diff --git a/sys/conf/defines b/sys/conf/defines new file mode 100644 index 00000000000..ea7046324e3 --- /dev/null +++ b/sys/conf/defines @@ -0,0 +1,15 @@ +/:#if.*[ \t]*KPROF/d +/:#if.*[ \t]*PGINPROF/d +/:#if.*[ \t]*UNFAST/d +/:#if.*[ \t]*INSECURE/d +/:#if.*[ \t]*TRACE/d +/:#if.*[ \t]*DISKMON/d +/:#if.*[ \t]*INTRLVE/d +/:#if.*[ \t]*lint/d +/:#if.*[ \t]*notdef/d +/:#if.*[ \t]*unneeded/d +/:#if.*[ \t]*vax/d +/:#if.*[ \t]*TCPTRUEOOB/d +/:#if.*[ \t]*irele/d +/:#if.*[ \t]*ilock/d +/:#if.*[ \t]*notyet/d diff --git a/sys/conf/files b/sys/conf/files new file mode 100644 index 00000000000..c083f2e1dee --- /dev/null +++ b/sys/conf/files @@ -0,0 +1,274 @@ +isofs/cd9660/cd9660_bmap.c optional cd9660 +isofs/cd9660/cd9660_lookup.c optional cd9660 +isofs/cd9660/cd9660_node.c optional cd9660 +isofs/cd9660/cd9660_rrip.c optional cd9660 +isofs/cd9660/cd9660_util.c optional cd9660 +isofs/cd9660/cd9660_vfsops.c optional cd9660 +isofs/cd9660/cd9660_vnops.c optional cd9660 +kdb/kdb_access.c optional kadb +kdb/kdb_command.c optional kadb +kdb/kdb_ctype.c optional kadb +kdb/kdb_expr.c optional kadb +kdb/kdb_format.c optional kadb +kdb/kdb_input.c optional kadb +kdb/kdb_message.c optional kadb +kdb/kdb_output.c optional kadb +kdb/kdb_pcs.c optional kadb +kdb/kdb_print.c optional kadb +kdb/kdb_runpcs.c optional kadb +kdb/kdb_sym.c optional kadb +kdb/kdb_trap.c optional kadb +kern/init_main.c standard +kern/init_sysent.c standard +kern/kern_acct.c standard +kern/kern_clock.c standard +kern/kern_descrip.c standard +kern/kern_exec.c standard +kern/kern_exit.c standard +kern/kern_fork.c standard +kern/kern_ktrace.c standard +kern/kern_malloc.c standard +kern/kern_physio.c standard +kern/kern_proc.c standard +kern/kern_prot.c standard +kern/kern_resource.c standard +kern/kern_sig.c standard +kern/kern_subr.c standard +kern/kern_synch.c standard +kern/kern_sysctl.c standard +kern/kern_time.c standard +kern/kern_xxx.c standard +kern/subr_log.c standard +kern/subr_prf.c standard +kern/subr_prof.c standard +kern/subr_rmap.c standard +kern/subr_xxx.c standard +kern/sys_generic.c standard +kern/sys_process.c standard +kern/sys_socket.c standard +kern/sysv_shm.c optional sysvshm +kern/tty.c standard +kern/tty_compat.c standard +kern/tty_conf.c standard +kern/tty_pty.c optional pty +kern/tty_subr.c standard +kern/tty_tb.c optional tb +kern/tty_tty.c standard +kern/uipc_domain.c standard +kern/uipc_mbuf.c standard +kern/uipc_proto.c standard +kern/uipc_socket.c standard +kern/uipc_socket2.c standard +kern/uipc_syscalls.c standard +kern/uipc_usrreq.c standard +kern/vfs_bio.c standard +kern/vfs_cache.c standard +kern/vfs_cluster.c standard +kern/vfs_conf.c standard +kern/vfs_init.c standard +kern/vfs_lookup.c standard +kern/vfs_subr.c standard +kern/vfs_syscalls.c standard +kern/vfs_vnops.c standard +miscfs/deadfs/dead_vnops.c standard +miscfs/fdesc/fdesc_vfsops.c optional fdesc +miscfs/fdesc/fdesc_vnops.c optional fdesc +miscfs/fifofs/fifo_vnops.c optional fifo +miscfs/kernfs/kernfs_vfsops.c optional kernfs +miscfs/kernfs/kernfs_vnops.c optional kernfs +miscfs/nullfs/null_subr.c optional nullfs +miscfs/nullfs/null_vfsops.c optional nullfs +miscfs/nullfs/null_vnops.c optional nullfs +miscfs/portal/portal_vfsops.c optional portal +miscfs/portal/portal_vnops.c optional portal +miscfs/procfs/procfs_subr.c optional procfs +miscfs/procfs/procfs_vnops.c optional procfs +miscfs/procfs/procfs_vfsops.c optional procfs +miscfs/procfs/procfs_note.c optional procfs +miscfs/procfs/procfs_mem.c optional procfs +miscfs/procfs/procfs_ctl.c optional procfs +miscfs/procfs/procfs_status.c optional procfs +miscfs/procfs/procfs_regs.c optional procfs +miscfs/procfs/procfs_fpregs.c optional procfs +miscfs/specfs/spec_vnops.c standard +miscfs/umapfs/umap_subr.c optional umapfs +miscfs/umapfs/umap_vfsops.c optional umapfs +miscfs/umapfs/umap_vnops.c optional umapfs +miscfs/union/union_subr.c optional union +miscfs/union/union_vfsops.c optional union +miscfs/union/union_vnops.c optional union +net/bpf.c optional bpfilter +net/bpf_filter.c optional bpfilter +net/if.c standard +net/if_ethersubr.c optional ether +net/if_loop.c optional loop +net/if_sl.c optional sl +net/radix.c standard +net/raw_cb.c standard +net/raw_usrreq.c standard +net/route.c standard +net/rtsock.c standard +net/slcompress.c optional sl +netccitt/ccitt_proto.c optional ccitt +netccitt/llc_input.c optional llc +netccitt/llc_output.c optional llc +netccitt/llc_subr.c optional llc +netccitt/llc_timer.c optional llc +netccitt/pk_llcsubr.c optional llc +netccitt/pk_llcsubr.c optional hdlc +netccitt/hd_debug.c optional hdlc +netccitt/hd_input.c optional hdlc +netccitt/hd_output.c optional hdlc +netccitt/hd_subr.c optional hdlc +netccitt/hd_timer.c optional hdlc +netccitt/if_x25subr.c optional ccitt +netccitt/pk_acct.c optional ccitt +netccitt/pk_debug.c optional ccitt +netccitt/pk_input.c optional ccitt +netccitt/pk_output.c optional ccitt +netccitt/pk_subr.c optional ccitt +netccitt/pk_timer.c optional ccitt +netccitt/pk_usrreq.c optional ccitt +netimp/if_imp.c optional imp +netimp/if_imphost.c optional imp +netimp/raw_imp.c optional imp +netinet/if_ether.c optional ether +netinet/igmp.c optional inet +netinet/in.c optional inet +netinet/in_pcb.c optional inet +netinet/in_proto.c optional inet +netinet/ip_icmp.c optional inet +netinet/ip_input.c optional inet +netinet/ip_mroute.c optional inet mrouting +netinet/ip_output.c optional inet +netinet/raw_ip.c optional inet +netinet/tcp_debug.c optional inet +netinet/tcp_input.c optional inet +netinet/tcp_output.c optional inet +netinet/tcp_subr.c optional inet +netinet/tcp_timer.c optional inet +netinet/tcp_usrreq.c optional inet +netinet/udp_usrreq.c optional inet +netiso/clnp_debug.c optional iso +netiso/clnp_er.c optional iso +netiso/clnp_frag.c optional iso +netiso/clnp_input.c optional iso +netiso/clnp_options.c optional iso +netiso/clnp_output.c optional iso +netiso/clnp_raw.c optional iso +netiso/clnp_subr.c optional iso +netiso/clnp_timer.c optional iso +netiso/cltp_usrreq.c optional iso +netiso/esis.c optional iso +netiso/idrp_usrreq.c optional iso +netiso/if_eon.c optional eon +netiso/iso.c optional iso +netiso/iso_chksum.c optional iso +netiso/iso_pcb.c optional iso +netiso/iso_proto.c optional iso +netiso/iso_snpac.c optional iso +netiso/tp_astring.c optional iso +netiso/tp_astring.c optional tpip +netiso/tp_cons.c optional iso +netiso/tp_driver.c optional iso +netiso/tp_driver.c optional tpip +netiso/tp_emit.c optional iso +netiso/tp_emit.c optional tpip +netiso/tp_inet.c optional iso +netiso/tp_inet.c optional tpip +netiso/tp_input.c optional iso +netiso/tp_input.c optional tpip +netiso/tp_iso.c optional iso +netiso/tp_meas.c optional iso +netiso/tp_meas.c optional tpip +netiso/tp_output.c optional iso +netiso/tp_output.c optional tpip +netiso/tp_pcb.c optional iso +netiso/tp_pcb.c optional tpip +netiso/tp_subr.c optional iso +netiso/tp_subr.c optional tpip +netiso/tp_subr2.c optional iso +netiso/tp_subr2.c optional tpip +netiso/tp_timer.c optional iso +netiso/tp_timer.c optional tpip +netiso/tp_trace.c optional iso +netiso/tp_trace.c optional tpip +netiso/tp_usrreq.c optional iso +netiso/tp_usrreq.c optional tpip +netiso/tuba_subr.c optional iso tuba +netiso/tuba_table.c optional iso tuba +netiso/tuba_usrreq.c optional iso tuba +netns/idp_usrreq.c optional ns +netns/ns.c optional ns +netns/ns_error.c optional ns +netns/ns_input.c optional ns +netns/ns_ip.c optional ns +netns/ns_output.c optional ns +netns/ns_pcb.c optional ns +netns/ns_proto.c optional ns +netns/spp_debug.c optional ns +netns/spp_usrreq.c optional ns +nfs/nfs_bio.c optional nfs +nfs/nfs_node.c optional nfs +nfs/nfs_nqlease.c optional nfs +nfs/nfs_serv.c optional nfs +nfs/nfs_socket.c optional nfs +nfs/nfs_srvcache.c optional nfs +nfs/nfs_subs.c optional nfs +nfs/nfs_syscalls.c optional nfs +nfs/nfs_vfsops.c optional nfs +nfs/nfs_vnops.c optional nfs +ufs/ffs/ffs_alloc.c optional ffs +ufs/ffs/ffs_alloc.c optional mfs +ufs/ffs/ffs_balloc.c optional ffs +ufs/ffs/ffs_balloc.c optional mfs +ufs/ffs/ffs_inode.c optional ffs +ufs/ffs/ffs_inode.c optional mfs +ufs/ffs/ffs_subr.c optional ffs +ufs/ffs/ffs_subr.c optional mfs +ufs/ffs/ffs_tables.c optional ffs +ufs/ffs/ffs_tables.c optional mfs +ufs/ffs/ffs_vfsops.c optional ffs +ufs/ffs/ffs_vfsops.c optional mfs +ufs/ffs/ffs_vnops.c optional ffs +ufs/ffs/ffs_vnops.c optional mfs +ufs/lfs/lfs_alloc.c optional lfs +ufs/lfs/lfs_bio.c optional lfs +ufs/lfs/lfs_balloc.c optional lfs +ufs/lfs/lfs_cksum.c optional lfs +ufs/lfs/lfs_debug.c optional lfs +ufs/lfs/lfs_inode.c optional lfs +ufs/lfs/lfs_segment.c optional lfs +ufs/lfs/lfs_subr.c optional lfs +ufs/lfs/lfs_syscalls.c optional lfs +ufs/lfs/lfs_vfsops.c optional lfs +ufs/lfs/lfs_vnops.c optional lfs +ufs/mfs/mfs_vfsops.c optional mfs +ufs/mfs/mfs_vnops.c optional mfs +ufs/ufs/ufs_bmap.c standard +ufs/ufs/ufs_disksubr.c standard +ufs/ufs/ufs_ihash.c standard +ufs/ufs/ufs_inode.c standard +ufs/ufs/ufs_lockf.c standard +ufs/ufs/ufs_lookup.c standard +ufs/ufs/ufs_quota.c standard +ufs/ufs/ufs_vfsops.c standard +ufs/ufs/ufs_vnops.c standard +vm/device_pager.c optional devpager +vm/kern_lock.c standard +vm/swap_pager.c optional swappager +vm/vm_fault.c standard +vm/vm_glue.c standard +vm/vm_init.c standard +vm/vm_kern.c standard +vm/vm_map.c standard +vm/vm_meter.c standard +vm/vm_mmap.c standard +vm/vm_object.c standard +vm/vm_page.c standard +vm/vm_pageout.c standard +vm/vm_pager.c standard +vm/vm_swap.c standard +vm/vm_unix.c standard +vm/vm_user.c standard +vm/vnode_pager.c optional vnodepager diff --git a/sys/conf/files.newconf b/sys/conf/files.newconf new file mode 100644 index 00000000000..7b0907ab894 --- /dev/null +++ b/sys/conf/files.newconf @@ -0,0 +1,274 @@ +# @(#)files.newconf 8.9 (Berkeley) 3/31/94 + +# generic attributes +define disk +define tape +define ifnet +define tty + +# net device attributes - we have generic code for ether. +# we should have imp but right now it is a pseudo-device. +define ether +# define imp +pseudo-device imp + +# scsi driver and associated stuff +define scsi { target = -1 } +device tg at scsi { drive = -1 } +file dev/scsi/scsi_subr.c scsi + +device sd at tg: disk +file dev/scsi/sd.c sd needs-flag + +# device st at tg: tape -- not yet + +# legitimate pseudo-devices +pseudo-device bpfilter +pseudo-device cd: disk +pseudo-device loop +pseudo-device pty: tty +pseudo-device sl +pseudo-device vn: disk + +# kernel sources +file isofs/cd9660/isofs_bmap.c isofs +file isofs/cd9660/isofs_lookup.c isofs +file isofs/cd9660/isofs_node.c isofs +file isofs/cd9660/isofs_rrip.c isofs +file isofs/cd9660/isofs_util.c isofs +file isofs/cd9660/isofs_vfsops.c isofs +file isofs/cd9660/isofs_vnops.c isofs +file kern/init_main.c +file kern/init_sysent.c +file kern/kern_acct.c +file kern/kern_clock.c +file kern/kern_descrip.c +file kern/kern_exec.c +file kern/kern_exit.c +file kern/kern_fork.c +file kern/kern_ktrace.c ktrace +file kern/kern_malloc.c +file kern/kern_physio.c +file kern/kern_proc.c +file kern/kern_prot.c +file kern/kern_resource.c +file kern/kern_sig.c +file kern/kern_subr.c +file kern/kern_synch.c +file kern/kern_sysctl.c +file kern/kern_time.c +file kern/kern_xxx.c +file kern/subr_autoconf.c +file kern/subr_log.c +file kern/subr_prf.c +file kern/subr_prof.c +file kern/subr_rmap.c +file kern/subr_xxx.c +file kern/sys_generic.c +file kern/sys_process.c +file kern/sys_socket.c +file kern/sysv_shm.c sysvshm +file kern/tty.c +file kern/tty_compat.c +file kern/tty_conf.c +file kern/tty_pty.c pty needs-count +file kern/tty_subr.c +file kern/tty_tb.c tb needs-flag +file kern/tty_tty.c +file kern/uipc_domain.c +file kern/uipc_mbuf.c +file kern/uipc_proto.c +file kern/uipc_socket.c +file kern/uipc_socket2.c +file kern/uipc_syscalls.c +file kern/uipc_usrreq.c +file kern/vfs_bio.c +file kern/vfs_cache.c +file kern/vfs_cluster.c +file kern/vfs_conf.c +file kern/vfs_init.c +file kern/vfs_lookup.c +file kern/vfs_subr.c +file kern/vfs_syscalls.c +file kern/vfs_vnops.c +file miscfs/deadfs/dead_vnops.c +file miscfs/fdesc/fdesc_vfsops.c fdesc +file miscfs/fdesc/fdesc_vnops.c fdesc +file miscfs/fifofs/fifo_vnops.c fifo +file miscfs/kernfs/kernfs_vfsops.c kernfs +file miscfs/kernfs/kernfs_vnops.c kernfs +file miscfs/nullfs/null_subr.c nullfs +file miscfs/nullfs/null_vfsops.c nullfs +file miscfs/nullfs/null_vnops.c nullfs +file miscfs/portal/portal_vfsops.c portal +file miscfs/portal/portal_vnops.c portal +file miscfs/procfs/procfs_subr.c procfs +file miscfs/procfs/procfs_vnops.c procfs +file miscfs/procfs/procfs_vfsops.c procfs +file miscfs/procfs/procfs_note.c procfs +file miscfs/procfs/procfs_mem.c procfs +file miscfs/procfs/procfs_ctl.c procfs +file miscfs/procfs/procfs_status.c procfs +file miscfs/procfs/procfs_regs.c procfs +file miscfs/procfs/procfs_fpregs.c procfs +file miscfs/specfs/spec_vnops.c +file miscfs/umapfs/umap_subr.c umapfs +file miscfs/umapfs/umap_vfsops.c umapfs +file miscfs/umapfs/umap_vnops.c umapfs +file miscfs/union/union_subr.c union +file miscfs/union/union_vfsops.c union +file miscfs/union/union_vnops.c union +file net/bpf.c bpfilter needs-count +file net/bpf_filter.c bpfilter needs-count +file net/if.c +file net/if_ethersubr.c ether needs-flag +file net/if_loop.c loop needs-count +file net/if_sl.c sl needs-count +file net/radix.c +file net/raw_cb.c +file net/raw_usrreq.c +file net/route.c +file net/rtsock.c +file net/slcompress.c sl +file netccitt/ccitt_proto.c ccitt +file netccitt/llc_input.c llc +file netccitt/llc_output.c llc +file netccitt/llc_subr.c llc +file netccitt/llc_timer.c llc +file netccitt/hd_debug.c hdlc +file netccitt/hd_input.c hdlc +file netccitt/hd_output.c hdlc +file netccitt/hd_subr.c hdlc +file netccitt/hd_timer.c hdlc +file netccitt/if_x25subr.c ccitt +file netccitt/pk_acct.c ccitt +file netccitt/pk_debug.c ccitt +file netccitt/pk_input.c ccitt +file netccitt/pk_llcsubr.c llc hdlc +file netccitt/pk_output.c ccitt +file netccitt/pk_subr.c ccitt +file netccitt/pk_timer.c ccitt +file netccitt/pk_usrreq.c ccitt +file netimp/if_imp.c imp needs-count +file netimp/if_imphost.c imp needs-count +file netimp/raw_imp.c imp +file netinet/if_ether.c ether +file netinet/igmp.c inet +file netinet/in.c inet +file netinet/in_pcb.c inet +file netinet/in_proto.c inet +file netinet/ip_icmp.c inet +file netinet/ip_input.c inet +file netinet/ip_mroute.c inet +file netinet/ip_output.c inet +file netinet/raw_ip.c inet +file netinet/tcp_debug.c inet +file netinet/tcp_input.c inet +file netinet/tcp_output.c inet +file netinet/tcp_subr.c inet +file netinet/tcp_timer.c inet +file netinet/tcp_usrreq.c inet +file netinet/udp_usrreq.c inet +file netiso/clnp_debug.c iso +file netiso/clnp_er.c iso +file netiso/clnp_frag.c iso +file netiso/clnp_input.c iso +file netiso/clnp_options.c iso +file netiso/clnp_output.c iso +file netiso/clnp_raw.c iso +file netiso/clnp_subr.c iso +file netiso/clnp_timer.c iso +file netiso/cltp_usrreq.c iso +file netiso/esis.c iso +file netiso/if_eon.c eon +file netiso/idrp_usrreq.c iso +file netiso/iso.c iso +file netiso/iso_chksum.c iso +file netiso/iso_pcb.c iso +file netiso/iso_proto.c iso +file netiso/iso_snpac.c iso +file netiso/tp_astring.c iso tpip +file netiso/tp_cons.c iso +file netiso/tp_driver.c iso tpip +file netiso/tp_emit.c iso tpip +file netiso/tp_inet.c iso tpip +file netiso/tp_input.c iso tpip +file netiso/tp_iso.c iso +file netiso/tp_meas.c iso tpip +file netiso/tp_output.c iso tpip +file netiso/tp_pcb.c iso tpip +file netiso/tp_subr.c iso tpip +file netiso/tp_subr2.c iso tpip +file netiso/tp_timer.c iso tpip +file netiso/tp_trace.c iso tpip +file netiso/tp_usrreq.c iso tpip +file netiso/tuba_subr.c iso tuba +file netiso/tuba_table.c iso tuba +file netiso/tuba_usrreq.c iso tuba +file netns/idp_usrreq.c ns +file netns/ns.c ns +file netns/ns_error.c ns +file netns/ns_input.c ns +file netns/ns_ip.c ns +file netns/ns_output.c ns +file netns/ns_pcb.c ns +file netns/ns_proto.c ns +file netns/spp_debug.c ns +file netns/spp_usrreq.c ns +file nfs/nfs_bio.c nfs +file nfs/nfs_node.c nfs +file nfs/nfs_nqlease.c nfs +file nfs/nfs_serv.c nfs +file nfs/nfs_socket.c nfs +file nfs/nfs_srvcache.c nfs +file nfs/nfs_subs.c nfs +file nfs/nfs_syscalls.c nfs +file nfs/nfs_vfsops.c nfs +file nfs/nfs_vnops.c nfs +file ufs/ffs/ffs_alloc.c ffs mfs +file ufs/ffs/ffs_balloc.c ffs mfs +file ufs/ffs/ffs_inode.c ffs mfs +file ufs/ffs/ffs_subr.c ffs mfs +file ufs/ffs/ffs_tables.c ffs mfs +file ufs/ffs/ffs_vfsops.c ffs mfs +file ufs/ffs/ffs_vnops.c ffs mfs +file ufs/lfs/lfs_alloc.c lfs +file ufs/lfs/lfs_bio.c lfs +file ufs/lfs/lfs_balloc.c lfs +file ufs/lfs/lfs_cksum.c lfs +file ufs/lfs/lfs_debug.c lfs +file ufs/lfs/lfs_inode.c lfs +file ufs/lfs/lfs_segment.c lfs +file ufs/lfs/lfs_subr.c lfs +file ufs/lfs/lfs_syscalls.c lfs +file ufs/lfs/lfs_vfsops.c lfs +file ufs/lfs/lfs_vnops.c lfs +file ufs/mfs/mfs_vfsops.c mfs +file ufs/mfs/mfs_vnops.c mfs +file ufs/ufs/ufs_bmap.c ffs lfs mfs +file ufs/ufs/ufs_disksubr.c ffs lfs mfs +file ufs/ufs/ufs_ihash.c ffs lfs mfs +file ufs/ufs/ufs_inode.c ffs lfs mfs +file ufs/ufs/ufs_lockf.c ffs lfs mfs +file ufs/ufs/ufs_lookup.c ffs lfs mfs +file ufs/ufs/ufs_quota.c ffs lfs mfs +file ufs/ufs/ufs_vfsops.c ffs lfs mfs +file ufs/ufs/ufs_vnops.c ffs lfs mfs +file vm/device_pager.c devpager +file vm/kern_lock.c +file vm/swap_pager.c swappager +file vm/vm_fault.c +file vm/vm_glue.c +file vm/vm_init.c +file vm/vm_kern.c +file vm/vm_map.c +file vm/vm_meter.c +file vm/vm_mmap.c +file vm/vm_object.c +file vm/vm_page.c +file vm/vm_pageout.c +file vm/vm_pager.c +file vm/vm_swap.c +file vm/vm_unix.c +file vm/vm_user.c +file vm/vnode_pager.c vnodepager diff --git a/sys/conf/newvers.sh b/sys/conf/newvers.sh new file mode 100644 index 00000000000..83a2f04ad7a --- /dev/null +++ b/sys/conf/newvers.sh @@ -0,0 +1,48 @@ +#!/bin/sh - +# +# Copyright (c) 1984, 1986, 1990, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)newvers.sh 8.1 (Berkeley) 4/20/94 + +if [ ! -r version ] +then + echo 0 > version +fi + +touch version +v=`cat version` u=${USER-root} d=`pwd` h=`hostname` t=`date` +echo "char ostype[] = \"4.4BSD\";" > vers.c +echo "char osrelease[] = \"4.4BSD-Lite\";" >> vers.c +echo "char sccs[4] = { '@', '(', '#', ')' };" >>vers.c +echo "char version[] = \"4.4BSD-Lite #${v}: ${t}\\n ${u}@${h}:${d}\\n\";" >>vers.c + +echo `expr ${v} + 1` > version diff --git a/sys/conf/nfsswapvmunix.c b/sys/conf/nfsswapvmunix.c new file mode 100644 index 00000000000..f9812eb3314 --- /dev/null +++ b/sys/conf/nfsswapvmunix.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsswapvmunix.c 8.1 (Berkeley) 6/10/93 + */ + +/* + * Sample NFS swapvmunix configuration file. + * This should be filled in by the bootstrap program. + * See /sys/nfs/nfsdiskless.h for details of the fields. + */ + +#include +#include +#include +#include + +#include +#include +#include + +extern int nfs_mountroot(); +int (*mountroot)() = nfs_mountroot; + +dev_t rootdev = NODEV; +dev_t argdev = NODEV; +dev_t dumpdev = NODEV; + +struct swdevt swdevt[] = { + { NODEV, 0, 5000 }, /* happy:/u/swap.dopey */ + { 0, 0, 0 } +}; +struct nfs_diskless nfs_diskless = { + { { 'q', 'e', '0', '\0' }, + { 0x10, 0x2, { 0x0, 0x0, 0x83, 0x68, 0x30, 0x2, } }, + { 0x10, 0x2, { 0x0, 0x0, 0x83, 0x68, 0x30, 0xff, } }, + { 0x10, 0x0, { 0x0, 0x0, 0xff, 0xff, 0xff, 0x0, } }, + }, + { 0x10, 0x2, { 0x0, 0x0, 0x83, 0x68, 0x30, 0x12, } }, + { + (struct sockaddr *)0, SOCK_DGRAM, 0, (nfsv2fh_t *)0, + 0, 8192, 8192, 10, 100, (char *)0, + }, + { + 0xf, + 0x9, + 0x0, + 0x0, + 0x1, + 0x0, + 0x0, + 0x0, + 0xc, + 0x0, + 0x0, + 0x0, + 0x6, + 0x0, + 0x0, + 0x0, + 0x27, + 0x18, + 0x79, + 0x27, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + }, + { 0x10, 0x2, { 0x8, 0x1, 0x83, 0x68, 0x30, 0x5, } }, + "happy", + { + (struct sockaddr *)0, SOCK_DGRAM, 0, (nfsv2fh_t *)0, + 0, 8192, 8192, 10, 100, (char *)0, + }, + { + 0x0, + 0x9, + 0x0, + 0x0, + 0x1, + 0x0, + 0x0, + 0x0, + 0xc, + 0x0, + 0x0, + 0x0, + 0x2, + 0x0, + 0x0, + 0x0, + 0xd0, + 0x48, + 0x42, + 0x25, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + 0x0, + }, + { 0x10, 0x2, { 0x8, 0x1, 0x83, 0x68, 0x30, 0x5, } }, + "happy", +}; diff --git a/sys/conf/param.c b/sys/conf/param.c new file mode 100644 index 00000000000..9f4e2cae857 --- /dev/null +++ b/sys/conf/param.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 1980, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)param.c 8.2 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef SYSVSHM +#include +#include +#endif + +/* + * System parameter formulae. + * + * This file is copied into each directory where we compile + * the kernel; it should be modified there to suit local taste + * if necessary. + * + * Compiled with -DHZ=xx -DTIMEZONE=x -DDST=x -DMAXUSERS=xx + */ + +#ifndef HZ +#define HZ 100 +#endif +int hz = HZ; +int tick = 1000000 / HZ; +int tickadj = 30000 / (60 * HZ); /* can adjust 30ms in 60s */ +struct timezone tz = { TIMEZONE, DST }; +#define NPROC (20 + 16 * MAXUSERS) +int maxproc = NPROC; +#define NTEXT (80 + NPROC / 8) /* actually the object cache */ +#define NVNODE (NPROC + NTEXT + 100) +int desiredvnodes = NVNODE; +int maxfiles = 3 * (NPROC + MAXUSERS) + 80; +int ncallout = 16 + NPROC; +int nclist = 60 + 12 * MAXUSERS; +int nmbclusters = NMBCLUSTERS; +int fscale = FSCALE; /* kernel uses `FSCALE', user uses `fscale' */ + +/* + * Values in support of System V compatible shared memory. XXX + */ +#ifdef SYSVSHM +#define SHMMAX (SHMMAXPGS*NBPG) +#define SHMMIN 1 +#define SHMMNI 32 /* <= SHMMMNI in shm.h */ +#define SHMSEG 8 +#define SHMALL (SHMMAXPGS/CLSIZE) + +struct shminfo shminfo = { + SHMMAX, + SHMMIN, + SHMMNI, + SHMSEG, + SHMALL +}; +#endif + +/* + * These are initialized at bootstrap time + * to values dependent on memory size + */ +int nbuf, nswbuf; + +/* + * These have to be allocated somewhere; allocating + * them here forces loader errors if this file is omitted + * (if they've been externed everywhere else; hah!). + */ +struct callout *callout; +struct cblock *cfree; +struct buf *buf, *swbuf; +char *buffers; + +/* + * Proc/pgrp hashing. + * Here so that hash table sizes can depend on MAXUSERS/NPROC. + * Hash size must be a power of two. + * NOW omission of this file will cause loader errors! + */ + +#if NPROC > 1024 +#define PIDHSZ 512 +#else +#if NPROC > 512 +#define PIDHSZ 256 +#else +#if NPROC > 256 +#define PIDHSZ 128 +#else +#define PIDHSZ 64 +#endif +#endif +#endif + +struct proc *pidhash[PIDHSZ]; +struct pgrp *pgrphash[PIDHSZ]; +int pidhashmask = PIDHSZ - 1; diff --git a/sys/conf/systags.sh b/sys/conf/systags.sh new file mode 100644 index 00000000000..90714d76974 --- /dev/null +++ b/sys/conf/systags.sh @@ -0,0 +1,72 @@ +#! /bin/sh +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)systags.sh 8.1 (Berkeley) 6/10/93 +# +# systags.sh - construct a system tags file using dependence relations +# in a .depend file +# +# First written May 16, 1992 by Van Jacobson, Lawrence Berkeley Laboratory. +# +# from: $Header: systags.sh,v 1.7 92/07/12 08:18:21 torek Exp $ + +rm -f tags tags.tmp tags.cfiles tags.sfiles tags.hfiles +MACHINE=`machine` +sed -e "s,\./machine/,../../$MACHINE/include/,g" \ + -e 's,[a-z][^/ ]*/\.\./,,g' .depend | awk '{ + for (i = 1; i <= NF; ++i) { + t = substr($i, length($i) - 1) + if (t == ".c") + cfiles[$i] = 1; + else if (t == ".h") + hfiles[$i] = 1; + else if (t == ".s") + sfiles[$i] = 1; + } + }; + END { + for (i in cfiles) + print i > "tags.cfiles"; + for (i in sfiles) + print i > "tags.sfiles"; + for (i in hfiles) + print i > "tags.hfiles"; + }' + +ctags -t -d -w `cat tags.cfiles tags.hfiles tags.sfiles` +egrep -o "^ENTRY\(.*\)|^ALTENTRY\(.*\)" `cat tags.sfiles` | \ + sed "s;\([^:]*\):\([^(]*\)(\([^, )]*\)\(.*\);\3 \1 /^\2(\3\4$/;" >> tags + +mv tags tags.tmp +sort -u tags.tmp > tags +rm tags.tmp tags.cfiles tags.sfiles tags.hfiles diff --git a/sys/fs/cd9660/TODO b/sys/fs/cd9660/TODO new file mode 100644 index 00000000000..555d26ad7d1 --- /dev/null +++ b/sys/fs/cd9660/TODO @@ -0,0 +1,77 @@ +# $Id: TODO,v 1.4 1993/09/07 15:40:51 ws Exp $ + + 1) should understand "older", original High Sierra ("CDROM001") type + + Not yet. ( I don't have this technical information, yet. ) + + 2) should understand Rock Ridge + + Yes, we have follows function. + + o Symbolic Link + o Real Name(long name) + o File Attribute + o Time stamp + o uid, gid + o Devices + o Relocated directories + + Except follows: + + o POSIX device number mapping + + There is some preliminary stuff in there that (ab-)uses the mknod + system call, but this needs a writable filesystem + + 3) should be called cdfs, as there are other ISO file system soon possible + + Not yet. Probably we should make another file system when the ECMA draft + is valid and do it. For doing Rock Ridge Support, I can use almost same + code. So I just use the same file system interface... + + 4) should have file handles implemented for use with NFS, etc + + Yes. we have already this one, and I based it for this release. + + 5) should have name translation enabled by mount flag + + Yes. we can disable the Rock Ridge Extension by follows option; + + "mount -t isofs -o -norrip /dev/cd0d /cdrom" + + 6) should run as a user process, and not take up kernel space (cdroms + are slow) + + Not yet. + + 7) ECMA support. + + Not yet. we need not only a technical spec but also ECMA format + cd-rom itself! + + 8) Character set change by SVD ( multi SVD support ) + + Not yet. We should also hack the other part of system as 8 bit + clean. As far as I know, if you export the cdrom by NFS, the client + can access the 8 bit clean (ie. Solaris Japanese with EUC code ) + + 9) Access checks in isofs_access + + Not yet. + + 10) Support for generation numbers + + Yes. Default is to list only the last file (the one with the highest + generation number). If you mount with -gen, all files are shown with + their generation numbers. In both cases you can specify the generation + number on opening files (if you happen to know it) or leave it off, + when it will again find the last file. + + 11) Support for extended attributes + + Yes. Since this requires an extra block buffer for the attributes + this must be enabled on mounting with the option -extattr. + +---------- +Last update July 19, '93 by Atsushi Murai. (amurai@spec.co.jp) +Last update August 19, '93 by Wolfgang Solfrank. (ws@tools.de) diff --git a/sys/fs/cd9660/TODO.hibler b/sys/fs/cd9660/TODO.hibler new file mode 100644 index 00000000000..3501aa296cd --- /dev/null +++ b/sys/fs/cd9660/TODO.hibler @@ -0,0 +1,22 @@ +1. Investiate making ISOFS another UFS shared filesystem (ala FFS/MFS/LFS). + Since it was modelled after the inode code, we might be able to merge + them back. It looks like a seperate (but very similar) lookup routine + will be needed due to the associated file stuff. + +2. Make filesystem exportable. This comes for free if stacked with UFS. + Otherwise, the ufs_export routines need to be elevated to vfs_* routines. + [ DONE - hibler ] + +3. If it can't be merged with UFS, at least get them in sync. For example, + it could use the same style hashing routines as in ufs/ufs_ihash.c + +4. It would be nice to be able to use the vfs_cluster code. + Unfortunately, if the logical block size is smaller than the page size, + it won't work. Also, if throughtput is relatively constant for any + block size (as it is for the HP drive--150kbs) then clustering may not + buy much (or may even hurt when vfs_cluster comes up with a large sync + cluster). + +5. Seems like there should be a "notrans" or some such mount option to show + filenames as they really are without lower-casing, stripping of version + numbers, etc. Does this make sense? diff --git a/sys/fs/cd9660/cd9660_bmap.c b/sys/fs/cd9660/cd9660_bmap.c new file mode 100644 index 00000000000..911eedfd06a --- /dev/null +++ b/sys/fs/cd9660/cd9660_bmap.c @@ -0,0 +1,102 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_bmap.c 8.3 (Berkeley) 1/23/94 + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Bmap converts a the logical block number of a file to its physical block + * number on the disk. The conversion is done by using the logical block + * number to index into the data block (extent) for the file. + */ +int +cd9660_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + struct iso_node *ip = VTOI(ap->a_vp); + daddr_t lblkno = ap->a_bn; + long bsize; + + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_vpp != NULL) + *ap->a_vpp = ip->i_devvp; + if (ap->a_bnp == NULL) + return (0); + + /* + * Compute the requested block number + */ + bsize = ip->i_mnt->logical_block_size; + *ap->a_bnp = (ip->iso_start + lblkno) * btodb(bsize); + + /* + * Determine maximum number of readahead blocks following the + * requested block. + */ + if (ap->a_runp) { + int nblk; + + nblk = (ip->i_size - (lblkno + 1) * bsize) / bsize; + if (nblk <= 0) + *ap->a_runp = 0; + else if (nblk >= MAXBSIZE/bsize) + *ap->a_runp = MAXBSIZE/bsize - 1; + else + *ap->a_runp = nblk; + } + + return 0; +} diff --git a/sys/fs/cd9660/cd9660_lookup.c b/sys/fs/cd9660/cd9660_lookup.c new file mode 100644 index 00000000000..62d1d3fc791 --- /dev/null +++ b/sys/fs/cd9660/cd9660_lookup.c @@ -0,0 +1,465 @@ +/*- + * Copyright (c) 1989, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_lookup.c 7.33 (Berkeley) 5/19/91 + * + * @(#)cd9660_lookup.c 8.2 (Berkeley) 1/23/94 + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct nchstats iso_nchstats; + +/* + * Convert a component of a pathname into a pointer to a locked inode. + * This is a very central and rather complicated routine. + * If the file system is not maintained in a strict tree hierarchy, + * this can result in a deadlock situation (see comments in code below). + * + * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on + * whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it and the target of the pathname + * exists, lookup returns both the target and its parent directory locked. + * When creating or renaming and LOCKPARENT is specified, the target may + * not be ".". When deleting and LOCKPARENT is specified, the target may + * be "."., but the caller must check to ensure it does an vrele and iput + * instead of two iputs. + * + * Overall outline of ufs_lookup: + * + * check accessibility of directory + * look for name in cache, if found, then if at end of path + * and deleting or creating, drop it, else return name + * search for name in directory, to found or notfound + * notfound: + * if creating, return locked directory, leaving info on available slots + * else return error + * found: + * if at end of path and deleting, return information to allow delete + * if at end of path and rewriting (RENAME and LOCKPARENT), lock target + * inode and return info to allow rewrite + * if not at end, add name to cache; if at end and neither creating + * nor deleting, add name to cache + * + * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode unlocked. + */ +cd9660_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vdp; /* vnode for directory being searched */ + register struct iso_node *dp; /* inode for directory being searched */ + register struct iso_mnt *imp; /* file system that directory is in */ + struct buf *bp; /* a buffer of directory entries */ + struct iso_directory_record *ep;/* the current directory entry */ + int entryoffsetinblock; /* offset of ep in bp's buffer */ + int saveoffset; /* offset of last directory entry in dir */ + int numdirpasses; /* strategy for directory search */ + doff_t endsearch; /* offset to end directory search */ + struct iso_node *pdp; /* saved dp during symlink work */ + struct iso_node *tdp; /* returned by iget */ + int lockparent; /* 1 => lockparent flag is set */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int error; + ino_t ino = 0; + int reclen; + u_short namelen; + char altname[NAME_MAX]; + int res; + int assoc, len; + char *name; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + struct ucred *cred = cnp->cn_cred; + int flags = cnp->cn_flags; + int nameiop = cnp->cn_nameiop; + + bp = NULL; + *vpp = NULL; + vdp = ap->a_dvp; + dp = VTOI(vdp); + imp = dp->i_mnt; + lockparent = flags & LOCKPARENT; + wantparent = flags & (LOCKPARENT|WANTPARENT); + + /* + * Check accessiblity of directory. + */ + if (vdp->v_type != VDIR) + return (ENOTDIR); + if (error = VOP_ACCESS(vdp, VEXEC, cred, cnp->cn_proc)) + return (error); + + /* + * We now have a segment name to search for, and a directory to search. + * + * Before tediously performing a linear scan of the directory, + * check the name cache to see if the directory/name pair + * we are looking for is known already. + */ + if (error = cache_lookup(vdp, vpp, cnp)) { + int vpid; /* capability number of vnode */ + + if (error == ENOENT) + return (error); +#ifdef PARANOID + if ((vdp->v_flag & VROOT) && (flags & ISDOTDOT)) + panic("ufs_lookup: .. through root"); +#endif + /* + * Get the next vnode in the path. + * See comment below starting `Step through' for + * an explaination of the locking protocol. + */ + pdp = dp; + dp = VTOI(*vpp); + vdp = *vpp; + vpid = vdp->v_id; + if (pdp == dp) { + VREF(vdp); + error = 0; + } else if (flags & ISDOTDOT) { + ISO_IUNLOCK(pdp); + error = vget(vdp, 1); + if (!error && lockparent && (flags & ISLASTCN)) + ISO_ILOCK(pdp); + } else { + error = vget(vdp, 1); + if (!lockparent || error || !(flags & ISLASTCN)) + ISO_IUNLOCK(pdp); + } + /* + * Check that the capability number did not change + * while we were waiting for the lock. + */ + if (!error) { + if (vpid == vdp->v_id) + return (0); + iso_iput(dp); + if (lockparent && pdp != dp && (flags & ISLASTCN)) + ISO_IUNLOCK(pdp); + } + ISO_ILOCK(pdp); + dp = pdp; + vdp = ITOV(dp); + *vpp = NULL; + } + + len = cnp->cn_namelen; + name = cnp->cn_nameptr; + /* + * A leading `=' means, we are looking for an associated file + */ + if (assoc = (imp->iso_ftype != ISO_FTYPE_RRIP && *name == ASSOCCHAR)) { + len--; + name++; + } + + /* + * If there is cached information on a previous search of + * this directory, pick up where we last left off. + * We cache only lookups as these are the most common + * and have the greatest payoff. Caching CREATE has little + * benefit as it usually must search the entire directory + * to determine that the entry does not exist. Caching the + * location of the last DELETE or RENAME has not reduced + * profiling time and hence has been removed in the interest + * of simplicity. + */ + if (nameiop != LOOKUP || dp->i_diroff == 0 || + dp->i_diroff > dp->i_size) { + entryoffsetinblock = 0; + dp->i_offset = 0; + numdirpasses = 1; + } else { + dp->i_offset = dp->i_diroff; + entryoffsetinblock = iso_blkoff(imp, dp->i_offset); + if (entryoffsetinblock != 0) { + if (error = iso_blkatoff(dp, dp->i_offset, &bp)) + return (error); + } + numdirpasses = 2; + iso_nchstats.ncs_2passes++; + } + endsearch = roundup(dp->i_size, imp->logical_block_size); + +searchloop: + while (dp->i_offset < endsearch) { + /* + * If offset is on a block boundary, + * read the next directory block. + * Release previous if it exists. + */ + if (iso_blkoff(imp, dp->i_offset) == 0) { + if (bp != NULL) + brelse(bp); + if (error = iso_blkatoff(dp, dp->i_offset, &bp)) + return (error); + entryoffsetinblock = 0; + } + /* + * Get pointer to next entry. + */ + ep = (struct iso_directory_record *) + (bp->b_un.b_addr + entryoffsetinblock); + + reclen = isonum_711 (ep->length); + if (reclen == 0) { + /* skip to next block, if any */ + dp->i_offset = + roundup(dp->i_offset, imp->logical_block_size); + continue; + } + + if (reclen < ISO_DIRECTORY_RECORD_SIZE) + /* illegal entry, stop */ + break; + + if (entryoffsetinblock + reclen > imp->logical_block_size) + /* entries are not allowed to cross boundaries */ + break; + + /* + * Check for a name match. + */ + namelen = isonum_711(ep->name_len); + + if (reclen < ISO_DIRECTORY_RECORD_SIZE + namelen) + /* illegal entry, stop */ + break; + + switch (imp->iso_ftype) { + default: + if ((!(isonum_711(ep->flags)&4)) == !assoc) { + if ((len == 1 + && *name == '.') + || (flags & ISDOTDOT)) { + if (namelen == 1 + && ep->name[0] == ((flags & ISDOTDOT) ? 1 : 0)) { + /* + * Save directory entry's inode number and + * reclen in ndp->ni_ufs area, and release + * directory buffer. + */ + isodirino(&dp->i_ino,ep,imp); + goto found; + } + if (namelen != 1 + || ep->name[0] != 0) + goto notfound; + } else if (!(res = isofncmp(name,len, + ep->name,namelen))) { + if (isonum_711(ep->flags)&2) + isodirino(&ino,ep,imp); + else + ino = dbtob(bp->b_blkno) + + entryoffsetinblock; + saveoffset = dp->i_offset; + } else if (ino) + goto foundino; +#ifdef NOSORTBUG /* On some CDs directory entries are not sorted correctly */ + else if (res < 0) + goto notfound; + else if (res > 0 && numdirpasses == 2) + numdirpasses++; +#endif + } + break; + case ISO_FTYPE_RRIP: + if (isonum_711(ep->flags)&2) + isodirino(&ino,ep,imp); + else + ino = dbtob(bp->b_blkno) + entryoffsetinblock; + dp->i_ino = ino; + cd9660_rrip_getname(ep,altname,&namelen,&dp->i_ino,imp); + if (namelen == cnp->cn_namelen + && !bcmp(name,altname,namelen)) + goto found; + ino = 0; + break; + } + dp->i_offset += reclen; + entryoffsetinblock += reclen; + } + if (ino) { +foundino: + dp->i_ino = ino; + if (saveoffset != dp->i_offset) { + if (iso_lblkno(imp,dp->i_offset) + != iso_lblkno(imp,saveoffset)) { + if (bp != NULL) + brelse(bp); + if (error = iso_blkatoff(dp, saveoffset, &bp)) + return (error); + } + ep = (struct iso_directory_record *)(bp->b_un.b_addr + + iso_blkoff(imp,saveoffset)); + dp->i_offset = saveoffset; + } + goto found; + } +notfound: + /* + * If we started in the middle of the directory and failed + * to find our target, we must check the beginning as well. + */ + if (numdirpasses == 2) { + numdirpasses--; + dp->i_offset = 0; + endsearch = dp->i_diroff; + goto searchloop; + } + if (bp != NULL) + brelse(bp); + /* + * Insert name into cache (as non-existent) if appropriate. + */ + if (cnp->cn_flags & MAKEENTRY) + cache_enter(vdp, *vpp, cnp); + if (nameiop == CREATE || nameiop == RENAME) + return (EJUSTRETURN); + return (ENOENT); + +found: + if (numdirpasses == 2) + iso_nchstats.ncs_pass2++; + if (bp != NULL) + brelse(bp); + + /* + * Found component in pathname. + * If the final component of path name, save information + * in the cache as to where the entry was found. + */ + if ((flags & ISLASTCN) && nameiop == LOOKUP) + dp->i_diroff = dp->i_offset; + + /* + * Step through the translation in the name. We do not `iput' the + * directory because we may need it again if a symbolic link + * is relative to the current directory. Instead we save it + * unlocked as "pdp". We must get the target inode before unlocking + * the directory to insure that the inode will not be removed + * before we get it. We prevent deadlock by always fetching + * inodes from the root, moving down the directory tree. Thus + * when following backward pointers ".." we must unlock the + * parent directory before getting the requested directory. + * There is a potential race condition here if both the current + * and parent directories are removed before the `iget' for the + * inode associated with ".." returns. We hope that this occurs + * infrequently since we cannot avoid this race condition without + * implementing a sophisticated deadlock detection algorithm. + * Note also that this simple deadlock detection scheme will not + * work if the file system has any hard links other than ".." + * that point backwards in the directory structure. + */ + pdp = dp; + /* + * If ino is different from dp->i_ino, + * it's a relocated directory. + */ + if (flags & ISDOTDOT) { + ISO_IUNLOCK(pdp); /* race to get the inode */ + if (error = iso_iget(dp,dp->i_ino, + dp->i_ino != ino, + &tdp,ep)) { + ISO_ILOCK(pdp); + return (error); + } + if (lockparent && (flags & ISLASTCN)) + ISO_ILOCK(pdp); + *vpp = ITOV(tdp); + } else if (dp->i_number == dp->i_ino) { + VREF(vdp); /* we want ourself, ie "." */ + *vpp = vdp; + } else { + if (error = iso_iget(dp,dp->i_ino,dp->i_ino!=ino,&tdp,ep)) + return (error); + if (!lockparent || !(flags & ISLASTCN)) + ISO_IUNLOCK(pdp); + *vpp = ITOV(tdp); + } + + /* + * Insert name into cache if appropriate. + */ + if (cnp->cn_flags & MAKEENTRY) + cache_enter(vdp, *vpp, cnp); + return (0); +} + +/* + * Return buffer with contents of block "offset" + * from the beginning of directory "ip". If "res" + * is non-zero, fill it in with a pointer to the + * remaining space in the directory. + */ +iso_blkatoff(ip, offset, bpp) + struct iso_node *ip; + doff_t offset; + struct buf **bpp; +{ + register struct iso_mnt *imp = ip->i_mnt; + daddr_t lbn = iso_lblkno(imp,offset); + int bsize = iso_blksize(imp,ip,lbn); + struct buf *bp; + int error; + + if (error = bread(ITOV(ip),lbn,bsize,NOCRED,&bp)) { + brelse(bp); + *bpp = 0; + return (error); + } + *bpp = bp; + + return (0); +} diff --git a/sys/fs/cd9660/cd9660_node.c b/sys/fs/cd9660/cd9660_node.c new file mode 100644 index 00000000000..d83a7a6f126 --- /dev/null +++ b/sys/fs/cd9660/cd9660_node.c @@ -0,0 +1,648 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_node.c 8.2 (Berkeley) 1/23/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define INOHSZ 512 +#if ((INOHSZ&(INOHSZ-1)) == 0) +#define INOHASH(dev,ino) (((dev)+((ino)>>12))&(INOHSZ-1)) +#else +#define INOHASH(dev,ino) (((unsigned)((dev)+((ino)>>12)))%INOHSZ) +#endif + +union iso_ihead { + union iso_ihead *ih_head[2]; + struct iso_node *ih_chain[2]; +} iso_ihead[INOHSZ]; + +#ifdef ISODEVMAP +#define DNOHSZ 64 +#if ((DNOHSZ&(DNOHSZ-1)) == 0) +#define DNOHASH(dev,ino) (((dev)+((ino)>>12))&(DNOHSZ-1)) +#else +#define DNOHASH(dev,ino) (((unsigned)((dev)+((ino)>>12)))%DNOHSZ) +#endif + +union iso_dhead { + union iso_dhead *dh_head[2]; + struct iso_dnode *dh_chain[2]; +} iso_dhead[DNOHSZ]; +#endif + +int prtactive; /* 1 => print out reclaim of active vnodes */ + +/* + * Initialize hash links for inodes and dnodes. + */ +cd9660_init() +{ + register int i; + register union iso_ihead *ih = iso_ihead; +#ifdef ISODEVMAP + register union iso_dhead *dh = iso_dhead; +#endif + + for (i = INOHSZ; --i >= 0; ih++) { + ih->ih_head[0] = ih; + ih->ih_head[1] = ih; + } +#ifdef ISODEVMAP + for (i = DNOHSZ; --i >= 0; dh++) { + dh->dh_head[0] = dh; + dh->dh_head[1] = dh; + } +#endif +} + +#ifdef ISODEVMAP +/* + * Enter a new node into the device hash list + */ +struct iso_dnode * +iso_dmap(dev,ino,create) + dev_t dev; + ino_t ino; + int create; +{ + struct iso_dnode *dp; + union iso_dhead *dh; + + dh = &iso_dhead[DNOHASH(dev, ino)]; + for (dp = dh->dh_chain[0]; + dp != (struct iso_dnode *)dh; + dp = dp->d_forw) + if (ino == dp->i_number && dev == dp->i_dev) + return dp; + + if (!create) + return (struct iso_dnode *)0; + + MALLOC(dp,struct iso_dnode *,sizeof(struct iso_dnode),M_CACHE,M_WAITOK); + dp->i_dev = dev; + dp->i_number = ino; + insque(dp,dh); + + return dp; +} + +void +iso_dunmap(dev) + dev_t dev; +{ + struct iso_dnode *dp, *dq; + union iso_dhead *dh; + + for (dh = iso_dhead; dh < iso_dhead + DNOHSZ; dh++) { + for (dp = dh->dh_chain[0]; + dp != (struct iso_dnode *)dh; + dp = dq) { + dq = dp->d_forw; + if (dev == dp->i_dev) { + remque(dp); + FREE(dp,M_CACHE); + } + } + } +} +#endif + +/* + * Look up a ISOFS dinode number to find its incore vnode. + * If it is not in core, read it in from the specified device. + * If it is in core, wait for the lock bit to clear, then + * return the inode locked. Detection and handling of mount + * points must be done by the calling routine. + */ +iso_iget(xp, ino, relocated, ipp, isodir) + struct iso_node *xp; + ino_t ino; + struct iso_node **ipp; + struct iso_directory_record *isodir; +{ + dev_t dev = xp->i_dev; + struct mount *mntp = ITOV(xp)->v_mount; + register struct iso_node *ip, *iq; + register struct vnode *vp; + register struct iso_dnode *dp; + struct vnode *nvp; + struct buf *bp = NULL, *bp2 = NULL; + union iso_ihead *ih; + union iso_dhead *dh; + int i, error, result; + struct iso_mnt *imp; + ino_t defino; + + ih = &iso_ihead[INOHASH(dev, ino)]; +loop: + for (ip = ih->ih_chain[0]; + ip != (struct iso_node *)ih; + ip = ip->i_forw) { + if (ino != ip->i_number || dev != ip->i_dev) + continue; + if ((ip->i_flag&ILOCKED) != 0) { + ip->i_flag |= IWANT; + sleep((caddr_t)ip, PINOD); + goto loop; + } + if (vget(ITOV(ip), 1)) + goto loop; + *ipp = ip; + return 0; + } + /* + * Allocate a new vnode/iso_node. + */ + if (error = getnewvnode(VT_ISOFS, mntp, cd9660_vnodeop_p, &nvp)) { + *ipp = 0; + return error; + } + MALLOC(ip, struct iso_node *, sizeof(struct iso_node), + M_ISOFSNODE, M_WAITOK); + bzero((caddr_t)ip, sizeof(struct iso_node)); + nvp->v_data = ip; + ip->i_vnode = nvp; + ip->i_flag = 0; + ip->i_devvp = 0; + ip->i_diroff = 0; + ip->i_lockf = 0; + + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + ip->i_dev = dev; + ip->i_number = ino; + insque(ip, ih); + ISO_ILOCK(ip); + + imp = VFSTOISOFS (mntp); + ip->i_mnt = imp; + ip->i_devvp = imp->im_devvp; + VREF(ip->i_devvp); + + if (relocated) { + /* + * On relocated directories we must + * read the `.' entry out of a dir. + */ + ip->iso_start = ino >> imp->im_bshift; + if (error = iso_blkatoff(ip,0,&bp)) { + vrele(ip->i_devvp); + remque(ip); + ip->i_forw = ip; + ip->i_back = ip; + iso_iput(ip); + *ipp = 0; + return error; + } + isodir = (struct iso_directory_record *)bp->b_un.b_addr; + } + + ip->iso_extent = isonum_733(isodir->extent); + ip->i_size = isonum_733(isodir->size); + ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent; + + vp = ITOV(ip); + + /* + * Setup time stamp, attribute + */ + vp->v_type = VNON; + switch (imp->iso_ftype) { + default: /* ISO_FTYPE_9660 */ + if ((imp->im_flags&ISOFSMNT_EXTATT) + && isonum_711(isodir->ext_attr_length)) + iso_blkatoff(ip,-isonum_711(isodir->ext_attr_length), + &bp2); + cd9660_defattr(isodir,ip,bp2 ); + cd9660_deftstamp(isodir,ip,bp2 ); + break; + case ISO_FTYPE_RRIP: + result = cd9660_rrip_analyze(isodir,ip,imp); + break; + } + if (bp2) + brelse(bp2); + if (bp) + brelse(bp); + + /* + * Initialize the associated vnode + */ + vp->v_type = IFTOVT(ip->inode.iso_mode); + + if ( vp->v_type == VFIFO ) { +#ifdef FIFO + extern int (**cd9660_fifoop_p)(); + vp->v_op = cd9660_fifoop_p; +#else + iso_iput(ip); + *ipp = 0; + return EOPNOTSUPP; +#endif /* FIFO */ + } else if ( vp->v_type == VCHR || vp->v_type == VBLK ) { + extern int (**cd9660_specop_p)(); + + /* + * if device, look at device number table for translation + */ +#ifdef ISODEVMAP + if (dp = iso_dmap(dev,ino,0)) + ip->inode.iso_rdev = dp->d_dev; +#endif + vp->v_op = cd9660_specop_p; + if (nvp = checkalias(vp, ip->inode.iso_rdev, mntp)) { + /* + * Reinitialize aliased inode. + */ + vp = nvp; + iq = VTOI(vp); + iq->i_vnode = vp; + iq->i_flag = 0; + ISO_ILOCK(iq); + iq->i_dev = dev; + iq->i_number = ino; + iq->i_mnt = ip->i_mnt; + bcopy(&ip->iso_extent,&iq->iso_extent, + (char *)(ip + 1) - (char *)&ip->iso_extent); + insque(iq, ih); + /* + * Discard unneeded vnode + * (This introduces the need of INACTIVE modification) + */ + ip->inode.iso_mode = 0; + iso_iput(ip); + ip = iq; + } + } + + if (ip->iso_extent == imp->root_extent) + vp->v_flag |= VROOT; + + *ipp = ip; + return 0; +} + +/* + * Unlock and decrement the reference count of an inode structure. + */ +iso_iput(ip) + register struct iso_node *ip; +{ + + if ((ip->i_flag & ILOCKED) == 0) + panic("iso_iput"); + ISO_IUNLOCK(ip); + vrele(ITOV(ip)); +} + +/* + * Last reference to an inode, write the inode out and if necessary, + * truncate and deallocate the file. + */ +int +cd9660_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + register struct iso_node *ip = VTOI(vp); + int mode, error = 0; + + if (prtactive && vp->v_usecount != 0) + vprint("cd9660_inactive: pushing active", vp); + + ip->i_flag = 0; + /* + * If we are done with the inode, reclaim it + * so that it can be reused immediately. + */ + if (vp->v_usecount == 0 && ip->inode.iso_mode == 0) + vgone(vp); + return error; +} + +/* + * Reclaim an inode so that it can be used for other purposes. + */ +int +cd9660_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct iso_node *ip = VTOI(vp); + int i; + + if (prtactive && vp->v_usecount != 0) + vprint("cd9660_reclaim: pushing active", vp); + /* + * Remove the inode from its hash chain. + */ + remque(ip); + ip->i_forw = ip; + ip->i_back = ip; + /* + * Purge old data structures associated with the inode. + */ + cache_purge(vp); + if (ip->i_devvp) { + vrele(ip->i_devvp); + ip->i_devvp = 0; + } + FREE(vp->v_data, M_ISOFSNODE); + vp->v_data = NULL; + return 0; +} + +/* + * Lock an inode. If its already locked, set the WANT bit and sleep. + */ +iso_ilock(ip) + register struct iso_node *ip; +{ + + while (ip->i_flag & ILOCKED) { + ip->i_flag |= IWANT; + if (ip->i_spare0 == curproc->p_pid) + panic("locking against myself"); + ip->i_spare1 = curproc->p_pid; + (void) sleep((caddr_t)ip, PINOD); + } + ip->i_spare1 = 0; + ip->i_spare0 = curproc->p_pid; + ip->i_flag |= ILOCKED; +} + +/* + * Unlock an inode. If WANT bit is on, wakeup. + */ +iso_iunlock(ip) + register struct iso_node *ip; +{ + + if ((ip->i_flag & ILOCKED) == 0) + vprint("iso_iunlock: unlocked inode", ITOV(ip)); + ip->i_spare0 = 0; + ip->i_flag &= ~ILOCKED; + if (ip->i_flag&IWANT) { + ip->i_flag &= ~IWANT; + wakeup((caddr_t)ip); + } +} + +/* + * File attributes + */ +void +cd9660_defattr(isodir,inop,bp) + struct iso_directory_record *isodir; + struct iso_node *inop; + struct buf *bp; +{ + struct buf *bp2 = NULL; + struct iso_mnt *imp; + struct iso_extended_attributes *ap = NULL; + int off; + + if (isonum_711(isodir->flags)&2) { + inop->inode.iso_mode = S_IFDIR; + /* + * If we return 2, fts() will assume there are no subdirectories + * (just links for the path and .), so instead we return 1. + */ + inop->inode.iso_links = 1; + } else { + inop->inode.iso_mode = S_IFREG; + inop->inode.iso_links = 1; + } + if (!bp + && ((imp = inop->i_mnt)->im_flags&ISOFSMNT_EXTATT) + && (off = isonum_711(isodir->ext_attr_length))) { + iso_blkatoff(inop,-off * imp->logical_block_size,&bp2); + bp = bp2; + } + if (bp) { + ap = (struct iso_extended_attributes *)bp->b_un.b_addr; + + if (isonum_711(ap->version) == 1) { + if (!(ap->perm[0]&0x40)) + inop->inode.iso_mode |= VEXEC >> 6; + if (!(ap->perm[0]&0x10)) + inop->inode.iso_mode |= VREAD >> 6; + if (!(ap->perm[0]&4)) + inop->inode.iso_mode |= VEXEC >> 3; + if (!(ap->perm[0]&1)) + inop->inode.iso_mode |= VREAD >> 3; + if (!(ap->perm[1]&0x40)) + inop->inode.iso_mode |= VEXEC; + if (!(ap->perm[1]&0x10)) + inop->inode.iso_mode |= VREAD; + inop->inode.iso_uid = isonum_723(ap->owner); /* what about 0? */ + inop->inode.iso_gid = isonum_723(ap->group); /* what about 0? */ + } else + ap = NULL; + } + if (!ap) { + inop->inode.iso_mode |= VREAD|VEXEC|(VREAD|VEXEC)>>3|(VREAD|VEXEC)>>6; + inop->inode.iso_uid = (uid_t)0; + inop->inode.iso_gid = (gid_t)0; + } + if (bp2) + brelse(bp2); +} + +/* + * Time stamps + */ +void +cd9660_deftstamp(isodir,inop,bp) + struct iso_directory_record *isodir; + struct iso_node *inop; + struct buf *bp; +{ + struct buf *bp2 = NULL; + struct iso_mnt *imp; + struct iso_extended_attributes *ap = NULL; + int off; + + if (!bp + && ((imp = inop->i_mnt)->im_flags&ISOFSMNT_EXTATT) + && (off = isonum_711(isodir->ext_attr_length))) { + iso_blkatoff(inop,-off * imp->logical_block_size,&bp2); + bp = bp2; + } + if (bp) { + ap = (struct iso_extended_attributes *)bp->b_un.b_addr; + + if (isonum_711(ap->version) == 1) { + if (!cd9660_tstamp_conv17(ap->ftime,&inop->inode.iso_atime)) + cd9660_tstamp_conv17(ap->ctime,&inop->inode.iso_atime); + if (!cd9660_tstamp_conv17(ap->ctime,&inop->inode.iso_ctime)) + inop->inode.iso_ctime = inop->inode.iso_atime; + if (!cd9660_tstamp_conv17(ap->mtime,&inop->inode.iso_mtime)) + inop->inode.iso_mtime = inop->inode.iso_ctime; + } else + ap = NULL; + } + if (!ap) { + cd9660_tstamp_conv7(isodir->date,&inop->inode.iso_ctime); + inop->inode.iso_atime = inop->inode.iso_ctime; + inop->inode.iso_mtime = inop->inode.iso_ctime; + } + if (bp2) + brelse(bp2); +} + +int +cd9660_tstamp_conv7(pi,pu) +char *pi; +struct timeval *pu; +{ + int i; + int crtime, days; + int y, m, d, hour, minute, second, tz; + + y = pi[0] + 1900; + m = pi[1]; + d = pi[2]; + hour = pi[3]; + minute = pi[4]; + second = pi[5]; + tz = pi[6]; + + if (y < 1970) { + pu->tv_sec = 0; + pu->tv_usec = 0; + return 0; + } else { +#ifdef ORIGINAL + /* computes day number relative to Sept. 19th,1989 */ + /* don't even *THINK* about changing formula. It works! */ + days = 367*(y-1980)-7*(y+(m+9)/12)/4-3*((y+(m-9)/7)/100+1)/4+275*m/9+d-100; +#else + /* + * Changed :-) to make it relative to Jan. 1st, 1970 + * and to disambiguate negative division + */ + days = 367*(y-1960)-7*(y+(m+9)/12)/4-3*((y+(m+9)/12-1)/100+1)/4+275*m/9+d-239; +#endif + crtime = ((((days * 24) + hour) * 60 + minute) * 60) + second; + + /* timezone offset is unreliable on some disks */ + if (-48 <= tz && tz <= 52) + crtime += tz * 15 * 60; + } + pu->tv_sec = crtime; + pu->tv_usec = 0; + return 1; +} + +static unsigned +cd9660_chars2ui(begin,len) + unsigned char *begin; + int len; +{ + unsigned rc; + + for (rc = 0; --len >= 0;) { + rc *= 10; + rc += *begin++ - '0'; + } + return rc; +} + +int +cd9660_tstamp_conv17(pi,pu) + unsigned char *pi; + struct timeval *pu; +{ + unsigned char buf[7]; + + /* year:"0001"-"9999" -> -1900 */ + buf[0] = cd9660_chars2ui(pi,4) - 1900; + + /* month: " 1"-"12" -> 1 - 12 */ + buf[1] = cd9660_chars2ui(pi + 4,2); + + /* day: " 1"-"31" -> 1 - 31 */ + buf[2] = cd9660_chars2ui(pi + 6,2); + + /* hour: " 0"-"23" -> 0 - 23 */ + buf[3] = cd9660_chars2ui(pi + 8,2); + + /* minute:" 0"-"59" -> 0 - 59 */ + buf[4] = cd9660_chars2ui(pi + 10,2); + + /* second:" 0"-"59" -> 0 - 59 */ + buf[5] = cd9660_chars2ui(pi + 12,2); + + /* difference of GMT */ + buf[6] = pi[16]; + + return cd9660_tstamp_conv7(buf,pu); +} + +void +isodirino(inump,isodir,imp) + ino_t *inump; + struct iso_directory_record *isodir; + struct iso_mnt *imp; +{ + *inump = (isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length)) + * imp->logical_block_size; +} diff --git a/sys/fs/cd9660/cd9660_node.h b/sys/fs/cd9660/cd9660_node.h new file mode 100644 index 00000000000..45de67f1a6b --- /dev/null +++ b/sys/fs/cd9660/cd9660_node.h @@ -0,0 +1,143 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_node.h 8.2 (Berkeley) 1/23/94 + */ + +/* + * Theoretically, directories can be more than 2Gb in length, + * however, in practice this seems unlikely. So, we define + * the type doff_t as a long to keep down the cost of doing + * lookup on a 32-bit machine. If you are porting to a 64-bit + * architecture, you should make doff_t the same as off_t. + */ +#define doff_t long + +typedef struct { + struct timespec iso_atime; /* time of last access */ + struct timespec iso_mtime; /* time of last modification */ + struct timespec iso_ctime; /* time file changed */ + u_short iso_mode; /* files access mode and type */ + uid_t iso_uid; /* owner user id */ + gid_t iso_gid; /* owner group id */ + short iso_links; /* links of file */ + dev_t iso_rdev; /* Major/Minor number for special */ +} ISO_RRIP_INODE; + +#ifdef ISODEVMAP +/* + * FOr device# (major,minor) translation table + */ +struct iso_dnode { + struct iso_dnode *d_chain[2]; /* hash chain, MUST be first */ + dev_t i_dev; /* device where dnode resides */ + ino_t i_number; /* the identity of the inode */ + dev_t d_dev; /* device # for translation */ +}; +#define d_forw d_chain[0] +#define d_back d_chain[1] +#endif + +struct iso_node { + struct iso_node *i_chain[2]; /* hash chain, MUST be first */ + struct vnode *i_vnode; /* vnode associated with this inode */ + struct vnode *i_devvp; /* vnode for block I/O */ + u_long i_flag; /* see below */ + dev_t i_dev; /* device where inode resides */ + ino_t i_number; /* the identity of the inode */ + /* we use the actual starting block of the file */ + struct iso_mnt *i_mnt; /* filesystem associated with this inode */ + struct lockf *i_lockf; /* head of byte-level lock list */ + doff_t i_endoff; /* end of useful stuff in directory */ + doff_t i_diroff; /* offset in dir, where we found last entry */ + doff_t i_offset; /* offset of free space in directory */ + ino_t i_ino; /* inode number of found directory */ + long i_spare0; + long i_spare1; + + long iso_extent; /* extent of file */ + long i_size; + long iso_start; /* actual start of data of file (may be different */ + /* from iso_extent, if file has extended attributes) */ + ISO_RRIP_INODE inode; +}; + +#define i_forw i_chain[0] +#define i_back i_chain[1] + +/* flags */ +#define ILOCKED 0x0001 /* inode is locked */ +#define IWANT 0x0002 /* some process waiting on lock */ +#define IACC 0x0020 /* inode access time to be updated */ + +#define VTOI(vp) ((struct iso_node *)(vp)->v_data) +#define ITOV(ip) ((ip)->i_vnode) + +#define ISO_ILOCK(ip) iso_ilock(ip) +#define ISO_IUNLOCK(ip) iso_iunlock(ip) + +/* + * Prototypes for ISOFS vnode operations + */ +int cd9660_lookup __P((struct vop_lookup_args *)); +int cd9660_open __P((struct vop_open_args *)); +int cd9660_close __P((struct vop_close_args *)); +int cd9660_access __P((struct vop_access_args *)); +int cd9660_getattr __P((struct vop_getattr_args *)); +int cd9660_read __P((struct vop_read_args *)); +int cd9660_ioctl __P((struct vop_ioctl_args *)); +int cd9660_select __P((struct vop_select_args *)); +int cd9660_mmap __P((struct vop_mmap_args *)); +int cd9660_seek __P((struct vop_seek_args *)); +int cd9660_readdir __P((struct vop_readdir_args *)); +int cd9660_abortop __P((struct vop_abortop_args *)); +int cd9660_inactive __P((struct vop_inactive_args *)); +int cd9660_reclaim __P((struct vop_reclaim_args *)); +int cd9660_bmap __P((struct vop_bmap_args *)); +int cd9660_lock __P((struct vop_lock_args *)); +int cd9660_unlock __P((struct vop_unlock_args *)); +int cd9660_strategy __P((struct vop_strategy_args *)); +int cd9660_print __P((struct vop_print_args *)); +int cd9660_islocked __P((struct vop_islocked_args *)); +void cd9660_defattr __P((struct iso_directory_record *, + struct iso_node *, struct buf *)); +void cd9660_deftstamp __P((struct iso_directory_record *, + struct iso_node *, struct buf *)); +#ifdef ISODEVMAP +struct iso_dnode *iso_dmap __P((dev_t, ino_t, int)); +void iso_dunmap __P((dev_t)); +#endif diff --git a/sys/fs/cd9660/cd9660_rrip.c b/sys/fs/cd9660/cd9660_rrip.c new file mode 100644 index 00000000000..0923fa01477 --- /dev/null +++ b/sys/fs/cd9660/cd9660_rrip.c @@ -0,0 +1,685 @@ +/*- + * Copyright (c) 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_rrip.c 8.2 (Berkeley) 1/23/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +/* + * POSIX file attribute + */ +static int +cd9660_rrip_attr(p,ana) + ISO_RRIP_ATTR *p; + ISO_RRIP_ANALYZE *ana; +{ + ana->inop->inode.iso_mode = isonum_731(p->mode_l); + ana->inop->inode.iso_uid = (uid_t)isonum_731(p->uid_l); + ana->inop->inode.iso_gid = (gid_t)isonum_731(p->gid_l); + ana->inop->inode.iso_links = isonum_731(p->links_l); + ana->fields &= ~ISO_SUSP_ATTR; + return ISO_SUSP_ATTR; +} + +static void +cd9660_rrip_defattr(isodir,ana) + struct iso_directory_record *isodir; + ISO_RRIP_ANALYZE *ana; +{ + /* But this is a required field! */ + printf("RRIP without PX field?\n"); + cd9660_defattr(isodir,ana->inop,NULL); +} + +/* + * Symbolic Links + */ +static int +cd9660_rrip_slink(p,ana) + ISO_RRIP_SLINK *p; + ISO_RRIP_ANALYZE *ana; +{ + register ISO_RRIP_SLINK_COMPONENT *pcomp; + register ISO_RRIP_SLINK_COMPONENT *pcompe; + int len, wlen, cont; + char *outbuf, *inbuf; + + pcomp = (ISO_RRIP_SLINK_COMPONENT *)p->component; + pcompe = (ISO_RRIP_SLINK_COMPONENT *)((char *)p + isonum_711(p->h.length)); + len = *ana->outlen; + outbuf = ana->outbuf; + cont = ana->cont; + + /* + * Gathering a Symbolic name from each component with path + */ + for (; + pcomp < pcompe; + pcomp = (ISO_RRIP_SLINK_COMPONENT *)((char *)pcomp + ISO_RRIP_SLSIZ + + isonum_711(pcomp->clen))) { + + if (!cont) { + if (len < ana->maxlen) { + len++; + *outbuf++ = '/'; + } + } + cont = 0; + + inbuf = ".."; + wlen = 0; + + switch (*pcomp->cflag) { + + case ISO_SUSP_CFLAG_CURRENT: + /* Inserting Current */ + wlen = 1; + break; + + case ISO_SUSP_CFLAG_PARENT: + /* Inserting Parent */ + wlen = 2; + break; + + case ISO_SUSP_CFLAG_ROOT: + /* Inserting slash for ROOT */ + /* start over from beginning(?) */ + outbuf -= len; + len = 0; + break; + + case ISO_SUSP_CFLAG_VOLROOT: + /* Inserting a mount point i.e. "/cdrom" */ + /* same as above */ + outbuf -= len; + len = 0; + inbuf = ana->imp->im_mountp->mnt_stat.f_mntonname; + wlen = strlen(inbuf); + break; + + case ISO_SUSP_CFLAG_HOST: + /* Inserting hostname i.e. "kurt.tools.de" */ + inbuf = hostname; + wlen = hostnamelen; + break; + + case ISO_SUSP_CFLAG_CONTINUE: + cont = 1; + /* fall thru */ + case 0: + /* Inserting component */ + wlen = isonum_711(pcomp->clen); + inbuf = pcomp->name; + break; + default: + printf("RRIP with incorrect flags?"); + wlen = ana->maxlen + 1; + break; + } + + if (len + wlen > ana->maxlen) { + /* indicate error to caller */ + ana->cont = 1; + ana->fields = 0; + ana->outbuf -= *ana->outlen; + *ana->outlen = 0; + return 0; + } + + bcopy(inbuf,outbuf,wlen); + outbuf += wlen; + len += wlen; + + } + ana->outbuf = outbuf; + *ana->outlen = len; + ana->cont = cont; + + if (!isonum_711(p->flags)) { + ana->fields &= ~ISO_SUSP_SLINK; + return ISO_SUSP_SLINK; + } + return 0; +} + +/* + * Alternate name + */ +static int +cd9660_rrip_altname(p,ana) + ISO_RRIP_ALTNAME *p; + ISO_RRIP_ANALYZE *ana; +{ + char *inbuf; + int wlen; + int cont; + + inbuf = ".."; + wlen = 0; + cont = 0; + + switch (*p->flags) { + case ISO_SUSP_CFLAG_CURRENT: + /* Inserting Current */ + wlen = 1; + break; + + case ISO_SUSP_CFLAG_PARENT: + /* Inserting Parent */ + wlen = 2; + break; + + case ISO_SUSP_CFLAG_HOST: + /* Inserting hostname i.e. "kurt.tools.de" */ + inbuf = hostname; + wlen = hostnamelen; + break; + + case ISO_SUSP_CFLAG_CONTINUE: + cont = 1; + /* fall thru */ + case 0: + /* Inserting component */ + wlen = isonum_711(p->h.length) - 5; + inbuf = (char *)p + 5; + break; + + default: + printf("RRIP with incorrect NM flags?\n"); + wlen = ana->maxlen + 1; + break; + } + + if ((*ana->outlen += wlen) > ana->maxlen) { + /* treat as no name field */ + ana->fields &= ~ISO_SUSP_ALTNAME; + ana->outbuf -= *ana->outlen - wlen; + *ana->outlen = 0; + return 0; + } + + bcopy(inbuf,ana->outbuf,wlen); + ana->outbuf += wlen; + + if (!cont) { + ana->fields &= ~ISO_SUSP_ALTNAME; + return ISO_SUSP_ALTNAME; + } + return 0; +} + +static void +cd9660_rrip_defname(isodir,ana) + struct iso_directory_record *isodir; + ISO_RRIP_ANALYZE *ana; +{ + strcpy(ana->outbuf,".."); + switch (*isodir->name) { + default: + isofntrans(isodir->name,isonum_711(isodir->name_len), + ana->outbuf,ana->outlen, + 1,isonum_711(isodir->flags)&4); + break; + case 0: + *ana->outlen = 1; + break; + case 1: + *ana->outlen = 2; + break; + } +} + +/* + * Parent or Child Link + */ +static int +cd9660_rrip_pclink(p,ana) + ISO_RRIP_CLINK *p; + ISO_RRIP_ANALYZE *ana; +{ + *ana->inump = isonum_733(p->dir_loc) << ana->imp->im_bshift; + ana->fields &= ~(ISO_SUSP_CLINK|ISO_SUSP_PLINK); + return *p->h.type == 'C' ? ISO_SUSP_CLINK : ISO_SUSP_PLINK; +} + +/* + * Relocated directory + */ +static int +cd9660_rrip_reldir(p,ana) + ISO_RRIP_RELDIR *p; + ISO_RRIP_ANALYZE *ana; +{ + /* special hack to make caller aware of RE field */ + *ana->outlen = 0; + ana->fields = 0; + return ISO_SUSP_RELDIR|ISO_SUSP_ALTNAME|ISO_SUSP_CLINK|ISO_SUSP_PLINK; +} + +static int +cd9660_rrip_tstamp(p,ana) + ISO_RRIP_TSTAMP *p; + ISO_RRIP_ANALYZE *ana; +{ + unsigned char *ptime; + + ptime = p->time; + + /* Check a format of time stamp (7bytes/17bytes) */ + if (!(*p->flags&ISO_SUSP_TSTAMP_FORM17)) { + if (*p->flags&ISO_SUSP_TSTAMP_CREAT) + ptime += 7; + + if (*p->flags&ISO_SUSP_TSTAMP_MODIFY) { + cd9660_tstamp_conv7(ptime,&ana->inop->inode.iso_mtime); + ptime += 7; + } else + bzero(&ana->inop->inode.iso_mtime,sizeof(struct timeval)); + + if (*p->flags&ISO_SUSP_TSTAMP_ACCESS) { + cd9660_tstamp_conv7(ptime,&ana->inop->inode.iso_atime); + ptime += 7; + } else + ana->inop->inode.iso_atime = ana->inop->inode.iso_mtime; + + if (*p->flags&ISO_SUSP_TSTAMP_ATTR) + cd9660_tstamp_conv7(ptime,&ana->inop->inode.iso_ctime); + else + ana->inop->inode.iso_ctime = ana->inop->inode.iso_mtime; + + } else { + if (*p->flags&ISO_SUSP_TSTAMP_CREAT) + ptime += 17; + + if (*p->flags&ISO_SUSP_TSTAMP_MODIFY) { + cd9660_tstamp_conv17(ptime,&ana->inop->inode.iso_mtime); + ptime += 17; + } else + bzero(&ana->inop->inode.iso_mtime,sizeof(struct timeval)); + + if (*p->flags&ISO_SUSP_TSTAMP_ACCESS) { + cd9660_tstamp_conv17(ptime,&ana->inop->inode.iso_atime); + ptime += 17; + } else + ana->inop->inode.iso_atime = ana->inop->inode.iso_mtime; + + if (*p->flags&ISO_SUSP_TSTAMP_ATTR) + cd9660_tstamp_conv17(ptime,&ana->inop->inode.iso_ctime); + else + ana->inop->inode.iso_ctime = ana->inop->inode.iso_mtime; + + } + ana->fields &= ~ISO_SUSP_TSTAMP; + return ISO_SUSP_TSTAMP; +} + +static void +cd9660_rrip_deftstamp(isodir,ana) + struct iso_directory_record *isodir; + ISO_RRIP_ANALYZE *ana; +{ + cd9660_deftstamp(isodir,ana->inop,NULL); +} + +/* + * POSIX device modes + */ +static int +cd9660_rrip_device(p,ana) + ISO_RRIP_DEVICE *p; + ISO_RRIP_ANALYZE *ana; +{ + unsigned high, low; + + high = isonum_733(p->dev_t_high_l); + low = isonum_733(p->dev_t_low_l); + + if ( high == 0 ) { + ana->inop->inode.iso_rdev = makedev( major(low), minor(low) ); + } else { + ana->inop->inode.iso_rdev = makedev( high, minor(low) ); + } + ana->fields &= ~ISO_SUSP_DEVICE; + return ISO_SUSP_DEVICE; +} + +/* + * Flag indicating + */ +static int +cd9660_rrip_idflag(p,ana) + ISO_RRIP_IDFLAG *p; + ISO_RRIP_ANALYZE *ana; +{ + ana->fields &= isonum_711(p->flags)|~0xff; /* don't touch high bits */ + /* special handling of RE field */ + if (ana->fields&ISO_SUSP_RELDIR) + return cd9660_rrip_reldir(p,ana); + + return ISO_SUSP_IDFLAG; +} + +/* + * Continuation pointer + */ +static int +cd9660_rrip_cont(p,ana) + ISO_RRIP_CONT *p; + ISO_RRIP_ANALYZE *ana; +{ + ana->iso_ce_blk = isonum_733(p->location); + ana->iso_ce_off = isonum_733(p->offset); + ana->iso_ce_len = isonum_733(p->length); + return ISO_SUSP_CONT; +} + +/* + * System Use end + */ +static int +cd9660_rrip_stop(p,ana) + ISO_SUSP_HEADER *p; + ISO_RRIP_ANALYZE *ana; +{ + /* stop analyzing */ + ana->fields = 0; + return ISO_SUSP_STOP; +} + +/* + * Extension reference + */ +static int +cd9660_rrip_extref(p,ana) + ISO_RRIP_EXTREF *p; + ISO_RRIP_ANALYZE *ana; +{ + if (isonum_711(p->len_id) != 10 + || bcmp((char *)p + 8,"RRIP_1991A",10) + || isonum_711(p->version) != 1) + return 0; + ana->fields &= ~ISO_SUSP_EXTREF; + return ISO_SUSP_EXTREF; +} + +typedef struct { + char type[2]; + int (*func)(); + void (*func2)(); + int result; +} RRIP_TABLE; + +static int +cd9660_rrip_loop(isodir,ana,table) + struct iso_directory_record *isodir; + ISO_RRIP_ANALYZE *ana; + RRIP_TABLE *table; +{ + register RRIP_TABLE *ptable; + register ISO_SUSP_HEADER *phead; + register ISO_SUSP_HEADER *pend; + struct buf *bp = NULL; + int i; + char *pwhead; + int result; + + /* + * Note: If name length is odd, + * it will be padding 1 byte after the name + */ + pwhead = isodir->name + isonum_711(isodir->name_len); + if (!(isonum_711(isodir->name_len)&1)) + pwhead++; + + /* If it's not the '.' entry of the root dir obey SP field */ + if (*isodir->name != 0 + || isonum_733(isodir->extent) != ana->imp->root_extent) + pwhead += ana->imp->rr_skip; + else + pwhead += ana->imp->rr_skip0; + + phead = (ISO_SUSP_HEADER *)pwhead; + pend = (ISO_SUSP_HEADER *)((char *)isodir + isonum_711(isodir->length)); + + result = 0; + while (1) { + ana->iso_ce_len = 0; + /* + * Note: "pend" should be more than one SUSP header + */ + while (pend >= phead + 1) { + if (isonum_711(phead->version) == 1) { + for (ptable = table; ptable->func; ptable++) { + if (*phead->type == *ptable->type + && phead->type[1] == ptable->type[1]) { + result |= ptable->func(phead,ana); + break; + } + } + if (!ana->fields) + break; + } + /* + * move to next SUSP + * Hopefully this works with newer versions, too + */ + phead = (ISO_SUSP_HEADER *)((char *)phead + isonum_711(phead->length)); + } + + if ( ana->fields && ana->iso_ce_len ) { + if (ana->iso_ce_blk >= ana->imp->volume_space_size + || ana->iso_ce_off + ana->iso_ce_len > ana->imp->logical_block_size + || bread(ana->imp->im_devvp, + ana->iso_ce_blk * ana->imp->logical_block_size / DEV_BSIZE, + ana->imp->logical_block_size,NOCRED,&bp)) + /* what to do now? */ + break; + phead = (ISO_SUSP_HEADER *)(bp->b_un.b_addr + ana->iso_ce_off); + pend = (ISO_SUSP_HEADER *) ((char *)phead + ana->iso_ce_len); + } else + break; + } + if (bp) + brelse(bp); + /* + * If we don't find the Basic SUSP stuffs, just set default value + * ( attribute/time stamp ) + */ + for (ptable = table; ptable->func2; ptable++) + if (!(ptable->result&result)) + ptable->func2(isodir,ana); + + return result; +} + +static RRIP_TABLE rrip_table_analyze[] = { + { "PX", cd9660_rrip_attr, cd9660_rrip_defattr, ISO_SUSP_ATTR }, + { "TF", cd9660_rrip_tstamp, cd9660_rrip_deftstamp, ISO_SUSP_TSTAMP }, + { "PN", cd9660_rrip_device, 0, ISO_SUSP_DEVICE }, + { "RR", cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG }, + { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, + { "", 0, 0, 0 } +}; + +int +cd9660_rrip_analyze(isodir,inop,imp) + struct iso_directory_record *isodir; + struct iso_node *inop; + struct iso_mnt *imp; +{ + ISO_RRIP_ANALYZE analyze; + + analyze.inop = inop; + analyze.imp = imp; + analyze.fields = ISO_SUSP_ATTR|ISO_SUSP_TSTAMP|ISO_SUSP_DEVICE; + + return cd9660_rrip_loop(isodir,&analyze,rrip_table_analyze); +} + +/* + * Get Alternate Name from 'AL' record + * If either no AL record or 0 length, + * it will be return the translated ISO9660 name, + */ +static RRIP_TABLE rrip_table_getname[] = { + { "NM", cd9660_rrip_altname, cd9660_rrip_defname, ISO_SUSP_ALTNAME }, + { "CL", cd9660_rrip_pclink, 0, ISO_SUSP_CLINK|ISO_SUSP_PLINK }, + { "PL", cd9660_rrip_pclink, 0, ISO_SUSP_CLINK|ISO_SUSP_PLINK }, + { "RE", cd9660_rrip_reldir, 0, ISO_SUSP_RELDIR }, + { "RR", cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG }, + { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, + { "", 0, 0, 0 } +}; + +int +cd9660_rrip_getname(isodir,outbuf,outlen,inump,imp) + struct iso_directory_record *isodir; + char *outbuf; + u_short *outlen; + ino_t *inump; + struct iso_mnt *imp; +{ + ISO_RRIP_ANALYZE analyze; + RRIP_TABLE *tab; + + analyze.outbuf = outbuf; + analyze.outlen = outlen; + analyze.maxlen = NAME_MAX; + analyze.inump = inump; + analyze.imp = imp; + analyze.fields = ISO_SUSP_ALTNAME|ISO_SUSP_RELDIR|ISO_SUSP_CLINK|ISO_SUSP_PLINK; + *outlen = 0; + + tab = rrip_table_getname; + if (*isodir->name == 0 + || *isodir->name == 1) { + cd9660_rrip_defname(isodir,&analyze); + + analyze.fields &= ~ISO_SUSP_ALTNAME; + tab++; + } + + return cd9660_rrip_loop(isodir,&analyze,tab); +} + +/* + * Get Symbolic Name from 'SL' record + * + * Note: isodir should contains SL record! + */ +static RRIP_TABLE rrip_table_getsymname[] = { + { "SL", cd9660_rrip_slink, 0, ISO_SUSP_SLINK }, + { "RR", cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG }, + { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, + { "", 0, 0, 0 } +}; + +int +cd9660_rrip_getsymname(isodir,outbuf,outlen,imp) + struct iso_directory_record *isodir; + char *outbuf; + u_short *outlen; + struct iso_mnt *imp; +{ + ISO_RRIP_ANALYZE analyze; + + analyze.outbuf = outbuf; + analyze.outlen = outlen; + *outlen = 0; + analyze.maxlen = MAXPATHLEN; + analyze.cont = 1; /* don't start with a slash */ + analyze.imp = imp; + analyze.fields = ISO_SUSP_SLINK; + + return (cd9660_rrip_loop(isodir,&analyze,rrip_table_getsymname)&ISO_SUSP_SLINK); +} + +static RRIP_TABLE rrip_table_extref[] = { + { "ER", cd9660_rrip_extref, 0, ISO_SUSP_EXTREF }, + { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, + { "", 0, 0, 0 } +}; + +/* + * Check for Rock Ridge Extension and return offset of its fields. + * Note: We require the ER field. + */ +int +cd9660_rrip_offset(isodir,imp) + struct iso_directory_record *isodir; + struct iso_mnt *imp; +{ + ISO_RRIP_OFFSET *p; + ISO_RRIP_ANALYZE analyze; + + imp->rr_skip0 = 0; + p = (ISO_RRIP_OFFSET *)(isodir->name + 1); + if (bcmp(p,"SP\7\1\276\357",6)) { + /* Maybe, it's a CDROM XA disc? */ + imp->rr_skip0 = 15; + p = (ISO_RRIP_OFFSET *)((char *)p + 15); + if (bcmp(p,"SP\7\1\276\357",6)) + return -1; + } + + analyze.imp = imp; + analyze.fields = ISO_SUSP_EXTREF; + if (!(cd9660_rrip_loop(isodir,&analyze,rrip_table_extref)&ISO_SUSP_EXTREF)) + return -1; + + return isonum_711(p->skip); +} diff --git a/sys/fs/cd9660/cd9660_rrip.h b/sys/fs/cd9660/cd9660_rrip.h new file mode 100644 index 00000000000..b4017281f06 --- /dev/null +++ b/sys/fs/cd9660/cd9660_rrip.h @@ -0,0 +1,146 @@ +/*- + * Copyright (c) 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_rrip.h 8.1 (Berkeley) 1/21/94 + */ + +typedef struct { + char type [ISODCL ( 0, 1)]; + unsigned char length [ISODCL ( 2, 2)]; /* 711 */ + unsigned char version [ISODCL ( 3, 3)]; +} ISO_SUSP_HEADER; + +typedef struct { + ISO_SUSP_HEADER h; + char mode_l [ISODCL ( 4, 7)]; /* 731 */ + char mode_m [ISODCL ( 8, 11)]; /* 732 */ + char links_l [ISODCL ( 12, 15)]; /* 731 */ + char links_m [ISODCL ( 16, 19)]; /* 732 */ + char uid_l [ISODCL ( 20, 23)]; /* 731 */ + char uid_m [ISODCL ( 24, 27)]; /* 732 */ + char gid_l [ISODCL ( 28, 31)]; /* 731 */ + char gid_m [ISODCL ( 32, 35)]; /* 732 */ +} ISO_RRIP_ATTR; + +typedef struct { + ISO_SUSP_HEADER h; + char dev_t_high_l [ISODCL ( 4, 7)]; /* 731 */ + char dev_t_high_m [ISODCL ( 8, 11)]; /* 732 */ + char dev_t_low_l [ISODCL ( 12, 15)]; /* 731 */ + char dev_t_low_m [ISODCL ( 16, 19)]; /* 732 */ +} ISO_RRIP_DEVICE; + +#define ISO_SUSP_CFLAG_CONTINUE 0x01 +#define ISO_SUSP_CFLAG_CURRENT 0x02 +#define ISO_SUSP_CFLAG_PARENT 0x04 +#define ISO_SUSP_CFLAG_ROOT 0x08 +#define ISO_SUSP_CFLAG_VOLROOT 0x10 +#define ISO_SUSP_CFLAG_HOST 0x20 + +typedef struct { + u_char cflag [ISODCL ( 1, 1)]; + u_char clen [ISODCL ( 2, 2)]; + u_char name [0]; +} ISO_RRIP_SLINK_COMPONENT; +#define ISO_RRIP_SLSIZ 2 + +typedef struct { + ISO_SUSP_HEADER h; + u_char flags [ISODCL ( 4, 4)]; + u_char component [ISODCL ( 5, 5)]; +} ISO_RRIP_SLINK; + +typedef struct { + ISO_SUSP_HEADER h; + char flags [ISODCL ( 4, 4)]; +} ISO_RRIP_ALTNAME; + +typedef struct { + ISO_SUSP_HEADER h; + char dir_loc [ISODCL ( 4, 11)]; /* 733 */ +} ISO_RRIP_CLINK; + +typedef struct { + ISO_SUSP_HEADER h; + char dir_loc [ISODCL ( 4, 11)]; /* 733 */ +} ISO_RRIP_PLINK; + +typedef struct { + ISO_SUSP_HEADER h; +} ISO_RRIP_RELDIR; + +#define ISO_SUSP_TSTAMP_FORM17 0x80 +#define ISO_SUSP_TSTAMP_FORM7 0x00 +#define ISO_SUSP_TSTAMP_CREAT 0x01 +#define ISO_SUSP_TSTAMP_MODIFY 0x02 +#define ISO_SUSP_TSTAMP_ACCESS 0x04 +#define ISO_SUSP_TSTAMP_ATTR 0x08 +#define ISO_SUSP_TSTAMP_BACKUP 0x10 +#define ISO_SUSP_TSTAMP_EXPIRE 0x20 +#define ISO_SUSP_TSTAMP_EFFECT 0x40 + +typedef struct { + ISO_SUSP_HEADER h; + unsigned char flags [ISODCL ( 4, 4)]; + unsigned char time [ISODCL ( 5, 5)]; +} ISO_RRIP_TSTAMP; + +typedef struct { + ISO_SUSP_HEADER h; + unsigned char flags [ISODCL ( 4, 4)]; +} ISO_RRIP_IDFLAG; + +typedef struct { + ISO_SUSP_HEADER h; + char len_id [ISODCL ( 4, 4)]; + char len_des [ISODCL ( 5, 5)]; + char len_src [ISODCL ( 6, 6)]; + char version [ISODCL ( 7, 7)]; +} ISO_RRIP_EXTREF; + +typedef struct { + ISO_SUSP_HEADER h; + char check [ISODCL ( 4, 5)]; + char skip [ISODCL ( 6, 6)]; +} ISO_RRIP_OFFSET; + +typedef struct { + ISO_SUSP_HEADER h; + char location [ISODCL ( 4, 11)]; + char offset [ISODCL ( 12, 19)]; + char length [ISODCL ( 20, 27)]; +} ISO_RRIP_CONT; diff --git a/sys/fs/cd9660/cd9660_util.c b/sys/fs/cd9660/cd9660_util.c new file mode 100644 index 00000000000..f74f0515ff7 --- /dev/null +++ b/sys/fs/cd9660/cd9660_util.c @@ -0,0 +1,236 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_util.c 8.1 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* XXX */ +#include /* XXX */ +#include +#include + +#include + +#ifdef __notanymore__ +int +isonum_711 (p) +unsigned char *p; +{ + return (*p); +} + +int +isonum_712 (p) +signed char *p; +{ + return (*p); +} + +int +isonum_721 (p) +unsigned char *p; +{ + /* little endian short */ +#if BYTE_ORDER != LITTLE_ENDIAN + printf ("isonum_721 called on non little-endian machine!\n"); +#endif + + return *(short *)p; +} + +int +isonum_722 (p) +unsigned char *p; +{ + /* big endian short */ +#if BYTE_ORDER != BIG_ENDIAN + printf ("isonum_722 called on non big-endian machine!\n"); +#endif + + return *(short *)p; +} + +int +isonum_723 (p) +unsigned char *p; +{ +#if BYTE_ORDER == BIG_ENDIAN + return isonum_722 (p + 2); +#elif BYTE_ORDER == LITTLE_ENDIAN + return isonum_721 (p); +#else + printf ("isonum_723 unsupported byte order!\n"); + return 0; +#endif +} + +int +isonum_731 (p) +unsigned char *p; +{ + /* little endian long */ +#if BYTE_ORDER != LITTLE_ENDIAN + printf ("isonum_731 called on non little-endian machine!\n"); +#endif + + return *(long *)p; +} + +int +isonum_732 (p) +unsigned char *p; +{ + /* big endian long */ +#if BYTE_ORDER != BIG_ENDIAN + printf ("isonum_732 called on non big-endian machine!\n"); +#endif + + return *(long *)p; +} + +int +isonum_733 (p) +unsigned char *p; +{ +#if BYTE_ORDER == BIG_ENDIAN + return isonum_732 (p + 4); +#elif BYTE_ORDER == LITTLE_ENDIAN + return isonum_731 (p); +#else + printf ("isonum_733 unsupported byte order!\n"); + return 0; +#endif +} +#endif /* __notanymore__ */ + +/* + * translate and compare a filename + * Note: Version number plus ';' may be omitted. + */ +int +isofncmp(unsigned char *fn,int fnlen,unsigned char *isofn,int isolen) +{ + int i, j; + char c; + + while (--fnlen >= 0) { + if (--isolen < 0) + return *fn; + if ((c = *isofn++) == ';') { + switch (*fn++) { + default: + return *--fn; + case 0: + return 0; + case ';': + break; + } + for (i = 0; --fnlen >= 0; i = i * 10 + *fn++ - '0') { + if (*fn < '0' || *fn > '9') { + return -1; + } + } + for (j = 0; --isolen >= 0; j = j * 10 + *isofn++ - '0'); + return i - j; + } + if (c != *fn) { + if (c >= 'A' && c <= 'Z') { + if (c + ('a' - 'A') != *fn) { + if (*fn >= 'a' && *fn <= 'z') + return *fn - ('a' - 'A') - c; + else + return *fn - c; + } + } else + return *fn - c; + } + fn++; + } + if (isolen > 0) { + switch (*isofn) { + default: + return -1; + case '.': + if (isofn[1] != ';') + return -1; + case ';': + return 0; + } + } + return 0; +} + +/* + * translate a filename + */ +void +isofntrans(unsigned char *infn,int infnlen, + unsigned char *outfn,unsigned short *outfnlen, + int original,int assoc) +{ + int fnidx = 0; + + if (assoc) { + *outfn++ = ASSOCCHAR; + fnidx++; + } + for (; fnidx < infnlen; fnidx++) { + char c = *infn++; + + if (!original && c >= 'A' && c <= 'Z') + *outfn++ = c + ('a' - 'A'); + else if (!original && c == '.' && *infn == ';') + break; + else if (!original && c == ';') + break; + else + *outfn++ = c; + } + *outfnlen = fnidx; +} diff --git a/sys/fs/cd9660/cd9660_vfsops.c b/sys/fs/cd9660/cd9660_vfsops.c new file mode 100644 index 00000000000..02dd92af66f --- /dev/null +++ b/sys/fs/cd9660/cd9660_vfsops.c @@ -0,0 +1,681 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_vfsops.c 8.3 (Berkeley) 1/31/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern int enodev (); + +struct vfsops cd9660_vfsops = { + cd9660_mount, + cd9660_start, + cd9660_unmount, + cd9660_root, + cd9660_quotactl, + cd9660_statfs, + cd9660_sync, + cd9660_vget, + cd9660_fhtovp, + cd9660_vptofh, + cd9660_init, +}; + +/* + * Called by vfs_mountroot when iso is going to be mounted as root. + * + * Name is updated by mount(8) after booting. + */ +#define ROOTNAME "root_device" + +static iso_mountfs(); + +cd9660_mountroot() +{ + register struct mount *mp; + extern struct vnode *rootvp; + struct proc *p = curproc; /* XXX */ + struct iso_mnt *imp; + register struct fs *fs; + u_int size; + int error; + struct iso_args args; + + /* + * Get vnodes for swapdev and rootdev. + */ + if (bdevvp(swapdev, &swapdev_vp) || bdevvp(rootdev, &rootvp)) + panic("cd9660_mountroot: can't setup bdevvp's"); + + mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + mp->mnt_op = &cd9660_vfsops; + mp->mnt_flag = MNT_RDONLY; + args.flags = ISOFSMNT_ROOT; + if (error = iso_mountfs(rootvp, mp, p, &args)) { + free(mp, M_MOUNT); + return (error); + } + if (error = vfs_lock(mp)) { + (void)cd9660_unmount(mp, 0, p); + free(mp, M_MOUNT); + return (error); + } + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mp->mnt_flag |= MNT_ROOTFS; + mp->mnt_vnodecovered = NULLVP; + imp = VFSTOISOFS(mp); + bzero(imp->im_fsmnt, sizeof(imp->im_fsmnt)); + imp->im_fsmnt[0] = '/'; + bcopy((caddr_t)imp->im_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) cd9660_statfs(mp, &mp->mnt_stat, p); + vfs_unlock(mp); + return (0); +} + +/* + * Flag to allow forcible unmounting. + */ +int iso_doforce = 1; + +/* + * VFS Operations. + * + * mount system call + */ +cd9660_mount(mp, path, data, ndp, p) + register struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct vnode *devvp; + struct iso_args args; + u_int size; + int error; + struct iso_mnt *imp; + + if (error = copyin(data, (caddr_t)&args, sizeof (struct iso_args))) + return (error); + + if ((mp->mnt_flag & MNT_RDONLY) == 0) + return (EROFS); + + /* + * If updating, check whether changing from read-only to + * read/write; if there is no device name, that's all we do. + */ + if (mp->mnt_flag & MNT_UPDATE) { + imp = VFSTOISOFS(mp); + if (args.fspec == 0) + return (vfs_export(mp, &imp->im_export, &args.export)); + } + /* + * Not an update, or updating the name: look up the name + * and verify that it refers to a sensible block device. + */ + NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); + if (error = namei(ndp)) + return (error); + devvp = ndp->ni_vp; + + if (devvp->v_type != VBLK) { + vrele(devvp); + return ENOTBLK; + } + if (major(devvp->v_rdev) >= nblkdev) { + vrele(devvp); + return ENXIO; + } + if ((mp->mnt_flag & MNT_UPDATE) == 0) + error = iso_mountfs(devvp, mp, p, &args); + else { + if (devvp != imp->im_devvp) + error = EINVAL; /* needs translation */ + else + vrele(devvp); + } + if (error) { + vrele(devvp); + return error; + } + imp = VFSTOISOFS(mp); + (void) copyinstr(path, imp->im_fsmnt, sizeof(imp->im_fsmnt)-1, &size); + bzero(imp->im_fsmnt + size, sizeof(imp->im_fsmnt) - size); + bcopy((caddr_t)imp->im_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) cd9660_statfs(mp, &mp->mnt_stat, p); + return 0; +} + +/* + * Common code for mount and mountroot + */ +static iso_mountfs(devvp, mp, p, argp) + register struct vnode *devvp; + struct mount *mp; + struct proc *p; + struct iso_args *argp; +{ + register struct iso_mnt *isomp = (struct iso_mnt *)0; + struct buf *bp = NULL; + dev_t dev = devvp->v_rdev; + caddr_t base, space; + int havepart = 0, blks; + int error = EINVAL, i, size; + int needclose = 0; + int ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + extern struct vnode *rootvp; + int j; + int iso_bsize; + int iso_blknum; + struct iso_volume_descriptor *vdp; + struct iso_primary_descriptor *pri; + struct iso_directory_record *rootp; + int logical_block_size; + + if (!ronly) + return EROFS; + + /* + * Disallow multiple mounts of the same device. + * Disallow mounting of a device that is currently in use + * (except for root, which might share swap device for miniroot). + * Flush out any old buffers remaining from a previous use. + */ + if (error = vfs_mountedon(devvp)) + return error; + if (vcount(devvp) > 1 && devvp != rootvp) + return EBUSY; + if (error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + + if (error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p)) + return error; + needclose = 1; + + /* This is the "logical sector size". The standard says this + * should be 2048 or the physical sector size on the device, + * whichever is greater. For now, we'll just use a constant. + */ + iso_bsize = ISO_DEFAULT_BLOCK_SIZE; + + for (iso_blknum = 16; iso_blknum < 100; iso_blknum++) { + if (error = bread (devvp, btodb(iso_blknum * iso_bsize), + iso_bsize, NOCRED, &bp)) + goto out; + + vdp = (struct iso_volume_descriptor *)bp->b_un.b_addr; + if (bcmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) != 0) { + error = EINVAL; + goto out; + } + + if (isonum_711 (vdp->type) == ISO_VD_END) { + error = EINVAL; + goto out; + } + + if (isonum_711 (vdp->type) == ISO_VD_PRIMARY) + break; + brelse(bp); + } + + if (isonum_711 (vdp->type) != ISO_VD_PRIMARY) { + error = EINVAL; + goto out; + } + + pri = (struct iso_primary_descriptor *)vdp; + + logical_block_size = isonum_723 (pri->logical_block_size); + + if (logical_block_size < DEV_BSIZE || logical_block_size > MAXBSIZE + || (logical_block_size & (logical_block_size - 1)) != 0) { + error = EINVAL; + goto out; + } + + rootp = (struct iso_directory_record *)pri->root_directory_record; + + isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK); + bzero((caddr_t)isomp, sizeof *isomp); + isomp->logical_block_size = logical_block_size; + isomp->volume_space_size = isonum_733 (pri->volume_space_size); + bcopy (rootp, isomp->root, sizeof isomp->root); + isomp->root_extent = isonum_733 (rootp->extent); + isomp->root_size = isonum_733 (rootp->size); + + isomp->im_bmask = logical_block_size - 1; + isomp->im_bshift = 0; + while ((1 << isomp->im_bshift) < isomp->logical_block_size) + isomp->im_bshift++; + + bp->b_flags |= B_AGE; + brelse(bp); + bp = NULL; + + mp->mnt_data = (qaddr_t)isomp; + mp->mnt_stat.f_fsid.val[0] = (long)dev; + mp->mnt_stat.f_fsid.val[1] = MOUNT_CD9660; + mp->mnt_maxsymlinklen = 0; + mp->mnt_flag |= MNT_LOCAL; + isomp->im_mountp = mp; + isomp->im_dev = dev; + isomp->im_devvp = devvp; + + devvp->v_specflags |= SI_MOUNTEDON; + + /* Check the Rock Ridge Extention support */ + if (!(argp->flags & ISOFSMNT_NORRIP)) { + if (error = bread (isomp->im_devvp, + (isomp->root_extent + isonum_711(rootp->ext_attr_length)) + * isomp->logical_block_size / DEV_BSIZE, + isomp->logical_block_size,NOCRED,&bp)) + goto out; + + rootp = (struct iso_directory_record *)bp->b_un.b_addr; + + if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) { + argp->flags |= ISOFSMNT_NORRIP; + } else { + argp->flags &= ~ISOFSMNT_GENS; + } + + /* + * The contents are valid, + * but they will get reread as part of another vnode, so... + */ + bp->b_flags |= B_AGE; + brelse(bp); + bp = NULL; + } + isomp->im_flags = argp->flags&(ISOFSMNT_NORRIP|ISOFSMNT_GENS|ISOFSMNT_EXTATT); + switch (isomp->im_flags&(ISOFSMNT_NORRIP|ISOFSMNT_GENS)) { + default: + isomp->iso_ftype = ISO_FTYPE_DEFAULT; + break; + case ISOFSMNT_GENS|ISOFSMNT_NORRIP: + isomp->iso_ftype = ISO_FTYPE_9660; + break; + case 0: + isomp->iso_ftype = ISO_FTYPE_RRIP; + break; + } + + return 0; +out: + if (bp) + brelse(bp); + if (needclose) + (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); + if (isomp) { + free((caddr_t)isomp, M_ISOFSMNT); + mp->mnt_data = (qaddr_t)0; + } + return error; +} + +/* + * Make a filesystem operational. + * Nothing to do at the moment. + */ +/* ARGSUSED */ +cd9660_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + return 0; +} + +/* + * unmount system call + */ +int +cd9660_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + register struct iso_mnt *isomp; + int i, error, ronly, flags = 0; + + if (mntflags & MNT_FORCE) { + if (!iso_doforce || (mp->mnt_flag & MNT_ROOTFS)) + return (EINVAL); + flags |= FORCECLOSE; + } +#if 0 + mntflushbuf(mp, 0); + if (mntinvalbuf(mp)) + return EBUSY; +#endif + if (error = vflush(mp, NULLVP, flags)) + return (error); + + isomp = VFSTOISOFS(mp); + +#ifdef ISODEVMAP + if (isomp->iso_ftype == ISO_FTYPE_RRIP) + iso_dunmap(isomp->im_dev); +#endif + + isomp->im_devvp->v_specflags &= ~SI_MOUNTEDON; + error = VOP_CLOSE(isomp->im_devvp, FREAD, NOCRED, p); + vrele(isomp->im_devvp); + free((caddr_t)isomp, M_ISOFSMNT); + mp->mnt_data = (qaddr_t)0; + mp->mnt_flag &= ~MNT_LOCAL; + return (error); +} + +/* + * Return root of a filesystem + */ +cd9660_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + register struct iso_node *ip; + struct iso_node tip, *nip; + struct vnode tvp; + int error; + struct iso_mnt *imp = VFSTOISOFS (mp); + struct iso_directory_record *dp; + + tvp.v_mount = mp; + tvp.v_data = &tip; + ip = VTOI(&tvp); + ip->i_vnode = &tvp; + ip->i_dev = imp->im_dev; + ip->i_diroff = 0; + dp = (struct iso_directory_record *)imp->root; + isodirino(&ip->i_number,dp,imp); + + /* + * With RRIP we must use the `.' entry of the root directory. + * Simply tell iget, that it's a relocated directory. + */ + error = iso_iget(ip,ip->i_number, + imp->iso_ftype == ISO_FTYPE_RRIP, + &nip,dp); + if (error) + return error; + *vpp = ITOV(nip); + return 0; +} + +/* + * Do operations associated with quotas, not supported + */ +/* ARGSUSED */ +int +cd9660_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} + +/* + * Get file system statistics. + */ +cd9660_statfs(mp, sbp, p) + struct mount *mp; + register struct statfs *sbp; + struct proc *p; +{ + register struct iso_mnt *isomp; + register struct fs *fs; + + isomp = VFSTOISOFS(mp); + + sbp->f_type = MOUNT_CD9660; + sbp->f_bsize = isomp->logical_block_size; + sbp->f_iosize = sbp->f_bsize; /* XXX */ + sbp->f_blocks = isomp->volume_space_size; + sbp->f_bfree = 0; /* total free blocks */ + sbp->f_bavail = 0; /* blocks free for non superuser */ + sbp->f_files = 0; /* total files */ + sbp->f_ffree = 0; /* free file nodes */ + if (sbp != &mp->mnt_stat) { + bcopy((caddr_t)mp->mnt_stat.f_mntonname, + (caddr_t)&sbp->f_mntonname[0], MNAMELEN); + bcopy((caddr_t)mp->mnt_stat.f_mntfromname, + (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); + } + /* Use the first spare for flags: */ + sbp->f_spare[0] = isomp->im_flags; + return 0; +} + +/* ARGSUSED */ +int +cd9660_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + return (0); +} + +/* + * Flat namespace lookup. + * Currently unsupported. + */ +/* ARGSUSED */ +int +cd9660_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +/* + * File handle to vnode + * + * Have to be really careful about stale file handles: + * - check that the inode number is in range + * - call iget() to get the locked inode + * - check for an unallocated inode (i_mode == 0) + * - check that the generation number matches + */ + +struct ifid { + ushort ifid_len; + ushort ifid_pad; + int ifid_ino; + long ifid_start; +}; + +/* ARGSUSED */ +int +cd9660_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) + register struct mount *mp; + struct fid *fhp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + struct vnode tvp; + int error; + int lbn, off; + struct ifid *ifhp; + struct iso_mnt *imp; + struct buf *bp; + struct iso_directory_record *dirp; + struct iso_node tip, *ip, *nip; + struct netcred *np; + + imp = VFSTOISOFS (mp); + ifhp = (struct ifid *)fhp; + +#ifdef ISOFS_DBG + printf("fhtovp: ino %d, start %ld\n", + ifhp->ifid_ino, ifhp->ifid_start); +#endif + + np = vfs_export_lookup(mp, &imp->im_export, nam); + if (np == NULL) + return (EACCES); + + lbn = iso_lblkno(imp, ifhp->ifid_ino); + if (lbn >= imp->volume_space_size) { + printf("fhtovp: lbn exceed volume space %d\n", lbn); + return (ESTALE); + } + + off = iso_blkoff(imp, ifhp->ifid_ino); + if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) { + printf("fhtovp: crosses block boundary %d\n", + off + ISO_DIRECTORY_RECORD_SIZE); + return (ESTALE); + } + + error = bread(imp->im_devvp, btodb(lbn * imp->logical_block_size), + imp->logical_block_size, NOCRED, &bp); + if (error) { + printf("fhtovp: bread error %d\n",error); + brelse(bp); + return (error); + } + + dirp = (struct iso_directory_record *)(bp->b_un.b_addr + off); + if (off + isonum_711(dirp->length) > imp->logical_block_size) { + brelse(bp); + printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n", + off+isonum_711(dirp->length), off, + isonum_711(dirp->length)); + return (ESTALE); + } + + if (isonum_733(dirp->extent) + isonum_711(dirp->ext_attr_length) != + ifhp->ifid_start) { + brelse(bp); + printf("fhtovp: file start miss %d vs %d\n", + isonum_733(dirp->extent)+isonum_711(dirp->ext_attr_length), + ifhp->ifid_start); + return (ESTALE); + } + brelse(bp); + + ip = &tip; + tvp.v_mount = mp; + tvp.v_data = ip; + ip->i_vnode = &tvp; + ip->i_dev = imp->im_dev; + if (error = iso_iget(ip, ifhp->ifid_ino, 0, &nip, dirp)) { + *vpp = NULLVP; + printf("fhtovp: failed to get inode\n"); + return (error); + } + ip = nip; + /* + * XXX need generation number? + */ + if (ip->inode.iso_mode == 0) { + iso_iput(ip); + *vpp = NULLVP; + printf("fhtovp: inode mode == 0\n"); + return (ESTALE); + } + *vpp = ITOV(ip); + *exflagsp = np->netc_exflags; + *credanonp = &np->netc_anon; + return 0; +} + +/* + * Vnode pointer to File handle + */ +/* ARGSUSED */ +cd9660_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + register struct iso_node *ip = VTOI(vp); + register struct ifid *ifhp; + register struct iso_mnt *mp = ip->i_mnt; + + ifhp = (struct ifid *)fhp; + ifhp->ifid_len = sizeof(struct ifid); + + ifhp->ifid_ino = ip->i_number; + ifhp->ifid_start = ip->iso_start; + +#ifdef ISOFS_DBG + printf("vptofh: ino %d, start %ld\n", + ifhp->ifid_ino,ifhp->ifid_start); +#endif + return 0; +} diff --git a/sys/fs/cd9660/cd9660_vnops.c b/sys/fs/cd9660/cd9660_vnops.c new file mode 100644 index 00000000000..59f5a73f5c8 --- /dev/null +++ b/sys/fs/cd9660/cd9660_vnops.c @@ -0,0 +1,1038 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_vnops.c 8.3 (Berkeley) 1/23/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if 0 +/* + * Mknod vnode call + * Actually remap the device number + */ +cd9660_mknod(ndp, vap, cred, p) + struct nameidata *ndp; + struct ucred *cred; + struct vattr *vap; + struct proc *p; +{ +#ifndef ISODEVMAP + free(ndp->ni_pnbuf, M_NAMEI); + vput(ndp->ni_dvp); + vput(ndp->ni_vp); + return EINVAL; +#else + register struct vnode *vp; + struct iso_node *ip; + struct iso_dnode *dp; + int error; + + vp = ndp->ni_vp; + ip = VTOI(vp); + + if (ip->i_mnt->iso_ftype != ISO_FTYPE_RRIP + || vap->va_type != vp->v_type + || (vap->va_type != VCHR && vap->va_type != VBLK)) { + free(ndp->ni_pnbuf, M_NAMEI); + vput(ndp->ni_dvp); + vput(ndp->ni_vp); + return EINVAL; + } + + dp = iso_dmap(ip->i_dev,ip->i_number,1); + if (ip->inode.iso_rdev == vap->va_rdev || vap->va_rdev == VNOVAL) { + /* same as the unmapped one, delete the mapping */ + remque(dp); + FREE(dp,M_CACHE); + } else + /* enter new mapping */ + dp->d_dev = vap->va_rdev; + + /* + * Remove inode so that it will be reloaded by iget and + * checked to see if it is an alias of an existing entry + * in the inode cache. + */ + vput(vp); + vp->v_type = VNON; + vgone(vp); + return (0); +#endif +} +#endif + +/* + * Open called. + * + * Nothing to do. + */ +/* ARGSUSED */ +int +cd9660_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + return (0); +} + +/* + * Close called + * + * Update the times on the inode on writeable file systems. + */ +/* ARGSUSED */ +int +cd9660_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + return (0); +} + +/* + * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC. + * The mode is shifted to select the owner/group/other fields. The + * super user is granted all permissions. + */ +/* ARGSUSED */ +cd9660_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + return (0); +} + +cd9660_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; + +{ + struct vnode *vp = ap->a_vp; + register struct vattr *vap = ap->a_vap; + register struct iso_node *ip = VTOI(vp); + int i; + + vap->va_fsid = ip->i_dev; + vap->va_fileid = ip->i_number; + + vap->va_mode = ip->inode.iso_mode; + vap->va_nlink = ip->inode.iso_links; + vap->va_uid = ip->inode.iso_uid; + vap->va_gid = ip->inode.iso_gid; + vap->va_atime = ip->inode.iso_atime; + vap->va_mtime = ip->inode.iso_mtime; + vap->va_ctime = ip->inode.iso_ctime; + vap->va_rdev = ip->inode.iso_rdev; + + vap->va_size = (u_quad_t) ip->i_size; + vap->va_flags = 0; + vap->va_gen = 1; + vap->va_blocksize = ip->i_mnt->logical_block_size; + vap->va_bytes = (u_quad_t) ip->i_size; + vap->va_type = vp->v_type; + return (0); +} + +#if ISO_DEFAULT_BLOCK_SIZE >= NBPG +#ifdef DEBUG +extern int doclusterread; +#else +#define doclusterread 1 +#endif +#else +/* XXX until cluster routines can handle block sizes less than one page */ +#define doclusterread 0 +#endif + +/* + * Vnode op for reading. + */ +cd9660_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + register struct uio *uio = ap->a_uio; + register struct iso_node *ip = VTOI(vp); + register struct iso_mnt *imp; + struct buf *bp; + daddr_t lbn, bn, rablock; + off_t diff; + int rasize, error = 0; + long size, n, on; + + if (uio->uio_resid == 0) + return (0); + if (uio->uio_offset < 0) + return (EINVAL); + ip->i_flag |= IACC; + imp = ip->i_mnt; + do { + lbn = iso_lblkno(imp, uio->uio_offset); + on = iso_blkoff(imp, uio->uio_offset); + n = min((unsigned)(imp->logical_block_size - on), + uio->uio_resid); + diff = (off_t)ip->i_size - uio->uio_offset; + if (diff <= 0) + return (0); + if (diff < n) + n = diff; + size = iso_blksize(imp, ip, lbn); + rablock = lbn + 1; + if (doclusterread) { + if (iso_lblktosize(imp, rablock) <= ip->i_size) + error = cluster_read(vp, (off_t)ip->i_size, + lbn, size, NOCRED, &bp); + else + error = bread(vp, lbn, size, NOCRED, &bp); + } else { + if (vp->v_lastr + 1 == lbn && + iso_lblktosize(imp, rablock) < ip->i_size) { + rasize = iso_blksize(imp, ip, rablock); + error = breadn(vp, lbn, size, &rablock, + &rasize, 1, NOCRED, &bp); + } else + error = bread(vp, lbn, size, NOCRED, &bp); + } + vp->v_lastr = lbn; + n = min(n, size - bp->b_resid); + if (error) { + brelse(bp); + return (error); + } + + error = uiomove(bp->b_un.b_addr + on, (int)n, uio); + if (n + on == imp->logical_block_size || + uio->uio_offset == (off_t)ip->i_size) + bp->b_flags |= B_AGE; + brelse(bp); + } while (error == 0 && uio->uio_resid > 0 && n != 0); + return (error); +} + +/* ARGSUSED */ +int +cd9660_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + printf("You did ioctl for isofs !!\n"); + return (ENOTTY); +} + +/* ARGSUSED */ +int +cd9660_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* + * We should really check to see if I/O is possible. + */ + return (1); +} + +/* + * Mmap a file + * + * NB Currently unsupported. + */ +/* ARGSUSED */ +int +cd9660_mmap(ap) + struct vop_mmap_args /* { + struct vnode *a_vp; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (EINVAL); +} + +/* + * Seek on a file + * + * Nothing to do, so just return. + */ +/* ARGSUSED */ +int +cd9660_seek(ap) + struct vop_seek_args /* { + struct vnode *a_vp; + off_t a_oldoff; + off_t a_newoff; + struct ucred *a_cred; + } */ *ap; +{ + + return (0); +} + +/* + * Structure for reading directories + */ +struct isoreaddir { + struct dirent saveent; + struct dirent assocent; + struct dirent current; + off_t saveoff; + off_t assocoff; + off_t curroff; + struct uio *uio; + off_t uio_off; + u_int *cookiep; + int ncookies; + int eof; +}; + +static int +iso_uiodir(idp,dp,off) + struct isoreaddir *idp; + struct dirent *dp; + off_t off; +{ + int error; + + dp->d_name[dp->d_namlen] = 0; + dp->d_reclen = DIRSIZ(dp); + + if (idp->uio->uio_resid < dp->d_reclen) { + idp->eof = 0; + return -1; + } + + if (idp->cookiep) { + if (idp->ncookies <= 0) { + idp->eof = 0; + return -1; + } + + *idp->cookiep++ = off; + --idp->ncookies; + } + + if (error = uiomove(dp,dp->d_reclen,idp->uio)) + return error; + idp->uio_off = off; + return 0; +} + +static int +iso_shipdir(idp) + struct isoreaddir *idp; +{ + struct dirent *dp; + int cl, sl, assoc; + int error; + char *cname, *sname; + + cl = idp->current.d_namlen; + cname = idp->current.d_name; + if (assoc = cl > 1 && *cname == ASSOCCHAR) { + cl--; + cname++; + } + + dp = &idp->saveent; + sname = dp->d_name; + if (!(sl = dp->d_namlen)) { + dp = &idp->assocent; + sname = dp->d_name + 1; + sl = dp->d_namlen - 1; + } + if (sl > 0) { + if (sl != cl + || bcmp(sname,cname,sl)) { + if (idp->assocent.d_namlen) { + if (error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) + return error; + idp->assocent.d_namlen = 0; + } + if (idp->saveent.d_namlen) { + if (error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) + return error; + idp->saveent.d_namlen = 0; + } + } + } + idp->current.d_reclen = DIRSIZ(&idp->current); + if (assoc) { + idp->assocoff = idp->curroff; + bcopy(&idp->current,&idp->assocent,idp->current.d_reclen); + } else { + idp->saveoff = idp->curroff; + bcopy(&idp->current,&idp->saveent,idp->current.d_reclen); + } + return 0; +} + +/* + * Vnode op for readdir + * XXX make sure everything still works now that eofflagp and cookiep + * are no longer args. + */ +int +cd9660_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + register struct uio *uio = ap->a_uio; + struct isoreaddir *idp; + int entryoffsetinblock; + int error = 0; + int endsearch; + struct iso_directory_record *ep; + u_short elen; + int reclen; + struct iso_mnt *imp; + struct iso_node *ip; + struct buf *bp = NULL; + + ip = VTOI(ap->a_vp); + imp = ip->i_mnt; + + MALLOC(idp,struct isoreaddir *,sizeof(*idp),M_TEMP,M_WAITOK); + idp->saveent.d_namlen = 0; + idp->assocent.d_namlen = 0; + idp->uio = uio; +#if 0 + idp->cookiep = cookies; + idp->ncookies = ncookies; + idp->eof = 1; +#else + idp->cookiep = 0; +#endif + idp->curroff = uio->uio_offset; + + entryoffsetinblock = iso_blkoff(imp, idp->curroff); + if (entryoffsetinblock != 0) { + if (error = iso_blkatoff(ip, idp->curroff, &bp)) { + FREE(idp,M_TEMP); + return (error); + } + } + + endsearch = ip->i_size; + + while (idp->curroff < endsearch) { + /* + * If offset is on a block boundary, + * read the next directory block. + * Release previous if it exists. + */ + + if (iso_blkoff(imp, idp->curroff) == 0) { + if (bp != NULL) + brelse(bp); + if (error = iso_blkatoff(ip, idp->curroff, &bp)) + break; + entryoffsetinblock = 0; + } + /* + * Get pointer to next entry. + */ + + ep = (struct iso_directory_record *) + (bp->b_un.b_addr + entryoffsetinblock); + + reclen = isonum_711 (ep->length); + if (reclen == 0) { + /* skip to next block, if any */ + idp->curroff = roundup (idp->curroff, + imp->logical_block_size); + continue; + } + + if (reclen < ISO_DIRECTORY_RECORD_SIZE) { + error = EINVAL; + /* illegal entry, stop */ + break; + } + + if (entryoffsetinblock + reclen > imp->logical_block_size) { + error = EINVAL; + /* illegal directory, so stop looking */ + break; + } + + idp->current.d_namlen = isonum_711 (ep->name_len); + if (isonum_711(ep->flags)&2) + isodirino(&idp->current.d_fileno,ep,imp); + else + idp->current.d_fileno = dbtob(bp->b_blkno) + + idp->curroff; + + if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) { + error = EINVAL; + /* illegal entry, stop */ + break; + } + + idp->curroff += reclen; + /* + * + */ + switch (imp->iso_ftype) { + case ISO_FTYPE_RRIP: + cd9660_rrip_getname(ep,idp->current.d_name, + (u_short *)&idp->current.d_namlen, + &idp->current.d_fileno,imp); + if (idp->current.d_namlen) + error = iso_uiodir(idp,&idp->current,idp->curroff); + break; + default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 */ + strcpy(idp->current.d_name,".."); + switch (ep->name[0]) { + case 0: + idp->current.d_namlen = 1; + error = iso_uiodir(idp,&idp->current,idp->curroff); + break; + case 1: + idp->current.d_namlen = 2; + error = iso_uiodir(idp,&idp->current,idp->curroff); + break; + default: + isofntrans(ep->name,idp->current.d_namlen, + idp->current.d_name, &elen, + imp->iso_ftype == ISO_FTYPE_9660, + isonum_711(ep->flags)&4); + idp->current.d_namlen = (u_char)elen; + if (imp->iso_ftype == ISO_FTYPE_DEFAULT) + error = iso_shipdir(idp); + else + error = iso_uiodir(idp,&idp->current,idp->curroff); + break; + } + } + if (error) + break; + + entryoffsetinblock += reclen; + } + + if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) { + idp->current.d_namlen = 0; + error = iso_shipdir(idp); + } + if (error < 0) + error = 0; + + if (bp) + brelse (bp); + + uio->uio_offset = idp->uio_off; +#if 0 + *eofflagp = idp->eof; +#endif + + FREE(idp,M_TEMP); + + return (error); +} + +/* + * Return target name of a symbolic link + * Shouldn't we get the parent vnode and read the data from there? + * This could eventually result in deadlocks in cd9660_lookup. + * But otherwise the block read here is in the block buffer two times. + */ +typedef struct iso_directory_record ISODIR; +typedef struct iso_node ISONODE; +typedef struct iso_mnt ISOMNT; +int +cd9660_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + ISONODE *ip; + ISODIR *dirp; + ISOMNT *imp; + struct buf *bp; + u_short symlen; + int error; + char *symname; + ino_t ino; + + ip = VTOI(ap->a_vp); + imp = ip->i_mnt; + + if (imp->iso_ftype != ISO_FTYPE_RRIP) + return EINVAL; + + /* + * Get parents directory record block that this inode included. + */ + error = bread(imp->im_devvp, + (daddr_t)(ip->i_number / DEV_BSIZE), + imp->logical_block_size, + NOCRED, + &bp); + if (error) { + brelse(bp); + return EINVAL; + } + + /* + * Setup the directory pointer for this inode + */ + dirp = (ISODIR *)(bp->b_un.b_addr + (ip->i_number & imp->im_bmask)); +#ifdef DEBUG + printf("lbn=%d,off=%d,bsize=%d,DEV_BSIZE=%d, dirp= %08x, b_addr=%08x, offset=%08x(%08x)\n", + (daddr_t)(ip->i_number >> imp->im_bshift), + ip->i_number & imp->im_bmask, + imp->logical_block_size, + DEV_BSIZE, + dirp, + bp->b_un.b_addr, + ip->i_number, + ip->i_number & imp->im_bmask ); +#endif + + /* + * Just make sure, we have a right one.... + * 1: Check not cross boundary on block + */ + if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length) + > imp->logical_block_size) { + brelse(bp); + return EINVAL; + } + + /* + * Now get a buffer + * Abuse a namei buffer for now. + */ + MALLOC(symname,char *,MAXPATHLEN,M_NAMEI,M_WAITOK); + + /* + * Ok, we just gathering a symbolic name in SL record. + */ + if (cd9660_rrip_getsymname(dirp,symname,&symlen,imp) == 0) { + FREE(symname,M_NAMEI); + brelse(bp); + return EINVAL; + } + /* + * Don't forget before you leave from home ;-) + */ + brelse(bp); + + /* + * return with the symbolic name to caller's. + */ + error = uiomove(symname,symlen,ap->a_uio); + + FREE(symname,M_NAMEI); + + return error; +} + +/* + * Ufs abort op, called after namei() when a CREATE/DELETE isn't actually + * done. If a buffer has been saved in anticipation of a CREATE, delete it. + */ +int +cd9660_abortop(ap) + struct vop_abortop_args /* { + struct vnode *a_dvp; + struct componentname *a_cnp; + } */ *ap; +{ + if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) + FREE(ap->a_cnp->cn_pnbuf, M_NAMEI); + return 0; +} + +/* + * Lock an inode. + */ +int +cd9660_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct iso_node *ip = VTOI(ap->a_vp); + + ISO_ILOCK(ip); + return 0; +} + +/* + * Unlock an inode. + */ +int +cd9660_unlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct iso_node *ip = VTOI(ap->a_vp); + + if (!(ip->i_flag & ILOCKED)) + panic("cd9660_unlock NOT LOCKED"); + ISO_IUNLOCK(ip); + return 0; +} + +/* + * Check for a locked inode. + */ +int +cd9660_islocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + if (VTOI(ap->a_vp)->i_flag & ILOCKED) + return 1; + return 0; +} + +/* + * Calculate the logical to physical mapping if not done already, + * then call the device strategy routine. + */ +int +cd9660_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + register struct buf *bp = ap->a_bp; + register struct vnode *vp = bp->b_vp; + register struct iso_node *ip; + int error; + + ip = VTOI(vp); + if (vp->v_type == VBLK || vp->v_type == VCHR) + panic("cd9660_strategy: spec"); + if (bp->b_blkno == bp->b_lblkno) { + if (error = + VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) { + bp->b_error = error; + bp->b_flags |= B_ERROR; + biodone(bp); + return (error); + } + if ((long)bp->b_blkno == -1) + clrbuf(bp); + } + if ((long)bp->b_blkno == -1) { + biodone(bp); + return (0); + } + vp = ip->i_devvp; + bp->b_dev = vp->v_rdev; + VOCALL (vp->v_op, VOFFSET(vop_strategy), ap); + return (0); +} + +/* + * Print out the contents of an inode. + */ +int +cd9660_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + printf("tag VT_ISOFS, isofs vnode\n"); + return 0; +} + +/* + * Unsupported operation + */ +int +cd9660_enotsupp() +{ + + return (EOPNOTSUPP); +} + +/* + * Global vfs data structures for isofs + */ +#define cd9660_create \ + ((int (*) __P((struct vop_create_args *)))cd9660_enotsupp) +#define cd9660_mknod ((int (*) __P((struct vop_mknod_args *)))cd9660_enotsupp) +#define cd9660_setattr \ + ((int (*) __P((struct vop_setattr_args *)))cd9660_enotsupp) +#define cd9660_write ((int (*) __P((struct vop_write_args *)))cd9660_enotsupp) +#define cd9660_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) +#define cd9660_remove \ + ((int (*) __P((struct vop_remove_args *)))cd9660_enotsupp) +#define cd9660_link ((int (*) __P((struct vop_link_args *)))cd9660_enotsupp) +#define cd9660_rename \ + ((int (*) __P((struct vop_rename_args *)))cd9660_enotsupp) +#define cd9660_mkdir ((int (*) __P((struct vop_mkdir_args *)))cd9660_enotsupp) +#define cd9660_rmdir ((int (*) __P((struct vop_rmdir_args *)))cd9660_enotsupp) +#define cd9660_symlink \ + ((int (*) __P((struct vop_symlink_args *)))cd9660_enotsupp) +#define cd9660_pathconf \ + ((int (*) __P((struct vop_pathconf_args *)))cd9660_enotsupp) +#define cd9660_advlock \ + ((int (*) __P((struct vop_advlock_args *)))cd9660_enotsupp) +#define cd9660_blkatoff \ + ((int (*) __P((struct vop_blkatoff_args *)))cd9660_enotsupp) +#define cd9660_valloc ((int(*) __P(( \ + struct vnode *pvp, \ + int mode, \ + struct ucred *cred, \ + struct vnode **vpp))) cd9660_enotsupp) +#define cd9660_vfree ((int (*) __P((struct vop_vfree_args *)))cd9660_enotsupp) +#define cd9660_truncate \ + ((int (*) __P((struct vop_truncate_args *)))cd9660_enotsupp) +#define cd9660_update \ + ((int (*) __P((struct vop_update_args *)))cd9660_enotsupp) +#define cd9660_bwrite \ + ((int (*) __P((struct vop_bwrite_args *)))cd9660_enotsupp) + +/* + * Global vfs data structures for nfs + */ +int (**cd9660_vnodeop_p)(); +struct vnodeopv_entry_desc cd9660_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, cd9660_lookup }, /* lookup */ + { &vop_create_desc, cd9660_create }, /* create */ + { &vop_mknod_desc, cd9660_mknod }, /* mknod */ + { &vop_open_desc, cd9660_open }, /* open */ + { &vop_close_desc, cd9660_close }, /* close */ + { &vop_access_desc, cd9660_access }, /* access */ + { &vop_getattr_desc, cd9660_getattr }, /* getattr */ + { &vop_setattr_desc, cd9660_setattr }, /* setattr */ + { &vop_read_desc, cd9660_read }, /* read */ + { &vop_write_desc, cd9660_write }, /* write */ + { &vop_ioctl_desc, cd9660_ioctl }, /* ioctl */ + { &vop_select_desc, cd9660_select }, /* select */ + { &vop_mmap_desc, cd9660_mmap }, /* mmap */ + { &vop_fsync_desc, cd9660_fsync }, /* fsync */ + { &vop_seek_desc, cd9660_seek }, /* seek */ + { &vop_remove_desc, cd9660_remove }, /* remove */ + { &vop_link_desc, cd9660_link }, /* link */ + { &vop_rename_desc, cd9660_rename }, /* rename */ + { &vop_mkdir_desc, cd9660_mkdir }, /* mkdir */ + { &vop_rmdir_desc, cd9660_rmdir }, /* rmdir */ + { &vop_symlink_desc, cd9660_symlink }, /* symlink */ + { &vop_readdir_desc, cd9660_readdir }, /* readdir */ + { &vop_readlink_desc, cd9660_readlink },/* readlink */ + { &vop_abortop_desc, cd9660_abortop }, /* abortop */ + { &vop_inactive_desc, cd9660_inactive },/* inactive */ + { &vop_reclaim_desc, cd9660_reclaim }, /* reclaim */ + { &vop_lock_desc, cd9660_lock }, /* lock */ + { &vop_unlock_desc, cd9660_unlock }, /* unlock */ + { &vop_bmap_desc, cd9660_bmap }, /* bmap */ + { &vop_strategy_desc, cd9660_strategy },/* strategy */ + { &vop_print_desc, cd9660_print }, /* print */ + { &vop_islocked_desc, cd9660_islocked },/* islocked */ + { &vop_pathconf_desc, cd9660_pathconf },/* pathconf */ + { &vop_advlock_desc, cd9660_advlock }, /* advlock */ + { &vop_blkatoff_desc, cd9660_blkatoff },/* blkatoff */ + { &vop_valloc_desc, cd9660_valloc }, /* valloc */ + { &vop_vfree_desc, cd9660_vfree }, /* vfree */ + { &vop_truncate_desc, cd9660_truncate },/* truncate */ + { &vop_update_desc, cd9660_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc cd9660_vnodeop_opv_desc = + { &cd9660_vnodeop_p, cd9660_vnodeop_entries }; + +/* + * Special device vnode ops + */ +int (**cd9660_specop_p)(); +struct vnodeopv_entry_desc cd9660_specop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, cd9660_create }, /* create */ + { &vop_mknod_desc, cd9660_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, spec_close }, /* close */ + { &vop_access_desc, cd9660_access }, /* access */ + { &vop_getattr_desc, cd9660_getattr }, /* getattr */ + { &vop_setattr_desc, cd9660_setattr }, /* setattr */ + { &vop_read_desc, spec_read }, /* read */ + { &vop_write_desc, spec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_select_desc, spec_select }, /* select */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, cd9660_remove }, /* remove */ + { &vop_link_desc, cd9660_link }, /* link */ + { &vop_rename_desc, cd9660_rename }, /* rename */ + { &vop_mkdir_desc, cd9660_mkdir }, /* mkdir */ + { &vop_rmdir_desc, cd9660_rmdir }, /* rmdir */ + { &vop_symlink_desc, cd9660_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, cd9660_inactive },/* inactive */ + { &vop_reclaim_desc, cd9660_reclaim }, /* reclaim */ + { &vop_lock_desc, cd9660_lock }, /* lock */ + { &vop_unlock_desc, cd9660_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + /* XXX strategy: panics, should be notsupp instead? */ + { &vop_strategy_desc, cd9660_strategy },/* strategy */ + { &vop_print_desc, cd9660_print }, /* print */ + { &vop_islocked_desc, cd9660_islocked },/* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_blkatoff_desc, spec_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, spec_valloc }, /* valloc */ + { &vop_vfree_desc, spec_vfree }, /* vfree */ + { &vop_truncate_desc, spec_truncate }, /* truncate */ + { &vop_update_desc, cd9660_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc cd9660_specop_opv_desc = + { &cd9660_specop_p, cd9660_specop_entries }; + +#ifdef FIFO +int (**cd9660_fifoop_p)(); +struct vnodeopv_entry_desc cd9660_fifoop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fifo_lookup }, /* lookup */ + { &vop_create_desc, cd9660_create }, /* create */ + { &vop_mknod_desc, cd9660_mknod }, /* mknod */ + { &vop_open_desc, fifo_open }, /* open */ + { &vop_close_desc, fifo_close }, /* close */ + { &vop_access_desc, cd9660_access }, /* access */ + { &vop_getattr_desc, cd9660_getattr }, /* getattr */ + { &vop_setattr_desc, cd9660_setattr }, /* setattr */ + { &vop_read_desc, fifo_read }, /* read */ + { &vop_write_desc, fifo_write }, /* write */ + { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */ + { &vop_select_desc, fifo_select }, /* select */ + { &vop_mmap_desc, fifo_mmap }, /* mmap */ + { &vop_fsync_desc, fifo_fsync }, /* fsync */ + { &vop_seek_desc, fifo_seek }, /* seek */ + { &vop_remove_desc, cd9660_remove }, /* remove */ + { &vop_link_desc, cd9660_link }, /* link */ + { &vop_rename_desc, cd9660_rename }, /* rename */ + { &vop_mkdir_desc, cd9660_mkdir }, /* mkdir */ + { &vop_rmdir_desc, cd9660_rmdir }, /* rmdir */ + { &vop_symlink_desc, cd9660_symlink }, /* symlink */ + { &vop_readdir_desc, fifo_readdir }, /* readdir */ + { &vop_readlink_desc, fifo_readlink }, /* readlink */ + { &vop_abortop_desc, fifo_abortop }, /* abortop */ + { &vop_inactive_desc, cd9660_inactive },/* inactive */ + { &vop_reclaim_desc, cd9660_reclaim }, /* reclaim */ + { &vop_lock_desc, cd9660_lock }, /* lock */ + { &vop_unlock_desc, cd9660_unlock }, /* unlock */ + { &vop_bmap_desc, fifo_bmap }, /* bmap */ + { &vop_strategy_desc, fifo_badop }, /* strategy */ + { &vop_print_desc, cd9660_print }, /* print */ + { &vop_islocked_desc, cd9660_islocked },/* islocked */ + { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */ + { &vop_advlock_desc, fifo_advlock }, /* advlock */ + { &vop_blkatoff_desc, fifo_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fifo_valloc }, /* valloc */ + { &vop_vfree_desc, fifo_vfree }, /* vfree */ + { &vop_truncate_desc, fifo_truncate }, /* truncate */ + { &vop_update_desc, cd9660_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc cd9660_fifoop_opv_desc = + { &cd9660_fifoop_p, cd9660_fifoop_entries }; +#endif /* FIFO */ diff --git a/sys/fs/cd9660/iso.h b/sys/fs/cd9660/iso.h new file mode 100644 index 00000000000..e3567066e1c --- /dev/null +++ b/sys/fs/cd9660/iso.h @@ -0,0 +1,256 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso.h 8.2 (Berkeley) 1/23/94 + */ + +#define ISODCL(from, to) (to - from + 1) + +struct iso_volume_descriptor { + char type[ISODCL(1,1)]; /* 711 */ + char id[ISODCL(2,6)]; + char version[ISODCL(7,7)]; + char data[ISODCL(8,2048)]; +}; + +/* volume descriptor types */ +#define ISO_VD_PRIMARY 1 +#define ISO_VD_END 255 + +#define ISO_STANDARD_ID "CD001" +#define ISO_ECMA_ID "CDW01" + +struct iso_primary_descriptor { + char type [ISODCL ( 1, 1)]; /* 711 */ + char id [ISODCL ( 2, 6)]; + char version [ISODCL ( 7, 7)]; /* 711 */ + char unused1 [ISODCL ( 8, 8)]; + char system_id [ISODCL ( 9, 40)]; /* achars */ + char volume_id [ISODCL ( 41, 72)]; /* dchars */ + char unused2 [ISODCL ( 73, 80)]; + char volume_space_size [ISODCL ( 81, 88)]; /* 733 */ + char unused3 [ISODCL ( 89, 120)]; + char volume_set_size [ISODCL (121, 124)]; /* 723 */ + char volume_sequence_number [ISODCL (125, 128)]; /* 723 */ + char logical_block_size [ISODCL (129, 132)]; /* 723 */ + char path_table_size [ISODCL (133, 140)]; /* 733 */ + char type_l_path_table [ISODCL (141, 144)]; /* 731 */ + char opt_type_l_path_table [ISODCL (145, 148)]; /* 731 */ + char type_m_path_table [ISODCL (149, 152)]; /* 732 */ + char opt_type_m_path_table [ISODCL (153, 156)]; /* 732 */ + char root_directory_record [ISODCL (157, 190)]; /* 9.1 */ + char volume_set_id [ISODCL (191, 318)]; /* dchars */ + char publisher_id [ISODCL (319, 446)]; /* achars */ + char preparer_id [ISODCL (447, 574)]; /* achars */ + char application_id [ISODCL (575, 702)]; /* achars */ + char copyright_file_id [ISODCL (703, 739)]; /* 7.5 dchars */ + char abstract_file_id [ISODCL (740, 776)]; /* 7.5 dchars */ + char bibliographic_file_id [ISODCL (777, 813)]; /* 7.5 dchars */ + char creation_date [ISODCL (814, 830)]; /* 8.4.26.1 */ + char modification_date [ISODCL (831, 847)]; /* 8.4.26.1 */ + char expiration_date [ISODCL (848, 864)]; /* 8.4.26.1 */ + char effective_date [ISODCL (865, 881)]; /* 8.4.26.1 */ + char file_structure_version [ISODCL (882, 882)]; /* 711 */ + char unused4 [ISODCL (883, 883)]; + char application_data [ISODCL (884, 1395)]; + char unused5 [ISODCL (1396, 2048)]; +}; +#define ISO_DEFAULT_BLOCK_SIZE 2048 + +struct iso_directory_record { + char length [ISODCL (1, 1)]; /* 711 */ + char ext_attr_length [ISODCL (2, 2)]; /* 711 */ + unsigned char extent [ISODCL (3, 10)]; /* 733 */ + unsigned char size [ISODCL (11, 18)]; /* 733 */ + char date [ISODCL (19, 25)]; /* 7 by 711 */ + char flags [ISODCL (26, 26)]; + char file_unit_size [ISODCL (27, 27)]; /* 711 */ + char interleave [ISODCL (28, 28)]; /* 711 */ + char volume_sequence_number [ISODCL (29, 32)]; /* 723 */ + char name_len [ISODCL (33, 33)]; /* 711 */ + char name [0]; +}; +/* can't take sizeof(iso_directory_record), because of possible alignment + of the last entry (34 instead of 33) */ +#define ISO_DIRECTORY_RECORD_SIZE 33 + +struct iso_extended_attributes { + unsigned char owner [ISODCL (1, 4)]; /* 723 */ + unsigned char group [ISODCL (5, 8)]; /* 723 */ + unsigned char perm [ISODCL (9, 10)]; /* 9.5.3 */ + char ctime [ISODCL (11, 27)]; /* 8.4.26.1 */ + char mtime [ISODCL (28, 44)]; /* 8.4.26.1 */ + char xtime [ISODCL (45, 61)]; /* 8.4.26.1 */ + char ftime [ISODCL (62, 78)]; /* 8.4.26.1 */ + char recfmt [ISODCL (79, 79)]; /* 711 */ + char recattr [ISODCL (80, 80)]; /* 711 */ + unsigned char reclen [ISODCL (81, 84)]; /* 723 */ + char system_id [ISODCL (85, 116)]; /* achars */ + char system_use [ISODCL (117, 180)]; + char version [ISODCL (181, 181)]; /* 711 */ + char len_esc [ISODCL (182, 182)]; /* 711 */ + char reserved [ISODCL (183, 246)]; + unsigned char len_au [ISODCL (247, 250)]; /* 723 */ +}; + +/* CD-ROM Format type */ +enum ISO_FTYPE { ISO_FTYPE_DEFAULT, ISO_FTYPE_9660, ISO_FTYPE_RRIP, ISO_FTYPE_ECMA }; + +#ifndef ISOFSMNT_ROOT +#define ISOFSMNT_ROOT 0 +#endif + +struct iso_mnt { + int im_flags; + + struct mount *im_mountp; + dev_t im_dev; + struct vnode *im_devvp; + + int logical_block_size; + int im_bshift; + int im_bmask; + + int volume_space_size; + char im_fsmnt[50]; + struct netexport im_export; + + char root[ISODCL (157, 190)]; + int root_extent; + int root_size; + enum ISO_FTYPE iso_ftype; + + int rr_skip; + int rr_skip0; +}; + +#define VFSTOISOFS(mp) ((struct iso_mnt *)((mp)->mnt_data)) + +#define iso_blkoff(imp, loc) ((loc) & (imp)->im_bmask) +#define iso_lblkno(imp, loc) ((loc) >> (imp)->im_bshift) +#define iso_blksize(imp, ip, lbn) ((imp)->logical_block_size) +#define iso_lblktosize(imp, blk) ((blk) << (imp)->im_bshift) + +int cd9660_mount __P((struct mount *, + char *, caddr_t, struct nameidata *, struct proc *)); +int cd9660_start __P((struct mount *, int, struct proc *)); +int cd9660_unmount __P((struct mount *, int, struct proc *)); +int cd9660_root __P((struct mount *, struct vnode **)); +int cd9660_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); +int cd9660_statfs __P((struct mount *, struct statfs *, struct proc *)); +int cd9660_sync __P((struct mount *, int, struct ucred *, struct proc *)); +int cd9660_vget __P((struct mount *, ino_t, struct vnode **)); +int cd9660_fhtovp __P((struct mount *, struct fid *, struct mbuf *, + struct vnode **, int *, struct ucred **)); +int cd9660_vptofh __P((struct vnode *, struct fid *)); +int cd9660_init __P(()); + +struct iso_node; +int iso_blkatoff __P((struct iso_node *ip, long offset, struct buf **bpp)); +int iso_iget __P((struct iso_node *xp, ino_t ino, int relocated, + struct iso_node **ipp, struct iso_directory_record *isodir)); +int iso_iput __P((struct iso_node *ip)); +int iso_ilock __P((struct iso_node *ip)); +int iso_iunlock __P((struct iso_node *ip)); +int cd9660_mountroot __P((void)); + +extern int (**cd9660_vnodeop_p)(); + +extern inline int +isonum_711(p) + unsigned char *p; +{ + return *p; +} + +extern inline int +isonum_712(p) + char *p; +{ + return *p; +} + +extern inline int +isonum_721(p) + unsigned char *p; +{ + return *p|((char)p[1] << 8); +} + +extern inline int +isonum_722(p) + unsigned char *p; +{ + return ((char)*p << 8)|p[1]; +} + +extern inline int +isonum_723(p) + unsigned char *p; +{ + return isonum_721(p); +} + +extern inline int +isonum_731(p) + unsigned char *p; +{ + return *p|(p[1] << 8)|(p[2] << 16)|(p[3] << 24); +} + +extern inline int +isonum_732(p) + unsigned char *p; +{ + return (*p << 24)|(p[1] << 16)|(p[2] << 8)|p[3]; +} + +extern inline int +isonum_733(p) + unsigned char *p; +{ + return isonum_731(p); +} + +int isofncmp __P((unsigned char *, int, unsigned char *, int)); +void isofntrans __P((unsigned char *, int, unsigned char *, unsigned short *, + int, int)); + +/* + * Associated files have a leading '='. + */ +#define ASSOCCHAR '=' diff --git a/sys/fs/cd9660/iso_rrip.h b/sys/fs/cd9660/iso_rrip.h new file mode 100644 index 00000000000..78e4a775201 --- /dev/null +++ b/sys/fs/cd9660/iso_rrip.h @@ -0,0 +1,83 @@ +/*- + * Copyright (c) 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso_rrip.h 8.2 (Berkeley) 1/23/94 + */ + + +/* + * Analyze function flag (similar to RR field bits) + */ +#define ISO_SUSP_ATTR 0x0001 +#define ISO_SUSP_DEVICE 0x0002 +#define ISO_SUSP_SLINK 0x0004 +#define ISO_SUSP_ALTNAME 0x0008 +#define ISO_SUSP_CLINK 0x0010 +#define ISO_SUSP_PLINK 0x0020 +#define ISO_SUSP_RELDIR 0x0040 +#define ISO_SUSP_TSTAMP 0x0080 +#define ISO_SUSP_IDFLAG 0x0100 +#define ISO_SUSP_EXTREF 0x0200 +#define ISO_SUSP_CONT 0x0400 +#define ISO_SUSP_OFFSET 0x0800 +#define ISO_SUSP_STOP 0x1000 +#define ISO_SUSP_UNKNOWN 0x8000 + +typedef struct { + struct iso_node *inop; + int fields; /* interesting fields in this analysis */ + daddr_t iso_ce_blk; /* block of continuation area */ + off_t iso_ce_off; /* offset of continuation area */ + int iso_ce_len; /* length of continuation area */ + struct iso_mnt *imp; /* mount structure */ + ino_t *inump; /* inode number pointer */ + char *outbuf; /* name/symbolic link output area */ + u_short *outlen; /* length of above */ + u_short maxlen; /* maximum length of above */ + int cont; /* continuation of above */ +} ISO_RRIP_ANALYZE; + +int cd9660_rrip_analyze __P((struct iso_directory_record *isodir, + struct iso_node *inop, struct iso_mnt *imp)); +int cd9660_rrip_getname __P((struct iso_directory_record *isodir, + char *outbuf, u_short *outlen, + ino_t *inump, struct iso_mnt *imp)); +int cd9660_rrip_getsymname __P((struct iso_directory_record *isodir, + char *outbuf, u_short *outlen, + struct iso_mnt *imp)); +int cd9660_rrip_offset __P((struct iso_directory_record *isodir, + struct iso_mnt *imp)); diff --git a/sys/fs/deadfs/dead_vnops.c b/sys/fs/deadfs/dead_vnops.c new file mode 100644 index 00000000000..9d04652b7fc --- /dev/null +++ b/sys/fs/deadfs/dead_vnops.c @@ -0,0 +1,354 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dead_vnops.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Prototypes for dead operations on vnodes. + */ +int dead_badop(), + dead_ebadf(); +int dead_lookup __P((struct vop_lookup_args *)); +#define dead_create ((int (*) __P((struct vop_create_args *)))dead_badop) +#define dead_mknod ((int (*) __P((struct vop_mknod_args *)))dead_badop) +int dead_open __P((struct vop_open_args *)); +#define dead_close ((int (*) __P((struct vop_close_args *)))nullop) +#define dead_access ((int (*) __P((struct vop_access_args *)))dead_ebadf) +#define dead_getattr ((int (*) __P((struct vop_getattr_args *)))dead_ebadf) +#define dead_setattr ((int (*) __P((struct vop_setattr_args *)))dead_ebadf) +int dead_read __P((struct vop_read_args *)); +int dead_write __P((struct vop_write_args *)); +int dead_ioctl __P((struct vop_ioctl_args *)); +int dead_select __P((struct vop_select_args *)); +#define dead_mmap ((int (*) __P((struct vop_mmap_args *)))dead_badop) +#define dead_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) +#define dead_seek ((int (*) __P((struct vop_seek_args *)))nullop) +#define dead_remove ((int (*) __P((struct vop_remove_args *)))dead_badop) +#define dead_link ((int (*) __P((struct vop_link_args *)))dead_badop) +#define dead_rename ((int (*) __P((struct vop_rename_args *)))dead_badop) +#define dead_mkdir ((int (*) __P((struct vop_mkdir_args *)))dead_badop) +#define dead_rmdir ((int (*) __P((struct vop_rmdir_args *)))dead_badop) +#define dead_symlink ((int (*) __P((struct vop_symlink_args *)))dead_badop) +#define dead_readdir ((int (*) __P((struct vop_readdir_args *)))dead_ebadf) +#define dead_readlink ((int (*) __P((struct vop_readlink_args *)))dead_ebadf) +#define dead_abortop ((int (*) __P((struct vop_abortop_args *)))dead_badop) +#define dead_inactive ((int (*) __P((struct vop_inactive_args *)))nullop) +#define dead_reclaim ((int (*) __P((struct vop_reclaim_args *)))nullop) +int dead_lock __P((struct vop_lock_args *)); +#define dead_unlock ((int (*) __P((struct vop_unlock_args *)))nullop) +int dead_bmap __P((struct vop_bmap_args *)); +int dead_strategy __P((struct vop_strategy_args *)); +int dead_print __P((struct vop_print_args *)); +#define dead_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +#define dead_pathconf ((int (*) __P((struct vop_pathconf_args *)))dead_ebadf) +#define dead_advlock ((int (*) __P((struct vop_advlock_args *)))dead_ebadf) +#define dead_blkatoff ((int (*) __P((struct vop_blkatoff_args *)))dead_badop) +#define dead_valloc ((int (*) __P((struct vop_valloc_args *)))dead_badop) +#define dead_vfree ((int (*) __P((struct vop_vfree_args *)))dead_badop) +#define dead_truncate ((int (*) __P((struct vop_truncate_args *)))nullop) +#define dead_update ((int (*) __P((struct vop_update_args *)))nullop) +#define dead_bwrite ((int (*) __P((struct vop_bwrite_args *)))nullop) + +int (**dead_vnodeop_p)(); +struct vnodeopv_entry_desc dead_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, dead_lookup }, /* lookup */ + { &vop_create_desc, dead_create }, /* create */ + { &vop_mknod_desc, dead_mknod }, /* mknod */ + { &vop_open_desc, dead_open }, /* open */ + { &vop_close_desc, dead_close }, /* close */ + { &vop_access_desc, dead_access }, /* access */ + { &vop_getattr_desc, dead_getattr }, /* getattr */ + { &vop_setattr_desc, dead_setattr }, /* setattr */ + { &vop_read_desc, dead_read }, /* read */ + { &vop_write_desc, dead_write }, /* write */ + { &vop_ioctl_desc, dead_ioctl }, /* ioctl */ + { &vop_select_desc, dead_select }, /* select */ + { &vop_mmap_desc, dead_mmap }, /* mmap */ + { &vop_fsync_desc, dead_fsync }, /* fsync */ + { &vop_seek_desc, dead_seek }, /* seek */ + { &vop_remove_desc, dead_remove }, /* remove */ + { &vop_link_desc, dead_link }, /* link */ + { &vop_rename_desc, dead_rename }, /* rename */ + { &vop_mkdir_desc, dead_mkdir }, /* mkdir */ + { &vop_rmdir_desc, dead_rmdir }, /* rmdir */ + { &vop_symlink_desc, dead_symlink }, /* symlink */ + { &vop_readdir_desc, dead_readdir }, /* readdir */ + { &vop_readlink_desc, dead_readlink }, /* readlink */ + { &vop_abortop_desc, dead_abortop }, /* abortop */ + { &vop_inactive_desc, dead_inactive }, /* inactive */ + { &vop_reclaim_desc, dead_reclaim }, /* reclaim */ + { &vop_lock_desc, dead_lock }, /* lock */ + { &vop_unlock_desc, dead_unlock }, /* unlock */ + { &vop_bmap_desc, dead_bmap }, /* bmap */ + { &vop_strategy_desc, dead_strategy }, /* strategy */ + { &vop_print_desc, dead_print }, /* print */ + { &vop_islocked_desc, dead_islocked }, /* islocked */ + { &vop_pathconf_desc, dead_pathconf }, /* pathconf */ + { &vop_advlock_desc, dead_advlock }, /* advlock */ + { &vop_blkatoff_desc, dead_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, dead_valloc }, /* valloc */ + { &vop_vfree_desc, dead_vfree }, /* vfree */ + { &vop_truncate_desc, dead_truncate }, /* truncate */ + { &vop_update_desc, dead_update }, /* update */ + { &vop_bwrite_desc, dead_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc dead_vnodeop_opv_desc = + { &dead_vnodeop_p, dead_vnodeop_entries }; + +/* + * Trivial lookup routine that always fails. + */ +/* ARGSUSED */ +int +dead_lookup(ap) + struct vop_lookup_args /* { + struct vnode * a_dvp; + struct vnode ** a_vpp; + struct componentname * a_cnp; + } */ *ap; +{ + + *ap->a_vpp = NULL; + return (ENOTDIR); +} + +/* + * Open always fails as if device did not exist. + */ +/* ARGSUSED */ +dead_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (ENXIO); +} + +/* + * Vnode op for read + */ +/* ARGSUSED */ +dead_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + + if (chkvnlock(ap->a_vp)) + panic("dead_read: lock"); + /* + * Return EOF for character devices, EIO for others + */ + if (ap->a_vp->v_type != VCHR) + return (EIO); + return (0); +} + +/* + * Vnode op for write + */ +/* ARGSUSED */ +dead_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + + if (chkvnlock(ap->a_vp)) + panic("dead_write: lock"); + return (EIO); +} + +/* + * Device ioctl operation. + */ +/* ARGSUSED */ +dead_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + if (!chkvnlock(ap->a_vp)) + return (EBADF); + return (VCALL(ap->a_vp, VOFFSET(vop_ioctl), ap)); +} + +/* ARGSUSED */ +dead_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* + * Let the user find out that the descriptor is gone. + */ + return (1); +} + +/* + * Just call the device strategy routine + */ +dead_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + + if (ap->a_bp->b_vp == NULL || !chkvnlock(ap->a_bp->b_vp)) { + ap->a_bp->b_flags |= B_ERROR; + biodone(ap->a_bp); + return (EIO); + } + return (VOP_STRATEGY(ap->a_bp)); +} + +/* + * Wait until the vnode has finished changing state. + */ +dead_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + if (!chkvnlock(ap->a_vp)) + return (0); + return (VCALL(ap->a_vp, VOFFSET(vop_lock), ap)); +} + +/* + * Wait until the vnode has finished changing state. + */ +dead_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + + if (!chkvnlock(ap->a_vp)) + return (EIO); + return (VOP_BMAP(ap->a_vp, ap->a_bn, ap->a_vpp, ap->a_bnp, ap->a_runp)); +} + +/* + * Print out the contents of a dead vnode. + */ +/* ARGSUSED */ +dead_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + printf("tag VT_NON, dead vnode\n"); +} + +/* + * Empty vnode failed operation + */ +dead_ebadf() +{ + + return (EBADF); +} + +/* + * Empty vnode bad operation + */ +dead_badop() +{ + + panic("dead_badop called"); + /* NOTREACHED */ +} + +/* + * Empty vnode null operation + */ +dead_nullop() +{ + + return (0); +} + +/* + * We have to wait during times when the vnode is + * in a state of change. + */ +chkvnlock(vp) + register struct vnode *vp; +{ + int locked = 0; + + while (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + locked = 1; + } + return (locked); +} diff --git a/sys/fs/fdescfs/fdesc.h b/sys/fs/fdescfs/fdesc.h new file mode 100644 index 00000000000..4c682e7bd37 --- /dev/null +++ b/sys/fs/fdescfs/fdesc.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fdesc.h 8.5 (Berkeley) 1/21/94 + * + * $Id: fdesc.h,v 1.8 1993/04/06 15:28:33 jsp Exp $ + */ + +#ifdef KERNEL +struct fdescmount { + struct vnode *f_root; /* Root node */ +}; + +#define FD_ROOT 2 +#define FD_DEVFD 3 +#define FD_STDIN 4 +#define FD_STDOUT 5 +#define FD_STDERR 6 +#define FD_CTTY 7 +#define FD_DESC 8 +#define FD_MAX 12 + +typedef enum { + Froot, + Fdevfd, + Fdesc, + Flink, + Fctty +} fdntype; + +struct fdescnode { + struct fdescnode *fd_forw; /* Hash chain */ + struct fdescnode *fd_back; + struct vnode *fd_vnode; /* Back ptr to vnode */ + fdntype fd_type; /* Type of this node */ + unsigned fd_fd; /* Fd to be dup'ed */ + char *fd_link; /* Link to fd/n */ + int fd_ix; /* filesystem index */ +}; + +#define VFSTOFDESC(mp) ((struct fdescmount *)((mp)->mnt_data)) +#define VTOFDESC(vp) ((struct fdescnode *)(vp)->v_data) + +extern dev_t devctty; +extern int fdesc_init __P((void)); +extern int fdesc_root __P((struct mount *, struct vnode **)); +extern int fdesc_allocvp __P((fdntype, int, struct mount *, struct vnode **)); +extern int (**fdesc_vnodeop_p)(); +extern struct vfsops fdesc_vfsops; +#endif /* KERNEL */ diff --git a/sys/fs/fdescfs/fdesc_vfsops.c b/sys/fs/fdescfs/fdesc_vfsops.c new file mode 100644 index 00000000000..80c543da655 --- /dev/null +++ b/sys/fs/fdescfs/fdesc_vfsops.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fdesc_vfsops.c 8.4 (Berkeley) 1/21/94 + * + * $Id: fdesc_vfsops.c,v 1.9 1993/04/06 15:28:33 jsp Exp $ + */ + +/* + * /dev/fd Filesystem + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Mount the per-process file descriptors (/dev/fd) + */ +int +fdesc_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + int error = 0; + u_int size; + struct fdescmount *fmp; + struct vnode *rvp; + + /* + * Update is a no-op + */ + if (mp->mnt_flag & MNT_UPDATE) + return (EOPNOTSUPP); + + error = fdesc_allocvp(Froot, FD_ROOT, mp, &rvp); + if (error) + return (error); + + MALLOC(fmp, struct fdescmount *, sizeof(struct fdescmount), + M_UFSMNT, M_WAITOK); /* XXX */ + rvp->v_type = VDIR; + rvp->v_flag |= VROOT; + fmp->f_root = rvp; + /* XXX -- don't mark as local to work around fts() problems */ + /*mp->mnt_flag |= MNT_LOCAL;*/ + mp->mnt_data = (qaddr_t) fmp; + getnewfsid(mp, MOUNT_FDESC); + + (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + bzero(mp->mnt_stat.f_mntfromname, MNAMELEN); + bcopy("fdesc", mp->mnt_stat.f_mntfromname, sizeof("fdesc")); + return (0); +} + +int +fdesc_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + return (0); +} + +int +fdesc_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + int error; + int flags = 0; + extern int doforce; + struct vnode *rootvp = VFSTOFDESC(mp)->f_root; + + if (mntflags & MNT_FORCE) { + /* fdesc can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + /* + * Clear out buffer cache. I don't think we + * ever get anything cached at this level at the + * moment, but who knows... + */ + if (rootvp->v_usecount > 1) + return (EBUSY); + if (error = vflush(mp, rootvp, flags)) + return (error); + + /* + * Release reference on underlying root vnode + */ + vrele(rootvp); + /* + * And blow it away for future re-use + */ + vgone(rootvp); + /* + * Finally, throw away the fdescmount structure + */ + free(mp->mnt_data, M_UFSMNT); /* XXX */ + mp->mnt_data = 0; + + return (0); +} + +int +fdesc_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *vp; + + /* + * Return locked reference to root. + */ + vp = VFSTOFDESC(mp)->f_root; + VREF(vp); + VOP_LOCK(vp); + *vpp = vp; + return (0); +} + +int +fdesc_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} + +int +fdesc_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + struct filedesc *fdp; + int lim; + int i; + int last; + int freefd; + + /* + * Compute number of free file descriptors. + * [ Strange results will ensue if the open file + * limit is ever reduced below the current number + * of open files... ] + */ + lim = p->p_rlimit[RLIMIT_NOFILE].rlim_cur; + fdp = p->p_fd; + last = min(fdp->fd_nfiles, lim); + freefd = 0; + for (i = fdp->fd_freefile; i < last; i++) + if (fdp->fd_ofiles[i] == NULL) + freefd++; + + /* + * Adjust for the fact that the fdesc array may not + * have been fully allocated yet. + */ + if (fdp->fd_nfiles < lim) + freefd += (lim - fdp->fd_nfiles); + + sbp->f_type = MOUNT_FDESC; + sbp->f_flags = 0; + sbp->f_bsize = DEV_BSIZE; + sbp->f_iosize = DEV_BSIZE; + sbp->f_blocks = 2; /* 1K to keep df happy */ + sbp->f_bfree = 0; + sbp->f_bavail = 0; + sbp->f_files = lim + 1; /* Allow for "." */ + sbp->f_ffree = freefd; /* See comments above */ + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + return (0); +} + +int +fdesc_sync(mp, waitfor) + struct mount *mp; + int waitfor; +{ + + return (0); +} + +/* + * Fdesc flat namespace lookup. + * Currently unsupported. + */ +int +fdesc_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +int +fdesc_fhtovp(mp, fhp, setgen, vpp) + struct mount *mp; + struct fid *fhp; + int setgen; + struct vnode **vpp; +{ + return (EOPNOTSUPP); +} + +int +fdesc_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + + return (EOPNOTSUPP); +} + +struct vfsops fdesc_vfsops = { + fdesc_mount, + fdesc_start, + fdesc_unmount, + fdesc_root, + fdesc_quotactl, + fdesc_statfs, + fdesc_sync, + fdesc_vget, + fdesc_fhtovp, + fdesc_vptofh, + fdesc_init, +}; diff --git a/sys/fs/fdescfs/fdesc_vnops.c b/sys/fs/fdescfs/fdesc_vnops.c new file mode 100644 index 00000000000..00d8675aea2 --- /dev/null +++ b/sys/fs/fdescfs/fdesc_vnops.c @@ -0,0 +1,974 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fdesc_vnops.c 8.9 (Berkeley) 1/21/94 + * + * $Id: fdesc_vnops.c,v 1.12 1993/04/06 16:17:17 jsp Exp $ + */ + +/* + * /dev/fd Filesystem + */ + +#include +#include +#include +#include +#include +#include /* boottime */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define cttyvp(p) ((p)->p_flag & P_CONTROLT ? (p)->p_session->s_ttyvp : NULL) + +#define FDL_WANT 0x01 +#define FDL_LOCKED 0x02 +static int fdcache_lock; + +dev_t devctty; + +#if (FD_STDIN != FD_STDOUT-1) || (FD_STDOUT != FD_STDERR-1) +FD_STDIN, FD_STDOUT, FD_STDERR must be a sequence n, n+1, n+2 +#endif + +#define NFDCACHE 3 +#define FD_NHASH(ix) ((ix) & NFDCACHE) + +/* + * Cache head + */ +struct fdcache { + struct fdescnode *fc_forw; + struct fdescnode *fc_back; +}; + +static struct fdcache fdcache[NFDCACHE]; + +/* + * Initialise cache headers + */ +fdesc_init() +{ + struct fdcache *fc; + + devctty = makedev(nchrdev, 0); + + for (fc = fdcache; fc < fdcache + NFDCACHE; fc++) + fc->fc_forw = fc->fc_back = (struct fdescnode *) fc; +} + +/* + * Compute hash list for given target vnode + */ +static struct fdcache * +fdesc_hash(ix) + int ix; +{ + + return (&fdcache[FD_NHASH(ix)]); +} + +int +fdesc_allocvp(ftype, ix, mp, vpp) + fdntype ftype; + int ix; + struct mount *mp; + struct vnode **vpp; +{ + struct fdcache *fc; + struct fdescnode *fd; + int error = 0; + +loop: + fc = fdesc_hash(ix); + for (fd = fc->fc_forw; fd != (struct fdescnode *) fc; fd = fd->fd_forw) { + if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) { + if (vget(fd->fd_vnode, 0)) + goto loop; + *vpp = fd->fd_vnode; + return (error); + } + } + + /* + * otherwise lock the array while we call getnewvnode + * since that can block. + */ + if (fdcache_lock & FDL_LOCKED) { + fdcache_lock |= FDL_WANT; + sleep((caddr_t) &fdcache_lock, PINOD); + goto loop; + } + fdcache_lock |= FDL_LOCKED; + + error = getnewvnode(VT_FDESC, mp, fdesc_vnodeop_p, vpp); + if (error) + goto out; + MALLOC(fd, void *, sizeof(struct fdescnode), M_TEMP, M_WAITOK); + (*vpp)->v_data = fd; + fd->fd_vnode = *vpp; + fd->fd_type = ftype; + fd->fd_fd = -1; + fd->fd_link = 0; + fd->fd_ix = ix; + fc = fdesc_hash(ix); + insque(fd, fc); + +out:; + fdcache_lock &= ~FDL_LOCKED; + + if (fdcache_lock & FDL_WANT) { + fdcache_lock &= ~FDL_WANT; + wakeup((caddr_t) &fdcache_lock); + } + + return (error); +} + +/* + * vp is the current namei directory + * ndp is the name to locate in that directory... + */ +int +fdesc_lookup(ap) + struct vop_lookup_args /* { + struct vnode * a_dvp; + struct vnode ** a_vpp; + struct componentname * a_cnp; + } */ *ap; +{ + struct vnode **vpp = ap->a_vpp; + struct vnode *dvp = ap->a_dvp; + char *pname; + struct proc *p; + int nfiles; + unsigned fd; + int error; + struct vnode *fvp; + char *ln; + + pname = ap->a_cnp->cn_nameptr; + if (ap->a_cnp->cn_namelen == 1 && *pname == '.') { + *vpp = dvp; + VREF(dvp); + VOP_LOCK(dvp); + return (0); + } + + p = ap->a_cnp->cn_proc; + nfiles = p->p_fd->fd_nfiles; + + switch (VTOFDESC(dvp)->fd_type) { + default: + case Flink: + case Fdesc: + case Fctty: + error = ENOTDIR; + goto bad; + + case Froot: + if (ap->a_cnp->cn_namelen == 2 && bcmp(pname, "fd", 2) == 0) { + error = fdesc_allocvp(Fdevfd, FD_DEVFD, dvp->v_mount, &fvp); + if (error) + goto bad; + *vpp = fvp; + fvp->v_type = VDIR; + VOP_LOCK(fvp); + return (0); + } + + if (ap->a_cnp->cn_namelen == 3 && bcmp(pname, "tty", 3) == 0) { + struct vnode *ttyvp = cttyvp(p); + if (ttyvp == NULL) { + error = ENXIO; + goto bad; + } + error = fdesc_allocvp(Fctty, FD_CTTY, dvp->v_mount, &fvp); + if (error) + goto bad; + *vpp = fvp; + fvp->v_type = VFIFO; + VOP_LOCK(fvp); + return (0); + } + + ln = 0; + switch (ap->a_cnp->cn_namelen) { + case 5: + if (bcmp(pname, "stdin", 5) == 0) { + ln = "fd/0"; + fd = FD_STDIN; + } + break; + case 6: + if (bcmp(pname, "stdout", 6) == 0) { + ln = "fd/1"; + fd = FD_STDOUT; + } else + if (bcmp(pname, "stderr", 6) == 0) { + ln = "fd/2"; + fd = FD_STDERR; + } + break; + } + + if (ln) { + error = fdesc_allocvp(Flink, fd, dvp->v_mount, &fvp); + if (error) + goto bad; + VTOFDESC(fvp)->fd_link = ln; + *vpp = fvp; + fvp->v_type = VLNK; + VOP_LOCK(fvp); + return (0); + } else { + error = ENOENT; + goto bad; + } + + /* FALL THROUGH */ + + case Fdevfd: + if (ap->a_cnp->cn_namelen == 2 && bcmp(pname, "..", 2) == 0) { + error = fdesc_root(dvp->v_mount, vpp); + return (error); + } + + fd = 0; + while (*pname >= '0' && *pname <= '9') { + fd = 10 * fd + *pname++ - '0'; + if (fd >= nfiles) + break; + } + + if (*pname != '\0') { + error = ENOENT; + goto bad; + } + + if (fd >= nfiles || p->p_fd->fd_ofiles[fd] == NULL) { + error = EBADF; + goto bad; + } + + error = fdesc_allocvp(Fdesc, FD_DESC+fd, dvp->v_mount, &fvp); + if (error) + goto bad; + VTOFDESC(fvp)->fd_fd = fd; + *vpp = fvp; + return (0); + } + +bad:; + *vpp = NULL; + return (error); +} + +int +fdesc_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + int error = 0; + + switch (VTOFDESC(vp)->fd_type) { + case Fdesc: + /* + * XXX Kludge: set p->p_dupfd to contain the value of the + * the file descriptor being sought for duplication. The error + * return ensures that the vnode for this device will be + * released by vn_open. Open will detect this special error and + * take the actions in dupfdopen. Other callers of vn_open or + * VOP_OPEN will simply report the error. + */ + ap->a_p->p_dupfd = VTOFDESC(vp)->fd_fd; /* XXX */ + error = ENODEV; + break; + + case Fctty: + error = cttyopen(devctty, ap->a_mode, 0, ap->a_p); + break; + } + + return (error); +} + +static int +fdesc_attr(fd, vap, cred, p) + int fd; + struct vattr *vap; + struct ucred *cred; + struct proc *p; +{ + struct filedesc *fdp = p->p_fd; + struct file *fp; + struct stat stb; + int error; + + if (fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + + switch (fp->f_type) { + case DTYPE_VNODE: + error = VOP_GETATTR((struct vnode *) fp->f_data, vap, cred, p); + if (error == 0 && vap->va_type == VDIR) { + /* + * don't allow directories to show up because + * that causes loops in the namespace. + */ + vap->va_type = VFIFO; + } + break; + + case DTYPE_SOCKET: + error = soo_stat((struct socket *)fp->f_data, &stb); + if (error == 0) { + vattr_null(vap); + vap->va_type = VSOCK; + vap->va_mode = stb.st_mode; + vap->va_nlink = stb.st_nlink; + vap->va_uid = stb.st_uid; + vap->va_gid = stb.st_gid; + vap->va_fsid = stb.st_dev; + vap->va_fileid = stb.st_ino; + vap->va_size = stb.st_size; + vap->va_blocksize = stb.st_blksize; + vap->va_atime = stb.st_atimespec; + vap->va_mtime = stb.st_mtimespec; + vap->va_ctime = stb.st_ctimespec; + vap->va_gen = stb.st_gen; + vap->va_flags = stb.st_flags; + vap->va_rdev = stb.st_rdev; + vap->va_bytes = stb.st_blocks * stb.st_blksize; + } + break; + + default: + panic("fdesc attr"); + break; + } + + return (error); +} + +int +fdesc_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + unsigned fd; + int error = 0; + + switch (VTOFDESC(vp)->fd_type) { + case Froot: + case Fdevfd: + case Flink: + case Fctty: + bzero((caddr_t) vap, sizeof(*vap)); + vattr_null(vap); + vap->va_fileid = VTOFDESC(vp)->fd_ix; + + switch (VTOFDESC(vp)->fd_type) { + case Flink: + vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; + vap->va_type = VLNK; + vap->va_nlink = 1; + vap->va_size = strlen(VTOFDESC(vp)->fd_link); + break; + + case Fctty: + vap->va_mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH; + vap->va_type = VFIFO; + vap->va_nlink = 1; + vap->va_size = 0; + break; + + default: + vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; + vap->va_type = VDIR; + vap->va_nlink = 2; + vap->va_size = DEV_BSIZE; + break; + } + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + vap->va_blocksize = DEV_BSIZE; + vap->va_atime.ts_sec = boottime.tv_sec; + vap->va_atime.ts_nsec = 0; + vap->va_mtime = vap->va_atime; + vap->va_ctime = vap->va_mtime; + vap->va_gen = 0; + vap->va_flags = 0; + vap->va_rdev = 0; + vap->va_bytes = 0; + break; + + case Fdesc: + fd = VTOFDESC(vp)->fd_fd; + error = fdesc_attr(fd, vap, ap->a_cred, ap->a_p); + break; + + default: + panic("fdesc_getattr"); + break; + } + + if (error == 0) + vp->v_type = vap->va_type; + + return (error); +} + +int +fdesc_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct filedesc *fdp = ap->a_p->p_fd; + struct file *fp; + unsigned fd; + int error; + + /* + * Can't mess with the root vnode + */ + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fdesc: + break; + + case Fctty: + return (0); + + default: + return (EACCES); + } + + fd = VTOFDESC(ap->a_vp)->fd_fd; + if (fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) { + return (EBADF); + } + + /* + * Can setattr the underlying vnode, but not sockets! + */ + switch (fp->f_type) { + case DTYPE_VNODE: + error = VOP_SETATTR((struct vnode *) fp->f_data, ap->a_vap, ap->a_cred, ap->a_p); + break; + + case DTYPE_SOCKET: + error = 0; + break; + + default: + panic("fdesc setattr"); + break; + } + + return (error); +} + +#define UIO_MX 16 + +static struct dirtmp { + u_long d_fileno; + u_short d_reclen; + u_short d_namlen; + char d_name[8]; +} rootent[] = { + { FD_DEVFD, UIO_MX, 2, "fd" }, + { FD_STDIN, UIO_MX, 5, "stdin" }, + { FD_STDOUT, UIO_MX, 6, "stdout" }, + { FD_STDERR, UIO_MX, 6, "stderr" }, + { FD_CTTY, UIO_MX, 3, "tty" }, + { 0 } +}; + +int +fdesc_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + struct uio *uio = ap->a_uio; + struct filedesc *fdp; + int i; + int error; + + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fctty: + return (0); + + case Fdesc: + return (ENOTDIR); + + default: + break; + } + + fdp = uio->uio_procp->p_fd; + + if (VTOFDESC(ap->a_vp)->fd_type == Froot) { + struct dirent d; + struct dirent *dp = &d; + struct dirtmp *dt; + + i = uio->uio_offset / UIO_MX; + error = 0; + + while (uio->uio_resid > 0) { + dt = &rootent[i]; + if (dt->d_fileno == 0) { + /**eofflagp = 1;*/ + break; + } + i++; + + switch (dt->d_fileno) { + case FD_CTTY: + if (cttyvp(uio->uio_procp) == NULL) + continue; + break; + + case FD_STDIN: + case FD_STDOUT: + case FD_STDERR: + if ((dt->d_fileno-FD_STDIN) >= fdp->fd_nfiles) + continue; + if (fdp->fd_ofiles[dt->d_fileno-FD_STDIN] == NULL) + continue; + break; + } + bzero((caddr_t) dp, UIO_MX); + dp->d_fileno = dt->d_fileno; + dp->d_namlen = dt->d_namlen; + dp->d_type = DT_UNKNOWN; + dp->d_reclen = dt->d_reclen; + bcopy(dt->d_name, dp->d_name, dp->d_namlen+1); + error = uiomove((caddr_t) dp, UIO_MX, uio); + if (error) + break; + } + uio->uio_offset = i * UIO_MX; + return (error); + } + + i = uio->uio_offset / UIO_MX; + error = 0; + while (uio->uio_resid > 0) { + if (i >= fdp->fd_nfiles) + break; + + if (fdp->fd_ofiles[i] != NULL) { + struct dirent d; + struct dirent *dp = &d; + + bzero((caddr_t) dp, UIO_MX); + + dp->d_namlen = sprintf(dp->d_name, "%d", i); + dp->d_reclen = UIO_MX; + dp->d_type = DT_UNKNOWN; + dp->d_fileno = i + FD_STDIN; + /* + * And ship to userland + */ + error = uiomove((caddr_t) dp, UIO_MX, uio); + if (error) + break; + } + i++; + } + + uio->uio_offset = i * UIO_MX; + return (error); +} + +int +fdesc_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + int error; + + if (vp->v_type != VLNK) + return (EPERM); + + if (VTOFDESC(vp)->fd_type == Flink) { + char *ln = VTOFDESC(vp)->fd_link; + error = uiomove(ln, strlen(ln), ap->a_uio); + } else { + error = EOPNOTSUPP; + } + + return (error); +} + +int +fdesc_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error = EOPNOTSUPP; + + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fctty: + error = cttyread(devctty, ap->a_uio, ap->a_ioflag); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +int +fdesc_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error = EOPNOTSUPP; + + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fctty: + error = cttywrite(devctty, ap->a_uio, ap->a_ioflag); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +int +fdesc_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + int error = EOPNOTSUPP; + + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fctty: + error = cttyioctl(devctty, ap->a_command, ap->a_data, + ap->a_fflag, ap->a_p); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +int +fdesc_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + int error = EOPNOTSUPP; + + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fctty: + error = cttyselect(devctty, ap->a_fflags, ap->a_p); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +int +fdesc_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + /* + * Clear out the v_type field to avoid + * nasty things happening in vgone(). + */ + vp->v_type = VNON; + return (0); +} + +int +fdesc_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + remque(VTOFDESC(vp)); + FREE(vp->v_data, M_TEMP); + vp->v_data = 0; + + return (0); +} + +/* + * Return POSIX pathconf information applicable to special devices. + */ +fdesc_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_MAX_CANON: + *ap->a_retval = MAX_CANON; + return (0); + case _PC_MAX_INPUT: + *ap->a_retval = MAX_INPUT; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_VDISABLE: + *ap->a_retval = _POSIX_VDISABLE; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Print out the contents of a /dev/fd vnode. + */ +/* ARGSUSED */ +int +fdesc_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + printf("tag VT_NON, fdesc vnode\n"); + return (0); +} + +/*void*/ +int +fdesc_vfree(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + + return (0); +} + +/* + * /dev/fd vnode unsupported operation + */ +int +fdesc_enotsupp() +{ + + return (EOPNOTSUPP); +} + +/* + * /dev/fd "should never get here" operation + */ +int +fdesc_badop() +{ + + panic("fdesc: bad op"); + /* NOTREACHED */ +} + +/* + * /dev/fd vnode null operation + */ +int +fdesc_nullop() +{ + + return (0); +} + +#define fdesc_create ((int (*) __P((struct vop_create_args *)))fdesc_enotsupp) +#define fdesc_mknod ((int (*) __P((struct vop_mknod_args *)))fdesc_enotsupp) +#define fdesc_close ((int (*) __P((struct vop_close_args *)))nullop) +#define fdesc_access ((int (*) __P((struct vop_access_args *)))nullop) +#define fdesc_mmap ((int (*) __P((struct vop_mmap_args *)))fdesc_enotsupp) +#define fdesc_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) +#define fdesc_seek ((int (*) __P((struct vop_seek_args *)))nullop) +#define fdesc_remove ((int (*) __P((struct vop_remove_args *)))fdesc_enotsupp) +#define fdesc_link ((int (*) __P((struct vop_link_args *)))fdesc_enotsupp) +#define fdesc_rename ((int (*) __P((struct vop_rename_args *)))fdesc_enotsupp) +#define fdesc_mkdir ((int (*) __P((struct vop_mkdir_args *)))fdesc_enotsupp) +#define fdesc_rmdir ((int (*) __P((struct vop_rmdir_args *)))fdesc_enotsupp) +#define fdesc_symlink ((int (*) __P((struct vop_symlink_args *)))fdesc_enotsupp) +#define fdesc_abortop ((int (*) __P((struct vop_abortop_args *)))nullop) +#define fdesc_lock ((int (*) __P((struct vop_lock_args *)))nullop) +#define fdesc_unlock ((int (*) __P((struct vop_unlock_args *)))nullop) +#define fdesc_bmap ((int (*) __P((struct vop_bmap_args *)))fdesc_badop) +#define fdesc_strategy ((int (*) __P((struct vop_strategy_args *)))fdesc_badop) +#define fdesc_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +#define fdesc_advlock ((int (*) __P((struct vop_advlock_args *)))fdesc_enotsupp) +#define fdesc_blkatoff \ + ((int (*) __P((struct vop_blkatoff_args *)))fdesc_enotsupp) +#define fdesc_vget ((int (*) __P((struct vop_vget_args *)))fdesc_enotsupp) +#define fdesc_valloc ((int(*) __P(( \ + struct vnode *pvp, \ + int mode, \ + struct ucred *cred, \ + struct vnode **vpp))) fdesc_enotsupp) +#define fdesc_truncate \ + ((int (*) __P((struct vop_truncate_args *)))fdesc_enotsupp) +#define fdesc_update ((int (*) __P((struct vop_update_args *)))fdesc_enotsupp) +#define fdesc_bwrite ((int (*) __P((struct vop_bwrite_args *)))fdesc_enotsupp) + +int (**fdesc_vnodeop_p)(); +struct vnodeopv_entry_desc fdesc_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fdesc_lookup }, /* lookup */ + { &vop_create_desc, fdesc_create }, /* create */ + { &vop_mknod_desc, fdesc_mknod }, /* mknod */ + { &vop_open_desc, fdesc_open }, /* open */ + { &vop_close_desc, fdesc_close }, /* close */ + { &vop_access_desc, fdesc_access }, /* access */ + { &vop_getattr_desc, fdesc_getattr }, /* getattr */ + { &vop_setattr_desc, fdesc_setattr }, /* setattr */ + { &vop_read_desc, fdesc_read }, /* read */ + { &vop_write_desc, fdesc_write }, /* write */ + { &vop_ioctl_desc, fdesc_ioctl }, /* ioctl */ + { &vop_select_desc, fdesc_select }, /* select */ + { &vop_mmap_desc, fdesc_mmap }, /* mmap */ + { &vop_fsync_desc, fdesc_fsync }, /* fsync */ + { &vop_seek_desc, fdesc_seek }, /* seek */ + { &vop_remove_desc, fdesc_remove }, /* remove */ + { &vop_link_desc, fdesc_link }, /* link */ + { &vop_rename_desc, fdesc_rename }, /* rename */ + { &vop_mkdir_desc, fdesc_mkdir }, /* mkdir */ + { &vop_rmdir_desc, fdesc_rmdir }, /* rmdir */ + { &vop_symlink_desc, fdesc_symlink }, /* symlink */ + { &vop_readdir_desc, fdesc_readdir }, /* readdir */ + { &vop_readlink_desc, fdesc_readlink }, /* readlink */ + { &vop_abortop_desc, fdesc_abortop }, /* abortop */ + { &vop_inactive_desc, fdesc_inactive }, /* inactive */ + { &vop_reclaim_desc, fdesc_reclaim }, /* reclaim */ + { &vop_lock_desc, fdesc_lock }, /* lock */ + { &vop_unlock_desc, fdesc_unlock }, /* unlock */ + { &vop_bmap_desc, fdesc_bmap }, /* bmap */ + { &vop_strategy_desc, fdesc_strategy }, /* strategy */ + { &vop_print_desc, fdesc_print }, /* print */ + { &vop_islocked_desc, fdesc_islocked }, /* islocked */ + { &vop_pathconf_desc, fdesc_pathconf }, /* pathconf */ + { &vop_advlock_desc, fdesc_advlock }, /* advlock */ + { &vop_blkatoff_desc, fdesc_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fdesc_valloc }, /* valloc */ + { &vop_vfree_desc, fdesc_vfree }, /* vfree */ + { &vop_truncate_desc, fdesc_truncate }, /* truncate */ + { &vop_update_desc, fdesc_update }, /* update */ + { &vop_bwrite_desc, fdesc_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc fdesc_vnodeop_opv_desc = + { &fdesc_vnodeop_p, fdesc_vnodeop_entries }; diff --git a/sys/fs/fifofs/fifo.h b/sys/fs/fifofs/fifo.h new file mode 100644 index 00000000000..e89186d8b89 --- /dev/null +++ b/sys/fs/fifofs/fifo.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fifo.h 8.2 (Berkeley) 2/2/94 + */ + +#ifdef FIFO +/* + * Prototypes for fifo operations on vnodes. + */ +int fifo_badop(), + fifo_ebadf(); + +int fifo_lookup __P((struct vop_lookup_args *)); +#define fifo_create ((int (*) __P((struct vop_create_args *)))fifo_badop) +#define fifo_mknod ((int (*) __P((struct vop_mknod_args *)))fifo_badop) +int fifo_open __P((struct vop_open_args *)); +int fifo_close __P((struct vop_close_args *)); +#define fifo_access ((int (*) __P((struct vop_access_args *)))fifo_ebadf) +#define fifo_getattr ((int (*) __P((struct vop_getattr_args *)))fifo_ebadf) +#define fifo_setattr ((int (*) __P((struct vop_setattr_args *)))fifo_ebadf) +int fifo_read __P((struct vop_read_args *)); +int fifo_write __P((struct vop_write_args *)); +int fifo_ioctl __P((struct vop_ioctl_args *)); +int fifo_select __P((struct vop_select_args *)); +#define fifo_mmap ((int (*) __P((struct vop_mmap_args *)))fifo_badop) +#define fifo_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) +#define fifo_seek ((int (*) __P((struct vop_seek_args *)))fifo_badop) +#define fifo_remove ((int (*) __P((struct vop_remove_args *)))fifo_badop) +#define fifo_link ((int (*) __P((struct vop_link_args *)))fifo_badop) +#define fifo_rename ((int (*) __P((struct vop_rename_args *)))fifo_badop) +#define fifo_mkdir ((int (*) __P((struct vop_mkdir_args *)))fifo_badop) +#define fifo_rmdir ((int (*) __P((struct vop_rmdir_args *)))fifo_badop) +#define fifo_symlink ((int (*) __P((struct vop_symlink_args *)))fifo_badop) +#define fifo_readdir ((int (*) __P((struct vop_readdir_args *)))fifo_badop) +#define fifo_readlink ((int (*) __P((struct vop_readlink_args *)))fifo_badop) +#define fifo_abortop ((int (*) __P((struct vop_abortop_args *)))fifo_badop) +#define fifo_inactive ((int (*) __P((struct vop_inactive_args *)))nullop) +#define fifo_reclaim ((int (*) __P((struct vop_reclaim_args *)))nullop) +int fifo_lock __P((struct vop_lock_args *)); +int fifo_unlock __P((struct vop_unlock_args *)); +int fifo_bmap __P((struct vop_bmap_args *)); +#define fifo_strategy ((int (*) __P((struct vop_strategy_args *)))fifo_badop) +int fifo_print __P((struct vop_print_args *)); +#define fifo_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +int fifo_pathconf __P((struct vop_pathconf_args *)); +int fifo_advlock __P((struct vop_advlock_args *)); +#define fifo_blkatoff ((int (*) __P((struct vop_blkatoff_args *)))fifo_badop) +#define fifo_valloc ((int (*) __P((struct vop_valloc_args *)))fifo_badop) +#define fifo_reallocblks \ + ((int (*) __P((struct vop_reallocblks_args *)))fifo_badop) +#define fifo_vfree ((int (*) __P((struct vop_vfree_args *)))fifo_badop) +#define fifo_truncate ((int (*) __P((struct vop_truncate_args *)))nullop) +#define fifo_update ((int (*) __P((struct vop_update_args *)))nullop) +#define fifo_bwrite ((int (*) __P((struct vop_bwrite_args *)))nullop) +#endif /* FIFO */ diff --git a/sys/fs/fifofs/fifo_vnops.c b/sys/fs/fifofs/fifo_vnops.c new file mode 100644 index 00000000000..bad33a430b6 --- /dev/null +++ b/sys/fs/fifofs/fifo_vnops.c @@ -0,0 +1,494 @@ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fifo_vnops.c 8.2 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This structure is associated with the FIFO vnode and stores + * the state associated with the FIFO. + */ +struct fifoinfo { + struct socket *fi_readsock; + struct socket *fi_writesock; + long fi_readers; + long fi_writers; +}; + +int (**fifo_vnodeop_p)(); +struct vnodeopv_entry_desc fifo_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fifo_lookup }, /* lookup */ + { &vop_create_desc, fifo_create }, /* create */ + { &vop_mknod_desc, fifo_mknod }, /* mknod */ + { &vop_open_desc, fifo_open }, /* open */ + { &vop_close_desc, fifo_close }, /* close */ + { &vop_access_desc, fifo_access }, /* access */ + { &vop_getattr_desc, fifo_getattr }, /* getattr */ + { &vop_setattr_desc, fifo_setattr }, /* setattr */ + { &vop_read_desc, fifo_read }, /* read */ + { &vop_write_desc, fifo_write }, /* write */ + { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */ + { &vop_select_desc, fifo_select }, /* select */ + { &vop_mmap_desc, fifo_mmap }, /* mmap */ + { &vop_fsync_desc, fifo_fsync }, /* fsync */ + { &vop_seek_desc, fifo_seek }, /* seek */ + { &vop_remove_desc, fifo_remove }, /* remove */ + { &vop_link_desc, fifo_link }, /* link */ + { &vop_rename_desc, fifo_rename }, /* rename */ + { &vop_mkdir_desc, fifo_mkdir }, /* mkdir */ + { &vop_rmdir_desc, fifo_rmdir }, /* rmdir */ + { &vop_symlink_desc, fifo_symlink }, /* symlink */ + { &vop_readdir_desc, fifo_readdir }, /* readdir */ + { &vop_readlink_desc, fifo_readlink }, /* readlink */ + { &vop_abortop_desc, fifo_abortop }, /* abortop */ + { &vop_inactive_desc, fifo_inactive }, /* inactive */ + { &vop_reclaim_desc, fifo_reclaim }, /* reclaim */ + { &vop_lock_desc, fifo_lock }, /* lock */ + { &vop_unlock_desc, fifo_unlock }, /* unlock */ + { &vop_bmap_desc, fifo_bmap }, /* bmap */ + { &vop_strategy_desc, fifo_strategy }, /* strategy */ + { &vop_print_desc, fifo_print }, /* print */ + { &vop_islocked_desc, fifo_islocked }, /* islocked */ + { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */ + { &vop_advlock_desc, fifo_advlock }, /* advlock */ + { &vop_blkatoff_desc, fifo_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fifo_valloc }, /* valloc */ + { &vop_vfree_desc, fifo_vfree }, /* vfree */ + { &vop_truncate_desc, fifo_truncate }, /* truncate */ + { &vop_update_desc, fifo_update }, /* update */ + { &vop_bwrite_desc, fifo_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc fifo_vnodeop_opv_desc = + { &fifo_vnodeop_p, fifo_vnodeop_entries }; + +/* + * Trivial lookup routine that always fails. + */ +/* ARGSUSED */ +fifo_lookup(ap) + struct vop_lookup_args /* { + struct vnode * a_dvp; + struct vnode ** a_vpp; + struct componentname * a_cnp; + } */ *ap; +{ + + *ap->a_vpp = NULL; + return (ENOTDIR); +} + +/* + * Open called to set up a new instance of a fifo or + * to find an active instance of a fifo. + */ +/* ARGSUSED */ +fifo_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct fifoinfo *fip; + struct socket *rso, *wso; + int error; + static char openstr[] = "fifo"; + + if ((ap->a_mode & (FREAD|FWRITE)) == (FREAD|FWRITE)) + return (EINVAL); + if ((fip = vp->v_fifoinfo) == NULL) { + MALLOC(fip, struct fifoinfo *, sizeof(*fip), M_VNODE, M_WAITOK); + vp->v_fifoinfo = fip; + if (error = socreate(AF_UNIX, &rso, SOCK_STREAM, 0)) { + free(fip, M_VNODE); + vp->v_fifoinfo = NULL; + return (error); + } + fip->fi_readsock = rso; + if (error = socreate(AF_UNIX, &wso, SOCK_STREAM, 0)) { + (void)soclose(rso); + free(fip, M_VNODE); + vp->v_fifoinfo = NULL; + return (error); + } + fip->fi_writesock = wso; + if (error = unp_connect2(wso, rso)) { + (void)soclose(wso); + (void)soclose(rso); + free(fip, M_VNODE); + vp->v_fifoinfo = NULL; + return (error); + } + fip->fi_readers = fip->fi_writers = 0; + wso->so_state |= SS_CANTRCVMORE; + rso->so_state |= SS_CANTSENDMORE; + } + error = 0; + if (ap->a_mode & FREAD) { + fip->fi_readers++; + if (fip->fi_readers == 1) { + fip->fi_writesock->so_state &= ~SS_CANTSENDMORE; + if (fip->fi_writers > 0) + wakeup((caddr_t)&fip->fi_writers); + } + if (ap->a_mode & O_NONBLOCK) + return (0); + while (fip->fi_writers == 0) { + VOP_UNLOCK(vp); + error = tsleep((caddr_t)&fip->fi_readers, + PCATCH | PSOCK, openstr, 0); + VOP_LOCK(vp); + if (error) + break; + } + } else { + fip->fi_writers++; + if (fip->fi_readers == 0 && (ap->a_mode & O_NONBLOCK)) { + error = ENXIO; + } else { + if (fip->fi_writers == 1) { + fip->fi_readsock->so_state &= ~SS_CANTRCVMORE; + if (fip->fi_readers > 0) + wakeup((caddr_t)&fip->fi_readers); + } + while (fip->fi_readers == 0) { + VOP_UNLOCK(vp); + error = tsleep((caddr_t)&fip->fi_writers, + PCATCH | PSOCK, openstr, 0); + VOP_LOCK(vp); + if (error) + break; + } + } + } + if (error) + VOP_CLOSE(vp, ap->a_mode, ap->a_cred, ap->a_p); + return (error); +} + +/* + * Vnode op for read + */ +/* ARGSUSED */ +fifo_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct uio *uio = ap->a_uio; + register struct socket *rso = ap->a_vp->v_fifoinfo->fi_readsock; + int error, startresid; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("fifo_read mode"); +#endif + if (uio->uio_resid == 0) + return (0); + if (ap->a_ioflag & IO_NDELAY) + rso->so_state |= SS_NBIO; + startresid = uio->uio_resid; + VOP_UNLOCK(ap->a_vp); + error = soreceive(rso, (struct mbuf **)0, uio, (int *)0, + (struct mbuf **)0, (struct mbuf **)0); + VOP_LOCK(ap->a_vp); + /* + * Clear EOF indication after first such return. + */ + if (uio->uio_resid == startresid) + rso->so_state &= ~SS_CANTRCVMORE; + if (ap->a_ioflag & IO_NDELAY) + rso->so_state &= ~SS_NBIO; + return (error); +} + +/* + * Vnode op for write + */ +/* ARGSUSED */ +fifo_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct socket *wso = ap->a_vp->v_fifoinfo->fi_writesock; + int error; + +#ifdef DIAGNOSTIC + if (ap->a_uio->uio_rw != UIO_WRITE) + panic("fifo_write mode"); +#endif + if (ap->a_ioflag & IO_NDELAY) + wso->so_state |= SS_NBIO; + VOP_UNLOCK(ap->a_vp); + error = sosend(wso, (struct mbuf *)0, ap->a_uio, 0, (struct mbuf *)0, 0); + VOP_LOCK(ap->a_vp); + if (ap->a_ioflag & IO_NDELAY) + wso->so_state &= ~SS_NBIO; + return (error); +} + +/* + * Device ioctl operation. + */ +/* ARGSUSED */ +fifo_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct file filetmp; + + if (ap->a_command == FIONBIO) + return (0); + if (ap->a_fflag & FREAD) + filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock; + else + filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock; + return (soo_ioctl(&filetmp, ap->a_command, ap->a_data, ap->a_p)); +} + +/* ARGSUSED */ +fifo_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct file filetmp; + + if (ap->a_fflags & FREAD) + filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock; + else + filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock; + return (soo_select(&filetmp, ap->a_which, ap->a_p)); +} + +/* + * This is a noop, simply returning what one has been given. + */ +fifo_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + } */ *ap; +{ + + if (ap->a_vpp != NULL) + *ap->a_vpp = ap->a_vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + return (0); +} + +/* + * At the moment we do not do any locking. + */ +/* ARGSUSED */ +fifo_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* ARGSUSED */ +fifo_unlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* + * Device close routine + */ +/* ARGSUSED */ +fifo_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct fifoinfo *fip = vp->v_fifoinfo; + int error1, error2; + + if (ap->a_fflag & FWRITE) { + fip->fi_writers--; + if (fip->fi_writers == 0) + socantrcvmore(fip->fi_readsock); + } else { + fip->fi_readers--; + if (fip->fi_readers == 0) + socantsendmore(fip->fi_writesock); + } + if (vp->v_usecount > 1) + return (0); + error1 = soclose(fip->fi_readsock); + error2 = soclose(fip->fi_writesock); + FREE(fip, M_VNODE); + vp->v_fifoinfo = NULL; + if (error1) + return (error1); + return (error2); +} + +/* + * Print out the contents of a fifo vnode. + */ +fifo_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + printf("tag VT_NON"); + fifo_printinfo(ap->a_vp); + printf("\n"); +} + +/* + * Print out internal contents of a fifo vnode. + */ +fifo_printinfo(vp) + struct vnode *vp; +{ + register struct fifoinfo *fip = vp->v_fifoinfo; + + printf(", fifo with %d readers and %d writers", + fip->fi_readers, fip->fi_writers); +} + +/* + * Return POSIX pathconf information applicable to fifo's. + */ +fifo_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Fifo failed operation + */ +fifo_ebadf() +{ + + return (EBADF); +} + +/* + * Fifo advisory byte-level locks. + */ +/* ARGSUSED */ +fifo_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * Fifo bad operation + */ +fifo_badop() +{ + + panic("fifo_badop called"); + /* NOTREACHED */ +} diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h new file mode 100644 index 00000000000..14286ffeee0 --- /dev/null +++ b/sys/fs/nullfs/null.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null.h 8.2 (Berkeley) 1/21/94 + * + * $Id: lofs.h,v 1.8 1992/05/30 10:05:43 jsp Exp jsp $ + */ + +struct null_args { + char *target; /* Target of loopback */ +}; + +struct null_mount { + struct mount *nullm_vfs; + struct vnode *nullm_rootvp; /* Reference to root null_node */ +}; + +#ifdef KERNEL +/* + * A cache of vnode references + */ +struct null_node { + struct null_node *null_forw; /* Hash chain */ + struct null_node *null_back; + struct vnode *null_lowervp; /* VREFed once */ + struct vnode *null_vnode; /* Back pointer */ +}; + +extern int null_node_create __P((struct mount *mp, struct vnode *target, struct vnode **vpp)); + +#define MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data)) +#define VTONULL(vp) ((struct null_node *)(vp)->v_data) +#define NULLTOV(xp) ((xp)->null_vnode) +#ifdef NULLFS_DIAGNOSTIC +extern struct vnode *null_checkvp __P((struct vnode *vp, char *fil, int lno)); +#define NULLVPTOLOWERVP(vp) null_checkvp((vp), __FILE__, __LINE__) +#else +#define NULLVPTOLOWERVP(vp) (VTONULL(vp)->null_lowervp) +#endif + +extern int (**null_vnodeop_p)(); +extern struct vfsops null_vfsops; +#endif /* KERNEL */ diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c new file mode 100644 index 00000000000..a31723fe4c2 --- /dev/null +++ b/sys/fs/nullfs/null_subr.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null_subr.c 8.4 (Berkeley) 1/21/94 + * + * $Id: lofs_subr.c,v 1.11 1992/05/30 10:05:43 jsp Exp jsp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOG2_SIZEVNODE 7 /* log2(sizeof struct vnode) */ +#define NNULLNODECACHE 16 +#define NULL_NHASH(vp) ((((u_long)vp)>>LOG2_SIZEVNODE) & (NNULLNODECACHE-1)) + +/* + * Null layer cache: + * Each cache entry holds a reference to the lower vnode + * along with a pointer to the alias vnode. When an + * entry is added the lower vnode is VREF'd. When the + * alias is removed the lower vnode is vrele'd. + */ + +/* + * Cache head + */ +struct null_node_cache { + struct null_node *ac_forw; + struct null_node *ac_back; +}; + +static struct null_node_cache null_node_cache[NNULLNODECACHE]; + +/* + * Initialise cache headers + */ +nullfs_init() +{ + struct null_node_cache *ac; +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_init\n"); /* printed during system boot */ +#endif + + for (ac = null_node_cache; ac < null_node_cache + NNULLNODECACHE; ac++) + ac->ac_forw = ac->ac_back = (struct null_node *) ac; +} + +/* + * Compute hash list for given lower vnode + */ +static struct null_node_cache * +null_node_hash(lowervp) +struct vnode *lowervp; +{ + + return (&null_node_cache[NULL_NHASH(lowervp)]); +} + +/* + * Return a VREF'ed alias for lower vnode if already exists, else 0. + */ +static struct vnode * +null_node_find(mp, lowervp) + struct mount *mp; + struct vnode *lowervp; +{ + struct null_node_cache *hd; + struct null_node *a; + struct vnode *vp; + + /* + * Find hash base, and then search the (two-way) linked + * list looking for a null_node structure which is referencing + * the lower vnode. If found, the increment the null_node + * reference count (but NOT the lower vnode's VREF counter). + */ + hd = null_node_hash(lowervp); +loop: + for (a = hd->ac_forw; a != (struct null_node *) hd; a = a->null_forw) { + if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) { + vp = NULLTOV(a); + /* + * We need vget for the VXLOCK + * stuff, but we don't want to lock + * the lower node. + */ + if (vget(vp, 0)) { + printf ("null_node_find: vget failed.\n"); + goto loop; + }; + return (vp); + } + } + + return NULL; +} + + +/* + * Make a new null_node node. + * Vp is the alias vnode, lofsvp is the lower vnode. + * Maintain a reference to (lowervp). + */ +static int +null_node_alloc(mp, lowervp, vpp) + struct mount *mp; + struct vnode *lowervp; + struct vnode **vpp; +{ + struct null_node_cache *hd; + struct null_node *xp; + struct vnode *othervp, *vp; + int error; + + if (error = getnewvnode(VT_NULL, mp, null_vnodeop_p, vpp)) + return (error); + vp = *vpp; + + MALLOC(xp, struct null_node *, sizeof(struct null_node), M_TEMP, M_WAITOK); + vp->v_type = lowervp->v_type; + xp->null_vnode = vp; + vp->v_data = xp; + xp->null_lowervp = lowervp; + /* + * Before we insert our new node onto the hash chains, + * check to see if someone else has beaten us to it. + * (We could have slept in MALLOC.) + */ + if (othervp = null_node_find(lowervp)) { + FREE(xp, M_TEMP); + vp->v_type = VBAD; /* node is discarded */ + vp->v_usecount = 0; /* XXX */ + *vpp = othervp; + return 0; + }; + VREF(lowervp); /* Extra VREF will be vrele'd in null_node_create */ + hd = null_node_hash(lowervp); + insque(xp, hd); + return 0; +} + + +/* + * Try to find an existing null_node vnode refering + * to it, otherwise make a new null_node vnode which + * contains a reference to the lower vnode. + */ +int +null_node_create(mp, lowervp, newvpp) + struct mount *mp; + struct vnode *lowervp; + struct vnode **newvpp; +{ + struct vnode *aliasvp; + + if (aliasvp = null_node_find(mp, lowervp)) { + /* + * null_node_find has taken another reference + * to the alias vnode. + */ +#ifdef NULLFS_DIAGNOSTIC + vprint("null_node_create: exists", NULLTOV(ap)); +#endif + /* VREF(aliasvp); --- done in null_node_find */ + } else { + int error; + + /* + * Get new vnode. + */ +#ifdef NULLFS_DIAGNOSTIC + printf("null_node_create: create new alias vnode\n"); +#endif + + /* + * Make new vnode reference the null_node. + */ + if (error = null_node_alloc(mp, lowervp, &aliasvp)) + return error; + + /* + * aliasvp is already VREF'd by getnewvnode() + */ + } + + vrele(lowervp); + +#ifdef DIAGNOSTIC + if (lowervp->v_usecount < 1) { + /* Should never happen... */ + vprint ("null_node_create: alias "); + vprint ("null_node_create: lower "); + printf ("null_node_create: lower has 0 usecount.\n"); + panic ("null_node_create: lower has 0 usecount."); + }; +#endif + +#ifdef NULLFS_DIAGNOSTIC + vprint("null_node_create: alias", aliasvp); + vprint("null_node_create: lower", lowervp); +#endif + + *newvpp = aliasvp; + return (0); +} +#ifdef NULLFS_DIAGNOSTIC +struct vnode * +null_checkvp(vp, fil, lno) + struct vnode *vp; + char *fil; + int lno; +{ + struct null_node *a = VTONULL(vp); +#ifdef notyet + /* + * Can't do this check because vop_reclaim runs + * with a funny vop vector. + */ + if (vp->v_op != null_vnodeop_p) { + printf ("null_checkvp: on non-null-node\n"); + while (null_checkvp_barrier) /*WAIT*/ ; + panic("null_checkvp"); + }; +#endif + if (a->null_lowervp == NULL) { + /* Should never happen */ + int i; u_long *p; + printf("vp = %x, ZERO ptr\n", vp); + for (p = (u_long *) a, i = 0; i < 8; i++) + printf(" %x", p[i]); + printf("\n"); + /* wait for debugger */ + while (null_checkvp_barrier) /*WAIT*/ ; + panic("null_checkvp"); + } + if (a->null_lowervp->v_usecount < 1) { + int i; u_long *p; + printf("vp = %x, unref'ed lowervp\n", vp); + for (p = (u_long *) a, i = 0; i < 8; i++) + printf(" %x", p[i]); + printf("\n"); + /* wait for debugger */ + while (null_checkvp_barrier) /*WAIT*/ ; + panic ("null with unref'ed lowervp"); + }; +#ifdef notyet + printf("null %x/%d -> %x/%d [%s, %d]\n", + NULLTOV(a), NULLTOV(a)->v_usecount, + a->null_lowervp, a->null_lowervp->v_usecount, + fil, lno); +#endif + return a->null_lowervp; +} +#endif diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c new file mode 100644 index 00000000000..b0d2df75cda --- /dev/null +++ b/sys/fs/nullfs/null_vfsops.c @@ -0,0 +1,366 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null_vfsops.c 8.2 (Berkeley) 1/21/94 + * + * @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92 + * $Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp jsp $ + */ + +/* + * Null Layer + * (See null_vnops.c for a description of what this does.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Mount null layer + */ +int +nullfs_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + int error = 0; + struct null_args args; + struct vnode *lowerrootvp, *vp; + struct vnode *nullm_rootvp; + struct null_mount *xmp; + u_int size; + +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_mount(mp = %x)\n", mp); +#endif + + /* + * Update is a no-op + */ + if (mp->mnt_flag & MNT_UPDATE) { + return (EOPNOTSUPP); + /* return VFS_MOUNT(MOUNTTONULLMOUNT(mp)->nullm_vfs, path, data, ndp, p);*/ + } + + /* + * Get argument + */ + if (error = copyin(data, (caddr_t)&args, sizeof(struct null_args))) + return (error); + + /* + * Find lower node + */ + NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT|LOCKLEAF, + UIO_USERSPACE, args.target, p); + if (error = namei(ndp)) + return (error); + + /* + * Sanity check on lower vnode + */ + lowerrootvp = ndp->ni_vp; + + vrele(ndp->ni_dvp); + ndp->ni_dvp = NULL; + + xmp = (struct null_mount *) malloc(sizeof(struct null_mount), + M_UFSMNT, M_WAITOK); /* XXX */ + + /* + * Save reference to underlying FS + */ + xmp->nullm_vfs = lowerrootvp->v_mount; + + /* + * Save reference. Each mount also holds + * a reference on the root vnode. + */ + error = null_node_create(mp, lowerrootvp, &vp); + /* + * Unlock the node (either the lower or the alias) + */ + VOP_UNLOCK(vp); + /* + * Make sure the node alias worked + */ + if (error) { + vrele(lowerrootvp); + free(xmp, M_UFSMNT); /* XXX */ + return (error); + } + + /* + * Keep a held reference to the root vnode. + * It is vrele'd in nullfs_unmount. + */ + nullm_rootvp = vp; + nullm_rootvp->v_flag |= VROOT; + xmp->nullm_rootvp = nullm_rootvp; + if (NULLVPTOLOWERVP(nullm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_data = (qaddr_t) xmp; + getnewfsid(mp, MOUNT_LOFS); + + (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + (void) copyinstr(args.target, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_mount: lower %s, alias at %s\n", + mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); +#endif + return (0); +} + +/* + * VFS start. Nothing needed here - the start routine + * on the underlying filesystem will have been called + * when that filesystem was mounted. + */ +int +nullfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + return (0); + /* return VFS_START(MOUNTTONULLMOUNT(mp)->nullm_vfs, flags, p); */ +} + +/* + * Free reference to null layer + */ +int +nullfs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + struct vnode *nullm_rootvp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; + int error; + int flags = 0; + extern int doforce; + +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_unmount(mp = %x)\n", mp); +#endif + + if (mntflags & MNT_FORCE) { + /* lofs can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + /* + * Clear out buffer cache. I don't think we + * ever get anything cached at this level at the + * moment, but who knows... + */ +#if 0 + mntflushbuf(mp, 0); + if (mntinvalbuf(mp, 1)) + return (EBUSY); +#endif + if (nullm_rootvp->v_usecount > 1) + return (EBUSY); + if (error = vflush(mp, nullm_rootvp, flags)) + return (error); + +#ifdef NULLFS_DIAGNOSTIC + vprint("alias root of lower", nullm_rootvp); +#endif + /* + * Release reference on underlying root vnode + */ + vrele(nullm_rootvp); + /* + * And blow it away for future re-use + */ + vgone(nullm_rootvp); + /* + * Finally, throw away the null_mount structure + */ + free(mp->mnt_data, M_UFSMNT); /* XXX */ + mp->mnt_data = 0; + return 0; +} + +int +nullfs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *vp; + +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_root(mp = %x, vp = %x->%x)\n", mp, + MOUNTTONULLMOUNT(mp)->nullm_rootvp, + NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp) + ); +#endif + + /* + * Return locked reference to root. + */ + vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; + VREF(vp); + VOP_LOCK(vp); + *vpp = vp; + return 0; +} + +int +nullfs_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg, p); +} + +int +nullfs_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + int error; + struct statfs mstat; + +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_statfs(mp = %x, vp = %x->%x)\n", mp, + MOUNTTONULLMOUNT(mp)->nullm_rootvp, + NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp) + ); +#endif + + bzero(&mstat, sizeof(mstat)); + + error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, p); + if (error) + return (error); + + /* now copy across the "interesting" information and fake the rest */ + sbp->f_type = mstat.f_type; + sbp->f_flags = mstat.f_flags; + sbp->f_bsize = mstat.f_bsize; + sbp->f_iosize = mstat.f_iosize; + sbp->f_blocks = mstat.f_blocks; + sbp->f_bfree = mstat.f_bfree; + sbp->f_bavail = mstat.f_bavail; + sbp->f_files = mstat.f_files; + sbp->f_ffree = mstat.f_ffree; + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + return (0); +} + +int +nullfs_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + /* + * XXX - Assumes no data cached at null layer. + */ + return (0); +} + +int +nullfs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, vpp); +} + +int +nullfs_fhtovp(mp, fidp, nam, vpp, exflagsp, credanonp) + struct mount *mp; + struct fid *fidp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred**credanonp; +{ + + return VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, nam, vpp, exflagsp,credanonp); +} + +int +nullfs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + return VFS_VPTOFH(NULLVPTOLOWERVP(vp), fhp); +} + +int nullfs_init __P((void)); + +struct vfsops null_vfsops = { + nullfs_mount, + nullfs_start, + nullfs_unmount, + nullfs_root, + nullfs_quotactl, + nullfs_statfs, + nullfs_sync, + nullfs_vget, + nullfs_fhtovp, + nullfs_vptofh, + nullfs_init, +}; diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c new file mode 100644 index 00000000000..115ff6f4643 --- /dev/null +++ b/sys/fs/nullfs/null_vnops.c @@ -0,0 +1,462 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * John Heidemann of the UCLA Ficus project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null_vnops.c 8.1 (Berkeley) 6/10/93 + * + * Ancestors: + * @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92 + * $Id: lofs_vnops.c,v 1.11 1992/05/30 10:05:43 jsp Exp jsp $ + * ...and... + * @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project + */ + +/* + * Null Layer + * + * (See mount_null(8) for more information.) + * + * The null layer duplicates a portion of the file system + * name space under a new name. In this respect, it is + * similar to the loopback file system. It differs from + * the loopback fs in two respects: it is implemented using + * a stackable layers techniques, and it's "null-node"s stack above + * all lower-layer vnodes, not just over directory vnodes. + * + * The null layer has two purposes. First, it serves as a demonstration + * of layering by proving a layer which does nothing. (It actually + * does everything the loopback file system does, which is slightly + * more than nothing.) Second, the null layer can serve as a prototype + * layer. Since it provides all necessary layer framework, + * new file system layers can be created very easily be starting + * with a null layer. + * + * The remainder of this man page examines the null layer as a basis + * for constructing new layers. + * + * + * INSTANTIATING NEW NULL LAYERS + * + * New null layers are created with mount_null(8). + * Mount_null(8) takes two arguments, the pathname + * of the lower vfs (target-pn) and the pathname where the null + * layer will appear in the namespace (alias-pn). After + * the null layer is put into place, the contents + * of target-pn subtree will be aliased under alias-pn. + * + * + * OPERATION OF A NULL LAYER + * + * The null layer is the minimum file system layer, + * simply bypassing all possible operations to the lower layer + * for processing there. The majority of its activity centers + * on the bypass routine, though which nearly all vnode operations + * pass. + * + * The bypass routine accepts arbitrary vnode operations for + * handling by the lower layer. It begins by examing vnode + * operation arguments and replacing any null-nodes by their + * lower-layer equivlants. It then invokes the operation + * on the lower layer. Finally, it replaces the null-nodes + * in the arguments and, if a vnode is return by the operation, + * stacks a null-node on top of the returned vnode. + * + * Although bypass handles most operations, + * vop_getattr, _inactive, _reclaim, and _print are not bypassed. + * Vop_getattr must change the fsid being returned. + * Vop_inactive and vop_reclaim are not bypassed so that + * they can handle freeing null-layer specific data. + * Vop_print is not bypassed to avoid excessive debugging + * information. + * + * + * INSTANTIATING VNODE STACKS + * + * Mounting associates the null layer with a lower layer, + * effect stacking two VFSes. Vnode stacks are instead + * created on demand as files are accessed. + * + * The initial mount creates a single vnode stack for the + * root of the new null layer. All other vnode stacks + * are created as a result of vnode operations on + * this or other null vnode stacks. + * + * New vnode stacks come into existance as a result of + * an operation which returns a vnode. + * The bypass routine stacks a null-node above the new + * vnode before returning it to the caller. + * + * For example, imagine mounting a null layer with + * "mount_null /usr/include /dev/layer/null". + * Changing directory to /dev/layer/null will assign + * the root null-node (which was created when the null layer was mounted). + * Now consider opening "sys". A vop_lookup would be + * done on the root null-node. This operation would bypass through + * to the lower layer which would return a vnode representing + * the UFS "sys". Null_bypass then builds a null-node + * aliasing the UFS "sys" and returns this to the caller. + * Later operations on the null-node "sys" will repeat this + * process when constructing other vnode stacks. + * + * + * CREATING OTHER FILE SYSTEM LAYERS + * + * One of the easiest ways to construct new file system layers is to make + * a copy of the null layer, rename all files and variables, and + * then begin modifing the copy. Sed can be used to easily rename + * all variables. + * + * The umap layer is an example of a layer descended from the + * null layer. + * + * + * INVOKING OPERATIONS ON LOWER LAYERS + * + * There are two techniques to invoke operations on a lower layer + * when the operation cannot be completely bypassed. Each method + * is appropriate in different situations. In both cases, + * it is the responsibility of the aliasing layer to make + * the operation arguments "correct" for the lower layer + * by mapping an vnode arguments to the lower layer. + * + * The first approach is to call the aliasing layer's bypass routine. + * This method is most suitable when you wish to invoke the operation + * currently being hanldled on the lower layer. It has the advantage + * that the bypass routine already must do argument mapping. + * An example of this is null_getattrs in the null layer. + * + * A second approach is to directly invoked vnode operations on + * the lower layer with the VOP_OPERATIONNAME interface. + * The advantage of this method is that it is easy to invoke + * arbitrary operations on the lower layer. The disadvantage + * is that vnodes arguments must be manualy mapped. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ + +/* + * This is the 10-Apr-92 bypass routine. + * This version has been optimized for speed, throwing away some + * safety checks. It should still always work, but it's not as + * robust to programmer errors. + * Define SAFETY to include some error checking code. + * + * In general, we map all vnodes going down and unmap them on the way back. + * As an exception to this, vnodes can be marked "unmapped" by setting + * the Nth bit in operation's vdesc_flags. + * + * Also, some BSD vnode operations have the side effect of vrele'ing + * their arguments. With stacking, the reference counts are held + * by the upper node, not the lower one, so we must handle these + * side-effects here. This is not of concern in Sun-derived systems + * since there are no such side-effects. + * + * This makes the following assumptions: + * - only one returned vpp + * - no INOUT vpp's (Sun's vop_open has one of these) + * - the vnode operation vector of the first vnode should be used + * to determine what implementation of the op should be invoked + * - all mapped vnodes are of our vnode-type (NEEDSWORK: + * problems on rmdir'ing mount points and renaming?) + */ +int +null_bypass(ap) + struct vop_generic_args /* { + struct vnodeop_desc *a_desc; + + } */ *ap; +{ + extern int (**null_vnodeop_p)(); /* not extern, really "forward" */ + register struct vnode **this_vp_p; + int error; + struct vnode *old_vps[VDESC_MAX_VPS]; + struct vnode **vps_p[VDESC_MAX_VPS]; + struct vnode ***vppp; + struct vnodeop_desc *descp = ap->a_desc; + int reles, i; + + if (null_bug_bypass) + printf ("null_bypass: %s\n", descp->vdesc_name); + +#ifdef SAFETY + /* + * We require at least one vp. + */ + if (descp->vdesc_vp_offsets == NULL || + descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) + panic ("null_bypass: no vp's in map.\n"); +#endif + + /* + * Map the vnodes going in. + * Later, we'll invoke the operation based on + * the first mapped vnode's operation vector. + */ + reles = descp->vdesc_flags; + for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { + if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) + break; /* bail out at end of list */ + vps_p[i] = this_vp_p = + VOPARG_OFFSETTO(struct vnode**,descp->vdesc_vp_offsets[i],ap); + /* + * We're not guaranteed that any but the first vnode + * are of our type. Check for and don't map any + * that aren't. (We must always map first vp or vclean fails.) + */ + if (i && (*this_vp_p)->v_op != null_vnodeop_p) { + old_vps[i] = NULL; + } else { + old_vps[i] = *this_vp_p; + *(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p); + /* + * XXX - Several operations have the side effect + * of vrele'ing their vp's. We must account for + * that. (This should go away in the future.) + */ + if (reles & 1) + VREF(*this_vp_p); + } + + } + + /* + * Call the operation on the lower layer + * with the modified argument structure. + */ + error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap); + + /* + * Maintain the illusion of call-by-value + * by restoring vnodes in the argument structure + * to their original value. + */ + reles = descp->vdesc_flags; + for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { + if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) + break; /* bail out at end of list */ + if (old_vps[i]) { + *(vps_p[i]) = old_vps[i]; + if (reles & 1) + vrele(*(vps_p[i])); + } + } + + /* + * Map the possible out-going vpp + * (Assumes that the lower layer always returns + * a VREF'ed vpp unless it gets an error.) + */ + if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && + !(descp->vdesc_flags & VDESC_NOMAP_VPP) && + !error) { + /* + * XXX - even though some ops have vpp returned vp's, + * several ops actually vrele this before returning. + * We must avoid these ops. + * (This should go away when these ops are regularized.) + */ + if (descp->vdesc_flags & VDESC_VPP_WILLRELE) + goto out; + vppp = VOPARG_OFFSETTO(struct vnode***, + descp->vdesc_vpp_offset,ap); + error = null_node_create(old_vps[0]->v_mount, **vppp, *vppp); + } + + out: + return (error); +} + + +/* + * We handle getattr only to change the fsid. + */ +int +null_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + int error; + if (error = null_bypass(ap)) + return (error); + /* Requires that arguments be restored. */ + ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; + return (0); +} + + +int +null_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + /* + * Do nothing (and _don't_ bypass). + * Wait to vrele lowervp until reclaim, + * so that until then our null_node is in the + * cache and reusable. + * + * NEEDSWORK: Someday, consider inactive'ing + * the lowervp and then trying to reactivate it + * with capabilities (v_id) + * like they do in the name lookup cache code. + * That's too much work for now. + */ + return (0); +} + +int +null_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct null_node *xp = VTONULL(vp); + struct vnode *lowervp = xp->null_lowervp; + + /* + * Note: in vop_reclaim, vp->v_op == dead_vnodeop_p, + * so we can't call VOPs on ourself. + */ + /* After this assignment, this node will not be re-used. */ + xp->null_lowervp = NULL; + remque(xp); + FREE(vp->v_data, M_TEMP); + vp->v_data = NULL; + vrele (lowervp); + return (0); +} + + +int +null_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + printf ("\ttag VT_NULLFS, vp=%x, lowervp=%x\n", vp, NULLVPTOLOWERVP(vp)); + return (0); +} + + +/* + * XXX - vop_strategy must be hand coded because it has no + * vnode in its arguments. + * This goes away with a merged VM/buffer cache. + */ +int +null_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + int error; + struct vnode *savedvp; + + savedvp = bp->b_vp; + bp->b_vp = NULLVPTOLOWERVP(bp->b_vp); + + error = VOP_STRATEGY(bp); + + bp->b_vp = savedvp; + + return (error); +} + + +/* + * XXX - like vop_strategy, vop_bwrite must be hand coded because it has no + * vnode in its arguments. + * This goes away with a merged VM/buffer cache. + */ +int +null_bwrite(ap) + struct vop_bwrite_args /* { + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + int error; + struct vnode *savedvp; + + savedvp = bp->b_vp; + bp->b_vp = NULLVPTOLOWERVP(bp->b_vp); + + error = VOP_BWRITE(bp); + + bp->b_vp = savedvp; + + return (error); +} + +/* + * Global vfs data structures + */ +int (**null_vnodeop_p)(); +struct vnodeopv_entry_desc null_vnodeop_entries[] = { + { &vop_default_desc, null_bypass }, + + { &vop_getattr_desc, null_getattr }, + { &vop_inactive_desc, null_inactive }, + { &vop_reclaim_desc, null_reclaim }, + { &vop_print_desc, null_print }, + + { &vop_strategy_desc, null_strategy }, + { &vop_bwrite_desc, null_bwrite }, + + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc null_vnodeop_opv_desc = + { &null_vnodeop_p, null_vnodeop_entries }; diff --git a/sys/fs/portalfs/portal.h b/sys/fs/portalfs/portal.h new file mode 100644 index 00000000000..38d7ee0cdd2 --- /dev/null +++ b/sys/fs/portalfs/portal.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)portal.h 8.4 (Berkeley) 1/21/94 + * + * $Id: portal.h,v 1.3 1992/05/30 10:05:24 jsp Exp jsp $ + */ + +struct portal_args { + char *pa_config; /* Config file */ + int pa_socket; /* Socket to server */ +}; + +struct portal_cred { + int pcr_flag; /* File open mode */ + uid_t pcr_uid; /* From ucred */ + short pcr_ngroups; /* From ucred */ + gid_t pcr_groups[NGROUPS]; /* From ucred */ +}; + +#ifdef KERNEL +struct portalmount { + struct vnode *pm_root; /* Root node */ + struct file *pm_server; /* Held reference to server socket */ +}; + +struct portalnode { + int pt_size; /* Length of Arg */ + char *pt_arg; /* Arg to send to server */ + int pt_fileid; /* cookie */ +}; + +#define VFSTOPORTAL(mp) ((struct portalmount *)((mp)->mnt_data)) +#define VTOPORTAL(vp) ((struct portalnode *)(vp)->v_data) + +#define PORTAL_ROOTFILEID 2 + +extern int (**portal_vnodeop_p)(); +extern struct vfsops portal_vfsops; +#endif /* KERNEL */ diff --git a/sys/fs/portalfs/portal_vfsops.c b/sys/fs/portalfs/portal_vfsops.c new file mode 100644 index 00000000000..39e8563009b --- /dev/null +++ b/sys/fs/portalfs/portal_vfsops.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)portal_vfsops.c 8.6 (Berkeley) 1/21/94 + * + * $Id: portal_vfsops.c,v 1.5 1992/05/30 10:25:27 jsp Exp jsp $ + */ + +/* + * Portal Filesystem + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +portal_init() +{ + + return (0); +} + +/* + * Mount the per-process file descriptors (/dev/fd) + */ +int +portal_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct file *fp; + struct portal_args args; + struct portalmount *fmp; + struct socket *so; + struct vnode *rvp; + u_int size; + int error; + + /* + * Update is a no-op + */ + if (mp->mnt_flag & MNT_UPDATE) + return (EOPNOTSUPP); + + if (error = copyin(data, (caddr_t) &args, sizeof(struct portal_args))) + return (error); + + if (error = getsock(p->p_fd, args.pa_socket, &fp)) + return (error); + so = (struct socket *) fp->f_data; + if (so->so_proto->pr_domain->dom_family != AF_UNIX) + return (ESOCKTNOSUPPORT); + + error = getnewvnode(VT_PORTAL, mp, portal_vnodeop_p, &rvp); /* XXX */ + if (error) + return (error); + MALLOC(rvp->v_data, void *, sizeof(struct portalnode), + M_TEMP, M_WAITOK); + + fmp = (struct portalmount *) malloc(sizeof(struct portalmount), + M_UFSMNT, M_WAITOK); /* XXX */ + rvp->v_type = VDIR; + rvp->v_flag |= VROOT; + VTOPORTAL(rvp)->pt_arg = 0; + VTOPORTAL(rvp)->pt_size = 0; + VTOPORTAL(rvp)->pt_fileid = PORTAL_ROOTFILEID; + fmp->pm_root = rvp; + fmp->pm_server = fp; fp->f_count++; + + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_data = (qaddr_t) fmp; + getnewfsid(mp, MOUNT_PORTAL); + + (void)copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + (void)copyinstr(args.pa_config, + mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + +#ifdef notdef + bzero(mp->mnt_stat.f_mntfromname, MNAMELEN); + bcopy("portal", mp->mnt_stat.f_mntfromname, sizeof("portal")); +#endif + + return (0); +} + +int +portal_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + + return (0); +} + +int +portal_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + extern int doforce; + struct vnode *rootvp = VFSTOPORTAL(mp)->pm_root; + int error, flags = 0; + + + if (mntflags & MNT_FORCE) { + /* portal can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + /* + * Clear out buffer cache. I don't think we + * ever get anything cached at this level at the + * moment, but who knows... + */ +#ifdef notyet + mntflushbuf(mp, 0); + if (mntinvalbuf(mp, 1)) + return (EBUSY); +#endif + if (rootvp->v_usecount > 1) + return (EBUSY); + if (error = vflush(mp, rootvp, flags)) + return (error); + + /* + * Release reference on underlying root vnode + */ + vrele(rootvp); + /* + * And blow it away for future re-use + */ + vgone(rootvp); + /* + * Shutdown the socket. This will cause the select in the + * daemon to wake up, and then the accept will get ECONNABORTED + * which it interprets as a request to go and bury itself. + */ + soshutdown((struct socket *) VFSTOPORTAL(mp)->pm_server->f_data, 2); + /* + * Discard reference to underlying file. Must call closef because + * this may be the last reference. + */ + closef(VFSTOPORTAL(mp)->pm_server, (struct proc *) 0); + /* + * Finally, throw away the portalmount structure + */ + free(mp->mnt_data, M_UFSMNT); /* XXX */ + mp->mnt_data = 0; + return (0); +} + +int +portal_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *vp; + + + /* + * Return locked reference to root. + */ + vp = VFSTOPORTAL(mp)->pm_root; + VREF(vp); + VOP_LOCK(vp); + *vpp = vp; + return (0); +} + +int +portal_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} + +int +portal_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + + sbp->f_type = MOUNT_PORTAL; + sbp->f_flags = 0; + sbp->f_bsize = DEV_BSIZE; + sbp->f_iosize = DEV_BSIZE; + sbp->f_blocks = 2; /* 1K to keep df happy */ + sbp->f_bfree = 0; + sbp->f_bavail = 0; + sbp->f_files = 1; /* Allow for "." */ + sbp->f_ffree = 0; /* See comments above */ + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + return (0); +} + +int +portal_sync(mp, waitfor) + struct mount *mp; + int waitfor; +{ + + return (0); +} + +int +portal_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +int +portal_fhtovp(mp, fhp, vpp) + struct mount *mp; + struct fid *fhp; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +int +portal_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + + return (EOPNOTSUPP); +} + +struct vfsops portal_vfsops = { + portal_mount, + portal_start, + portal_unmount, + portal_root, + portal_quotactl, + portal_statfs, + portal_sync, + portal_vget, + portal_fhtovp, + portal_vptofh, + portal_init, +}; diff --git a/sys/fs/portalfs/portal_vnops.c b/sys/fs/portalfs/portal_vnops.c new file mode 100644 index 00000000000..5e170261e71 --- /dev/null +++ b/sys/fs/portalfs/portal_vnops.c @@ -0,0 +1,707 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)portal_vnops.c 8.8 (Berkeley) 1/21/94 + * + * $Id: portal_vnops.c,v 1.4 1992/05/30 10:05:24 jsp Exp jsp $ + */ + +/* + * Portal Filesystem + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int portal_fileid = PORTAL_ROOTFILEID+1; + +static void +portal_closefd(p, fd) + struct proc *p; + int fd; +{ + int error; + struct { + int fd; + } ua; + int rc; + + ua.fd = fd; + error = close(p, &ua, &rc); + /* + * We should never get an error, and there isn't anything + * we could do if we got one, so just print a message. + */ + if (error) + printf("portal_closefd: error = %d\n", error); +} + +/* + * vp is the current namei directory + * cnp is the name to locate in that directory... + */ +int +portal_lookup(ap) + struct vop_lookup_args /* { + struct vnode * a_dvp; + struct vnode ** a_vpp; + struct componentname * a_cnp; + } */ *ap; +{ + char *pname = ap->a_cnp->cn_nameptr; + struct portalnode *pt; + int error; + struct vnode *fvp = 0; + char *path; + int size; + + if (ap->a_cnp->cn_namelen == 1 && *pname == '.') { + *ap->a_vpp = ap->a_dvp; + VREF(ap->a_dvp); + /*VOP_LOCK(ap->a_dvp);*/ + return (0); + } + + + error = getnewvnode(VT_PORTAL, ap->a_dvp->v_mount, portal_vnodeop_p, &fvp); + if (error) + goto bad; + fvp->v_type = VREG; + MALLOC(fvp->v_data, void *, sizeof(struct portalnode), + M_TEMP, M_WAITOK); + + pt = VTOPORTAL(fvp); + /* + * Save all of the remaining pathname and + * advance the namei next pointer to the end + * of the string. + */ + for (size = 0, path = pname; *path; path++) + size++; + ap->a_cnp->cn_consume = size - ap->a_cnp->cn_namelen; + + pt->pt_arg = malloc(size+1, M_TEMP, M_WAITOK); + pt->pt_size = size+1; + bcopy(pname, pt->pt_arg, pt->pt_size); + pt->pt_fileid = portal_fileid++; + + *ap->a_vpp = fvp; + /*VOP_LOCK(fvp);*/ + return (0); + +bad:; + if (fvp) { + vrele(fvp); + } + *ap->a_vpp = NULL; + return (error); +} + +static int +portal_connect(so, so2) + struct socket *so; + struct socket *so2; +{ + /* from unp_connect, bypassing the namei stuff... */ + struct socket *so3; + struct unpcb *unp2; + struct unpcb *unp3; + + if (so2 == 0) + return (ECONNREFUSED); + + if (so->so_type != so2->so_type) + return (EPROTOTYPE); + + if ((so2->so_options & SO_ACCEPTCONN) == 0) + return (ECONNREFUSED); + + if ((so3 = sonewconn(so2, 0)) == 0) + return (ECONNREFUSED); + + unp2 = sotounpcb(so2); + unp3 = sotounpcb(so3); + if (unp2->unp_addr) + unp3->unp_addr = m_copy(unp2->unp_addr, 0, (int)M_COPYALL); + + so2 = so3; + + + return (unp_connect2(so, so2)); +} + +int +portal_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct socket *so = 0; + struct portalnode *pt; + struct proc *p = ap->a_p; + struct vnode *vp = ap->a_vp; + int s; + struct uio auio; + struct iovec aiov[2]; + int res; + struct mbuf *cm = 0; + struct cmsghdr *cmsg; + int newfds; + int *ip; + int fd; + int error; + int len; + struct portalmount *fmp; + struct file *fp; + struct portal_cred pcred; + + /* + * Nothing to do when opening the root node. + */ + if (vp->v_flag & VROOT) + return (0); + + /* + * Can't be opened unless the caller is set up + * to deal with the side effects. Check for this + * by testing whether the p_dupfd has been set. + */ + if (p->p_dupfd >= 0) + return (ENODEV); + + pt = VTOPORTAL(vp); + fmp = VFSTOPORTAL(vp->v_mount); + + /* + * Create a new socket. + */ + error = socreate(AF_UNIX, &so, SOCK_STREAM, 0); + if (error) + goto bad; + + /* + * Reserve some buffer space + */ + res = pt->pt_size + sizeof(pcred) + 512; /* XXX */ + error = soreserve(so, res, res); + if (error) + goto bad; + + /* + * Kick off connection + */ + error = portal_connect(so, (struct socket *)fmp->pm_server->f_data); + if (error) + goto bad; + + /* + * Wait for connection to complete + */ + /* + * XXX: Since the mount point is holding a reference on the + * underlying server socket, it is not easy to find out whether + * the server process is still running. To handle this problem + * we loop waiting for the new socket to be connected (something + * which will only happen if the server is still running) or for + * the reference count on the server socket to drop to 1, which + * will happen if the server dies. Sleep for 5 second intervals + * and keep polling the reference count. XXX. + */ + s = splnet(); + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { + if (fmp->pm_server->f_count == 1) { + error = ECONNREFUSED; + splx(s); + goto bad; + } + (void) tsleep((caddr_t) &so->so_timeo, PSOCK, "portalcon", 5 * hz); + } + splx(s); + + if (so->so_error) { + error = so->so_error; + goto bad; + } + + /* + * Set miscellaneous flags + */ + so->so_rcv.sb_timeo = 0; + so->so_snd.sb_timeo = 0; + so->so_rcv.sb_flags |= SB_NOINTR; + so->so_snd.sb_flags |= SB_NOINTR; + + + pcred.pcr_flag = ap->a_mode; + pcred.pcr_uid = ap->a_cred->cr_uid; + pcred.pcr_ngroups = ap->a_cred->cr_ngroups; + bcopy(ap->a_cred->cr_groups, pcred.pcr_groups, NGROUPS * sizeof(gid_t)); + aiov[0].iov_base = (caddr_t) &pcred; + aiov[0].iov_len = sizeof(pcred); + aiov[1].iov_base = pt->pt_arg; + aiov[1].iov_len = pt->pt_size; + auio.uio_iov = aiov; + auio.uio_iovcnt = 2; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_procp = p; + auio.uio_offset = 0; + auio.uio_resid = aiov[0].iov_len + aiov[1].iov_len; + + error = sosend(so, (struct mbuf *) 0, &auio, + (struct mbuf *) 0, (struct mbuf *) 0, 0); + if (error) + goto bad; + + len = auio.uio_resid = sizeof(int); + do { + struct mbuf *m = 0; + int flags = MSG_WAITALL; + error = soreceive(so, (struct mbuf **) 0, &auio, + &m, &cm, &flags); + if (error) + goto bad; + + /* + * Grab an error code from the mbuf. + */ + if (m) { + m = m_pullup(m, sizeof(int)); /* Needed? */ + if (m) { + error = *(mtod(m, int *)); + m_freem(m); + } else { + error = EINVAL; + } + } else { + if (cm == 0) { + error = ECONNRESET; /* XXX */ +#ifdef notdef + break; +#endif + } + } + } while (cm == 0 && auio.uio_resid == len && !error); + + if (cm == 0) + goto bad; + + if (auio.uio_resid) { + error = 0; +#ifdef notdef + error = EMSGSIZE; + goto bad; +#endif + } + + /* + * XXX: Break apart the control message, and retrieve the + * received file descriptor. Note that more than one descriptor + * may have been received, or that the rights chain may have more + * than a single mbuf in it. What to do? + */ + cmsg = mtod(cm, struct cmsghdr *); + newfds = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof (int); + if (newfds == 0) { + error = ECONNREFUSED; + goto bad; + } + /* + * At this point the rights message consists of a control message + * header, followed by a data region containing a vector of + * integer file descriptors. The fds were allocated by the action + * of receiving the control message. + */ + ip = (int *) (cmsg + 1); + fd = *ip++; + if (newfds > 1) { + /* + * Close extra fds. + */ + int i; + printf("portal_open: %d extra fds\n", newfds - 1); + for (i = 1; i < newfds; i++) { + portal_closefd(p, *ip); + ip++; + } + } + + /* + * Check that the mode the file is being opened for is a subset + * of the mode of the existing descriptor. + */ + fp = p->p_fd->fd_ofiles[fd]; + if (((ap->a_mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { + portal_closefd(p, fd); + error = EACCES; + goto bad; + } + + /* + * Save the dup fd in the proc structure then return the + * special error code (ENXIO) which causes magic things to + * happen in vn_open. The whole concept is, well, hmmm. + */ + p->p_dupfd = fd; + error = ENXIO; + +bad:; + /* + * And discard the control message. + */ + if (cm) { + m_freem(cm); + } + + if (so) { + soshutdown(so, 2); + soclose(so); + } + return (error); +} + +int +portal_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + + bzero(vap, sizeof(*vap)); + vattr_null(vap); + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + vap->va_size = DEV_BSIZE; + vap->va_blocksize = DEV_BSIZE; + microtime(&vap->va_atime); + vap->va_mtime = vap->va_atime; + vap->va_ctime = vap->va_ctime; + vap->va_gen = 0; + vap->va_flags = 0; + vap->va_rdev = 0; + /* vap->va_qbytes = 0; */ + vap->va_bytes = 0; + /* vap->va_qsize = 0; */ + if (vp->v_flag & VROOT) { + vap->va_type = VDIR; + vap->va_mode = S_IRUSR|S_IWUSR|S_IXUSR| + S_IRGRP|S_IWGRP|S_IXGRP| + S_IROTH|S_IWOTH|S_IXOTH; + vap->va_nlink = 2; + vap->va_fileid = 2; + } else { + vap->va_type = VREG; + vap->va_mode = S_IRUSR|S_IWUSR| + S_IRGRP|S_IWGRP| + S_IROTH|S_IWOTH; + vap->va_nlink = 1; + vap->va_fileid = VTOPORTAL(vp)->pt_fileid; + } + return (0); +} + +int +portal_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* + * Can't mess with the root vnode + */ + if (ap->a_vp->v_flag & VROOT) + return (EACCES); + + return (0); +} + +/* + * Fake readdir, just return empty directory. + * It is hard to deal with '.' and '..' so don't bother. + */ +int +portal_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + + return (0); +} + +int +portal_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +int +portal_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct portalnode *pt = VTOPORTAL(ap->a_vp); + + if (pt->pt_arg) { + free((caddr_t) pt->pt_arg, M_TEMP); + pt->pt_arg = 0; + } + FREE(ap->a_vp->v_data, M_TEMP); + ap->a_vp->v_data = 0; + + return (0); +} + +/* + * Return POSIX pathconf information applicable to special devices. + */ +portal_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_MAX_CANON: + *ap->a_retval = MAX_CANON; + return (0); + case _PC_MAX_INPUT: + *ap->a_retval = MAX_INPUT; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_VDISABLE: + *ap->a_retval = _POSIX_VDISABLE; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Print out the contents of a Portal vnode. + */ +/* ARGSUSED */ +int +portal_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + printf("tag VT_PORTAL, portal vnode\n"); + return (0); +} + +/*void*/ +int +portal_vfree(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + + return (0); +} + + +/* + * Portal vnode unsupported operation + */ +int +portal_enotsupp() +{ + + return (EOPNOTSUPP); +} + +/* + * Portal "should never get here" operation + */ +int +portal_badop() +{ + + panic("portal: bad op"); + /* NOTREACHED */ +} + +/* + * Portal vnode null operation + */ +int +portal_nullop() +{ + + return (0); +} + +#define portal_create ((int (*) __P((struct vop_create_args *)))portal_enotsupp) +#define portal_mknod ((int (*) __P((struct vop_mknod_args *)))portal_enotsupp) +#define portal_close ((int (*) __P((struct vop_close_args *)))nullop) +#define portal_access ((int (*) __P((struct vop_access_args *)))nullop) +#define portal_read ((int (*) __P((struct vop_read_args *)))portal_enotsupp) +#define portal_write ((int (*) __P((struct vop_write_args *)))portal_enotsupp) +#define portal_ioctl ((int (*) __P((struct vop_ioctl_args *)))portal_enotsupp) +#define portal_select ((int (*) __P((struct vop_select_args *)))portal_enotsupp) +#define portal_mmap ((int (*) __P((struct vop_mmap_args *)))portal_enotsupp) +#define portal_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) +#define portal_seek ((int (*) __P((struct vop_seek_args *)))nullop) +#define portal_remove ((int (*) __P((struct vop_remove_args *)))portal_enotsupp) +#define portal_link ((int (*) __P((struct vop_link_args *)))portal_enotsupp) +#define portal_rename ((int (*) __P((struct vop_rename_args *)))portal_enotsupp) +#define portal_mkdir ((int (*) __P((struct vop_mkdir_args *)))portal_enotsupp) +#define portal_rmdir ((int (*) __P((struct vop_rmdir_args *)))portal_enotsupp) +#define portal_symlink \ + ((int (*) __P((struct vop_symlink_args *)))portal_enotsupp) +#define portal_readlink \ + ((int (*) __P((struct vop_readlink_args *)))portal_enotsupp) +#define portal_abortop ((int (*) __P((struct vop_abortop_args *)))nullop) +#define portal_lock ((int (*) __P((struct vop_lock_args *)))nullop) +#define portal_unlock ((int (*) __P((struct vop_unlock_args *)))nullop) +#define portal_bmap ((int (*) __P((struct vop_bmap_args *)))portal_badop) +#define portal_strategy \ + ((int (*) __P((struct vop_strategy_args *)))portal_badop) +#define portal_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +#define portal_advlock \ + ((int (*) __P((struct vop_advlock_args *)))portal_enotsupp) +#define portal_blkatoff \ + ((int (*) __P((struct vop_blkatoff_args *)))portal_enotsupp) +#define portal_valloc ((int(*) __P(( \ + struct vnode *pvp, \ + int mode, \ + struct ucred *cred, \ + struct vnode **vpp))) portal_enotsupp) +#define portal_truncate \ + ((int (*) __P((struct vop_truncate_args *)))portal_enotsupp) +#define portal_update ((int (*) __P((struct vop_update_args *)))portal_enotsupp) +#define portal_bwrite ((int (*) __P((struct vop_bwrite_args *)))portal_enotsupp) + +int (**portal_vnodeop_p)(); +struct vnodeopv_entry_desc portal_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, portal_lookup }, /* lookup */ + { &vop_create_desc, portal_create }, /* create */ + { &vop_mknod_desc, portal_mknod }, /* mknod */ + { &vop_open_desc, portal_open }, /* open */ + { &vop_close_desc, portal_close }, /* close */ + { &vop_access_desc, portal_access }, /* access */ + { &vop_getattr_desc, portal_getattr }, /* getattr */ + { &vop_setattr_desc, portal_setattr }, /* setattr */ + { &vop_read_desc, portal_read }, /* read */ + { &vop_write_desc, portal_write }, /* write */ + { &vop_ioctl_desc, portal_ioctl }, /* ioctl */ + { &vop_select_desc, portal_select }, /* select */ + { &vop_mmap_desc, portal_mmap }, /* mmap */ + { &vop_fsync_desc, portal_fsync }, /* fsync */ + { &vop_seek_desc, portal_seek }, /* seek */ + { &vop_remove_desc, portal_remove }, /* remove */ + { &vop_link_desc, portal_link }, /* link */ + { &vop_rename_desc, portal_rename }, /* rename */ + { &vop_mkdir_desc, portal_mkdir }, /* mkdir */ + { &vop_rmdir_desc, portal_rmdir }, /* rmdir */ + { &vop_symlink_desc, portal_symlink }, /* symlink */ + { &vop_readdir_desc, portal_readdir }, /* readdir */ + { &vop_readlink_desc, portal_readlink }, /* readlink */ + { &vop_abortop_desc, portal_abortop }, /* abortop */ + { &vop_inactive_desc, portal_inactive }, /* inactive */ + { &vop_reclaim_desc, portal_reclaim }, /* reclaim */ + { &vop_lock_desc, portal_lock }, /* lock */ + { &vop_unlock_desc, portal_unlock }, /* unlock */ + { &vop_bmap_desc, portal_bmap }, /* bmap */ + { &vop_strategy_desc, portal_strategy }, /* strategy */ + { &vop_print_desc, portal_print }, /* print */ + { &vop_islocked_desc, portal_islocked }, /* islocked */ + { &vop_pathconf_desc, portal_pathconf }, /* pathconf */ + { &vop_advlock_desc, portal_advlock }, /* advlock */ + { &vop_blkatoff_desc, portal_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, portal_valloc }, /* valloc */ + { &vop_vfree_desc, portal_vfree }, /* vfree */ + { &vop_truncate_desc, portal_truncate }, /* truncate */ + { &vop_update_desc, portal_update }, /* update */ + { &vop_bwrite_desc, portal_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc portal_vnodeop_opv_desc = + { &portal_vnodeop_p, portal_vnodeop_entries }; diff --git a/sys/fs/procfs/README b/sys/fs/procfs/README new file mode 100644 index 00000000000..38811b3f6e3 --- /dev/null +++ b/sys/fs/procfs/README @@ -0,0 +1,113 @@ +saute procfs lyonnais + +procfs supports two levels of directory. the filesystem root +directory contains a representation of the system process table. +this consists of an entry for each active and zombie process, and +an additional entry "curproc" which always represents the process +making the lookup request. + +each of the sub-directories contains several files. these files +are used to control and interrogate processes. the files implemented +are: + + file - xxx. the exec'ed file. + + status - r/o. returns process status. + + ctl - w/o. sends a control message to the process. + for example: + echo hup > /proc/curproc/note + will send a SIGHUP to the shell. + whereas + echo attach > /proc/1293/ctl + would set up process 1293 for debugging. + see below for more details. + + mem - r/w. virtual memory image of the process. + parts of the address space are readable + only if they exist in the target process. + a more reasonable alternative might be + to return zero pages instead of an error. + comments? + + note - w/o. writing a string here sends the + equivalent note to the process. + [ not implemented. ] + + notepg - w/o. the same as note, but sends to all + members of the process group. + [ not implemented. ] + + regs - r/w. process register set. this can be read + or written any time even if the process + is not stopped. since the bsd kernel + is single-processor, this implementation + will get the "right" register values. + a multi-proc kernel would need to do some + synchronisation. + +this then looks like: + +% ls -li /proc +total 0 + 9 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 0 + 17 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 1 + 89 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 10 + 25 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 2 +2065 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 257 +2481 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 309 + 265 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 32 +3129 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 390 +3209 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 400 +3217 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 401 +3273 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 408 + 393 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 48 + 409 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 50 + 465 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 57 + 481 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 59 + 537 dr-xr-xr-x 2 root kmem 0 Sep 21 15:06 66 + 545 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 67 + 657 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 81 + 665 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 82 + 673 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 83 + 681 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 84 +3273 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 curproc +% ls -li /proc/curproc +total 408 +3341 --w------- 1 jsp staff 0 Sep 21 15:06 ctl +1554 -r-xr-xr-x 1 bin bin 90112 Mar 29 04:52 file +3339 -rw------- 1 jsp staff 118784 Sep 21 15:06 mem +3343 --w------- 1 jsp staff 0 Sep 21 15:06 note +3344 --w------- 1 jsp staff 0 Sep 21 15:06 notepg +3340 -rw------- 1 jsp staff 0 Sep 21 15:06 regs +3342 -r--r--r-- 1 jsp staff 0 Sep 21 15:06 status +% df /proc/curproc /proc/curproc/file +Filesystem 512-blocks Used Avail Capacity Mounted on +proc 2 2 0 100% /proc +/dev/wd0a 16186 13548 1018 93% / +% cat /proc/curproc/status +cat 446 439 400 81 12,0 ctty 748620684 270000 0 0 0 20000 nochan 11 20 20 20 0 21 117 + + + +the basic sequence of commands written to "ctl" would be + + attach - this stops the target process and + arranges for the sending process + to become the debug control process + wait - wait for the target process to come to + a steady state ready for debugging. + step - single step, with no signal delivery. + run - continue running, with no signal delivery, + until next trap or breakpoint. + - deliver signal and continue running. + detach - continue execution of the target process + and remove it from control by the debug process + +in a normal debugging environment, where the target is fork/exec'd by +the debugger, the debugger should fork and the child should stop itself +(with a self-inflicted SIGSTOP). the parent should do a "wait" then an +"attach". as before, the child will hit a breakpoint on the first +instruction in any newly exec'd image. + +$Id: README,v 3.1 1993/12/15 09:40:17 jsp Exp $ diff --git a/sys/fs/procfs/procfs.h b/sys/fs/procfs/procfs.h new file mode 100644 index 00000000000..f7b8fa3ef0e --- /dev/null +++ b/sys/fs/procfs/procfs.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs.h 8.6 (Berkeley) 2/3/94 + * + * From: + * $Id: procfs.h,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +/* + * The different types of node in a procfs filesystem + */ +typedef enum { + Proot, /* the filesystem root */ + Pproc, /* a process-specific sub-directory */ + Pfile, /* the executable file */ + Pmem, /* the process's memory image */ + Pregs, /* the process's register set */ + Pfpregs, /* the process's FP register set */ + Pctl, /* process control */ + Pstatus, /* process status */ + Pnote, /* process notifier */ + Pnotepg /* process group notifier */ +} pfstype; + +/* + * control data for the proc file system. + */ +struct pfsnode { + struct pfsnode *pfs_next; /* next on list */ + struct vnode *pfs_vnode; /* vnode associated with this pfsnode */ + pfstype pfs_type; /* type of procfs node */ + pid_t pfs_pid; /* associated process */ + u_short pfs_mode; /* mode bits for stat() */ + u_long pfs_flags; /* open flags */ + u_long pfs_fileno; /* unique file id */ +}; + +#define PROCFS_NOTELEN 64 /* max length of a note (/proc/$pid/note) */ +#define PROCFS_CTLLEN 8 /* max length of a ctl msg (/proc/$pid/ctl */ + +/* + * Kernel stuff follows + */ +#ifdef KERNEL +#define CNEQ(cnp, s, len) \ + ((cnp)->cn_namelen == (len) && \ + (bcmp((s), (cnp)->cn_nameptr, (len)) == 0)) + +/* + * Format of a directory entry in /proc, ... + * This must map onto struct dirent (see ) + */ +#define PROCFS_NAMELEN 8 +struct pfsdent { + u_long d_fileno; + u_short d_reclen; + u_char d_type; + u_char d_namlen; + char d_name[PROCFS_NAMELEN]; +}; +#define UIO_MX sizeof(struct pfsdent) +#define PROCFS_FILENO(pid, type) \ + (((type) == Proot) ? \ + 2 : \ + ((((pid)+1) << 3) + ((int) (type)))) + +/* + * Convert between pfsnode vnode + */ +#define VTOPFS(vp) ((struct pfsnode *)(vp)->v_data) +#define PFSTOV(pfs) ((pfs)->pfs_vnode) + +typedef struct vfs_namemap vfs_namemap_t; +struct vfs_namemap { + const char *nm_name; + int nm_val; +}; + +extern int vfs_getuserstr __P((struct uio *, char *, int *)); +extern vfs_namemap_t *vfs_findname __P((vfs_namemap_t *, char *, int)); + +/* */ +struct reg; +struct fpreg; + +#define PFIND(pid) ((pid) ? pfind(pid) : &proc0) +extern int procfs_freevp __P((struct vnode *)); +extern int procfs_allocvp __P((struct mount *, struct vnode **, long, pfstype)); +extern struct vnode *procfs_findtextvp __P((struct proc *)); +extern int procfs_sstep __P((struct proc *)); +extern void procfs_fix_sstep __P((struct proc *)); +extern int procfs_read_regs __P((struct proc *, struct reg *)); +extern int procfs_write_regs __P((struct proc *, struct reg *)); +extern int procfs_read_fpregs __P((struct proc *, struct fpreg *)); +extern int procfs_write_fpregs __P((struct proc *, struct fpreg *)); +extern int procfs_donote __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); +extern int procfs_doregs __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); +extern int procfs_dofpregs __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); +extern int procfs_domem __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); +extern int procfs_doctl __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); +extern int procfs_dostatus __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); + +#define PROCFS_LOCKED 0x01 +#define PROCFS_WANT 0x02 + +extern int (**procfs_vnodeop_p)(); +extern struct vfsops procfs_vfsops; + +/* + * Prototypes for procfs vnode ops + */ +int procfs_badop(); /* varargs */ +int procfs_rw __P((struct vop_read_args *)); +int procfs_lookup __P((struct vop_lookup_args *)); +#define procfs_create ((int (*) __P((struct vop_create_args *))) procfs_badop) +#define procfs_mknod ((int (*) __P((struct vop_mknod_args *))) procfs_badop) +int procfs_open __P((struct vop_open_args *)); +int procfs_close __P((struct vop_close_args *)); +int procfs_access __P((struct vop_access_args *)); +int procfs_getattr __P((struct vop_getattr_args *)); +int procfs_setattr __P((struct vop_setattr_args *)); +#define procfs_read procfs_rw +#define procfs_write procfs_rw +int procfs_ioctl __P((struct vop_ioctl_args *)); +#define procfs_select ((int (*) __P((struct vop_select_args *))) procfs_badop) +#define procfs_mmap ((int (*) __P((struct vop_mmap_args *))) procfs_badop) +#define procfs_fsync ((int (*) __P((struct vop_fsync_args *))) procfs_badop) +#define procfs_seek ((int (*) __P((struct vop_seek_args *))) procfs_badop) +#define procfs_remove ((int (*) __P((struct vop_remove_args *))) procfs_badop) +#define procfs_link ((int (*) __P((struct vop_link_args *))) procfs_badop) +#define procfs_rename ((int (*) __P((struct vop_rename_args *))) procfs_badop) +#define procfs_mkdir ((int (*) __P((struct vop_mkdir_args *))) procfs_badop) +#define procfs_rmdir ((int (*) __P((struct vop_rmdir_args *))) procfs_badop) +#define procfs_symlink ((int (*) __P((struct vop_symlink_args *))) procfs_badop) +int procfs_readdir __P((struct vop_readdir_args *)); +#define procfs_readlink ((int (*) __P((struct vop_readlink_args *))) procfs_badop) +int procfs_abortop __P((struct vop_abortop_args *)); +int procfs_inactive __P((struct vop_inactive_args *)); +int procfs_reclaim __P((struct vop_reclaim_args *)); +#define procfs_lock ((int (*) __P((struct vop_lock_args *))) nullop) +#define procfs_unlock ((int (*) __P((struct vop_unlock_args *))) nullop) +int procfs_bmap __P((struct vop_bmap_args *)); +#define procfs_strategy ((int (*) __P((struct vop_strategy_args *))) procfs_badop) +int procfs_print __P((struct vop_print_args *)); +#define procfs_islocked ((int (*) __P((struct vop_islocked_args *))) nullop) +#define procfs_advlock ((int (*) __P((struct vop_advlock_args *))) procfs_badop) +#define procfs_blkatoff ((int (*) __P((struct vop_blkatoff_args *))) procfs_badop) +#define procfs_valloc ((int (*) __P((struct vop_valloc_args *))) procfs_badop) +#define procfs_vfree ((int (*) __P((struct vop_vfree_args *))) nullop) +#define procfs_truncate ((int (*) __P((struct vop_truncate_args *))) procfs_badop) +#define procfs_update ((int (*) __P((struct vop_update_args *))) nullop) +#endif /* KERNEL */ diff --git a/sys/fs/procfs/procfs_ctl.c b/sys/fs/procfs/procfs_ctl.c new file mode 100644 index 00000000000..a42a03ce91c --- /dev/null +++ b/sys/fs/procfs/procfs_ctl.c @@ -0,0 +1,302 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_ctl.c 8.3 (Berkeley) 1/21/94 + * + * From: + * $Id: procfs_ctl.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * True iff process (p) is in trace wait state + * relative to process (curp) + */ +#define TRACE_WAIT_P(curp, p) \ + ((p)->p_stat == SSTOP && \ + (p)->p_pptr == (curp) && \ + ((p)->p_flag & P_TRACED)) + +#ifdef notdef +#define FIX_SSTEP(p) { \ + procfs_fix_sstep(p); \ + } \ +} +#else +#define FIX_SSTEP(p) +#endif + +#define PROCFS_CTL_ATTACH 1 +#define PROCFS_CTL_DETACH 2 +#define PROCFS_CTL_STEP 3 +#define PROCFS_CTL_RUN 4 +#define PROCFS_CTL_WAIT 5 + +static vfs_namemap_t ctlnames[] = { + /* special /proc commands */ + { "attach", PROCFS_CTL_ATTACH }, + { "detach", PROCFS_CTL_DETACH }, + { "step", PROCFS_CTL_STEP }, + { "run", PROCFS_CTL_RUN }, + { "wait", PROCFS_CTL_WAIT }, + { 0 }, +}; + +static vfs_namemap_t signames[] = { + /* regular signal names */ + { "hup", SIGHUP }, { "int", SIGINT }, + { "quit", SIGQUIT }, { "ill", SIGILL }, + { "trap", SIGTRAP }, { "abrt", SIGABRT }, + { "iot", SIGIOT }, { "emt", SIGEMT }, + { "fpe", SIGFPE }, { "kill", SIGKILL }, + { "bus", SIGBUS }, { "segv", SIGSEGV }, + { "sys", SIGSYS }, { "pipe", SIGPIPE }, + { "alrm", SIGALRM }, { "term", SIGTERM }, + { "urg", SIGURG }, { "stop", SIGSTOP }, + { "tstp", SIGTSTP }, { "cont", SIGCONT }, + { "chld", SIGCHLD }, { "ttin", SIGTTIN }, + { "ttou", SIGTTOU }, { "io", SIGIO }, + { "xcpu", SIGXCPU }, { "xfsz", SIGXFSZ }, + { "vtalrm", SIGVTALRM }, { "prof", SIGPROF }, + { "winch", SIGWINCH }, { "info", SIGINFO }, + { "usr1", SIGUSR1 }, { "usr2", SIGUSR2 }, + { 0 }, +}; + +static int +procfs_control(curp, p, op) + struct proc *curp; + struct proc *p; + int op; +{ + int error; + + /* + * Attach - attaches the target process for debugging + * by the calling process. + */ + if (op == PROCFS_CTL_ATTACH) { + /* check whether already being traced */ + if (p->p_flag & P_TRACED) + return (EBUSY); + + /* can't trace yourself! */ + if (p->p_pid == curp->p_pid) + return (EINVAL); + + /* + * Go ahead and set the trace flag. + * Save the old parent (it's reset in + * _DETACH, and also in kern_exit.c:wait4() + * Reparent the process so that the tracing + * proc gets to see all the action. + * Stop the target. + */ + p->p_flag |= P_TRACED; + p->p_xstat = 0; /* XXX ? */ + if (p->p_pptr != curp) { + p->p_oppid = p->p_pptr->p_pid; + proc_reparent(p, curp); + } + psignal(p, SIGSTOP); + return (0); + } + + /* + * Target process must be stopped, owned by (curp) and + * be set up for tracing (P_TRACED flag set). + * Allow DETACH to take place at any time for sanity. + * Allow WAIT any time, of course. + */ + switch (op) { + case PROCFS_CTL_DETACH: + case PROCFS_CTL_WAIT: + break; + + default: + if (!TRACE_WAIT_P(curp, p)) + return (EBUSY); + } + + /* + * do single-step fixup if needed + */ + FIX_SSTEP(p); + + /* + * Don't deliver any signal by default. + * To continue with a signal, just send + * the signal name to the ctl file + */ + p->p_xstat = 0; + + switch (op) { + /* + * Detach. Cleans up the target process, reparent it if possible + * and set it running once more. + */ + case PROCFS_CTL_DETACH: + /* if not being traced, then this is a painless no-op */ + if ((p->p_flag & P_TRACED) == 0) + return (0); + + /* not being traced any more */ + p->p_flag &= ~P_TRACED; + + /* give process back to original parent */ + if (p->p_oppid != p->p_pptr->p_pid) { + struct proc *pp; + + pp = pfind(p->p_oppid); + if (pp) + proc_reparent(p, pp); + } + + p->p_oppid = 0; + p->p_flag &= ~P_WAITED; /* XXX ? */ + wakeup((caddr_t) curp); /* XXX for CTL_WAIT below ? */ + + break; + + /* + * Step. Let the target process execute a single instruction. + */ + case PROCFS_CTL_STEP: + procfs_sstep(p); + break; + + /* + * Run. Let the target process continue running until a breakpoint + * or some other trap. + */ + case PROCFS_CTL_RUN: + break; + + /* + * Wait for the target process to stop. + * If the target is not being traced then just wait + * to enter + */ + case PROCFS_CTL_WAIT: + error = 0; + if (p->p_flag & P_TRACED) { + while (error == 0 && + (p->p_stat != SSTOP) && + (p->p_flag & P_TRACED) && + (p->p_pptr == curp)) { + error = tsleep((caddr_t) p, + PWAIT|PCATCH, "procfsx", 0); + } + if (error == 0 && !TRACE_WAIT_P(curp, p)) + error = EBUSY; + } else { + while (error == 0 && p->p_stat != SSTOP) { + error = tsleep((caddr_t) p, + PWAIT|PCATCH, "procfs", 0); + } + } + return (error); + + default: + panic("procfs_control"); + } + + if (p->p_stat == SSTOP) + setrunnable(p); + return (0); +} + +int +procfs_doctl(curp, p, pfs, uio) + struct proc *curp; + struct pfsnode *pfs; + struct uio *uio; + struct proc *p; +{ + int xlen; + int error; + char msg[PROCFS_CTLLEN+1]; + vfs_namemap_t *nm; + + if (uio->uio_rw != UIO_WRITE) + return (EOPNOTSUPP); + + xlen = PROCFS_CTLLEN; + error = vfs_getuserstr(uio, msg, &xlen); + if (error) + return (error); + + /* + * Map signal names into signal generation + * or debug control. Unknown commands and/or signals + * return EOPNOTSUPP. + * + * Sending a signal while the process is being debugged + * also has the side effect of letting the target continue + * to run. There is no way to single-step a signal delivery. + */ + error = EOPNOTSUPP; + + nm = vfs_findname(ctlnames, msg, xlen); + if (nm) { + error = procfs_control(curp, p, nm->nm_val); + } else { + nm = vfs_findname(signames, msg, xlen); + if (nm) { + if (TRACE_WAIT_P(curp, p)) { + p->p_xstat = nm->nm_val; + FIX_SSTEP(p); + setrunnable(p); + } else { + psignal(p, nm->nm_val); + } + error = 0; + } + } + + return (error); +} diff --git a/sys/fs/procfs/procfs_fpregs.c b/sys/fs/procfs/procfs_fpregs.c new file mode 100644 index 00000000000..6d850a6a881 --- /dev/null +++ b/sys/fs/procfs/procfs_fpregs.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_fpregs.c 8.1 (Berkeley) 1/27/94 + * + * From: + * $Id: procfs_regs.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int +procfs_dofpregs(curp, p, pfs, uio) + struct proc *curp; + struct proc *p; + struct pfsnode *pfs; + struct uio *uio; +{ + int error; + struct fpreg r; + char *kv; + int kl; + + kl = sizeof(r); + kv = (char *) &r; + + kv += uio->uio_offset; + kl -= uio->uio_offset; + if (kl > uio->uio_resid) + kl = uio->uio_resid; + + if (kl < 0) + error = EINVAL; + else + error = procfs_read_fpregs(p, &r); + if (error == 0) + error = uiomove(kv, kl, uio); + if (error == 0 && uio->uio_rw == UIO_WRITE) { + if (p->p_stat != SSTOP) + error = EBUSY; + else + error = procfs_write_fpregs(p, &r); + } + + uio->uio_offset = 0; + return (error); +} diff --git a/sys/fs/procfs/procfs_mem.c b/sys/fs/procfs/procfs_mem.c new file mode 100644 index 00000000000..039983da09c --- /dev/null +++ b/sys/fs/procfs/procfs_mem.c @@ -0,0 +1,302 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 Sean Eric Fagan + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry and Sean Eric Fagan. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_mem.c 8.4 (Berkeley) 1/21/94 + * + * From: + * $Id: procfs_mem.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +/* + * This is a lightly hacked and merged version + * of sef's pread/pwrite functions + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +procfs_rwmem(p, uio) + struct proc *p; + struct uio *uio; +{ + int error; + int writing; + + writing = uio->uio_rw == UIO_WRITE; + + /* + * Only map in one page at a time. We don't have to, but it + * makes things easier. This way is trivial - right? + */ + do { + vm_map_t map, tmap; + vm_object_t object; + vm_offset_t kva; + vm_offset_t uva; + int page_offset; /* offset into page */ + vm_offset_t pageno; /* page number */ + vm_map_entry_t out_entry; + vm_prot_t out_prot; + vm_page_t m; + boolean_t wired, single_use; + vm_offset_t off; + u_int len; + int fix_prot; + + uva = (vm_offset_t) uio->uio_offset; + if (uva > VM_MAXUSER_ADDRESS) { + error = 0; + break; + } + + /* + * Get the page number of this segment. + */ + pageno = trunc_page(uva); + page_offset = uva - pageno; + + /* + * How many bytes to copy + */ + len = min(PAGE_SIZE - page_offset, uio->uio_resid); + + /* + * The map we want... + */ + map = &p->p_vmspace->vm_map; + + /* + * Check the permissions for the area we're interested + * in. + */ + fix_prot = 0; + if (writing) + fix_prot = !vm_map_check_protection(map, pageno, + pageno + PAGE_SIZE, VM_PROT_WRITE); + + if (fix_prot) { + /* + * If the page is not writable, we make it so. + * XXX It is possible that a page may *not* be + * read/executable, if a process changes that! + * We will assume, for now, that a page is either + * VM_PROT_ALL, or VM_PROT_READ|VM_PROT_EXECUTE. + */ + error = vm_map_protect(map, pageno, + pageno + PAGE_SIZE, VM_PROT_ALL, 0); + if (error) + break; + } + + /* + * Now we need to get the page. out_entry, out_prot, wired, + * and single_use aren't used. One would think the vm code + * would be a *bit* nicer... We use tmap because + * vm_map_lookup() can change the map argument. + */ + tmap = map; + error = vm_map_lookup(&tmap, pageno, + writing ? VM_PROT_WRITE : VM_PROT_READ, + &out_entry, &object, &off, &out_prot, + &wired, &single_use); + /* + * We're done with tmap now. + */ + if (!error) + vm_map_lookup_done(tmap, out_entry); + + /* + * Fault the page in... + */ + if (!error && writing && object->shadow) { + m = vm_page_lookup(object, off); + if (m == 0 || (m->flags & PG_COPYONWRITE)) + error = vm_fault(map, pageno, + VM_PROT_WRITE, FALSE); + } + + /* Find space in kernel_map for the page we're interested in */ + if (!error) + error = vm_map_find(kernel_map, object, off, &kva, + PAGE_SIZE, 1); + + if (!error) { + /* + * Neither vm_map_lookup() nor vm_map_find() appear + * to add a reference count to the object, so we do + * that here and now. + */ + vm_object_reference(object); + + /* + * Mark the page we just found as pageable. + */ + error = vm_map_pageable(kernel_map, kva, + kva + PAGE_SIZE, 0); + + /* + * Now do the i/o move. + */ + if (!error) + error = uiomove(kva + page_offset, len, uio); + + vm_map_remove(kernel_map, kva, kva + PAGE_SIZE); + } + if (fix_prot) + vm_map_protect(map, pageno, pageno + PAGE_SIZE, + VM_PROT_READ|VM_PROT_EXECUTE, 0); + } while (error == 0 && uio->uio_resid > 0); + + return (error); +} + +/* + * Copy data in and out of the target process. + * We do this by mapping the process's page into + * the kernel and then doing a uiomove direct + * from the kernel address space. + */ +int +procfs_domem(curp, p, pfs, uio) + struct proc *curp; + struct proc *p; + struct pfsnode *pfs; + struct uio *uio; +{ + int error; + + if (uio->uio_resid == 0) + return (0); + + error = procfs_rwmem(p, uio); + + return (error); +} + +/* + * Given process (p), find the vnode from which + * it's text segment is being executed. + * + * It would be nice to grab this information from + * the VM system, however, there is no sure-fire + * way of doing that. Instead, fork(), exec() and + * wait() all maintain the p_textvp field in the + * process proc structure which contains a held + * reference to the exec'ed vnode. + */ +struct vnode * +procfs_findtextvp(p) + struct proc *p; +{ + return (p->p_textvp); +} + + +#ifdef probably_never +/* + * Given process (p), find the vnode from which + * it's text segment is being mapped. + * + * (This is here, rather than in procfs_subr in order + * to keep all the VM related code in one place.) + */ +struct vnode * +procfs_findtextvp(p) + struct proc *p; +{ + int error; + vm_object_t object; + vm_offset_t pageno; /* page number */ + + /* find a vnode pager for the user address space */ + + for (pageno = VM_MIN_ADDRESS; + pageno < VM_MAXUSER_ADDRESS; + pageno += PAGE_SIZE) { + vm_map_t map; + vm_map_entry_t out_entry; + vm_prot_t out_prot; + boolean_t wired, single_use; + vm_offset_t off; + + map = &p->p_vmspace->vm_map; + error = vm_map_lookup(&map, pageno, + VM_PROT_READ, + &out_entry, &object, &off, &out_prot, + &wired, &single_use); + + if (!error) { + vm_pager_t pager; + + printf("procfs: found vm object\n"); + vm_map_lookup_done(map, out_entry); + printf("procfs: vm object = %x\n", object); + + /* + * At this point, assuming no errors, object + * is the VM object mapping UVA (pageno). + * Ensure it has a vnode pager, then grab + * the vnode from that pager's handle. + */ + + pager = object->pager; + printf("procfs: pager = %x\n", pager); + if (pager) + printf("procfs: found pager, type = %d\n", pager->pg_type); + if (pager && pager->pg_type == PG_VNODE) { + struct vnode *vp; + + vp = (struct vnode *) pager->pg_handle; + printf("procfs: vp = 0x%x\n", vp); + return (vp); + } + } + } + + printf("procfs: text object not found\n"); + return (0); +} +#endif /* probably_never */ diff --git a/sys/fs/procfs/procfs_note.c b/sys/fs/procfs/procfs_note.c new file mode 100644 index 00000000000..bf2f160baa0 --- /dev/null +++ b/sys/fs/procfs/procfs_note.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_note.c 8.2 (Berkeley) 1/21/94 + * + * From: + * $Id: procfs_note.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int +procfs_donote(curp, p, pfs, uio) + struct proc *curp; + struct proc *p; + struct pfsnode *pfs; + struct uio *uio; +{ + int xlen; + int error; + char note[PROCFS_NOTELEN+1]; + + if (uio->uio_rw != UIO_WRITE) + return (EINVAL); + + xlen = PROCFS_NOTELEN; + error = vfs_getuserstr(uio, note, &xlen); + if (error) + return (error); + + /* send to process's notify function */ + return (EOPNOTSUPP); +} diff --git a/sys/fs/procfs/procfs_regs.c b/sys/fs/procfs/procfs_regs.c new file mode 100644 index 00000000000..fa95fef8f10 --- /dev/null +++ b/sys/fs/procfs/procfs_regs.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_regs.c 8.3 (Berkeley) 1/27/94 + * + * From: + * $Id: procfs_regs.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int +procfs_doregs(curp, p, pfs, uio) + struct proc *curp; + struct proc *p; + struct pfsnode *pfs; + struct uio *uio; +{ + int error; + struct reg r; + char *kv; + int kl; + + kl = sizeof(r); + kv = (char *) &r; + + kv += uio->uio_offset; + kl -= uio->uio_offset; + if (kl > uio->uio_resid) + kl = uio->uio_resid; + + if (kl < 0) + error = EINVAL; + else + error = procfs_read_regs(p, &r); + if (error == 0) + error = uiomove(kv, kl, uio); + if (error == 0 && uio->uio_rw == UIO_WRITE) { + if (p->p_stat != SSTOP) + error = EBUSY; + else + error = procfs_write_regs(p, &r); + } + + uio->uio_offset = 0; + return (error); +} diff --git a/sys/fs/procfs/procfs_status.c b/sys/fs/procfs/procfs_status.c new file mode 100644 index 00000000000..d88aaabdfb0 --- /dev/null +++ b/sys/fs/procfs/procfs_status.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_status.c 8.3 (Berkeley) 2/17/94 + * + * From: + * $Id: procfs_status.c,v 3.1 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +procfs_dostatus(curp, p, pfs, uio) + struct proc *curp; + struct proc *p; + struct pfsnode *pfs; + struct uio *uio; +{ + struct session *sess; + struct tty *tp; + struct ucred *cr; + char *ps; + char *sep; + int pid, ppid, pgid, sid; + int i; + int xlen; + int error; + char psbuf[256]; /* XXX - conservative */ + + if (uio->uio_rw != UIO_READ) + return (EOPNOTSUPP); + + pid = p->p_pid; + ppid = p->p_pptr ? p->p_pptr->p_pid : 0, + pgid = p->p_pgrp->pg_id; + sess = p->p_pgrp->pg_session; + sid = sess->s_leader ? sess->s_leader->p_pid : 0; + +/* comm pid ppid pgid sid maj,min ctty,sldr start ut st wmsg uid groups ... */ + + ps = psbuf; + bcopy(p->p_comm, ps, MAXCOMLEN); + ps[MAXCOMLEN] = '\0'; + ps += strlen(ps); + ps += sprintf(ps, " %d %d %d %d ", pid, ppid, pgid, sid); + + if ((p->p_flag&P_CONTROLT) && (tp = sess->s_ttyp)) + ps += sprintf(ps, "%d,%d ", major(tp->t_dev), minor(tp->t_dev)); + else + ps += sprintf(ps, "%d,%d ", -1, -1); + + sep = ""; + if (sess->s_ttyvp) { + ps += sprintf(ps, "%sctty", sep); + sep = ","; + } + if (SESS_LEADER(p)) { + ps += sprintf(ps, "%ssldr", sep); + sep = ","; + } + if (*sep != ',') + ps += sprintf(ps, "noflags"); + + if (p->p_flag & P_INMEM) + ps += sprintf(ps, " %d,%d", + p->p_stats->p_start.tv_sec, + p->p_stats->p_start.tv_usec); + else + ps += sprintf(ps, " -1,-1"); + + { + struct timeval ut, st; + + calcru(p, &ut, &st, (void *) 0); + ps += sprintf(ps, " %d,%d %d,%d", + ut.tv_sec, + ut.tv_usec, + st.tv_sec, + st.tv_usec); + } + + ps += sprintf(ps, " %s", + (p->p_wchan && p->p_wmesg) ? p->p_wmesg : "nochan"); + + cr = p->p_ucred; + + ps += sprintf(ps, " %d", cr->cr_uid, cr->cr_gid); + for (i = 0; i < cr->cr_ngroups; i++) + ps += sprintf(ps, ",%d", cr->cr_groups[i]); + ps += sprintf(ps, "\n"); + + xlen = ps - psbuf; + xlen -= uio->uio_offset; + ps = psbuf + uio->uio_offset; + xlen = min(xlen, uio->uio_resid); + if (xlen <= 0) + error = 0; + else + error = uiomove(ps, xlen, uio); + + return (error); +} diff --git a/sys/fs/procfs/procfs_subr.c b/sys/fs/procfs/procfs_subr.c new file mode 100644 index 00000000000..b371af19af0 --- /dev/null +++ b/sys/fs/procfs/procfs_subr.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_subr.c 8.4 (Berkeley) 1/27/94 + * + * From: + * $Id: procfs_subr.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct pfsnode *pfshead; +static int pfsvplock; + +/* + * allocate a pfsnode/vnode pair. the vnode is + * referenced, but not locked. + * + * the pid, pfs_type, and mount point uniquely + * identify a pfsnode. the mount point is needed + * because someone might mount this filesystem + * twice. + * + * all pfsnodes are maintained on a singly-linked + * list. new nodes are only allocated when they cannot + * be found on this list. entries on the list are + * removed when the vfs reclaim entry is called. + * + * a single lock is kept for the entire list. this is + * needed because the getnewvnode() function can block + * waiting for a vnode to become free, in which case there + * may be more than one process trying to get the same + * vnode. this lock is only taken if we are going to + * call getnewvnode, since the kernel itself is single-threaded. + * + * if an entry is found on the list, then call vget() to + * take a reference. this is done because there may be + * zero references to it and so it needs to removed from + * the vnode free list. + */ +int +procfs_allocvp(mp, vpp, pid, pfs_type) + struct mount *mp; + struct vnode **vpp; + long pid; + pfstype pfs_type; +{ + int error; + struct pfsnode *pfs; + struct pfsnode **pp; + +loop: + for (pfs = pfshead; pfs != 0; pfs = pfs->pfs_next) { + if (pfs->pfs_pid == pid && + pfs->pfs_type == pfs_type && + PFSTOV(pfs)->v_mount == mp) { + if (vget(pfs->pfs_vnode, 0)) + goto loop; + *vpp = pfs->pfs_vnode; + return (0); + } + } + + /* + * otherwise lock the vp list while we call getnewvnode + * since that can block. + */ + if (pfsvplock & PROCFS_LOCKED) { + pfsvplock |= PROCFS_WANT; + sleep((caddr_t) &pfsvplock, PINOD); + goto loop; + } + pfsvplock |= PROCFS_LOCKED; + + error = getnewvnode(VT_PROCFS, mp, procfs_vnodeop_p, vpp); + if (error) + goto out; + + MALLOC((*vpp)->v_data, void *, sizeof(struct pfsnode), + M_TEMP, M_WAITOK); + + pfs = VTOPFS(*vpp); + pfs->pfs_next = 0; + pfs->pfs_pid = (pid_t) pid; + pfs->pfs_type = pfs_type; + pfs->pfs_vnode = *vpp; + pfs->pfs_flags = 0; + pfs->pfs_fileno = PROCFS_FILENO(pid, pfs_type); + + switch (pfs_type) { + case Proot: /* /proc = dr-xr-xr-x */ + pfs->pfs_mode = (VREAD|VEXEC) | + (VREAD|VEXEC) >> 3 | + (VREAD|VEXEC) >> 6; + break; + + case Pproc: + pfs->pfs_mode = (VREAD|VEXEC) | + (VREAD|VEXEC) >> 3 | + (VREAD|VEXEC) >> 6; + break; + + case Pfile: + pfs->pfs_mode = (VREAD|VWRITE); + break; + + case Pmem: + pfs->pfs_mode = (VREAD|VWRITE); + break; + + case Pregs: + pfs->pfs_mode = (VREAD|VWRITE); + break; + + case Pfpregs: + pfs->pfs_mode = (VREAD|VWRITE); + break; + + case Pctl: + pfs->pfs_mode = (VWRITE); + break; + + case Pstatus: + pfs->pfs_mode = (VREAD) | + (VREAD >> 3) | + (VREAD >> 6); + break; + + case Pnote: + pfs->pfs_mode = (VWRITE); + break; + + case Pnotepg: + pfs->pfs_mode = (VWRITE); + break; + + default: + panic("procfs_allocvp"); + } + + /* add to procfs vnode list */ + for (pp = &pfshead; *pp; pp = &(*pp)->pfs_next) + continue; + *pp = pfs; + +out: + pfsvplock &= ~PROCFS_LOCKED; + + if (pfsvplock & PROCFS_WANT) { + pfsvplock &= ~PROCFS_WANT; + wakeup((caddr_t) &pfsvplock); + } + + return (error); +} + +int +procfs_freevp(vp) + struct vnode *vp; +{ + struct pfsnode **pfspp; + struct pfsnode *pfs = VTOPFS(vp); + + for (pfspp = &pfshead; *pfspp != 0; pfspp = &(*pfspp)->pfs_next) { + if (*pfspp == pfs) { + *pfspp = pfs->pfs_next; + break; + } + } + + FREE(vp->v_data, M_TEMP); + vp->v_data = 0; + return (0); +} + +int +procfs_rw(ap) + struct vop_read_args *ap; +{ + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + struct proc *curp = uio->uio_procp; + struct pfsnode *pfs = VTOPFS(vp); + struct proc *p; + + p = PFIND(pfs->pfs_pid); + if (p == 0) + return (EINVAL); + + switch (pfs->pfs_type) { + case Pnote: + case Pnotepg: + return (procfs_donote(curp, p, pfs, uio)); + + case Pregs: + return (procfs_doregs(curp, p, pfs, uio)); + + case Pfpregs: + return (procfs_dofpregs(curp, p, pfs, uio)); + + case Pctl: + return (procfs_doctl(curp, p, pfs, uio)); + + case Pstatus: + return (procfs_dostatus(curp, p, pfs, uio)); + + case Pmem: + return (procfs_domem(curp, p, pfs, uio)); + + default: + return (EOPNOTSUPP); + } +} + +/* + * Get a string from userland into (buf). Strip a trailing + * nl character (to allow easy access from the shell). + * The buffer should be *buflenp + 1 chars long. vfs_getuserstr + * will automatically add a nul char at the end. + * + * Returns 0 on success or the following errors + * + * EINVAL: file offset is non-zero. + * EMSGSIZE: message is longer than kernel buffer + * EFAULT: user i/o buffer is not addressable + */ +int +vfs_getuserstr(uio, buf, buflenp) + struct uio *uio; + char *buf; + int *buflenp; +{ + int xlen; + int error; + + if (uio->uio_offset != 0) + return (EINVAL); + + xlen = *buflenp; + + /* must be able to read the whole string in one go */ + if (xlen < uio->uio_resid) + return (EMSGSIZE); + xlen = uio->uio_resid; + + error = uiomove(buf, xlen, uio); + if (error) + return (error); + + /* allow multiple writes without seeks */ + uio->uio_offset = 0; + + /* cleanup string and remove trailing newline */ + buf[xlen] = '\0'; + xlen = strlen(buf); + if (xlen > 0 && buf[xlen-1] == '\n') + buf[--xlen] = '\0'; + *buflenp = xlen; + + return (0); +} + +vfs_namemap_t * +vfs_findname(nm, buf, buflen) + vfs_namemap_t *nm; + char *buf; + int buflen; +{ + for (; nm->nm_name; nm++) + if (bcmp(buf, (char *) nm->nm_name, buflen+1) == 0) + return (nm); + + return (0); +} diff --git a/sys/fs/procfs/procfs_vfsops.c b/sys/fs/procfs/procfs_vfsops.c new file mode 100644 index 00000000000..3938ca12357 --- /dev/null +++ b/sys/fs/procfs/procfs_vfsops.c @@ -0,0 +1,243 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_vfsops.c 8.4 (Berkeley) 1/21/94 + * + * From: + * $Id: procfs_vfsops.c,v 3.1 1993/12/15 09:40:17 jsp Exp $ + */ + +/* + * procfs VFS interface + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for PAGE_SIZE */ + +/* + * VFS Operations. + * + * mount system call + */ +/* ARGSUSED */ +procfs_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + u_int size; + + if (UIO_MX & (UIO_MX-1)) { + log(LOG_ERR, "procfs: invalid directory entry size"); + return (EINVAL); + } + + if (mp->mnt_flag & MNT_UPDATE) + return (EOPNOTSUPP); + + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_data = 0; + getnewfsid(mp, MOUNT_PROCFS); + + (void) copyinstr(path, (caddr_t)mp->mnt_stat.f_mntonname, MNAMELEN, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + + size = sizeof("procfs") - 1; + bcopy("procfs", mp->mnt_stat.f_mntfromname, size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + + return (0); +} + +/* + * unmount system call + */ +procfs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + int error; + extern int doforce; + int flags = 0; + + if (mntflags & MNT_FORCE) { + /* procfs can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + if (error = vflush(mp, 0, flags)) + return (error); + + return (0); +} + +procfs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct pfsnode *pfs; + struct vnode *vp; + int error; + + error = procfs_allocvp(mp, &vp, (pid_t) 0, Proot); + if (error) + return (error); + + vp->v_type = VDIR; + vp->v_flag = VROOT; + pfs = VTOPFS(vp); + + *vpp = vp; + return (0); +} + +/* + */ +/* ARGSUSED */ +procfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + + return (0); +} + +/* + * Get file system statistics. + */ +procfs_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + sbp->f_type = MOUNT_PROCFS; + sbp->f_bsize = PAGE_SIZE; + sbp->f_iosize = PAGE_SIZE; + sbp->f_blocks = 1; /* avoid divide by zero in some df's */ + sbp->f_bfree = 0; + sbp->f_bavail = 0; + sbp->f_files = maxproc; /* approx */ + sbp->f_ffree = maxproc - nprocs; /* approx */ + + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + + return (0); +} + + +procfs_quotactl(mp, cmds, uid, arg, p) + struct mount *mp; + int cmds; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} + +procfs_sync(mp, waitfor) + struct mount *mp; + int waitfor; +{ + + return (0); +} + +procfs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +procfs_fhtovp(mp, fhp, vpp) + struct mount *mp; + struct fid *fhp; + struct vnode **vpp; +{ + + return (EINVAL); +} + +procfs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + + return EINVAL; +} + +procfs_init() +{ + + return (0); +} + +struct vfsops procfs_vfsops = { + procfs_mount, + procfs_start, + procfs_unmount, + procfs_root, + procfs_quotactl, + procfs_statfs, + procfs_sync, + procfs_vget, + procfs_fhtovp, + procfs_vptofh, + procfs_init, +}; diff --git a/sys/fs/procfs/procfs_vnops.c b/sys/fs/procfs/procfs_vnops.c new file mode 100644 index 00000000000..4e1ee002bb9 --- /dev/null +++ b/sys/fs/procfs/procfs_vnops.c @@ -0,0 +1,814 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_vnops.c 8.6 (Berkeley) 2/7/94 + * + * From: + * $Id: procfs_vnops.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +/* + * procfs vnode interface + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for PAGE_SIZE */ + +/* + * Vnode Operations. + * + */ + +/* + * This is a list of the valid names in the + * process-specific sub-directories. It is + * used in procfs_lookup and procfs_readdir + */ +static struct pfsnames { + u_short d_namlen; + char d_name[PROCFS_NAMELEN]; + pfstype d_pfstype; +} procent[] = { +#define N(s) sizeof(s)-1, s + /* namlen, nam, type */ + { N("file"), Pfile }, + { N("mem"), Pmem }, + { N("regs"), Pregs }, + { N("fpregs"), Pfpregs }, + { N("ctl"), Pctl }, + { N("status"), Pstatus }, + { N("note"), Pnote }, + { N("notepg"), Pnotepg }, +#undef N +}; +#define Nprocent (sizeof(procent)/sizeof(procent[0])) + +static pid_t atopid __P((const char *, u_int)); + +/* + * set things up for doing i/o on + * the pfsnode (vp). (vp) is locked + * on entry, and should be left locked + * on exit. + * + * for procfs we don't need to do anything + * in particular for i/o. all that is done + * is to support exclusive open on process + * memory images. + */ +procfs_open(ap) + struct vop_open_args *ap; +{ + struct pfsnode *pfs = VTOPFS(ap->a_vp); + + switch (pfs->pfs_type) { + case Pmem: + if (PFIND(pfs->pfs_pid) == 0) + return (ENOENT); /* was ESRCH, jsp */ + + if ((pfs->pfs_flags & FWRITE) && (ap->a_mode & O_EXCL) || + (pfs->pfs_flags & O_EXCL) && (ap->a_mode & FWRITE)) + return (EBUSY); + + + if (ap->a_mode & FWRITE) + pfs->pfs_flags = ap->a_mode & (FWRITE|O_EXCL); + + return (0); + + default: + break; + } + + return (0); +} + +/* + * close the pfsnode (vp) after doing i/o. + * (vp) is not locked on entry or exit. + * + * nothing to do for procfs other than undo + * any exclusive open flag (see _open above). + */ +procfs_close(ap) + struct vop_close_args *ap; +{ + struct pfsnode *pfs = VTOPFS(ap->a_vp); + + switch (pfs->pfs_type) { + case Pmem: + if ((ap->a_fflag & FWRITE) && (pfs->pfs_flags & O_EXCL)) + pfs->pfs_flags &= ~(FWRITE|O_EXCL); + break; + } + + return (0); +} + +/* + * do an ioctl operation on pfsnode (vp). + * (vp) is not locked on entry or exit. + */ +procfs_ioctl(ap) + struct vop_ioctl_args *ap; +{ + + return (ENOTTY); +} + +/* + * do block mapping for pfsnode (vp). + * since we don't use the buffer cache + * for procfs this function should never + * be called. in any case, it's not clear + * what part of the kernel ever makes use + * of this function. for sanity, this is the + * usual no-op bmap, although returning + * (EIO) would be a reasonable alternative. + */ +procfs_bmap(ap) + struct vop_bmap_args *ap; +{ + + if (ap->a_vpp != NULL) + *ap->a_vpp = ap->a_vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + return (0); +} + +/* + * _inactive is called when the pfsnode + * is vrele'd and the reference count goes + * to zero. (vp) will be on the vnode free + * list, so to get it back vget() must be + * used. + * + * for procfs, check if the process is still + * alive and if it isn't then just throw away + * the vnode by calling vgone(). this may + * be overkill and a waste of time since the + * chances are that the process will still be + * there and PFIND is not free. + * + * (vp) is not locked on entry or exit. + */ +procfs_inactive(ap) + struct vop_inactive_args *ap; +{ + struct pfsnode *pfs = VTOPFS(ap->a_vp); + + if (PFIND(pfs->pfs_pid) == 0) + vgone(ap->a_vp); + + return (0); +} + +/* + * _reclaim is called when getnewvnode() + * wants to make use of an entry on the vnode + * free list. at this time the filesystem needs + * to free any private data and remove the node + * from any private lists. + */ +procfs_reclaim(ap) + struct vop_reclaim_args *ap; +{ + int error; + + error = procfs_freevp(ap->a_vp); + return (error); +} + +/* + * Return POSIX pathconf information applicable to special devices. + */ +procfs_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_MAX_CANON: + *ap->a_retval = MAX_CANON; + return (0); + case _PC_MAX_INPUT: + *ap->a_retval = MAX_INPUT; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_VDISABLE: + *ap->a_retval = _POSIX_VDISABLE; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * _print is used for debugging. + * just print a readable description + * of (vp). + */ +procfs_print(ap) + struct vop_print_args *ap; +{ + struct pfsnode *pfs = VTOPFS(ap->a_vp); + + printf("tag VT_PROCFS, pid %d, mode %x, flags %x\n", + pfs->pfs_pid, + pfs->pfs_mode, pfs->pfs_flags); +} + +/* + * _abortop is called when operations such as + * rename and create fail. this entry is responsible + * for undoing any side-effects caused by the lookup. + * this will always include freeing the pathname buffer. + */ +procfs_abortop(ap) + struct vop_abortop_args *ap; +{ + + if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) + FREE(ap->a_cnp->cn_pnbuf, M_NAMEI); + return (0); +} + +/* + * generic entry point for unsupported operations + */ +procfs_badop() +{ + + return (EIO); +} + +/* + * Invent attributes for pfsnode (vp) and store + * them in (vap). + * Directories lengths are returned as zero since + * any real length would require the genuine size + * to be computed, and nothing cares anyway. + * + * this is relatively minimal for procfs. + */ +procfs_getattr(ap) + struct vop_getattr_args *ap; +{ + struct pfsnode *pfs = VTOPFS(ap->a_vp); + struct vattr *vap = ap->a_vap; + struct proc *procp; + int error; + + /* first check the process still exists */ + switch (pfs->pfs_type) { + case Proot: + procp = 0; + break; + + default: + procp = PFIND(pfs->pfs_pid); + if (procp == 0) + return (ENOENT); + } + + error = 0; + + /* start by zeroing out the attributes */ + VATTR_NULL(vap); + + /* next do all the common fields */ + vap->va_type = ap->a_vp->v_type; + vap->va_mode = pfs->pfs_mode; + vap->va_fileid = pfs->pfs_fileno; + vap->va_flags = 0; + vap->va_blocksize = PAGE_SIZE; + vap->va_bytes = vap->va_size = 0; + + /* + * If the process has exercised some setuid or setgid + * privilege, then rip away read/write permission so + * that only root can gain access. + */ + switch (pfs->pfs_type) { + case Pregs: + case Pfpregs: + case Pmem: + if (procp->p_flag & P_SUGID) + vap->va_mode &= ~((VREAD|VWRITE)| + ((VREAD|VWRITE)>>3)| + ((VREAD|VWRITE)>>6)); + break; + } + + /* + * Make all times be current TOD. + * It would be possible to get the process start + * time from the p_stat structure, but there's + * no "file creation" time stamp anyway, and the + * p_stat structure is not addressible if u. gets + * swapped out for that process. + */ + microtime(&vap->va_ctime); + vap->va_atime = vap->va_mtime = vap->va_ctime; + + /* + * now do the object specific fields + * + * The size could be set from struct reg, but it's hardly + * worth the trouble, and it puts some (potentially) machine + * dependent data into this machine-independent code. If it + * becomes important then this function should break out into + * a per-file stat function in the corresponding .c file. + */ + + switch (pfs->pfs_type) { + case Proot: + vap->va_nlink = 2; + vap->va_uid = 0; + vap->va_gid = 0; + break; + + case Pproc: + vap->va_nlink = 2; + vap->va_uid = procp->p_ucred->cr_uid; + vap->va_gid = procp->p_ucred->cr_gid; + break; + + case Pfile: + error = EOPNOTSUPP; + break; + + case Pmem: + vap->va_nlink = 1; + vap->va_bytes = vap->va_size = + ctob(procp->p_vmspace->vm_tsize + + procp->p_vmspace->vm_dsize + + procp->p_vmspace->vm_ssize); + vap->va_uid = procp->p_ucred->cr_uid; + vap->va_gid = procp->p_ucred->cr_gid; + break; + + case Pregs: + case Pfpregs: + case Pctl: + case Pstatus: + case Pnote: + case Pnotepg: + vap->va_nlink = 1; + vap->va_uid = procp->p_ucred->cr_uid; + vap->va_gid = procp->p_ucred->cr_gid; + break; + + default: + panic("procfs_getattr"); + } + + return (error); +} + +procfs_setattr(ap) + struct vop_setattr_args *ap; +{ + /* + * just fake out attribute setting + * it's not good to generate an error + * return, otherwise things like creat() + * will fail when they try to set the + * file length to 0. worse, this means + * that echo $note > /proc/$pid/note will fail. + */ + + return (0); +} + +/* + * implement access checking. + * + * something very similar to this code is duplicated + * throughout the 4bsd kernel and should be moved + * into kern/vfs_subr.c sometime. + * + * actually, the check for super-user is slightly + * broken since it will allow read access to write-only + * objects. this doesn't cause any particular trouble + * but does mean that the i/o entry points need to check + * that the operation really does make sense. + */ +procfs_access(ap) + struct vop_access_args *ap; +{ + struct vattr *vap; + struct vattr vattr; + int error; + + /* + * If you're the super-user, + * you always get access. + */ + if (ap->a_cred->cr_uid == (uid_t) 0) + return (0); + vap = &vattr; + if (error = VOP_GETATTR(ap->a_vp, vap, ap->a_cred, ap->a_p)) + return (error); + + /* + * Access check is based on only one of owner, group, public. + * If not owner, then check group. If not a member of the + * group, then check public access. + */ + if (ap->a_cred->cr_uid != vap->va_uid) { + gid_t *gp; + int i; + + (ap->a_mode) >>= 3; + gp = ap->a_cred->cr_groups; + for (i = 0; i < ap->a_cred->cr_ngroups; i++, gp++) + if (vap->va_gid == *gp) + goto found; + ap->a_mode >>= 3; +found: + ; + } + + if ((vap->va_mode & ap->a_mode) == ap->a_mode) + return (0); + + return (EACCES); +} + +/* + * lookup. this is incredibly complicated in the + * general case, however for most pseudo-filesystems + * very little needs to be done. + * + * unless you want to get a migraine, just make sure your + * filesystem doesn't do any locking of its own. otherwise + * read and inwardly digest ufs_lookup(). + */ +procfs_lookup(ap) + struct vop_lookup_args *ap; +{ + struct componentname *cnp = ap->a_cnp; + struct vnode **vpp = ap->a_vpp; + struct vnode *dvp = ap->a_dvp; + char *pname = cnp->cn_nameptr; + int error = 0; + pid_t pid; + struct vnode *nvp; + struct pfsnode *pfs; + struct proc *procp; + pfstype pfs_type; + int i; + + if (cnp->cn_namelen == 1 && *pname == '.') { + *vpp = dvp; + VREF(dvp); + /*VOP_LOCK(dvp);*/ + return (0); + } + + *vpp = NULL; + + pfs = VTOPFS(dvp); + switch (pfs->pfs_type) { + case Proot: + if (cnp->cn_flags & ISDOTDOT) + return (EIO); + + if (CNEQ(cnp, "curproc", 7)) + pid = cnp->cn_proc->p_pid; + else + pid = atopid(pname, cnp->cn_namelen); + if (pid == NO_PID) + return (ENOENT); + + procp = PFIND(pid); + if (procp == 0) + return (ENOENT); + + error = procfs_allocvp(dvp->v_mount, &nvp, pid, Pproc); + if (error) + return (error); + + nvp->v_type = VDIR; + pfs = VTOPFS(nvp); + + *vpp = nvp; + return (0); + + case Pproc: + if (cnp->cn_flags & ISDOTDOT) { + error = procfs_root(dvp->v_mount, vpp); + return (error); + } + + procp = PFIND(pfs->pfs_pid); + if (procp == 0) + return (ENOENT); + + for (i = 0; i < Nprocent; i++) { + struct pfsnames *dp = &procent[i]; + + if (cnp->cn_namelen == dp->d_namlen && + bcmp(pname, dp->d_name, dp->d_namlen) == 0) { + pfs_type = dp->d_pfstype; + goto found; + } + } + return (ENOENT); + + found: + if (pfs_type == Pfile) { + nvp = procfs_findtextvp(procp); + if (nvp) { + VREF(nvp); + VOP_LOCK(nvp); + } else { + error = ENXIO; + } + } else { + error = procfs_allocvp(dvp->v_mount, &nvp, + pfs->pfs_pid, pfs_type); + if (error) + return (error); + + nvp->v_type = VREG; + pfs = VTOPFS(nvp); + } + *vpp = nvp; + return (error); + + default: + return (ENOTDIR); + } +} + +/* + * readdir returns directory entries from pfsnode (vp). + * + * the strategy here with procfs is to generate a single + * directory entry at a time (struct pfsdent) and then + * copy that out to userland using uiomove. a more efficent + * though more complex implementation, would try to minimize + * the number of calls to uiomove(). for procfs, this is + * hardly worth the added code complexity. + * + * this should just be done through read() + */ +procfs_readdir(ap) + struct vop_readdir_args *ap; +{ + struct uio *uio = ap->a_uio; + struct pfsdent d; + struct pfsdent *dp = &d; + struct pfsnode *pfs; + int error; + int count; + int i; + + pfs = VTOPFS(ap->a_vp); + + if (uio->uio_resid < UIO_MX) + return (EINVAL); + if (uio->uio_offset & (UIO_MX-1)) + return (EINVAL); + if (uio->uio_offset < 0) + return (EINVAL); + + error = 0; + count = 0; + i = uio->uio_offset / UIO_MX; + + switch (pfs->pfs_type) { + /* + * this is for the process-specific sub-directories. + * all that is needed to is copy out all the entries + * from the procent[] table (top of this file). + */ + case Pproc: { + while (uio->uio_resid >= UIO_MX) { + struct pfsnames *dt; + + if (i >= Nprocent) + break; + + dt = &procent[i]; + + dp->d_reclen = UIO_MX; + dp->d_fileno = PROCFS_FILENO(pfs->pfs_pid, dt->d_pfstype); + dp->d_type = DT_REG; + dp->d_namlen = dt->d_namlen; + bcopy(dt->d_name, dp->d_name, sizeof(dt->d_name)-1); + error = uiomove((caddr_t) dp, UIO_MX, uio); + if (error) + break; + count += UIO_MX; + i++; + } + + break; + + } + + /* + * this is for the root of the procfs filesystem + * what is needed is a special entry for "curproc" + * followed by an entry for each process on allproc +#ifdef PROCFS_ZOMBIE + * and zombproc. +#endif + */ + + case Proot: { + int pcnt; +#ifdef PROCFS_ZOMBIE + int doingzomb = 0; +#endif + volatile struct proc *p; + + p = allproc; + +#define PROCFS_XFILES 1 /* number of other entries, like "curproc" */ + pcnt = PROCFS_XFILES; + + while (p && uio->uio_resid >= UIO_MX) { + bzero((char *) dp, UIO_MX); + dp->d_type = DT_DIR; + dp->d_reclen = UIO_MX; + + switch (i) { + case 0: + /* ship out entry for "curproc" */ + dp->d_fileno = PROCFS_FILENO(PID_MAX+1, Pproc); + dp->d_namlen = sprintf(dp->d_name, "curproc"); + break; + + default: + if (pcnt >= i) { + dp->d_fileno = PROCFS_FILENO(p->p_pid, Pproc); + dp->d_namlen = sprintf(dp->d_name, "%ld", (long) p->p_pid); + } + + p = p->p_next; + +#ifdef PROCFS_ZOMBIE + if (p == 0 && doingzomb == 0) { + doingzomb = 1; + p = zombproc; + } +#endif + + if (pcnt++ < i) + continue; + + break; + } + error = uiomove((caddr_t) dp, UIO_MX, uio); + if (error) + break; + count += UIO_MX; + i++; + } + + break; + + } + + default: + error = ENOTDIR; + break; + } + + uio->uio_offset = i * UIO_MX; + + return (error); +} + +/* + * convert decimal ascii to pid_t + */ +static pid_t +atopid(b, len) + const char *b; + u_int len; +{ + pid_t p = 0; + + while (len--) { + char c = *b++; + if (c < '0' || c > '9') + return (NO_PID); + p = 10 * p + (c - '0'); + if (p > PID_MAX) + return (NO_PID); + } + + return (p); +} + +/* + * procfs vnode operations. + */ +int (**procfs_vnodeop_p)(); +struct vnodeopv_entry_desc procfs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, procfs_lookup }, /* lookup */ + { &vop_create_desc, procfs_create }, /* create */ + { &vop_mknod_desc, procfs_mknod }, /* mknod */ + { &vop_open_desc, procfs_open }, /* open */ + { &vop_close_desc, procfs_close }, /* close */ + { &vop_access_desc, procfs_access }, /* access */ + { &vop_getattr_desc, procfs_getattr }, /* getattr */ + { &vop_setattr_desc, procfs_setattr }, /* setattr */ + { &vop_read_desc, procfs_read }, /* read */ + { &vop_write_desc, procfs_write }, /* write */ + { &vop_ioctl_desc, procfs_ioctl }, /* ioctl */ + { &vop_select_desc, procfs_select }, /* select */ + { &vop_mmap_desc, procfs_mmap }, /* mmap */ + { &vop_fsync_desc, procfs_fsync }, /* fsync */ + { &vop_seek_desc, procfs_seek }, /* seek */ + { &vop_remove_desc, procfs_remove }, /* remove */ + { &vop_link_desc, procfs_link }, /* link */ + { &vop_rename_desc, procfs_rename }, /* rename */ + { &vop_mkdir_desc, procfs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, procfs_rmdir }, /* rmdir */ + { &vop_symlink_desc, procfs_symlink }, /* symlink */ + { &vop_readdir_desc, procfs_readdir }, /* readdir */ + { &vop_readlink_desc, procfs_readlink }, /* readlink */ + { &vop_abortop_desc, procfs_abortop }, /* abortop */ + { &vop_inactive_desc, procfs_inactive }, /* inactive */ + { &vop_reclaim_desc, procfs_reclaim }, /* reclaim */ + { &vop_lock_desc, procfs_lock }, /* lock */ + { &vop_unlock_desc, procfs_unlock }, /* unlock */ + { &vop_bmap_desc, procfs_bmap }, /* bmap */ + { &vop_strategy_desc, procfs_strategy }, /* strategy */ + { &vop_print_desc, procfs_print }, /* print */ + { &vop_islocked_desc, procfs_islocked }, /* islocked */ + { &vop_pathconf_desc, procfs_pathconf }, /* pathconf */ + { &vop_advlock_desc, procfs_advlock }, /* advlock */ + { &vop_blkatoff_desc, procfs_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, procfs_valloc }, /* valloc */ + { &vop_vfree_desc, procfs_vfree }, /* vfree */ + { &vop_truncate_desc, procfs_truncate }, /* truncate */ + { &vop_update_desc, procfs_update }, /* update */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc procfs_vnodeop_opv_desc = + { &procfs_vnodeop_p, procfs_vnodeop_entries }; diff --git a/sys/fs/specfs/spec_vnops.c b/sys/fs/specfs/spec_vnops.c new file mode 100644 index 00000000000..111c517b162 --- /dev/null +++ b/sys/fs/specfs/spec_vnops.c @@ -0,0 +1,689 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)spec_vnops.c 8.6 (Berkeley) 4/9/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* symbolic sleep message strings for devices */ +char devopn[] = "devopn"; +char devio[] = "devio"; +char devwait[] = "devwait"; +char devin[] = "devin"; +char devout[] = "devout"; +char devioc[] = "devioc"; +char devcls[] = "devcls"; + +int (**spec_vnodeop_p)(); +struct vnodeopv_entry_desc spec_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, spec_create }, /* create */ + { &vop_mknod_desc, spec_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, spec_close }, /* close */ + { &vop_access_desc, spec_access }, /* access */ + { &vop_getattr_desc, spec_getattr }, /* getattr */ + { &vop_setattr_desc, spec_setattr }, /* setattr */ + { &vop_read_desc, spec_read }, /* read */ + { &vop_write_desc, spec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_select_desc, spec_select }, /* select */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, spec_remove }, /* remove */ + { &vop_link_desc, spec_link }, /* link */ + { &vop_rename_desc, spec_rename }, /* rename */ + { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ + { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ + { &vop_symlink_desc, spec_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, spec_inactive }, /* inactive */ + { &vop_reclaim_desc, spec_reclaim }, /* reclaim */ + { &vop_lock_desc, spec_lock }, /* lock */ + { &vop_unlock_desc, spec_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + { &vop_strategy_desc, spec_strategy }, /* strategy */ + { &vop_print_desc, spec_print }, /* print */ + { &vop_islocked_desc, spec_islocked }, /* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_blkatoff_desc, spec_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, spec_valloc }, /* valloc */ + { &vop_vfree_desc, spec_vfree }, /* vfree */ + { &vop_truncate_desc, spec_truncate }, /* truncate */ + { &vop_update_desc, spec_update }, /* update */ + { &vop_bwrite_desc, spec_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc spec_vnodeop_opv_desc = + { &spec_vnodeop_p, spec_vnodeop_entries }; + +/* + * Trivial lookup routine that always fails. + */ +int +spec_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + + *ap->a_vpp = NULL; + return (ENOTDIR); +} + +/* + * Open a special file. + */ +/* ARGSUSED */ +spec_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct vnode *bvp, *vp = ap->a_vp; + dev_t bdev, dev = (dev_t)vp->v_rdev; + register int maj = major(dev); + int error; + + /* + * Don't allow open if fs is mounted -nodev. + */ + if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) + return (ENXIO); + + switch (vp->v_type) { + + case VCHR: + if ((u_int)maj >= nchrdev) + return (ENXIO); + if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { + /* + * When running in very secure mode, do not allow + * opens for writing of any disk character devices. + */ + if (securelevel >= 2 && isdisk(dev, VCHR)) + return (EPERM); + /* + * When running in secure mode, do not allow opens + * for writing of /dev/mem, /dev/kmem, or character + * devices whose corresponding block devices are + * currently mounted. + */ + if (securelevel >= 1) { + if ((bdev = chrtoblk(dev)) != NODEV && + vfinddev(bdev, VBLK, &bvp) && + bvp->v_usecount > 0 && + (error = vfs_mountedon(bvp))) + return (error); + if (iskmemdev(dev)) + return (EPERM); + } + } + VOP_UNLOCK(vp); + error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, ap->a_p); + VOP_LOCK(vp); + return (error); + + case VBLK: + if ((u_int)maj >= nblkdev) + return (ENXIO); + /* + * When running in very secure mode, do not allow + * opens for writing of any disk block devices. + */ + if (securelevel >= 2 && ap->a_cred != FSCRED && + (ap->a_mode & FWRITE) && isdisk(dev, VBLK)) + return (EPERM); + /* + * Do not allow opens of block devices that are + * currently mounted. + */ + if (error = vfs_mountedon(vp)) + return (error); + return ((*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, ap->a_p)); + } + return (0); +} + +/* + * Vnode op for read + */ +/* ARGSUSED */ +spec_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct uio *uio = ap->a_uio; + struct proc *p = uio->uio_procp; + struct buf *bp; + daddr_t bn, nextbn; + long bsize, bscale; + struct partinfo dpart; + int n, on, majordev, (*ioctl)(); + int error = 0; + dev_t dev; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("spec_read mode"); + if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) + panic("spec_read proc"); +#endif + if (uio->uio_resid == 0) + return (0); + + switch (vp->v_type) { + + case VCHR: + VOP_UNLOCK(vp); + error = (*cdevsw[major(vp->v_rdev)].d_read) + (vp->v_rdev, uio, ap->a_ioflag); + VOP_LOCK(vp); + return (error); + + case VBLK: + if (uio->uio_offset < 0) + return (EINVAL); + bsize = BLKDEV_IOSIZE; + dev = vp->v_rdev; + if ((majordev = major(dev)) < nblkdev && + (ioctl = bdevsw[majordev].d_ioctl) != NULL && + (*ioctl)(dev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0 && + dpart.part->p_fstype == FS_BSDFFS && + dpart.part->p_frag != 0 && dpart.part->p_fsize != 0) + bsize = dpart.part->p_frag * dpart.part->p_fsize; + bscale = bsize / DEV_BSIZE; + do { + bn = (uio->uio_offset / DEV_BSIZE) &~ (bscale - 1); + on = uio->uio_offset % bsize; + n = min((unsigned)(bsize - on), uio->uio_resid); + if (vp->v_lastr + bscale == bn) { + nextbn = bn + bscale; + error = breadn(vp, bn, (int)bsize, &nextbn, + (int *)&bsize, 1, NOCRED, &bp); + } else + error = bread(vp, bn, (int)bsize, NOCRED, &bp); + vp->v_lastr = bn; + n = min(n, bsize - bp->b_resid); + if (error) { + brelse(bp); + return (error); + } + error = uiomove((char *)bp->b_data + on, n, uio); + if (n + on == bsize) + bp->b_flags |= B_AGE; + brelse(bp); + } while (error == 0 && uio->uio_resid > 0 && n != 0); + return (error); + + default: + panic("spec_read type"); + } + /* NOTREACHED */ +} + +/* + * Vnode op for write + */ +/* ARGSUSED */ +spec_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct uio *uio = ap->a_uio; + struct proc *p = uio->uio_procp; + struct buf *bp; + daddr_t bn; + int bsize, blkmask; + struct partinfo dpart; + register int n, on; + int error = 0; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE) + panic("spec_write mode"); + if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) + panic("spec_write proc"); +#endif + + switch (vp->v_type) { + + case VCHR: + VOP_UNLOCK(vp); + error = (*cdevsw[major(vp->v_rdev)].d_write) + (vp->v_rdev, uio, ap->a_ioflag); + VOP_LOCK(vp); + return (error); + + case VBLK: + if (uio->uio_resid == 0) + return (0); + if (uio->uio_offset < 0) + return (EINVAL); + bsize = BLKDEV_IOSIZE; + if ((*bdevsw[major(vp->v_rdev)].d_ioctl)(vp->v_rdev, DIOCGPART, + (caddr_t)&dpart, FREAD, p) == 0) { + if (dpart.part->p_fstype == FS_BSDFFS && + dpart.part->p_frag != 0 && dpart.part->p_fsize != 0) + bsize = dpart.part->p_frag * + dpart.part->p_fsize; + } + blkmask = (bsize / DEV_BSIZE) - 1; + do { + bn = (uio->uio_offset / DEV_BSIZE) &~ blkmask; + on = uio->uio_offset % bsize; + n = min((unsigned)(bsize - on), uio->uio_resid); + if (n == bsize) + bp = getblk(vp, bn, bsize, 0, 0); + else + error = bread(vp, bn, bsize, NOCRED, &bp); + n = min(n, bsize - bp->b_resid); + if (error) { + brelse(bp); + return (error); + } + error = uiomove((char *)bp->b_data + on, n, uio); + if (n + on == bsize) { + bp->b_flags |= B_AGE; + bawrite(bp); + } else + bdwrite(bp); + } while (error == 0 && uio->uio_resid > 0 && n != 0); + return (error); + + default: + panic("spec_write type"); + } + /* NOTREACHED */ +} + +/* + * Device ioctl operation. + */ +/* ARGSUSED */ +spec_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + dev_t dev = ap->a_vp->v_rdev; + + switch (ap->a_vp->v_type) { + + case VCHR: + return ((*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, + ap->a_fflag, ap->a_p)); + + case VBLK: + if (ap->a_command == 0 && (int)ap->a_data == B_TAPE) + if (bdevsw[major(dev)].d_flags & B_TAPE) + return (0); + else + return (1); + return ((*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, + ap->a_fflag, ap->a_p)); + + default: + panic("spec_ioctl"); + /* NOTREACHED */ + } +} + +/* ARGSUSED */ +spec_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register dev_t dev; + + switch (ap->a_vp->v_type) { + + default: + return (1); /* XXX */ + + case VCHR: + dev = ap->a_vp->v_rdev; + return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_p); + } +} +/* + * Synch buffers associated with a block device + */ +/* ARGSUSED */ +int +spec_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct buf *bp; + struct buf *nbp; + int s; + + if (vp->v_type == VCHR) + return (0); + /* + * Flush all dirty buffers associated with a block device. + */ +loop: + s = splbio(); + for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { + nbp = bp->b_vnbufs.le_next; + if ((bp->b_flags & B_BUSY)) + continue; + if ((bp->b_flags & B_DELWRI) == 0) + panic("spec_fsync: not dirty"); + bremfree(bp); + bp->b_flags |= B_BUSY; + splx(s); + bawrite(bp); + goto loop; + } + if (ap->a_waitfor == MNT_WAIT) { + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1); + } +#ifdef DIAGNOSTIC + if (vp->v_dirtyblkhd.lh_first) { + vprint("spec_fsync: dirty", vp); + goto loop; + } +#endif + } + splx(s); + return (0); +} + +/* + * Just call the device strategy routine + */ +spec_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + + (*bdevsw[major(ap->a_bp->b_dev)].d_strategy)(ap->a_bp); + return (0); +} + +/* + * This is a noop, simply returning what one has been given. + */ +spec_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + } */ *ap; +{ + + if (ap->a_vpp != NULL) + *ap->a_vpp = ap->a_vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + return (0); +} + +/* + * At the moment we do not do any locking. + */ +/* ARGSUSED */ +spec_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* ARGSUSED */ +spec_unlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* + * Device close routine + */ +/* ARGSUSED */ +spec_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + dev_t dev = vp->v_rdev; + int (*devclose) __P((dev_t, int, int, struct proc *)); + int mode, error; + + switch (vp->v_type) { + + case VCHR: + /* + * Hack: a tty device that is a controlling terminal + * has a reference from the session structure. + * We cannot easily tell that a character device is + * a controlling terminal, unless it is the closing + * process' controlling terminal. In that case, + * if the reference count is 2 (this last descriptor + * plus the session), release the reference from the session. + */ + if (vcount(vp) == 2 && ap->a_p && + vp == ap->a_p->p_session->s_ttyvp) { + vrele(vp); + ap->a_p->p_session->s_ttyvp = NULL; + } + /* + * If the vnode is locked, then we are in the midst + * of forcably closing the device, otherwise we only + * close on last reference. + */ + if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) + return (0); + devclose = cdevsw[major(dev)].d_close; + mode = S_IFCHR; + break; + + case VBLK: + /* + * On last close of a block device (that isn't mounted) + * we must invalidate any in core blocks, so that + * we can, for instance, change floppy disks. + */ + if (error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0)) + return (error); + /* + * We do not want to really close the device if it + * is still in use unless we are trying to close it + * forcibly. Since every use (buffer, vnode, swap, cmap) + * holds a reference to the vnode, and because we mark + * any other vnodes that alias this device, when the + * sum of the reference counts on all the aliased + * vnodes descends to one, we are on last close. + */ + if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) + return (0); + devclose = bdevsw[major(dev)].d_close; + mode = S_IFBLK; + break; + + default: + panic("spec_close: not special"); + } + + return ((*devclose)(dev, ap->a_fflag, mode, ap->a_p)); +} + +/* + * Print out the contents of a special device vnode. + */ +spec_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + printf("tag VT_NON, dev %d, %d\n", major(ap->a_vp->v_rdev), + minor(ap->a_vp->v_rdev)); +} + +/* + * Return POSIX pathconf information applicable to special devices. + */ +spec_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_MAX_CANON: + *ap->a_retval = MAX_CANON; + return (0); + case _PC_MAX_INPUT: + *ap->a_retval = MAX_INPUT; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_VDISABLE: + *ap->a_retval = _POSIX_VDISABLE; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Special device advisory byte-level locks. + */ +/* ARGSUSED */ +spec_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * Special device failed operation + */ +spec_ebadf() +{ + + return (EBADF); +} + +/* + * Special device bad operation + */ +spec_badop() +{ + + panic("spec_badop called"); + /* NOTREACHED */ +} diff --git a/sys/fs/umapfs/umap.h b/sys/fs/umapfs/umap.h new file mode 100644 index 00000000000..9f4d1e7ace5 --- /dev/null +++ b/sys/fs/umapfs/umap.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * the UCLA Ficus project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)umap.h 8.3 (Berkeley) 1/21/94 + * + * @(#)null_vnops.c 1.5 (Berkeley) 7/10/92 + */ + +#define MAPFILEENTRIES 64 +#define GMAPFILEENTRIES 16 +#define NOBODY 32767 +#define NULLGROUP 65534 + +struct umap_args { + char *target; /* Target of loopback */ + int nentries; /* # of entries in user map array */ + int gnentries; /* # of entries in group map array */ + u_long (*mapdata)[2]; /* pointer to array of user mappings */ + u_long (*gmapdata)[2]; /* pointer to array of group mappings */ +}; + +struct umap_mount { + struct mount *umapm_vfs; + struct vnode *umapm_rootvp; /* Reference to root umap_node */ + int info_nentries; /* number of uid mappings */ + int info_gnentries; /* number of gid mappings */ + u_long info_mapdata[MAPFILEENTRIES][2]; /* mapping data for + user mapping in ficus */ + u_long info_gmapdata[GMAPFILEENTRIES][2]; /*mapping data for + group mapping in ficus */ +}; + +#ifdef KERNEL +/* + * A cache of vnode references + */ +struct umap_node { + struct umap_node *umap_forw; /* Hash chain */ + struct umap_node *umap_back; + struct vnode *umap_lowervp; /* Aliased vnode - VREFed once */ + struct vnode *umap_vnode; /* Back pointer to vnode/umap_node */ +}; + +extern int umap_node_create __P((struct mount *mp, struct vnode *target, struct vnode **vpp)); +extern u_long umap_reverse_findid __P((u_long id, u_long map[][2], int nentries)); +extern void umap_mapids __P((struct mount *v_mount, struct ucred *credp)); + +#define MOUNTTOUMAPMOUNT(mp) ((struct umap_mount *)((mp)->mnt_data)) +#define VTOUMAP(vp) ((struct umap_node *)(vp)->v_data) +#define UMAPTOV(xp) ((xp)->umap_vnode) +#ifdef UMAPFS_DIAGNOSTIC +extern struct vnode *umap_checkvp __P((struct vnode *vp, char *fil, int lno)); +#define UMAPVPTOLOWERVP(vp) umap_checkvp((vp), __FILE__, __LINE__) +#else +#define UMAPVPTOLOWERVP(vp) (VTOUMAP(vp)->umap_lowervp) +#endif + +extern int (**umap_vnodeop_p)(); +extern struct vfsops umap_vfsops; +#endif /* KERNEL */ diff --git a/sys/fs/umapfs/umap_subr.c b/sys/fs/umapfs/umap_subr.c new file mode 100644 index 00000000000..6f1f077a621 --- /dev/null +++ b/sys/fs/umapfs/umap_subr.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)umap_subr.c 8.6 (Berkeley) 1/26/94 + * + * $Id: lofs_subr.c, v 1.11 1992/05/30 10:05:43 jsp Exp jsp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOG2_SIZEVNODE 7 /* log2(sizeof struct vnode) */ +#define NUMAPNODECACHE 16 +#define UMAP_NHASH(vp) ((((u_long) vp)>>LOG2_SIZEVNODE) & (NUMAPNODECACHE-1)) + +/* + * Null layer cache: + * Each cache entry holds a reference to the target vnode + * along with a pointer to the alias vnode. When an + * entry is added the target vnode is VREF'd. When the + * alias is removed the target vnode is vrele'd. + */ + +/* + * Cache head + */ +struct umap_node_cache { + struct umap_node *ac_forw; + struct umap_node *ac_back; +}; + +static struct umap_node_cache umap_node_cache[NUMAPNODECACHE]; + +/* + * Initialise cache headers + */ +umapfs_init() +{ + struct umap_node_cache *ac; +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_init\n"); /* printed during system boot */ +#endif + + for (ac = umap_node_cache; ac < umap_node_cache + NUMAPNODECACHE; ac++) + ac->ac_forw = ac->ac_back = (struct umap_node *) ac; +} + +/* + * Compute hash list for given target vnode + */ +static struct umap_node_cache * +umap_node_hash(targetvp) + struct vnode *targetvp; +{ + + return (&umap_node_cache[UMAP_NHASH(targetvp)]); +} + +/* + * umap_findid is called by various routines in umap_vnodeops.c to + * find a user or group id in a map. + */ +static u_long +umap_findid(id, map, nentries) + u_long id; + u_long map[][2]; + int nentries; +{ + int i; + + /* Find uid entry in map */ + i = 0; + while ((iac_forw; a != (struct umap_node *) hd; a = a->umap_forw) { + if (a->umap_lowervp == targetvp && + a->umap_vnode->v_mount == mp) { + vp = UMAPTOV(a); + /* + * We need vget for the VXLOCK + * stuff, but we don't want to lock + * the lower node. + */ + if (vget(vp, 0)) { +#ifdef UMAPFS_DIAGNOSTIC + printf ("umap_node_find: vget failed.\n"); +#endif + goto loop; + } + return (vp); + } + } + +#ifdef UMAPFS_DIAGNOSTIC + printf("umap_node_find(%x, %x): NOT found\n", mp, targetvp); +#endif + + return (0); +} + +/* + * Make a new umap_node node. + * Vp is the alias vnode, lofsvp is the target vnode. + * Maintain a reference to (targetvp). + */ +static int +umap_node_alloc(mp, lowervp, vpp) + struct mount *mp; + struct vnode *lowervp; + struct vnode **vpp; +{ + struct umap_node_cache *hd; + struct umap_node *xp; + struct vnode *othervp, *vp; + int error; + + if (error = getnewvnode(VT_UMAP, mp, umap_vnodeop_p, vpp)) + return (error); + vp = *vpp; + + MALLOC(xp, struct umap_node *, sizeof(struct umap_node), + M_TEMP, M_WAITOK); + vp->v_type = lowervp->v_type; + xp->umap_vnode = vp; + vp->v_data = xp; + xp->umap_lowervp = lowervp; + /* + * Before we insert our new node onto the hash chains, + * check to see if someone else has beaten us to it. + * (We could have slept in MALLOC.) + */ + if (othervp = umap_node_find(lowervp)) { + FREE(xp, M_TEMP); + vp->v_type = VBAD; /* node is discarded */ + vp->v_usecount = 0; /* XXX */ + *vpp = othervp; + return (0); + } + VREF(lowervp); /* Extra VREF will be vrele'd in umap_node_create */ + hd = umap_node_hash(lowervp); + insque(xp, hd); + return (0); +} + + +/* + * Try to find an existing umap_node vnode refering + * to it, otherwise make a new umap_node vnode which + * contains a reference to the target vnode. + */ +int +umap_node_create(mp, targetvp, newvpp) + struct mount *mp; + struct vnode *targetvp; + struct vnode **newvpp; +{ + struct vnode *aliasvp; + + if (aliasvp = umap_node_find(mp, targetvp)) { + /* + * Take another reference to the alias vnode + */ +#ifdef UMAPFS_DIAGNOSTIC + vprint("umap_node_create: exists", ap->umap_vnode); +#endif + /* VREF(aliasvp); */ + } else { + int error; + + /* + * Get new vnode. + */ +#ifdef UMAPFS_DIAGNOSTIC + printf("umap_node_create: create new alias vnode\n"); +#endif + /* + * Make new vnode reference the umap_node. + */ + if (error = umap_node_alloc(mp, targetvp, &aliasvp)) + return (error); + + /* + * aliasvp is already VREF'd by getnewvnode() + */ + } + + vrele(targetvp); + +#ifdef UMAPFS_DIAGNOSTIC + vprint("umap_node_create: alias", aliasvp); + vprint("umap_node_create: target", targetvp); +#endif + + *newvpp = aliasvp; + return (0); +} + +#ifdef UMAPFS_DIAGNOSTIC +int umap_checkvp_barrier = 1; +struct vnode * +umap_checkvp(vp, fil, lno) + struct vnode *vp; + char *fil; + int lno; +{ + struct umap_node *a = VTOUMAP(vp); +#if 0 + /* + * Can't do this check because vop_reclaim runs + * with funny vop vector. + */ + if (vp->v_op != umap_vnodeop_p) { + printf ("umap_checkvp: on non-umap-node\n"); + while (umap_checkvp_barrier) /*WAIT*/ ; + panic("umap_checkvp"); + } +#endif + if (a->umap_lowervp == NULL) { + /* Should never happen */ + int i; u_long *p; + printf("vp = %x, ZERO ptr\n", vp); + for (p = (u_long *) a, i = 0; i < 8; i++) + printf(" %x", p[i]); + printf("\n"); + /* wait for debugger */ + while (umap_checkvp_barrier) /*WAIT*/ ; + panic("umap_checkvp"); + } + if (a->umap_lowervp->v_usecount < 1) { + int i; u_long *p; + printf("vp = %x, unref'ed lowervp\n", vp); + for (p = (u_long *) a, i = 0; i < 8; i++) + printf(" %x", p[i]); + printf("\n"); + /* wait for debugger */ + while (umap_checkvp_barrier) /*WAIT*/ ; + panic ("umap with unref'ed lowervp"); + } +#if 0 + printf("umap %x/%d -> %x/%d [%s, %d]\n", + a->umap_vnode, a->umap_vnode->v_usecount, + a->umap_lowervp, a->umap_lowervp->v_usecount, + fil, lno); +#endif + return (a->umap_lowervp); +} +#endif + +/* umap_mapids maps all of the ids in a credential, both user and group. */ + +void +umap_mapids(v_mount, credp) + struct mount *v_mount; + struct ucred *credp; +{ + int i, unentries, gnentries; + u_long *groupmap, *usermap; + uid_t uid; + gid_t gid; + + unentries = MOUNTTOUMAPMOUNT(v_mount)->info_nentries; + usermap = &(MOUNTTOUMAPMOUNT(v_mount)->info_mapdata[0][0]); + gnentries = MOUNTTOUMAPMOUNT(v_mount)->info_gnentries; + groupmap = &(MOUNTTOUMAPMOUNT(v_mount)->info_gmapdata[0][0]); + + /* Find uid entry in map */ + + uid = (uid_t) umap_findid(credp->cr_uid, usermap, unentries); + + if (uid != -1) + credp->cr_uid = uid; + else + credp->cr_uid = (uid_t) NOBODY; + +#ifdef notdef + /* cr_gid is the same as cr_groups[0] in 4BSD */ + + /* Find gid entry in map */ + + gid = (gid_t) umap_findid(credp->cr_gid, groupmap, gnentries); + + if (gid != -1) + credp->cr_gid = gid; + else + credp->cr_gid = NULLGROUP; +#endif + + /* Now we must map each of the set of groups in the cr_groups + structure. */ + + i = 0; + while (credp->cr_groups[i] != 0) { + gid = (gid_t) umap_findid(credp->cr_groups[i], + groupmap, gnentries); + + if (gid != -1) + credp->cr_groups[i++] = gid; + else + credp->cr_groups[i++] = NULLGROUP; + } +} diff --git a/sys/fs/umapfs/umap_vfsops.c b/sys/fs/umapfs/umap_vfsops.c new file mode 100644 index 00000000000..2480a85e440 --- /dev/null +++ b/sys/fs/umapfs/umap_vfsops.c @@ -0,0 +1,407 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * the UCLA Ficus project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)umap_vfsops.c 8.3 (Berkeley) 1/21/94 + * + * @(#)null_vfsops.c 1.5 (Berkeley) 7/10/92 + */ + +/* + * Umap Layer + * (See mount_umap(8) for a description of this layer.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Mount umap layer + */ +int +umapfs_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct umap_args args; + struct vnode *lowerrootvp, *vp; + struct vnode *umapm_rootvp; + struct umap_mount *amp; + u_int size; + int error; + +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_mount(mp = %x)\n", mp); +#endif + + /* + * Update is a no-op + */ + if (mp->mnt_flag & MNT_UPDATE) { + return (EOPNOTSUPP); + /* return (VFS_MOUNT(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, path, data, ndp, p));*/ + } + + /* + * Get argument + */ + if (error = copyin(data, (caddr_t)&args, sizeof(struct umap_args))) + return (error); + + /* + * Find lower node + */ + NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT|LOCKLEAF, + UIO_USERSPACE, args.target, p); + if (error = namei(ndp)) + return (error); + + /* + * Sanity check on lower vnode + */ + lowerrootvp = ndp->ni_vp; +#ifdef UMAPFS_DIAGNOSTIC + printf("vp = %x, check for VDIR...\n", lowerrootvp); +#endif + vrele(ndp->ni_dvp); + ndp->ni_dvp = 0; + + if (lowerrootvp->v_type != VDIR) { + vput(lowerrootvp); + return (EINVAL); + } + +#ifdef UMAPFS_DIAGNOSTIC + printf("mp = %x\n", mp); +#endif + + amp = (struct umap_mount *) malloc(sizeof(struct umap_mount), + M_UFSMNT, M_WAITOK); /* XXX */ + + /* + * Save reference to underlying FS + */ + amp->umapm_vfs = lowerrootvp->v_mount; + + /* + * Now copy in the number of entries and maps for umap mapping. + */ + amp->info_nentries = args.nentries; + amp->info_gnentries = args.gnentries; + error = copyin(args.mapdata, (caddr_t)amp->info_mapdata, + 2*sizeof(u_long)*args.nentries); + if (error) + return (error); + +#ifdef UMAP_DIAGNOSTIC + printf("umap_mount:nentries %d\n",args.nentries); + for (i = 0; i < args.nentries; i++) + printf(" %d maps to %d\n", amp->info_mapdata[i][0], + amp->info_mapdata[i][1]); +#endif + + error = copyin(args.gmapdata, (caddr_t)amp->info_gmapdata, + 2*sizeof(u_long)*args.nentries); + if (error) + return (error); + +#ifdef UMAP_DIAGNOSTIC + printf("umap_mount:gnentries %d\n",args.gnentries); + for (i = 0; i < args.gnentries; i++) + printf(" group %d maps to %d\n", + amp->info_gmapdata[i][0], + amp->info_gmapdata[i][1]); +#endif + + + /* + * Save reference. Each mount also holds + * a reference on the root vnode. + */ + error = umap_node_create(mp, lowerrootvp, &vp); + /* + * Unlock the node (either the lower or the alias) + */ + VOP_UNLOCK(vp); + /* + * Make sure the node alias worked + */ + if (error) { + vrele(lowerrootvp); + free(amp, M_UFSMNT); /* XXX */ + return (error); + } + + /* + * Keep a held reference to the root vnode. + * It is vrele'd in umapfs_unmount. + */ + umapm_rootvp = vp; + umapm_rootvp->v_flag |= VROOT; + amp->umapm_rootvp = umapm_rootvp; + if (UMAPVPTOLOWERVP(umapm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_data = (qaddr_t) amp; + getnewfsid(mp, MOUNT_LOFS); + + (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + (void) copyinstr(args.target, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_mount: lower %s, alias at %s\n", + mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); +#endif + return (0); +} + +/* + * VFS start. Nothing needed here - the start routine + * on the underlying filesystem will have been called + * when that filesystem was mounted. + */ +int +umapfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + return (0); + /* return (VFS_START(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, flags, p)); */ +} + +/* + * Free reference to umap layer + */ +int +umapfs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + struct vnode *umapm_rootvp = MOUNTTOUMAPMOUNT(mp)->umapm_rootvp; + int error; + int flags = 0; + extern int doforce; + +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_unmount(mp = %x)\n", mp); +#endif + + if (mntflags & MNT_FORCE) { + /* lofs can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + /* + * Clear out buffer cache. I don't think we + * ever get anything cached at this level at the + * moment, but who knows... + */ +#ifdef notyet + mntflushbuf(mp, 0); + if (mntinvalbuf(mp, 1)) + return (EBUSY); +#endif + if (umapm_rootvp->v_usecount > 1) + return (EBUSY); + if (error = vflush(mp, umapm_rootvp, flags)) + return (error); + +#ifdef UMAPFS_DIAGNOSTIC + vprint("alias root of lower", umapm_rootvp); +#endif + /* + * Release reference on underlying root vnode + */ + vrele(umapm_rootvp); + /* + * And blow it away for future re-use + */ + vgone(umapm_rootvp); + /* + * Finally, throw away the umap_mount structure + */ + free(mp->mnt_data, M_UFSMNT); /* XXX */ + mp->mnt_data = 0; + return (0); +} + +int +umapfs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *vp; + +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_root(mp = %x, vp = %x->%x)\n", mp, + MOUNTTOUMAPMOUNT(mp)->umapm_rootvp, + UMAPVPTOLOWERVP(MOUNTTOUMAPMOUNT(mp)->umapm_rootvp) + ); +#endif + + /* + * Return locked reference to root. + */ + vp = MOUNTTOUMAPMOUNT(mp)->umapm_rootvp; + VREF(vp); + VOP_LOCK(vp); + *vpp = vp; + return (0); +} + +int +umapfs_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + return (VFS_QUOTACTL(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, cmd, uid, arg, p)); +} + +int +umapfs_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + int error; + struct statfs mstat; + +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_statfs(mp = %x, vp = %x->%x)\n", mp, + MOUNTTOUMAPMOUNT(mp)->umapm_rootvp, + UMAPVPTOLOWERVP(MOUNTTOUMAPMOUNT(mp)->umapm_rootvp) + ); +#endif + + bzero(&mstat, sizeof(mstat)); + + error = VFS_STATFS(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, &mstat, p); + if (error) + return (error); + + /* now copy across the "interesting" information and fake the rest */ + sbp->f_type = mstat.f_type; + sbp->f_flags = mstat.f_flags; + sbp->f_bsize = mstat.f_bsize; + sbp->f_iosize = mstat.f_iosize; + sbp->f_blocks = mstat.f_blocks; + sbp->f_bfree = mstat.f_bfree; + sbp->f_bavail = mstat.f_bavail; + sbp->f_files = mstat.f_files; + sbp->f_ffree = mstat.f_ffree; + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + return (0); +} + +int +umapfs_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + /* + * XXX - Assumes no data cached at umap layer. + */ + return (0); +} + +int +umapfs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (VFS_VGET(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, ino, vpp)); +} + +int +umapfs_fhtovp(mp, fidp, nam, vpp, exflagsp, credanonp) + struct mount *mp; + struct fid *fidp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred**credanonp; +{ + + return (VFS_FHTOVP(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, fidp, nam, vpp, exflagsp,credanonp)); +} + +int +umapfs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + return (VFS_VPTOFH(UMAPVPTOLOWERVP(vp), fhp)); +} + +int umapfs_init __P((void)); + +struct vfsops umap_vfsops = { + umapfs_mount, + umapfs_start, + umapfs_unmount, + umapfs_root, + umapfs_quotactl, + umapfs_statfs, + umapfs_sync, + umapfs_vget, + umapfs_fhtovp, + umapfs_vptofh, + umapfs_init, +}; diff --git a/sys/fs/umapfs/umap_vnops.c b/sys/fs/umapfs/umap_vnops.c new file mode 100644 index 00000000000..287804e1561 --- /dev/null +++ b/sys/fs/umapfs/umap_vnops.c @@ -0,0 +1,488 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * the UCLA Ficus project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)umap_vnops.c 8.3 (Berkeley) 1/5/94 + */ + +/* + * Umap Layer + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +int umap_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ + +/* + * This is the 10-Apr-92 bypass routine. + * See null_vnops.c:null_bypass for more details. + */ +int +umap_bypass(ap) + struct vop_generic_args /* { + struct vnodeop_desc *a_desc; + + } */ *ap; +{ + extern int (**umap_vnodeop_p)(); /* not extern, really "forward" */ + struct ucred **credpp = 0, *credp = 0; + struct ucred *savecredp, *savecompcredp = 0; + struct ucred *compcredp = 0; + struct vnode **this_vp_p; + int error; + struct vnode *old_vps[VDESC_MAX_VPS]; + struct vnode *vp1 = 0; + struct vnode **vps_p[VDESC_MAX_VPS]; + struct vnode ***vppp; + struct vnodeop_desc *descp = ap->a_desc; + int reles, i; + struct componentname **compnamepp = 0; + + if (umap_bug_bypass) + printf ("umap_bypass: %s\n", descp->vdesc_name); + +#ifdef SAFETY + /* + * We require at least one vp. + */ + if (descp->vdesc_vp_offsets == NULL || + descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) + panic ("umap_bypass: no vp's in map.\n"); +#endif + + /* + * Map the vnodes going in. + * Later, we'll invoke the operation based on + * the first mapped vnode's operation vector. + */ + reles = descp->vdesc_flags; + for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { + if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) + break; /* bail out at end of list */ + vps_p[i] = this_vp_p = + VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[i], ap); + + if (i == 0) { + vp1 = *vps_p[0]; + } + + /* + * We're not guaranteed that any but the first vnode + * are of our type. Check for and don't map any + * that aren't. (Must map first vp or vclean fails.) + */ + + if (i && (*this_vp_p)->v_op != umap_vnodeop_p) { + old_vps[i] = NULL; + } else { + old_vps[i] = *this_vp_p; + *(vps_p[i]) = UMAPVPTOLOWERVP(*this_vp_p); + if (reles & 1) + VREF(*this_vp_p); + } + + } + + /* + * Fix the credentials. (That's the purpose of this layer.) + */ + + if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) { + + credpp = VOPARG_OFFSETTO(struct ucred**, + descp->vdesc_cred_offset, ap); + + /* Save old values */ + + savecredp = (*credpp); + (*credpp) = crdup(savecredp); + credp = *credpp; + + if (umap_bug_bypass && credp->cr_uid != 0) + printf("umap_bypass: user was %d, group %d\n", + credp->cr_uid, credp->cr_gid); + + /* Map all ids in the credential structure. */ + + umap_mapids(vp1->v_mount, credp); + + if (umap_bug_bypass && credp->cr_uid != 0) + printf("umap_bypass: user now %d, group %d\n", + credp->cr_uid, credp->cr_gid); + } + + /* BSD often keeps a credential in the componentname structure + * for speed. If there is one, it better get mapped, too. + */ + + if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) { + + compnamepp = VOPARG_OFFSETTO(struct componentname**, + descp->vdesc_componentname_offset, ap); + + compcredp = (*compnamepp)->cn_cred; + savecompcredp = compcredp; + compcredp = (*compnamepp)->cn_cred = crdup(savecompcredp); + + if (umap_bug_bypass && compcredp->cr_uid != 0) + printf("umap_bypass: component credit user was %d, group %d\n", + compcredp->cr_uid, compcredp->cr_gid); + + /* Map all ids in the credential structure. */ + + umap_mapids(vp1->v_mount, compcredp); + + if (umap_bug_bypass && compcredp->cr_uid != 0) + printf("umap_bypass: component credit user now %d, group %d\n", + compcredp->cr_uid, compcredp->cr_gid); + } + + /* + * Call the operation on the lower layer + * with the modified argument structure. + */ + error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap); + + /* + * Maintain the illusion of call-by-value + * by restoring vnodes in the argument structure + * to their original value. + */ + reles = descp->vdesc_flags; + for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { + if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) + break; /* bail out at end of list */ + if (old_vps[i]) { + *(vps_p[i]) = old_vps[i]; + if (reles & 1) + vrele(*(vps_p[i])); + }; + }; + + /* + * Map the possible out-going vpp + * (Assumes that the lower layer always returns + * a VREF'ed vpp unless it gets an error.) + */ + if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && + !(descp->vdesc_flags & VDESC_NOMAP_VPP) && + !error) { + if (descp->vdesc_flags & VDESC_VPP_WILLRELE) + goto out; + vppp = VOPARG_OFFSETTO(struct vnode***, + descp->vdesc_vpp_offset, ap); + error = umap_node_create(old_vps[0]->v_mount, **vppp, *vppp); + }; + + out: + /* + * Free duplicate cred structure and restore old one. + */ + if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) { + if (umap_bug_bypass && credp && credp->cr_uid != 0) + printf("umap_bypass: returning-user was %d\n", + credp->cr_uid); + + crfree(credp); + (*credpp) = savecredp; + if (umap_bug_bypass && credpp && (*credpp)->cr_uid != 0) + printf("umap_bypass: returning-user now %d\n\n", + (*credpp)->cr_uid); + } + + if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) { + if (umap_bug_bypass && compcredp && compcredp->cr_uid != 0) + printf("umap_bypass: returning-component-user was %d\n", + compcredp->cr_uid); + + crfree(compcredp); + (*compnamepp)->cn_cred = savecompcredp; + if (umap_bug_bypass && credpp && (*credpp)->cr_uid != 0) + printf("umap_bypass: returning-component-user now %d\n", + compcredp->cr_uid); + } + + return (error); +} + + +/* + * We handle getattr to change the fsid. + */ +int +umap_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + short uid, gid; + int error, tmpid, nentries, gnentries; + u_long (*mapdata)[2], (*gmapdata)[2]; + struct vnode **vp1p; + struct vnodeop_desc *descp = ap->a_desc; + + if (error = umap_bypass(ap)) + return (error); + /* Requires that arguments be restored. */ + ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; + + /* + * Umap needs to map the uid and gid returned by a stat + * into the proper values for this site. This involves + * finding the returned uid in the mapping information, + * translating it into the uid on the other end, + * and filling in the proper field in the vattr + * structure pointed to by ap->a_vap. The group + * is easier, since currently all groups will be + * translate to the NULLGROUP. + */ + + /* Find entry in map */ + + uid = ap->a_vap->va_uid; + gid = ap->a_vap->va_gid; + if (umap_bug_bypass) + printf("umap_getattr: mapped uid = %d, mapped gid = %d\n", uid, + gid); + + vp1p = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap); + nentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_nentries; + mapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_mapdata); + gnentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gnentries; + gmapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gmapdata); + + /* Reverse map the uid for the vnode. Since it's a reverse + map, we can't use umap_mapids() to do it. */ + + tmpid = umap_reverse_findid(uid, mapdata, nentries); + + if (tmpid != -1) { + + ap->a_vap->va_uid = (uid_t) tmpid; + if (umap_bug_bypass) + printf("umap_getattr: original uid = %d\n", uid); + } else + ap->a_vap->va_uid = (uid_t) NOBODY; + + /* Reverse map the gid for the vnode. */ + + tmpid = umap_reverse_findid(gid, gmapdata, gnentries); + + if (tmpid != -1) { + + ap->a_vap->va_gid = (gid_t) tmpid; + if (umap_bug_bypass) + printf("umap_getattr: original gid = %d\n", gid); + } else + ap->a_vap->va_gid = (gid_t) NULLGROUP; + + return (0); +} + +int +umap_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + /* + * Do nothing (and _don't_ bypass). + * Wait to vrele lowervp until reclaim, + * so that until then our umap_node is in the + * cache and reusable. + * + */ + return (0); +} + +int +umap_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct umap_node *xp = VTOUMAP(vp); + struct vnode *lowervp = xp->umap_lowervp; + + /* After this assignment, this node will not be re-used. */ + xp->umap_lowervp = NULL; + remque(xp); + FREE(vp->v_data, M_TEMP); + vp->v_data = NULL; + vrele(lowervp); + return (0); +} + +int +umap_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + int error; + struct vnode *savedvp; + + savedvp = bp->b_vp; + bp->b_vp = UMAPVPTOLOWERVP(bp->b_vp); + + error = VOP_STRATEGY(ap->a_bp); + + bp->b_vp = savedvp; + + return (error); +} + +int +umap_bwrite(ap) + struct vop_bwrite_args /* { + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + int error; + struct vnode *savedvp; + + savedvp = bp->b_vp; + bp->b_vp = UMAPVPTOLOWERVP(bp->b_vp); + + error = VOP_BWRITE(ap->a_bp); + + bp->b_vp = savedvp; + + return (error); +} + + +int +umap_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + printf("\ttag VT_UMAPFS, vp=%x, lowervp=%x\n", vp, UMAPVPTOLOWERVP(vp)); + return (0); +} + +int +umap_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + int error; + struct componentname *compnamep; + struct ucred *compcredp, *savecompcredp; + struct vnode *vp; + + /* + * Rename is irregular, having two componentname structures. + * We need to map the cre in the second structure, + * and then bypass takes care of the rest. + */ + + vp = ap->a_fdvp; + compnamep = ap->a_tcnp; + compcredp = compnamep->cn_cred; + + savecompcredp = compcredp; + compcredp = compnamep->cn_cred = crdup(savecompcredp); + + if (umap_bug_bypass && compcredp->cr_uid != 0) + printf("umap_rename: rename component credit user was %d, group %d\n", + compcredp->cr_uid, compcredp->cr_gid); + + /* Map all ids in the credential structure. */ + + umap_mapids(vp->v_mount, compcredp); + + if (umap_bug_bypass && compcredp->cr_uid != 0) + printf("umap_rename: rename component credit user now %d, group %d\n", + compcredp->cr_uid, compcredp->cr_gid); + + error = umap_bypass(ap); + + /* Restore the additional mapped componentname cred structure. */ + + crfree(compcredp); + compnamep->cn_cred = savecompcredp; + + return error; +} + +/* + * Global vfs data structures + */ +/* + * XXX - strategy, bwrite are hand coded currently. They should + * go away with a merged buffer/block cache. + * + */ +int (**umap_vnodeop_p)(); +struct vnodeopv_entry_desc umap_vnodeop_entries[] = { + { &vop_default_desc, umap_bypass }, + + { &vop_getattr_desc, umap_getattr }, + { &vop_inactive_desc, umap_inactive }, + { &vop_reclaim_desc, umap_reclaim }, + { &vop_print_desc, umap_print }, + { &vop_rename_desc, umap_rename }, + + { &vop_strategy_desc, umap_strategy }, + { &vop_bwrite_desc, umap_bwrite }, + + { (struct vnodeop_desc*) NULL, (int(*)()) NULL } +}; +struct vnodeopv_desc umap_vnodeop_opv_desc = + { &umap_vnodeop_p, umap_vnodeop_entries }; diff --git a/sys/fs/unionfs/union.h b/sys/fs/unionfs/union.h new file mode 100644 index 00000000000..463218ac3ed --- /dev/null +++ b/sys/fs/unionfs/union.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 1994 The Regents of the University of California. + * Copyright (c) 1994 Jan-Simon Pendry. + * All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)union.h 8.2 (Berkeley) 2/17/94 + */ + +struct union_args { + char *target; /* Target of loopback */ + int mntflags; /* Options on the mount */ +}; + +#define UNMNT_ABOVE 0x0001 /* Target appears below mount point */ +#define UNMNT_BELOW 0x0002 /* Target appears below mount point */ +#define UNMNT_REPLACE 0x0003 /* Target replaces mount point */ +#define UNMNT_OPMASK 0x0003 + +struct union_mount { + struct vnode *um_uppervp; + struct vnode *um_lowervp; + struct ucred *um_cred; /* Credentials of user calling mount */ + int um_cmode; /* cmask from mount process */ + int um_op; /* Operation mode */ +}; + +#ifdef KERNEL + +/* + * DEFDIRMODE is the mode bits used to create a shadow directory. + */ +#define VRWXMODE (VREAD|VWRITE|VEXEC) +#define VRWMODE (VREAD|VWRITE) +#define UN_DIRMODE ((VRWXMODE)|(VRWXMODE>>3)|(VRWXMODE>>6)) +#define UN_FILEMODE ((VRWMODE)|(VRWMODE>>3)|(VRWMODE>>6)) + +/* + * A cache of vnode references + */ +struct union_node { + LIST_ENTRY(union_node) un_cache; /* Hash chain */ + struct vnode *un_vnode; /* Back pointer */ + struct vnode *un_uppervp; /* overlaying object */ + struct vnode *un_lowervp; /* underlying object */ + struct vnode *un_dirvp; /* Parent dir of uppervp */ + char *un_path; /* saved component name */ + int un_hash; /* saved un_path hash value */ + int un_openl; /* # of opens on lowervp */ + int un_flags; +#ifdef DIAGNOSTIC + pid_t un_pid; +#endif +}; + +#define UN_WANT 0x01 +#define UN_LOCKED 0x02 +#define UN_ULOCK 0x04 /* Upper node is locked */ +#define UN_KLOCK 0x08 /* Keep upper node locked on vput */ + +extern int union_allocvp __P((struct vnode **, struct mount *, + struct vnode *, struct vnode *, + struct componentname *, struct vnode *, + struct vnode *)); +extern int union_copyfile __P((struct proc *, struct ucred *, + struct vnode *, struct vnode *)); +extern int union_mkshadow __P((struct union_mount *, struct vnode *, + struct componentname *, struct vnode **)); +extern int union_vn_create __P((struct vnode **, struct union_node *, + struct proc *)); +extern int union_cn_close __P((struct vnode *, int, struct ucred *, + struct proc *)); +extern void union_removed_upper __P((struct union_node *un)); +extern struct vnode *union_lowervp __P((struct vnode *)); +extern void union_newlower __P((struct union_node *, struct vnode *)); +extern void union_newupper __P((struct union_node *, struct vnode *)); + +#define MOUNTTOUNIONMOUNT(mp) ((struct union_mount *)((mp)->mnt_data)) +#define VTOUNION(vp) ((struct union_node *)(vp)->v_data) +#define UNIONTOV(un) ((un)->un_vnode) +#define LOWERVP(vp) (VTOUNION(vp)->un_lowervp) +#define UPPERVP(vp) (VTOUNION(vp)->un_uppervp) +#define OTHERVP(vp) (UPPERVP(vp) ? UPPERVP(vp) : LOWERVP(vp)) + +extern int (**union_vnodeop_p)(); +extern struct vfsops union_vfsops; +#endif /* KERNEL */ diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c new file mode 100644 index 00000000000..77947d1dfbe --- /dev/null +++ b/sys/fs/unionfs/union_subr.c @@ -0,0 +1,744 @@ +/* + * Copyright (c) 1994 Jan-Simon Pendry + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)union_subr.c 8.4 (Berkeley) 2/17/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DIAGNOSTIC +#include +#endif + +/* must be power of two, otherwise change UNION_HASH() */ +#define NHASH 32 + +/* unsigned int ... */ +#define UNION_HASH(u, l) \ + (((((unsigned long) (u)) + ((unsigned long) l)) >> 8) & (NHASH-1)) + +static LIST_HEAD(unhead, union_node) unhead[NHASH]; +static int unvplock[NHASH]; + +int +union_init() +{ + int i; + + for (i = 0; i < NHASH; i++) + LIST_INIT(&unhead[i]); + bzero((caddr_t) unvplock, sizeof(unvplock)); +} + +static int +union_list_lock(ix) + int ix; +{ + + if (unvplock[ix] & UN_LOCKED) { + unvplock[ix] |= UN_WANT; + sleep((caddr_t) &unvplock[ix], PINOD); + return (1); + } + + unvplock[ix] |= UN_LOCKED; + + return (0); +} + +static void +union_list_unlock(ix) + int ix; +{ + + unvplock[ix] &= ~UN_LOCKED; + + if (unvplock[ix] & UN_WANT) { + unvplock[ix] &= ~UN_WANT; + wakeup((caddr_t) &unvplock[ix]); + } +} + +void +union_updatevp(un, uppervp, lowervp) + struct union_node *un; + struct vnode *uppervp; + struct vnode *lowervp; +{ + int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); + int nhash = UNION_HASH(uppervp, lowervp); + + if (ohash != nhash) { + /* + * Ensure locking is ordered from lower to higher + * to avoid deadlocks. + */ + if (nhash < ohash) { + int t = ohash; + ohash = nhash; + nhash = t; + } + + while (union_list_lock(ohash)) + continue; + + while (union_list_lock(nhash)) + continue; + + LIST_REMOVE(un, un_cache); + union_list_unlock(ohash); + } else { + while (union_list_lock(nhash)) + continue; + } + + if (un->un_lowervp != lowervp) { + if (un->un_lowervp) { + vrele(un->un_lowervp); + if (un->un_path) { + free(un->un_path, M_TEMP); + un->un_path = 0; + } + if (un->un_dirvp) { + vrele(un->un_dirvp); + un->un_dirvp = NULLVP; + } + } + un->un_lowervp = lowervp; + } + + if (un->un_uppervp != uppervp) { + if (un->un_uppervp) + vrele(un->un_uppervp); + + un->un_uppervp = uppervp; + } + + if (ohash != nhash) + LIST_INSERT_HEAD(&unhead[nhash], un, un_cache); + + union_list_unlock(nhash); +} + +void +union_newlower(un, lowervp) + struct union_node *un; + struct vnode *lowervp; +{ + + union_updatevp(un, un->un_uppervp, lowervp); +} + +void +union_newupper(un, uppervp) + struct union_node *un; + struct vnode *uppervp; +{ + + union_updatevp(un, uppervp, un->un_lowervp); +} + +/* + * allocate a union_node/vnode pair. the vnode is + * referenced and locked. the new vnode is returned + * via (vpp). (mp) is the mountpoint of the union filesystem, + * (dvp) is the parent directory where the upper layer object + * should exist (but doesn't) and (cnp) is the componentname + * information which is partially copied to allow the upper + * layer object to be created at a later time. (uppervp) + * and (lowervp) reference the upper and lower layer objects + * being mapped. either, but not both, can be nil. + * if supplied, (uppervp) is locked. + * the reference is either maintained in the new union_node + * object which is allocated, or they are vrele'd. + * + * all union_nodes are maintained on a singly-linked + * list. new nodes are only allocated when they cannot + * be found on this list. entries on the list are + * removed when the vfs reclaim entry is called. + * + * a single lock is kept for the entire list. this is + * needed because the getnewvnode() function can block + * waiting for a vnode to become free, in which case there + * may be more than one process trying to get the same + * vnode. this lock is only taken if we are going to + * call getnewvnode, since the kernel itself is single-threaded. + * + * if an entry is found on the list, then call vget() to + * take a reference. this is done because there may be + * zero references to it and so it needs to removed from + * the vnode free list. + */ +int +union_allocvp(vpp, mp, undvp, dvp, cnp, uppervp, lowervp) + struct vnode **vpp; + struct mount *mp; + struct vnode *undvp; + struct vnode *dvp; /* may be null */ + struct componentname *cnp; /* may be null */ + struct vnode *uppervp; /* may be null */ + struct vnode *lowervp; /* may be null */ +{ + int error; + struct union_node *un; + struct union_node **pp; + struct vnode *xlowervp = NULLVP; + int hash; + int try; + + if (uppervp == NULLVP && lowervp == NULLVP) + panic("union: unidentifiable allocation"); + + if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) { + xlowervp = lowervp; + lowervp = NULLVP; + } + +loop: + for (try = 0; try < 3; try++) { + switch (try) { + case 0: + if (lowervp == NULLVP) + continue; + hash = UNION_HASH(uppervp, lowervp); + break; + + case 1: + if (uppervp == NULLVP) + continue; + hash = UNION_HASH(uppervp, NULLVP); + break; + + case 2: + if (lowervp == NULLVP) + continue; + hash = UNION_HASH(NULLVP, lowervp); + break; + } + + while (union_list_lock(hash)) + continue; + + for (un = unhead[hash].lh_first; un != 0; + un = un->un_cache.le_next) { + if ((un->un_lowervp == lowervp || + un->un_lowervp == NULLVP) && + (un->un_uppervp == uppervp || + un->un_uppervp == NULLVP) && + (UNIONTOV(un)->v_mount == mp)) { + if (vget(UNIONTOV(un), 0)) { + union_list_unlock(hash); + goto loop; + } + break; + } + } + + union_list_unlock(hash); + + if (un) + break; + } + + if (un) { + /* + * Obtain a lock on the union_node. + * uppervp is locked, though un->un_uppervp + * may not be. this doesn't break the locking + * hierarchy since in the case that un->un_uppervp + * is not yet locked it will be vrele'd and replaced + * with uppervp. + */ + + if ((dvp != NULLVP) && (uppervp == dvp)) { + /* + * Access ``.'', so (un) will already + * be locked. Since this process has + * the lock on (uppervp) no other + * process can hold the lock on (un). + */ +#ifdef DIAGNOSTIC + if ((un->un_flags & UN_LOCKED) == 0) + panic("union: . not locked"); + else if (curproc && un->un_pid != curproc->p_pid && + un->un_pid > -1 && curproc->p_pid > -1) + panic("union: allocvp not lock owner"); +#endif + } else { + if (un->un_flags & UN_LOCKED) { + vrele(UNIONTOV(un)); + un->un_flags |= UN_WANT; + sleep((caddr_t) &un->un_flags, PINOD); + goto loop; + } + un->un_flags |= UN_LOCKED; + +#ifdef DIAGNOSTIC + if (curproc) + un->un_pid = curproc->p_pid; + else + un->un_pid = -1; +#endif + } + + /* + * At this point, the union_node is locked, + * un->un_uppervp may not be locked, and uppervp + * is locked or nil. + */ + + /* + * Save information about the upper layer. + */ + if (uppervp != un->un_uppervp) { + union_newupper(un, uppervp); + } else if (uppervp) { + vrele(uppervp); + } + + if (un->un_uppervp) { + un->un_flags |= UN_ULOCK; + un->un_flags &= ~UN_KLOCK; + } + + /* + * Save information about the lower layer. + * This needs to keep track of pathname + * and directory information which union_vn_create + * might need. + */ + if (lowervp != un->un_lowervp) { + union_newlower(un, lowervp); + if (cnp && (lowervp != NULLVP) && + (lowervp->v_type == VREG)) { + un->un_hash = cnp->cn_hash; + un->un_path = malloc(cnp->cn_namelen+1, + M_TEMP, M_WAITOK); + bcopy(cnp->cn_nameptr, un->un_path, + cnp->cn_namelen); + un->un_path[cnp->cn_namelen] = '\0'; + VREF(dvp); + un->un_dirvp = dvp; + } + } else if (lowervp) { + vrele(lowervp); + } + *vpp = UNIONTOV(un); + return (0); + } + + /* + * otherwise lock the vp list while we call getnewvnode + * since that can block. + */ + hash = UNION_HASH(uppervp, lowervp); + + if (union_list_lock(hash)) + goto loop; + + error = getnewvnode(VT_UNION, mp, union_vnodeop_p, vpp); + if (error) { + if (uppervp) { + if (dvp == uppervp) + vrele(uppervp); + else + vput(uppervp); + } + if (lowervp) + vrele(lowervp); + + goto out; + } + + MALLOC((*vpp)->v_data, void *, sizeof(struct union_node), + M_TEMP, M_WAITOK); + + if (uppervp) + (*vpp)->v_type = uppervp->v_type; + else + (*vpp)->v_type = lowervp->v_type; + un = VTOUNION(*vpp); + un->un_vnode = *vpp; + un->un_uppervp = uppervp; + un->un_lowervp = lowervp; + un->un_openl = 0; + un->un_flags = UN_LOCKED; + if (un->un_uppervp) + un->un_flags |= UN_ULOCK; +#ifdef DIAGNOSTIC + if (curproc) + un->un_pid = curproc->p_pid; + else + un->un_pid = -1; +#endif + if (cnp && (lowervp != NULLVP) && (lowervp->v_type == VREG)) { + un->un_hash = cnp->cn_hash; + un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK); + bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen); + un->un_path[cnp->cn_namelen] = '\0'; + VREF(dvp); + un->un_dirvp = dvp; + } else { + un->un_hash = 0; + un->un_path = 0; + un->un_dirvp = 0; + } + + LIST_INSERT_HEAD(&unhead[hash], un, un_cache); + + if (xlowervp) + vrele(xlowervp); + +out: + union_list_unlock(hash); + + return (error); +} + +int +union_freevp(vp) + struct vnode *vp; +{ + struct union_node *un = VTOUNION(vp); + + LIST_REMOVE(un, un_cache); + + if (un->un_uppervp) + vrele(un->un_uppervp); + if (un->un_lowervp) + vrele(un->un_lowervp); + if (un->un_dirvp) + vrele(un->un_dirvp); + if (un->un_path) + free(un->un_path, M_TEMP); + + FREE(vp->v_data, M_TEMP); + vp->v_data = 0; + + return (0); +} + +/* + * copyfile. copy the vnode (fvp) to the vnode (tvp) + * using a sequence of reads and writes. both (fvp) + * and (tvp) are locked on entry and exit. + */ +int +union_copyfile(p, cred, fvp, tvp) + struct proc *p; + struct ucred *cred; + struct vnode *fvp; + struct vnode *tvp; +{ + char *buf; + struct uio uio; + struct iovec iov; + int error = 0; + + /* + * strategy: + * allocate a buffer of size MAXBSIZE. + * loop doing reads and writes, keeping track + * of the current uio offset. + * give up at the first sign of trouble. + */ + + uio.uio_procp = p; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_offset = 0; + + VOP_UNLOCK(fvp); /* XXX */ + LEASE_CHECK(fvp, p, cred, LEASE_READ); + VOP_LOCK(fvp); /* XXX */ + VOP_UNLOCK(tvp); /* XXX */ + LEASE_CHECK(tvp, p, cred, LEASE_WRITE); + VOP_LOCK(tvp); /* XXX */ + + buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); + + /* ugly loop follows... */ + do { + off_t offset = uio.uio_offset; + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + iov.iov_base = buf; + iov.iov_len = MAXBSIZE; + uio.uio_resid = iov.iov_len; + uio.uio_rw = UIO_READ; + error = VOP_READ(fvp, &uio, 0, cred); + + if (error == 0) { + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + iov.iov_base = buf; + iov.iov_len = MAXBSIZE - uio.uio_resid; + uio.uio_offset = offset; + uio.uio_rw = UIO_WRITE; + uio.uio_resid = iov.iov_len; + + if (uio.uio_resid == 0) + break; + + do { + error = VOP_WRITE(tvp, &uio, 0, cred); + } while ((uio.uio_resid > 0) && (error == 0)); + } + + } while (error == 0); + + free(buf, M_TEMP); + return (error); +} + +/* + * Create a shadow directory in the upper layer. + * The new vnode is returned locked. + * + * (um) points to the union mount structure for access to the + * the mounting process's credentials. + * (dvp) is the directory in which to create the shadow directory. + * it is unlocked on entry and exit. + * (cnp) is the componentname to be created. + * (vpp) is the returned newly created shadow directory, which + * is returned locked. + */ +int +union_mkshadow(um, dvp, cnp, vpp) + struct union_mount *um; + struct vnode *dvp; + struct componentname *cnp; + struct vnode **vpp; +{ + int error; + struct vattr va; + struct proc *p = cnp->cn_proc; + struct componentname cn; + + /* + * policy: when creating the shadow directory in the + * upper layer, create it owned by the user who did + * the mount, group from parent directory, and mode + * 777 modified by umask (ie mostly identical to the + * mkdir syscall). (jsp, kb) + */ + + /* + * A new componentname structure must be faked up because + * there is no way to know where the upper level cnp came + * from or what it is being used for. This must duplicate + * some of the work done by NDINIT, some of the work done + * by namei, some of the work done by lookup and some of + * the work done by VOP_LOOKUP when given a CREATE flag. + * Conclusion: Horrible. + * + * The pathname buffer will be FREEed by VOP_MKDIR. + */ + cn.cn_pnbuf = malloc(cnp->cn_namelen+1, M_NAMEI, M_WAITOK); + bcopy(cnp->cn_nameptr, cn.cn_pnbuf, cnp->cn_namelen); + cn.cn_pnbuf[cnp->cn_namelen] = '\0'; + + cn.cn_nameiop = CREATE; + cn.cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN); + cn.cn_proc = cnp->cn_proc; + if (um->um_op == UNMNT_ABOVE) + cn.cn_cred = cnp->cn_cred; + else + cn.cn_cred = um->um_cred; + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_namelen = cnp->cn_namelen; + cn.cn_hash = cnp->cn_hash; + cn.cn_consume = cnp->cn_consume; + + VREF(dvp); + if (error = relookup(dvp, vpp, &cn)) + return (error); + vrele(dvp); + + if (*vpp) { + VOP_ABORTOP(dvp, &cn); + VOP_UNLOCK(dvp); + vrele(*vpp); + *vpp = NULLVP; + return (EEXIST); + } + + VATTR_NULL(&va); + va.va_type = VDIR; + va.va_mode = um->um_cmode; + + /* LEASE_CHECK: dvp is locked */ + LEASE_CHECK(dvp, p, p->p_ucred, LEASE_WRITE); + + error = VOP_MKDIR(dvp, vpp, &cn, &va); + return (error); +} + +/* + * union_vn_create: creates and opens a new shadow file + * on the upper union layer. this function is similar + * in spirit to calling vn_open but it avoids calling namei(). + * the problem with calling namei is that a) it locks too many + * things, and b) it doesn't start at the "right" directory, + * whereas relookup is told where to start. + */ +int +union_vn_create(vpp, un, p) + struct vnode **vpp; + struct union_node *un; + struct proc *p; +{ + struct vnode *vp; + struct ucred *cred = p->p_ucred; + struct vattr vat; + struct vattr *vap = &vat; + int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); + int error; + int cmode = UN_FILEMODE & ~p->p_fd->fd_cmask; + char *cp; + struct componentname cn; + + *vpp = NULLVP; + + /* + * Build a new componentname structure (for the same + * reasons outlines in union_mkshadow). + * The difference here is that the file is owned by + * the current user, rather than by the person who + * did the mount, since the current user needs to be + * able to write the file (that's why it is being + * copied in the first place). + */ + cn.cn_namelen = strlen(un->un_path); + cn.cn_pnbuf = (caddr_t) malloc(cn.cn_namelen, M_NAMEI, M_WAITOK); + bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1); + cn.cn_nameiop = CREATE; + cn.cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN); + cn.cn_proc = p; + cn.cn_cred = p->p_ucred; + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_hash = un->un_hash; + cn.cn_consume = 0; + + VREF(un->un_dirvp); + if (error = relookup(un->un_dirvp, &vp, &cn)) + return (error); + vrele(un->un_dirvp); + + if (vp) { + VOP_ABORTOP(un->un_dirvp, &cn); + if (un->un_dirvp == vp) + vrele(un->un_dirvp); + else + vput(un->un_dirvp); + vrele(vp); + return (EEXIST); + } + + /* + * Good - there was no race to create the file + * so go ahead and create it. The permissions + * on the file will be 0666 modified by the + * current user's umask. Access to the file, while + * it is unioned, will require access to the top *and* + * bottom files. Access when not unioned will simply + * require access to the top-level file. + * TODO: confirm choice of access permissions. + */ + VATTR_NULL(vap); + vap->va_type = VREG; + vap->va_mode = cmode; + LEASE_CHECK(un->un_dirvp, p, cred, LEASE_WRITE); + if (error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap)) + return (error); + + if (error = VOP_OPEN(vp, fmode, cred, p)) { + vput(vp); + return (error); + } + + vp->v_writecount++; + *vpp = vp; + return (0); +} + +int +union_vn_close(vp, fmode, cred, p) + struct vnode *vp; + int fmode; + struct ucred *cred; + struct proc *p; +{ + if (fmode & FWRITE) + --vp->v_writecount; + return (VOP_CLOSE(vp, fmode)); +} + +void +union_removed_upper(un) + struct union_node *un; +{ + if (un->un_flags & UN_ULOCK) { + un->un_flags &= ~UN_ULOCK; + VOP_UNLOCK(un->un_uppervp); + } + + union_newupper(un, NULLVP); +} + +struct vnode * +union_lowervp(vp) + struct vnode *vp; +{ + struct union_node *un = VTOUNION(vp); + + if (un->un_lowervp && (vp->v_type == un->un_lowervp->v_type)) { + if (vget(un->un_lowervp, 0)) + return (NULLVP); + } + + return (un->un_lowervp); +} diff --git a/sys/fs/unionfs/union_vfsops.c b/sys/fs/unionfs/union_vfsops.c new file mode 100644 index 00000000000..9fa27460e3d --- /dev/null +++ b/sys/fs/unionfs/union_vfsops.c @@ -0,0 +1,550 @@ +/* + * Copyright (c) 1994 The Regents of the University of California. + * Copyright (c) 1994 Jan-Simon Pendry. + * All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)union_vfsops.c 8.7 (Berkeley) 3/5/94 + */ + +/* + * Union Layer + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Mount union filesystem + */ +int +union_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + int error = 0; + struct union_args args; + struct vnode *lowerrootvp = NULLVP; + struct vnode *upperrootvp = NULLVP; + struct union_mount *um; + struct ucred *cred = 0; + struct ucred *scred; + struct vattr va; + char *cp; + int len; + u_int size; + +#ifdef UNION_DIAGNOSTIC + printf("union_mount(mp = %x)\n", mp); +#endif + + /* + * Update is a no-op + */ + if (mp->mnt_flag & MNT_UPDATE) { + /* + * Need to provide. + * 1. a way to convert between rdonly and rdwr mounts. + * 2. support for nfs exports. + */ + error = EOPNOTSUPP; + goto bad; + } + + /* + * Take a copy of the process's credentials. This isn't + * quite right since the euid will always be zero and we + * want to get the "real" users credentials. So fix up + * the uid field after taking the copy. + */ + cred = crdup(p->p_ucred); + cred->cr_uid = p->p_cred->p_ruid; + + /* + * Ensure the *real* user has write permission on the + * mounted-on directory. This allows the mount_union + * command to be made setuid root so allowing anyone + * to do union mounts onto any directory on which they + * have write permission and which they also own. + */ + error = VOP_GETATTR(mp->mnt_vnodecovered, &va, cred, p); + if (error) + goto bad; + if ((va.va_uid != cred->cr_uid) && + (cred->cr_uid != 0)) { + error = EACCES; + goto bad; + } + error = VOP_ACCESS(mp->mnt_vnodecovered, VWRITE, cred, p); + if (error) + goto bad; + + /* + * Get argument + */ + if (error = copyin(data, (caddr_t)&args, sizeof(struct union_args))) + goto bad; + + lowerrootvp = mp->mnt_vnodecovered; + VREF(lowerrootvp); + + /* + * Find upper node. Use the real process credentials, + * not the effective ones since this will have come + * through a setuid process (mount_union). All this + * messing around with permissions is entirely bogus + * and should be removed by allowing any user straight + * past the mount system call. + */ + scred = p->p_ucred; + p->p_ucred = cred; + NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT, + UIO_USERSPACE, args.target, p); + p->p_ucred = scred; + + if (error = namei(ndp)) + goto bad; + + upperrootvp = ndp->ni_vp; + vrele(ndp->ni_dvp); + ndp->ni_dvp = NULL; + + if (upperrootvp->v_type != VDIR) { + error = EINVAL; + goto bad; + } + + um = (struct union_mount *) malloc(sizeof(struct union_mount), + M_UFSMNT, M_WAITOK); /* XXX */ + + /* + * Keep a held reference to the target vnodes. + * They are vrele'd in union_unmount. + * + * Depending on the _BELOW flag, the filesystems are + * viewed in a different order. In effect, this is the + * same as providing a mount under option to the mount syscall. + */ + + um->um_op = args.mntflags & UNMNT_OPMASK; + switch (um->um_op) { + case UNMNT_ABOVE: + um->um_lowervp = lowerrootvp; + um->um_uppervp = upperrootvp; + break; + + case UNMNT_BELOW: + um->um_lowervp = upperrootvp; + um->um_uppervp = lowerrootvp; + break; + + case UNMNT_REPLACE: + vrele(lowerrootvp); + lowerrootvp = NULLVP; + um->um_uppervp = upperrootvp; + um->um_lowervp = lowerrootvp; + break; + + default: + error = EINVAL; + goto bad; + } + + um->um_cred = cred; + um->um_cmode = UN_DIRMODE &~ p->p_fd->fd_cmask; + + /* + * Depending on what you think the MNT_LOCAL flag might mean, + * you may want the && to be || on the conditional below. + * At the moment it has been defined that the filesystem is + * only local if it is all local, ie the MNT_LOCAL flag implies + * that the entire namespace is local. If you think the MNT_LOCAL + * flag implies that some of the files might be stored locally + * then you will want to change the conditional. + */ + if (um->um_op == UNMNT_ABOVE) { + if (((um->um_lowervp == NULLVP) || + (um->um_lowervp->v_mount->mnt_flag & MNT_LOCAL)) && + (um->um_uppervp->v_mount->mnt_flag & MNT_LOCAL)) + mp->mnt_flag |= MNT_LOCAL; + } + + /* + * Copy in the upper layer's RDONLY flag. This is for the benefit + * of lookup() which explicitly checks the flag, rather than asking + * the filesystem for it's own opinion. This means, that an update + * mount of the underlying filesystem to go from rdonly to rdwr + * will leave the unioned view as read-only. + */ + mp->mnt_flag |= (um->um_uppervp->v_mount->mnt_flag & MNT_RDONLY); + + /* + * This is a user mount. Privilege check for unmount + * will be done in union_unmount. + */ + mp->mnt_flag |= MNT_USER; + + mp->mnt_data = (qaddr_t) um; + getnewfsid(mp, MOUNT_UNION); + + (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + + switch (um->um_op) { + case UNMNT_ABOVE: + cp = ""; + break; + case UNMNT_BELOW: + cp = ""; + break; + case UNMNT_REPLACE: + cp = ""; + break; + } + len = strlen(cp); + bcopy(cp, mp->mnt_stat.f_mntfromname, len); + + cp = mp->mnt_stat.f_mntfromname + len; + len = MNAMELEN - len; + + (void) copyinstr(args.target, cp, len - 1, &size); + bzero(cp + size, len - size); + +#ifdef UNION_DIAGNOSTIC + printf("union_mount: from %s, on %s\n", + mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); +#endif + return (0); + +bad: + if (cred) + crfree(cred); + if (upperrootvp) + vrele(upperrootvp); + if (lowerrootvp) + vrele(lowerrootvp); + return (error); +} + +/* + * VFS start. Nothing needed here - the start routine + * on the underlying filesystem(s) will have been called + * when that filesystem was mounted. + */ +int +union_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + + return (0); +} + +/* + * Free reference to union layer + */ +int +union_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + struct union_mount *um = MOUNTTOUNIONMOUNT(mp); + struct vnode *um_rootvp; + int error; + int flags = 0; + extern int doforce; + +#ifdef UNION_DIAGNOSTIC + printf("union_unmount(mp = %x)\n", mp); +#endif + + /* only the mounter, or superuser can unmount */ + if ((p->p_cred->p_ruid != um->um_cred->cr_uid) && + (error = suser(p->p_ucred, &p->p_acflag))) + return (error); + + if (mntflags & MNT_FORCE) { + /* union can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + if (error = union_root(mp, &um_rootvp)) + return (error); + if (um_rootvp->v_usecount > 1) { + vput(um_rootvp); + return (EBUSY); + } + if (error = vflush(mp, um_rootvp, flags)) { + vput(um_rootvp); + return (error); + } + +#ifdef UNION_DIAGNOSTIC + vprint("alias root of lower", um_rootvp); +#endif + /* + * Discard references to upper and lower target vnodes. + */ + if (um->um_lowervp) + vrele(um->um_lowervp); + vrele(um->um_uppervp); + crfree(um->um_cred); + /* + * Release reference on underlying root vnode + */ + vput(um_rootvp); + /* + * And blow it away for future re-use + */ + vgone(um_rootvp); + /* + * Finally, throw away the union_mount structure + */ + free(mp->mnt_data, M_UFSMNT); /* XXX */ + mp->mnt_data = 0; + return (0); +} + +int +union_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct union_mount *um = MOUNTTOUNIONMOUNT(mp); + int error; + int loselock; + +#ifdef UNION_DIAGNOSTIC + printf("union_root(mp = %x, lvp = %x, uvp = %x)\n", mp, + um->um_lowervp, + um->um_uppervp); +#endif + + /* + * Return locked reference to root. + */ + VREF(um->um_uppervp); + if ((um->um_op == UNMNT_BELOW) && + VOP_ISLOCKED(um->um_uppervp)) { + loselock = 1; + } else { + VOP_LOCK(um->um_uppervp); + loselock = 0; + } + if (um->um_lowervp) + VREF(um->um_lowervp); + error = union_allocvp(vpp, mp, + (struct vnode *) 0, + (struct vnode *) 0, + (struct componentname *) 0, + um->um_uppervp, + um->um_lowervp); + + if (error) { + if (!loselock) + VOP_UNLOCK(um->um_uppervp); + vrele(um->um_uppervp); + if (um->um_lowervp) + vrele(um->um_lowervp); + } else { + (*vpp)->v_flag |= VROOT; + if (loselock) + VTOUNION(*vpp)->un_flags &= ~UN_ULOCK; + } + + return (error); +} + +int +union_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} + +int +union_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + int error; + struct union_mount *um = MOUNTTOUNIONMOUNT(mp); + struct statfs mstat; + int lbsize; + +#ifdef UNION_DIAGNOSTIC + printf("union_statfs(mp = %x, lvp = %x, uvp = %x)\n", mp, + um->um_lowervp, + um->um_uppervp); +#endif + + bzero(&mstat, sizeof(mstat)); + + if (um->um_lowervp) { + error = VFS_STATFS(um->um_lowervp->v_mount, &mstat, p); + if (error) + return (error); + } + + /* now copy across the "interesting" information and fake the rest */ +#if 0 + sbp->f_type = mstat.f_type; + sbp->f_flags = mstat.f_flags; + sbp->f_bsize = mstat.f_bsize; + sbp->f_iosize = mstat.f_iosize; +#endif + lbsize = mstat.f_bsize; + sbp->f_blocks = mstat.f_blocks; + sbp->f_bfree = mstat.f_bfree; + sbp->f_bavail = mstat.f_bavail; + sbp->f_files = mstat.f_files; + sbp->f_ffree = mstat.f_ffree; + + error = VFS_STATFS(um->um_uppervp->v_mount, &mstat, p); + if (error) + return (error); + + sbp->f_type = MOUNT_UNION; + sbp->f_flags = mstat.f_flags; + sbp->f_bsize = mstat.f_bsize; + sbp->f_iosize = mstat.f_iosize; + + /* + * if the lower and upper blocksizes differ, then frig the + * block counts so that the sizes reported by df make some + * kind of sense. none of this makes sense though. + */ + + if (mstat.f_bsize != lbsize) { + sbp->f_blocks = sbp->f_blocks * lbsize / mstat.f_bsize; + sbp->f_bfree = sbp->f_bfree * lbsize / mstat.f_bsize; + sbp->f_bavail = sbp->f_bavail * lbsize / mstat.f_bsize; + } + sbp->f_blocks += mstat.f_blocks; + sbp->f_bfree += mstat.f_bfree; + sbp->f_bavail += mstat.f_bavail; + sbp->f_files += mstat.f_files; + sbp->f_ffree += mstat.f_ffree; + + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + return (0); +} + +int +union_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + + /* + * XXX - Assumes no data cached at union layer. + */ + return (0); +} + +int +union_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +int +union_fhtovp(mp, fidp, nam, vpp, exflagsp, credanonp) + struct mount *mp; + struct fid *fidp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + + return (EOPNOTSUPP); +} + +int +union_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + + return (EOPNOTSUPP); +} + +int union_init __P((void)); + +struct vfsops union_vfsops = { + union_mount, + union_start, + union_unmount, + union_root, + union_quotactl, + union_statfs, + union_sync, + union_vget, + union_fhtovp, + union_vptofh, + union_init, +}; diff --git a/sys/fs/unionfs/union_vnops.c b/sys/fs/unionfs/union_vnops.c new file mode 100644 index 00000000000..96327b0922d --- /dev/null +++ b/sys/fs/unionfs/union_vnops.c @@ -0,0 +1,1495 @@ +/* + * Copyright (c) 1992, 1993, 1994 The Regents of the University of California. + * Copyright (c) 1992, 1993, 1994 Jan-Simon Pendry. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)union_vnops.c 8.6 (Berkeley) 2/17/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FIXUP(un) { \ + if (((un)->un_flags & UN_ULOCK) == 0) { \ + union_fixup(un); \ + } \ +} + +static void +union_fixup(un) + struct union_node *un; +{ + + VOP_LOCK(un->un_uppervp); + un->un_flags |= UN_ULOCK; +} + +static int +union_lookup1(udvp, dvp, vpp, cnp) + struct vnode *udvp; + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; +{ + int error; + struct vnode *tdvp; + struct mount *mp; + + /* + * If stepping up the directory tree, check for going + * back across the mount point, in which case do what + * lookup would do by stepping back down the mount + * hierarchy. + */ + if (cnp->cn_flags & ISDOTDOT) { + for (;;) { + /* + * Don't do the NOCROSSMOUNT check + * at this level. By definition, + * union fs deals with namespaces, not + * filesystems. + */ + if ((dvp->v_flag & VROOT) == 0) + break; + + tdvp = dvp; + dvp = dvp->v_mount->mnt_vnodecovered; + vput(tdvp); + VREF(dvp); + VOP_LOCK(dvp); + } + } + + error = VOP_LOOKUP(dvp, &tdvp, cnp); + if (error) + return (error); + + /* + * The parent directory will have been unlocked, unless lookup + * found the last component. In which case, re-lock the node + * here to allow it to be unlocked again (phew) in union_lookup. + */ + if (dvp != tdvp && !(cnp->cn_flags & ISLASTCN)) + VOP_LOCK(dvp); + + dvp = tdvp; + + /* + * Lastly check if the current node is a mount point in + * which case walk up the mount hierarchy making sure not to + * bump into the root of the mount tree (ie. dvp != udvp). + */ + while (dvp != udvp && (dvp->v_type == VDIR) && + (mp = dvp->v_mountedhere)) { + + if (mp->mnt_flag & MNT_MLOCK) { + mp->mnt_flag |= MNT_MWAIT; + sleep((caddr_t) mp, PVFS); + continue; + } + + if (error = VFS_ROOT(mp, &tdvp)) { + vput(dvp); + return (error); + } + + vput(dvp); + dvp = tdvp; + } + + *vpp = dvp; + return (0); +} + +int +union_lookup(ap) + struct vop_lookup_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + int error; + int uerror, lerror; + struct vnode *uppervp, *lowervp; + struct vnode *upperdvp, *lowerdvp; + struct vnode *dvp = ap->a_dvp; + struct union_node *dun = VTOUNION(dvp); + struct componentname *cnp = ap->a_cnp; + int lockparent = cnp->cn_flags & LOCKPARENT; + int rdonly = cnp->cn_flags & RDONLY; + struct union_mount *um = MOUNTTOUNIONMOUNT(dvp->v_mount); + struct ucred *saved_cred; + + cnp->cn_flags |= LOCKPARENT; + + upperdvp = dun->un_uppervp; + lowerdvp = dun->un_lowervp; + uppervp = NULLVP; + lowervp = NULLVP; + + /* + * do the lookup in the upper level. + * if that level comsumes additional pathnames, + * then assume that something special is going + * on and just return that vnode. + */ + if (upperdvp) { + FIXUP(dun); + uerror = union_lookup1(um->um_uppervp, upperdvp, + &uppervp, cnp); + /*if (uppervp == upperdvp) + dun->un_flags |= UN_KLOCK;*/ + + if (cnp->cn_consume != 0) { + *ap->a_vpp = uppervp; + if (!lockparent) + cnp->cn_flags &= ~LOCKPARENT; + return (uerror); + } + } else { + uerror = ENOENT; + } + + /* + * in a similar way to the upper layer, do the lookup + * in the lower layer. this time, if there is some + * component magic going on, then vput whatever we got + * back from the upper layer and return the lower vnode + * instead. + */ + if (lowerdvp) { + int nameiop; + + VOP_LOCK(lowerdvp); + + /* + * Only do a LOOKUP on the bottom node, since + * we won't be making changes to it anyway. + */ + nameiop = cnp->cn_nameiop; + cnp->cn_nameiop = LOOKUP; + if (um->um_op == UNMNT_BELOW) { + saved_cred = cnp->cn_cred; + cnp->cn_cred = um->um_cred; + } + lerror = union_lookup1(um->um_lowervp, lowerdvp, + &lowervp, cnp); + if (um->um_op == UNMNT_BELOW) + cnp->cn_cred = saved_cred; + cnp->cn_nameiop = nameiop; + + if (lowervp != lowerdvp) + VOP_UNLOCK(lowerdvp); + + if (cnp->cn_consume != 0) { + if (uppervp) { + if (uppervp == upperdvp) + vrele(uppervp); + else + vput(uppervp); + uppervp = NULLVP; + } + *ap->a_vpp = lowervp; + if (!lockparent) + cnp->cn_flags &= ~LOCKPARENT; + return (lerror); + } + } else { + lerror = ENOENT; + } + + if (!lockparent) + cnp->cn_flags &= ~LOCKPARENT; + + /* + * at this point, we have uerror and lerror indicating + * possible errors with the lookups in the upper and lower + * layers. additionally, uppervp and lowervp are (locked) + * references to existing vnodes in the upper and lower layers. + * + * there are now three cases to consider. + * 1. if both layers returned an error, then return whatever + * error the upper layer generated. + * + * 2. if the top layer failed and the bottom layer succeeded + * then two subcases occur. + * a. the bottom vnode is not a directory, in which + * case just return a new union vnode referencing + * an empty top layer and the existing bottom layer. + * b. the bottom vnode is a directory, in which case + * create a new directory in the top-level and + * continue as in case 3. + * + * 3. if the top layer succeeded then return a new union + * vnode referencing whatever the new top layer and + * whatever the bottom layer returned. + */ + + *ap->a_vpp = NULLVP; + + /* case 1. */ + if ((uerror != 0) && (lerror != 0)) { + return (uerror); + } + + /* case 2. */ + if (uerror != 0 /* && (lerror == 0) */ ) { + if (lowervp->v_type == VDIR) { /* case 2b. */ + dun->un_flags &= ~UN_ULOCK; + VOP_UNLOCK(upperdvp); + uerror = union_mkshadow(um, upperdvp, cnp, &uppervp); + VOP_LOCK(upperdvp); + dun->un_flags |= UN_ULOCK; + + if (uerror) { + if (lowervp) { + vput(lowervp); + lowervp = NULLVP; + } + return (uerror); + } + } + } + + if (lowervp) + VOP_UNLOCK(lowervp); + + error = union_allocvp(ap->a_vpp, dvp->v_mount, dvp, upperdvp, cnp, + uppervp, lowervp); + + if (error) { + if (uppervp) + vput(uppervp); + if (lowervp) + vrele(lowervp); + } else { + if (*ap->a_vpp != dvp) + if (!lockparent || !(cnp->cn_flags & ISLASTCN)) + VOP_UNLOCK(dvp); + } + + return (error); +} + +int +union_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_dvp); + struct vnode *dvp = un->un_uppervp; + + if (dvp) { + int error; + struct vnode *vp; + + FIXUP(un); + + VREF(dvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + error = VOP_CREATE(dvp, &vp, ap->a_cnp, ap->a_vap); + if (error) + return (error); + + error = union_allocvp( + ap->a_vpp, + ap->a_dvp->v_mount, + ap->a_dvp, + NULLVP, + ap->a_cnp, + vp, + NULLVP); + if (error) + vput(vp); + return (error); + } + + vput(ap->a_dvp); + return (EROFS); +} + +int +union_mknod(ap) + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_dvp); + struct vnode *dvp = un->un_uppervp; + + if (dvp) { + int error; + struct vnode *vp; + + FIXUP(un); + + VREF(dvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + error = VOP_MKNOD(dvp, &vp, ap->a_cnp, ap->a_vap); + if (error) + return (error); + + if (vp) { + error = union_allocvp( + ap->a_vpp, + ap->a_dvp->v_mount, + ap->a_dvp, + NULLVP, + ap->a_cnp, + vp, + NULLVP); + if (error) + vput(vp); + } + return (error); + } + + vput(ap->a_dvp); + return (EROFS); +} + +int +union_open(ap) + struct vop_open_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_vp); + struct vnode *tvp; + int mode = ap->a_mode; + struct ucred *cred = ap->a_cred; + struct proc *p = ap->a_p; + int error; + + /* + * If there is an existing upper vp then simply open that. + */ + tvp = un->un_uppervp; + if (tvp == NULLVP) { + /* + * If the lower vnode is being opened for writing, then + * copy the file contents to the upper vnode and open that, + * otherwise can simply open the lower vnode. + */ + tvp = un->un_lowervp; + if ((ap->a_mode & FWRITE) && (tvp->v_type == VREG)) { + struct vnode *vp; + int i; + + /* + * Open the named file in the upper layer. Note that + * the file may have come into existence *since* the + * lookup was done, since the upper layer may really + * be a loopback mount of some other filesystem... + * so open the file with exclusive create and barf if + * it already exists. + * XXX - perhaps should re-lookup the node (once more + * with feeling) and simply open that. Who knows. + */ + error = union_vn_create(&vp, un, p); + if (error) + return (error); + + /* at this point, uppervp is locked */ + union_newupper(un, vp); + un->un_flags |= UN_ULOCK; + + /* + * Now, if the file is being opened with truncation, + * then the (new) upper vnode is ready to fly, + * otherwise the data from the lower vnode must be + * copied to the upper layer first. This only works + * for regular files (check is made above). + */ + if ((mode & O_TRUNC) == 0) { + /* + * XXX - should not ignore errors + * from VOP_CLOSE + */ + VOP_LOCK(tvp); + error = VOP_OPEN(tvp, FREAD, cred, p); + if (error == 0) { + error = union_copyfile(p, cred, + tvp, un->un_uppervp); + VOP_UNLOCK(tvp); + (void) VOP_CLOSE(tvp, FREAD); + } else { + VOP_UNLOCK(tvp); + } + +#ifdef UNION_DIAGNOSTIC + if (!error) + uprintf("union: copied up %s\n", + un->un_path); +#endif + } + + un->un_flags &= ~UN_ULOCK; + VOP_UNLOCK(un->un_uppervp); + union_vn_close(un->un_uppervp, FWRITE, cred, p); + VOP_LOCK(un->un_uppervp); + un->un_flags |= UN_ULOCK; + + /* + * Subsequent IOs will go to the top layer, so + * call close on the lower vnode and open on the + * upper vnode to ensure that the filesystem keeps + * its references counts right. This doesn't do + * the right thing with (cred) and (FREAD) though. + * Ignoring error returns is not righ, either. + */ + for (i = 0; i < un->un_openl; i++) { + (void) VOP_CLOSE(tvp, FREAD); + (void) VOP_OPEN(un->un_uppervp, FREAD, cred, p); + } + un->un_openl = 0; + + if (error == 0) + error = VOP_OPEN(un->un_uppervp, mode, cred, p); + return (error); + } + + /* + * Just open the lower vnode + */ + un->un_openl++; + VOP_LOCK(tvp); + error = VOP_OPEN(tvp, mode, cred, p); + VOP_UNLOCK(tvp); + + return (error); + } + + FIXUP(un); + + error = VOP_OPEN(tvp, mode, cred, p); + + return (error); +} + +int +union_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_vp); + struct vnode *vp; + + if (un->un_uppervp) { + vp = un->un_uppervp; + } else { +#ifdef UNION_DIAGNOSTIC + if (un->un_openl <= 0) + panic("union: un_openl cnt"); +#endif + --un->un_openl; + vp = un->un_lowervp; + } + + return (VOP_CLOSE(vp, ap->a_fflag, ap->a_cred, ap->a_p)); +} + +/* + * Check access permission on the union vnode. + * The access check being enforced is to check + * against both the underlying vnode, and any + * copied vnode. This ensures that no additional + * file permissions are given away simply because + * the user caused an implicit file copy. + */ +int +union_access(ap) + struct vop_access_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_vp); + int error = EACCES; + struct vnode *vp; + + if (vp = un->un_uppervp) { + FIXUP(un); + return (VOP_ACCESS(vp, ap->a_mode, ap->a_cred, ap->a_p)); + } + + if (vp = un->un_lowervp) { + VOP_LOCK(vp); + error = VOP_ACCESS(vp, ap->a_mode, ap->a_cred, ap->a_p); + if (error == 0) { + struct union_mount *um = MOUNTTOUNIONMOUNT(vp->v_mount); + + if (um->um_op == UNMNT_BELOW) + error = VOP_ACCESS(vp, ap->a_mode, + um->um_cred, ap->a_p); + } + VOP_UNLOCK(vp); + if (error) + return (error); + } + + return (error); +} + +/* + * We handle getattr only to change the fsid. + */ +int +union_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + int error; + struct union_node *un = VTOUNION(ap->a_vp); + struct vnode *vp = un->un_uppervp; + struct vattr *vap; + struct vattr va; + + + /* + * Some programs walk the filesystem hierarchy by counting + * links to directories to avoid stat'ing all the time. + * This means the link count on directories needs to be "correct". + * The only way to do that is to call getattr on both layers + * and fix up the link count. The link count will not necessarily + * be accurate but will be large enough to defeat the tree walkers. + */ + + vap = ap->a_vap; + + vp = un->un_uppervp; + if (vp != NULLVP) { + FIXUP(un); + error = VOP_GETATTR(vp, vap, ap->a_cred, ap->a_p); + if (error) + return (error); + } + + if (vp == NULLVP) { + vp = un->un_lowervp; + } else if (vp->v_type == VDIR) { + vp = un->un_lowervp; + vap = &va; + } else { + vp = NULLVP; + } + + if (vp != NULLVP) { + VOP_LOCK(vp); + error = VOP_GETATTR(vp, vap, ap->a_cred, ap->a_p); + VOP_UNLOCK(vp); + if (error) + return (error); + } + + if ((vap != ap->a_vap) && (vap->va_type == VDIR)) + ap->a_vap->va_nlink += vap->va_nlink; + + vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; + return (0); +} + +int +union_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_vp); + int error; + + /* + * Handle case of truncating lower object to zero size, + * by creating a zero length upper object. This is to + * handle the case of open with O_TRUNC and O_CREAT. + */ + if ((un->un_uppervp == NULLVP) && + /* assert(un->un_lowervp != NULLVP) */ + (un->un_lowervp->v_type == VREG) && + (ap->a_vap->va_size == 0)) { + struct vnode *vp; + + error = union_vn_create(&vp, un, ap->a_p); + if (error) + return (error); + + /* at this point, uppervp is locked */ + union_newupper(un, vp); + + VOP_UNLOCK(vp); + union_vn_close(un->un_uppervp, FWRITE, ap->a_cred, ap->a_p); + VOP_LOCK(vp); + un->un_flags |= UN_ULOCK; + } + + /* + * Try to set attributes in upper layer, + * otherwise return read-only filesystem error. + */ + if (un->un_uppervp != NULLVP) { + FIXUP(un); + error = VOP_SETATTR(un->un_uppervp, ap->a_vap, + ap->a_cred, ap->a_p); + } else { + error = EROFS; + } + + return (error); +} + +int +union_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_vp); + int dolock = (vp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_READ(vp, ap->a_uio, ap->a_ioflag, ap->a_cred); + if (dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_write(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_vp); + int dolock = (vp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_WRITE(vp, ap->a_uio, ap->a_ioflag, ap->a_cred); + if (dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (VOP_IOCTL(OTHERVP(ap->a_vp), ap->a_command, ap->a_data, + ap->a_fflag, ap->a_cred, ap->a_p)); +} + +int +union_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (VOP_SELECT(OTHERVP(ap->a_vp), ap->a_which, ap->a_fflags, + ap->a_cred, ap->a_p)); +} + +int +union_mmap(ap) + struct vop_mmap_args /* { + struct vnode *a_vp; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (VOP_MMAP(OTHERVP(ap->a_vp), ap->a_fflags, + ap->a_cred, ap->a_p)); +} + +int +union_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + int error = 0; + struct vnode *targetvp = OTHERVP(ap->a_vp); + + if (targetvp) { + int dolock = (targetvp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(targetvp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_FSYNC(targetvp, ap->a_cred, + ap->a_waitfor, ap->a_p); + if (dolock) + VOP_UNLOCK(targetvp); + } + + return (error); +} + +int +union_seek(ap) + struct vop_seek_args /* { + struct vnode *a_vp; + off_t a_oldoff; + off_t a_newoff; + struct ucred *a_cred; + } */ *ap; +{ + + return (VOP_SEEK(OTHERVP(ap->a_vp), ap->a_oldoff, ap->a_newoff, ap->a_cred)); +} + +int +union_remove(ap) + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + int error; + struct union_node *dun = VTOUNION(ap->a_dvp); + struct union_node *un = VTOUNION(ap->a_vp); + + if (dun->un_uppervp && un->un_uppervp) { + struct vnode *dvp = dun->un_uppervp; + struct vnode *vp = un->un_uppervp; + + FIXUP(dun); + VREF(dvp); + dun->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + FIXUP(un); + VREF(vp); + un->un_flags |= UN_KLOCK; + vput(ap->a_vp); + + error = VOP_REMOVE(dvp, vp, ap->a_cnp); + if (!error) + union_removed_upper(un); + + /* + * XXX: should create a whiteout here + */ + } else { + /* + * XXX: should create a whiteout here + */ + vput(ap->a_dvp); + vput(ap->a_vp); + error = EROFS; + } + + return (error); +} + +int +union_link(ap) + struct vop_link_args /* { + struct vnode *a_vp; + struct vnode *a_tdvp; + struct componentname *a_cnp; + } */ *ap; +{ + int error; + struct union_node *dun = VTOUNION(ap->a_vp); + struct union_node *un = VTOUNION(ap->a_tdvp); + + if (dun->un_uppervp && un->un_uppervp) { + struct vnode *dvp = dun->un_uppervp; + struct vnode *vp = un->un_uppervp; + + FIXUP(dun); + VREF(dvp); + dun->un_flags |= UN_KLOCK; + vput(ap->a_vp); + FIXUP(un); + VREF(vp); + vrele(ap->a_tdvp); + + error = VOP_LINK(dvp, vp, ap->a_cnp); + } else { + /* + * XXX: need to copy to upper layer + * and do the link there. + */ + vput(ap->a_vp); + vrele(ap->a_tdvp); + error = EROFS; + } + + return (error); +} + +int +union_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + int error; + + struct vnode *fdvp = ap->a_fdvp; + struct vnode *fvp = ap->a_fvp; + struct vnode *tdvp = ap->a_tdvp; + struct vnode *tvp = ap->a_tvp; + + if (fdvp->v_op == union_vnodeop_p) { /* always true */ + struct union_node *un = VTOUNION(fdvp); + if (un->un_uppervp == NULLVP) { + error = EROFS; + goto bad; + } + + FIXUP(un); + fdvp = un->un_uppervp; + VREF(fdvp); + vrele(ap->a_fdvp); + } + + if (fvp->v_op == union_vnodeop_p) { /* always true */ + struct union_node *un = VTOUNION(fvp); + if (un->un_uppervp == NULLVP) { + error = EROFS; + goto bad; + } + + FIXUP(un); + fvp = un->un_uppervp; + VREF(fvp); + vrele(ap->a_fvp); + } + + if (tdvp->v_op == union_vnodeop_p) { + struct union_node *un = VTOUNION(tdvp); + if (un->un_uppervp == NULLVP) { + error = EROFS; + goto bad; + } + + tdvp = un->un_uppervp; + VREF(tdvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_tdvp); + } + + if (tvp && tvp->v_op == union_vnodeop_p) { + struct union_node *un = VTOUNION(tvp); + if (un->un_uppervp == NULLVP) { + error = EROFS; + goto bad; + } + + tvp = un->un_uppervp; + VREF(tvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_tvp); + } + + return (VOP_RENAME(fdvp, fvp, ap->a_fcnp, tdvp, tvp, ap->a_tcnp)); + +bad: + vrele(fdvp); + vrele(fvp); + vput(tdvp); + if (tvp) + vput(tvp); + + return (error); +} + +int +union_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_dvp); + struct vnode *dvp = un->un_uppervp; + + if (dvp) { + int error; + struct vnode *vp; + + FIXUP(un); + VREF(dvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + error = VOP_MKDIR(dvp, &vp, ap->a_cnp, ap->a_vap); + if (error) + return (error); + + error = union_allocvp( + ap->a_vpp, + ap->a_dvp->v_mount, + ap->a_dvp, + NULLVP, + ap->a_cnp, + vp, + NULLVP); + if (error) + vput(vp); + return (error); + } + + vput(ap->a_dvp); + return (EROFS); +} + +int +union_rmdir(ap) + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + int error; + struct union_node *dun = VTOUNION(ap->a_dvp); + struct union_node *un = VTOUNION(ap->a_vp); + + if (dun->un_uppervp && un->un_uppervp) { + struct vnode *dvp = dun->un_uppervp; + struct vnode *vp = un->un_uppervp; + + FIXUP(dun); + VREF(dvp); + dun->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + FIXUP(un); + VREF(vp); + un->un_flags |= UN_KLOCK; + vput(ap->a_vp); + + error = VOP_RMDIR(dvp, vp, ap->a_cnp); + if (!error) + union_removed_upper(un); + + /* + * XXX: should create a whiteout here + */ + } else { + /* + * XXX: should create a whiteout here + */ + vput(ap->a_dvp); + vput(ap->a_vp); + error = EROFS; + } + + return (error); +} + +int +union_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_dvp); + struct vnode *dvp = un->un_uppervp; + + if (dvp) { + int error; + struct vnode *vp; + struct mount *mp = ap->a_dvp->v_mount; + + FIXUP(un); + VREF(dvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + error = VOP_SYMLINK(dvp, &vp, ap->a_cnp, + ap->a_vap, ap->a_target); + *ap->a_vpp = NULLVP; + return (error); + } + + vput(ap->a_dvp); + return (EROFS); +} + +/* + * union_readdir works in concert with getdirentries and + * readdir(3) to provide a list of entries in the unioned + * directories. getdirentries is responsible for walking + * down the union stack. readdir(3) is responsible for + * eliminating duplicate names from the returned data stream. + */ +int +union_readdir(ap) + struct vop_readdir_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + int error = 0; + struct union_node *un = VTOUNION(ap->a_vp); + + if (un->un_uppervp) { + FIXUP(un); + error = VOP_READDIR(un->un_uppervp, ap->a_uio, ap->a_cred); + } + + return (error); +} + +int +union_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_vp); + int dolock = (vp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_READLINK(vp, ap->a_uio, ap->a_cred); + if (dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_abortop(ap) + struct vop_abortop_args /* { + struct vnode *a_dvp; + struct componentname *a_cnp; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_dvp); + struct union_node *un = VTOUNION(ap->a_dvp); + int islocked = un->un_flags & UN_LOCKED; + int dolock = (vp == LOWERVP(ap->a_dvp)); + + if (islocked) { + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_dvp)); + } + error = VOP_ABORTOP(vp, ap->a_cnp); + if (islocked && dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + /* + * Do nothing (and _don't_ bypass). + * Wait to vrele lowervp until reclaim, + * so that until then our union_node is in the + * cache and reusable. + * + * NEEDSWORK: Someday, consider inactive'ing + * the lowervp and then trying to reactivate it + * with capabilities (v_id) + * like they do in the name lookup cache code. + * That's too much work for now. + */ + +#ifdef UNION_DIAGNOSTIC + struct union_node *un = VTOUNION(ap->a_vp); + + if (un->un_flags & UN_LOCKED) + panic("union: inactivating locked node"); +#endif + + return (0); +} + +int +union_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + union_freevp(ap->a_vp); + + return (0); +} + +int +union_lock(ap) + struct vop_lock_args *ap; +{ + struct vnode *vp = ap->a_vp; + struct union_node *un; + +start: + while (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + } + + un = VTOUNION(vp); + + if (un->un_uppervp) { + if ((un->un_flags & UN_ULOCK) == 0) { + un->un_flags |= UN_ULOCK; + VOP_LOCK(un->un_uppervp); + } +#ifdef DIAGNOSTIC + if (un->un_flags & UN_KLOCK) + panic("union: dangling upper lock"); +#endif + } + + if (un->un_flags & UN_LOCKED) { +#ifdef DIAGNOSTIC + if (curproc && un->un_pid == curproc->p_pid && + un->un_pid > -1 && curproc->p_pid > -1) + panic("union: locking against myself"); +#endif + un->un_flags |= UN_WANT; + sleep((caddr_t) &un->un_flags, PINOD); + goto start; + } + +#ifdef DIAGNOSTIC + if (curproc) + un->un_pid = curproc->p_pid; + else + un->un_pid = -1; +#endif + + un->un_flags |= UN_LOCKED; + return (0); +} + +int +union_unlock(ap) + struct vop_lock_args *ap; +{ + struct union_node *un = VTOUNION(ap->a_vp); + +#ifdef DIAGNOSTIC + if ((un->un_flags & UN_LOCKED) == 0) + panic("union: unlock unlocked node"); + if (curproc && un->un_pid != curproc->p_pid && + curproc->p_pid > -1 && un->un_pid > -1) + panic("union: unlocking other process's union node"); +#endif + + un->un_flags &= ~UN_LOCKED; + + if ((un->un_flags & (UN_ULOCK|UN_KLOCK)) == UN_ULOCK) + VOP_UNLOCK(un->un_uppervp); + + un->un_flags &= ~(UN_ULOCK|UN_KLOCK); + + if (un->un_flags & UN_WANT) { + un->un_flags &= ~UN_WANT; + wakeup((caddr_t) &un->un_flags); + } + +#ifdef DIAGNOSTIC + un->un_pid = 0; +#endif + + return (0); +} + +int +union_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_vp); + int dolock = (vp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_BMAP(vp, ap->a_bn, ap->a_vpp, ap->a_bnp, ap->a_runp); + if (dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + printf("\ttag VT_UNION, vp=%x, uppervp=%x, lowervp=%x\n", + vp, UPPERVP(vp), LOWERVP(vp)); + return (0); +} + +int +union_islocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return ((VTOUNION(ap->a_vp)->un_flags & UN_LOCKED) ? 1 : 0); +} + +int +union_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_vp); + int dolock = (vp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_PATHCONF(vp, ap->a_name, ap->a_retval); + if (dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + + return (VOP_ADVLOCK(OTHERVP(ap->a_vp), ap->a_id, ap->a_op, + ap->a_fl, ap->a_flags)); +} + + +/* + * XXX - vop_strategy must be hand coded because it has no + * vnode in its arguments. + * This goes away with a merged VM/buffer cache. + */ +int +union_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + int error; + struct vnode *savedvp; + + savedvp = bp->b_vp; + bp->b_vp = OTHERVP(bp->b_vp); + +#ifdef DIAGNOSTIC + if (bp->b_vp == NULLVP) + panic("union_strategy: nil vp"); + if (((bp->b_flags & B_READ) == 0) && + (bp->b_vp == LOWERVP(savedvp))) + panic("union_strategy: writing to lowervp"); +#endif + + error = VOP_STRATEGY(bp); + bp->b_vp = savedvp; + + return (error); +} + +/* + * Global vfs data structures + */ +int (**union_vnodeop_p)(); +struct vnodeopv_entry_desc union_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, union_lookup }, /* lookup */ + { &vop_create_desc, union_create }, /* create */ + { &vop_mknod_desc, union_mknod }, /* mknod */ + { &vop_open_desc, union_open }, /* open */ + { &vop_close_desc, union_close }, /* close */ + { &vop_access_desc, union_access }, /* access */ + { &vop_getattr_desc, union_getattr }, /* getattr */ + { &vop_setattr_desc, union_setattr }, /* setattr */ + { &vop_read_desc, union_read }, /* read */ + { &vop_write_desc, union_write }, /* write */ + { &vop_ioctl_desc, union_ioctl }, /* ioctl */ + { &vop_select_desc, union_select }, /* select */ + { &vop_mmap_desc, union_mmap }, /* mmap */ + { &vop_fsync_desc, union_fsync }, /* fsync */ + { &vop_seek_desc, union_seek }, /* seek */ + { &vop_remove_desc, union_remove }, /* remove */ + { &vop_link_desc, union_link }, /* link */ + { &vop_rename_desc, union_rename }, /* rename */ + { &vop_mkdir_desc, union_mkdir }, /* mkdir */ + { &vop_rmdir_desc, union_rmdir }, /* rmdir */ + { &vop_symlink_desc, union_symlink }, /* symlink */ + { &vop_readdir_desc, union_readdir }, /* readdir */ + { &vop_readlink_desc, union_readlink }, /* readlink */ + { &vop_abortop_desc, union_abortop }, /* abortop */ + { &vop_inactive_desc, union_inactive }, /* inactive */ + { &vop_reclaim_desc, union_reclaim }, /* reclaim */ + { &vop_lock_desc, union_lock }, /* lock */ + { &vop_unlock_desc, union_unlock }, /* unlock */ + { &vop_bmap_desc, union_bmap }, /* bmap */ + { &vop_strategy_desc, union_strategy }, /* strategy */ + { &vop_print_desc, union_print }, /* print */ + { &vop_islocked_desc, union_islocked }, /* islocked */ + { &vop_pathconf_desc, union_pathconf }, /* pathconf */ + { &vop_advlock_desc, union_advlock }, /* advlock */ +#ifdef notdef + { &vop_blkatoff_desc, union_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, union_valloc }, /* valloc */ + { &vop_vfree_desc, union_vfree }, /* vfree */ + { &vop_truncate_desc, union_truncate }, /* truncate */ + { &vop_update_desc, union_update }, /* update */ + { &vop_bwrite_desc, union_bwrite }, /* bwrite */ +#endif + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc union_vnodeop_opv_desc = + { &union_vnodeop_p, union_vnodeop_entries }; diff --git a/sys/gnu/ext2fs/ext2_bmap.c b/sys/gnu/ext2fs/ext2_bmap.c new file mode 100644 index 00000000000..bcd838d036a --- /dev/null +++ b/sys/gnu/ext2fs/ext2_bmap.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +/* + * Bmap converts a the logical block number of a file to its physical block + * number on the disk. The conversion is done by using the logical block + * number to index into the array of block pointers described by the dinode. + */ +int +ufs_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_vpp != NULL) + *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; + if (ap->a_bnp == NULL) + return (0); + + return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, + ap->a_runp)); +} + +/* + * Indirect blocks are now on the vnode for the file. They are given negative + * logical block numbers. Indirect blocks are addressed by the negative + * address of the first data block to which they point. Double indirect blocks + * are addressed by one less than the address of the first indirect block to + * which they point. Triple indirect blocks are addressed by one less than + * the address of the first double indirect block to which they point. + * + * ufs_bmaparray does the bmap conversion, and if requested returns the + * array of logical blocks which must be traversed to get to a block. + * Each entry contains the offset into that block that gets you to the + * next block and the disk address of the block (if it is assigned). + */ + +int +ufs_bmaparray(vp, bn, bnp, ap, nump, runp) + struct vnode *vp; + register daddr_t bn; + daddr_t *bnp; + struct indir *ap; + int *nump; + int *runp; +{ + register struct inode *ip; + struct buf *bp; + struct ufsmount *ump; + struct mount *mp; + struct vnode *devvp; + struct indir a[NIADDR], *xap; + daddr_t daddr; + long metalbn; + int error, maxrun, num; + + ip = VTOI(vp); + mp = vp->v_mount; + ump = VFSTOUFS(mp); +#ifdef DIAGNOSTIC + if (ap != NULL && nump == NULL || ap == NULL && nump != NULL) + panic("ufs_bmaparray: invalid arguments"); +#endif + + if (runp) { + /* + * XXX + * If MAXBSIZE is the largest transfer the disks can handle, + * we probably want maxrun to be 1 block less so that we + * don't create a block larger than the device can handle. + */ + *runp = 0; + maxrun = MAXBSIZE / mp->mnt_stat.f_iosize - 1; + } + + xap = ap == NULL ? a : ap; + if (!nump) + nump = # + if (error = ufs_getlbns(vp, bn, xap, nump)) + return (error); + + num = *nump; + if (num == 0) { + *bnp = blkptrtodb(ump, ip->i_db[bn]); + if (*bnp == 0) + *bnp = -1; + else if (runp) + for (++bn; bn < NDADDR && *runp < maxrun && + is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); + ++bn, ++*runp); + return (0); + } + + + /* Get disk address out of indirect block array */ + daddr = ip->i_ib[xap->in_off]; + + devvp = VFSTOUFS(vp->v_mount)->um_devvp; + for (bp = NULL, ++xap; --num; ++xap) { + /* + * Exit the loop if there is no disk address assigned yet and + * the indirect block isn't in the cache, or if we were + * looking for an indirect block and we've found it. + */ + + metalbn = xap->in_lbn; + if (daddr == 0 && !incore(vp, metalbn) || metalbn == bn) + break; + /* + * If we get here, we've either got the block in the cache + * or we have a disk address for it, go fetch it. + */ + if (bp) + brelse(bp); + + xap->in_exists = 1; + bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); + if (bp->b_flags & (B_DONE | B_DELWRI)) { + trace(TR_BREADHIT, pack(vp, size), metalbn); + } +#ifdef DIAGNOSTIC + else if (!daddr) + panic("ufs_bmaparry: indirect block not in cache"); +#endif + else { + trace(TR_BREADMISS, pack(vp, size), metalbn); + bp->b_blkno = blkptrtodb(ump, daddr); + bp->b_flags |= B_READ; + VOP_STRATEGY(bp); + curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + if (error = biowait(bp)) { + brelse(bp); + return (error); + } + } + + daddr = ((daddr_t *)bp->b_data)[xap->in_off]; + if (num == 1 && daddr && runp) + for (bn = xap->in_off + 1; + bn < MNINDIR(ump) && *runp < maxrun && + is_sequential(ump, ((daddr_t *)bp->b_data)[bn - 1], + ((daddr_t *)bp->b_data)[bn]); + ++bn, ++*runp); + } + if (bp) + brelse(bp); + + daddr = blkptrtodb(ump, daddr); + *bnp = daddr == 0 ? -1 : daddr; + return (0); +} + +/* + * Create an array of logical block number/offset pairs which represent the + * path of indirect blocks required to access a data block. The first "pair" + * contains the logical block number of the appropriate single, double or + * triple indirect block and the offset into the inode indirect block array. + * Note, the logical block number of the inode single/double/triple indirect + * block appears twice in the array, once with the offset into the i_ib and + * once with the offset into the page itself. + */ +int +ufs_getlbns(vp, bn, ap, nump) + struct vnode *vp; + register daddr_t bn; + struct indir *ap; + int *nump; +{ + long metalbn, realbn; + struct ufsmount *ump; + int blockcnt, i, numlevels, off; + + ump = VFSTOUFS(vp->v_mount); + if (nump) + *nump = 0; + numlevels = 0; + realbn = bn; + if ((long)bn < 0) + bn = -(long)bn; + + /* The first NDADDR blocks are direct blocks. */ + if (bn < NDADDR) + return (0); + + /* + * Determine the number of levels of indirection. After this loop + * is done, blockcnt indicates the number of data blocks possible + * at the given level of indirection, and NIADDR - i is the number + * of levels of indirection needed to locate the requested block. + */ + for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { + if (i == 0) + return (EFBIG); + blockcnt *= MNINDIR(ump); + if (bn < blockcnt) + break; + } + + /* Calculate the address of the first meta-block. */ + if (realbn >= 0) + metalbn = -(realbn - bn + NIADDR - i); + else + metalbn = -(-realbn - bn + NIADDR - i); + + /* + * At each iteration, off is the offset into the bap array which is + * an array of disk addresses at the current level of indirection. + * The logical block number and the offset in that block are stored + * into the argument array. + */ + ap->in_lbn = metalbn; + ap->in_off = off = NIADDR - i; + ap->in_exists = 0; + ap++; + for (++numlevels; i <= NIADDR; i++) { + /* If searching for a meta-data block, quit when found. */ + if (metalbn == realbn) + break; + + blockcnt /= MNINDIR(ump); + off = (bn / blockcnt) % MNINDIR(ump); + + ++numlevels; + ap->in_lbn = metalbn; + ap->in_off = off; + ap->in_exists = 0; + ++ap; + + metalbn -= -1 + off * blockcnt; + } + if (nump) + *nump = numlevels; + return (0); +} diff --git a/sys/gnu/ext2fs/ext2_ihash.c b/sys/gnu/ext2fs/ext2_ihash.c new file mode 100644 index 00000000000..4a37c907ef6 --- /dev/null +++ b/sys/gnu/ext2fs/ext2_ihash.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_ihash.c 8.4 (Berkeley) 12/30/93 + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Structures associated with inode cacheing. + */ +struct inode **ihashtbl; +u_long ihash; /* size of hash table - 1 */ +#define INOHASH(device, inum) (((device) + (inum)) & ihash) + +/* + * Initialize inode hash table. + */ +void +ufs_ihashinit() +{ + + ihashtbl = hashinit(desiredvnodes, M_UFSMNT, &ihash); +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, return it, even if it is locked. + */ +struct vnode * +ufs_ihashlookup(device, inum) + dev_t device; + ino_t inum; +{ + register struct inode *ip; + + for (ip = ihashtbl[INOHASH(device, inum)];; ip = ip->i_next) { + if (ip == NULL) + return (NULL); + if (inum == ip->i_number && device == ip->i_dev) + return (ITOV(ip)); + } + /* NOTREACHED */ +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, but locked, wait for it. + */ +struct vnode * +ufs_ihashget(device, inum) + dev_t device; + ino_t inum; +{ + register struct inode *ip; + struct vnode *vp; + + for (;;) + for (ip = ihashtbl[INOHASH(device, inum)];; ip = ip->i_next) { + if (ip == NULL) + return (NULL); + if (inum == ip->i_number && device == ip->i_dev) { + if (ip->i_flag & IN_LOCKED) { + ip->i_flag |= IN_WANTED; + sleep(ip, PINOD); + break; + } + vp = ITOV(ip); + if (!vget(vp, 1)) + return (vp); + break; + } + } + /* NOTREACHED */ +} + +/* + * Insert the inode into the hash table, and return it locked. + */ +void +ufs_ihashins(ip) + struct inode *ip; +{ + struct inode **ipp, *iq; + + ipp = &ihashtbl[INOHASH(ip->i_dev, ip->i_number)]; + if (iq = *ipp) + iq->i_prev = &ip->i_next; + ip->i_next = iq; + ip->i_prev = ipp; + *ipp = ip; + if (ip->i_flag & IN_LOCKED) + panic("ufs_ihashins: already locked"); + if (curproc) + ip->i_lockholder = curproc->p_pid; + else + ip->i_lockholder = -1; + ip->i_flag |= IN_LOCKED; +} + +/* + * Remove the inode from the hash table. + */ +void +ufs_ihashrem(ip) + register struct inode *ip; +{ + register struct inode *iq; + + if (iq = ip->i_next) + iq->i_prev = ip->i_prev; + *ip->i_prev = iq; +#ifdef DIAGNOSTIC + ip->i_next = NULL; + ip->i_prev = NULL; +#endif +} diff --git a/sys/gnu/ext2fs/ext2_mount.h b/sys/gnu/ext2fs/ext2_mount.h new file mode 100644 index 00000000000..237871fdaac --- /dev/null +++ b/sys/gnu/ext2fs/ext2_mount.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufsmount.h 8.2 (Berkeley) 1/12/94 + */ + +struct buf; +struct inode; +struct nameidata; +struct timeval; +struct ucred; +struct uio; +struct vnode; +struct netexport; + +/* This structure describes the UFS specific mount structure data. */ +struct ufsmount { + struct mount *um_mountp; /* filesystem vfs structure */ + dev_t um_dev; /* device mounted */ + struct vnode *um_devvp; /* block device mounted vnode */ + union { /* pointer to superblock */ + struct lfs *lfs; /* LFS */ + struct fs *fs; /* FFS */ + } ufsmount_u; +#define um_fs ufsmount_u.fs +#define um_lfs ufsmount_u.lfs + struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */ + struct ucred *um_cred[MAXQUOTAS]; /* quota file access cred */ + u_long um_nindir; /* indirect ptrs per block */ + u_long um_bptrtodb; /* indir ptr to disk block */ + u_long um_seqinc; /* inc between seq blocks */ + time_t um_btime[MAXQUOTAS]; /* block quota time limit */ + time_t um_itime[MAXQUOTAS]; /* inode quota time limit */ + char um_qflags[MAXQUOTAS]; /* quota specific flags */ + struct netexport um_export; /* export information */ +}; +/* + * Flags describing the state of quotas. + */ +#define QTF_OPENING 0x01 /* Q_QUOTAON in progress */ +#define QTF_CLOSING 0x02 /* Q_QUOTAOFF in progress */ + +/* Convert mount ptr to ufsmount ptr. */ +#define VFSTOUFS(mp) ((struct ufsmount *)((mp)->mnt_data)) + +/* + * Macros to access file system parameters in the ufsmount structure. + * Used by ufs_bmap. + */ +#define blkptrtodb(ump, b) ((b) << (ump)->um_bptrtodb) +#define is_sequential(ump, a, b) ((b) == (a) + ump->um_seqinc) +#define MNINDIR(ump) ((ump)->um_nindir) + + diff --git a/sys/gnu/ext2fs/inode.h b/sys/gnu/ext2fs/inode.h new file mode 100644 index 00000000000..df155967a7d --- /dev/null +++ b/sys/gnu/ext2fs/inode.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)inode.h 8.4 (Berkeley) 1/21/94 + */ + +#include + +/* + * Theoretically, directories can be more than 2Gb in length, however, in + * practice this seems unlikely. So, we define the type doff_t as a long + * to keep down the cost of doing lookup on a 32-bit machine. If you are + * porting to a 64-bit architecture, you should make doff_t the same as off_t. + */ +#define doff_t long + +/* + * The inode is used to describe each active (or recently active) + * file in the UFS filesystem. It is composed of two types of + * information. The first part is the information that is needed + * only while the file is active (such as the identity of the file + * and linkage to speed its lookup). The second part is the + * permannent meta-data associated with the file which is read + * in from the permanent dinode from long term storage when the + * file becomes active, and is put back when the file is no longer + * being used. + */ +struct inode { + struct inode *i_next; /* Hash chain forward. */ + struct inode **i_prev; /* Hash chain back. */ + struct vnode *i_vnode; /* Vnode associated with this inode. */ + struct vnode *i_devvp; /* Vnode for block I/O. */ + u_long i_flag; /* I* flags. */ + dev_t i_dev; /* Device associated with the inode. */ + ino_t i_number; /* The identity of the inode. */ + union { /* Associated filesystem. */ + struct fs *fs; /* FFS */ + struct lfs *lfs; /* LFS */ + } inode_u; +#define i_fs inode_u.fs +#define i_lfs inode_u.lfs + struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ + u_quad_t i_modrev; /* Revision level for lease. */ + struct lockf *i_lockf; /* Head of byte-level lock list. */ + pid_t i_lockholder; /* DEBUG: holder of inode lock. */ + pid_t i_lockwaiter; /* DEBUG: latest blocked for inode lock. */ + /* + * Side effects; used during directory lookup. + */ + long i_count; /* Size of free slot in directory. */ + doff_t i_endoff; /* End of useful stuff in directory. */ + doff_t i_diroff; /* Offset in dir, where we found last entry. */ + doff_t i_offset; /* Offset of free space in directory. */ + ino_t i_ino; /* Inode number of found directory. */ + u_long i_reclen; /* Size of found directory entry. */ + long i_spare[11]; /* Spares to round up to 128 bytes. */ + /* + * The on-disk dinode itself. + */ + struct dinode i_din; /* 128 bytes of the on-disk dinode. */ +}; + +#define i_atime i_din.di_atime +#define i_blocks i_din.di_blocks +#define i_ctime i_din.di_ctime +#define i_db i_din.di_db +#define i_flags i_din.di_flags +#define i_gen i_din.di_gen +#define i_gid i_din.di_gid +#define i_ib i_din.di_ib +#define i_mode i_din.di_mode +#define i_mtime i_din.di_mtime +#define i_nlink i_din.di_nlink +#define i_rdev i_din.di_rdev +#define i_shortlink i_din.di_shortlink +#define i_size i_din.di_size +#define i_uid i_din.di_uid + +/* These flags are kept in i_flag. */ +#define IN_ACCESS 0x0001 /* Access time update request. */ +#define IN_CHANGE 0x0002 /* Inode change time update request. */ +#define IN_EXLOCK 0x0004 /* File has exclusive lock. */ +#define IN_LOCKED 0x0008 /* Inode lock. */ +#define IN_LWAIT 0x0010 /* Process waiting on file lock. */ +#define IN_MODIFIED 0x0020 /* Inode has been modified. */ +#define IN_RENAME 0x0040 /* Inode is being renamed. */ +#define IN_SHLOCK 0x0080 /* File has shared lock. */ +#define IN_UPDATE 0x0100 /* Modification time update request. */ +#define IN_WANTED 0x0200 /* Inode is wanted by a process. */ + +#ifdef KERNEL +/* + * Structure used to pass around logical block paths generated by + * ufs_getlbns and used by truncate and bmap code. + */ +struct indir { + daddr_t in_lbn; /* Logical block number. */ + int in_off; /* Offset in buffer. */ + int in_exists; /* Flag if the block exists. */ +}; + +/* Convert between inode pointers and vnode pointers. */ +#define VTOI(vp) ((struct inode *)(vp)->v_data) +#define ITOV(ip) ((ip)->i_vnode) + +#define ITIMES(ip, t1, t2) { \ + if ((ip)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) { \ + (ip)->i_flag |= IN_MODIFIED; \ + if ((ip)->i_flag & IN_ACCESS) \ + (ip)->i_atime.ts_sec = (t1)->tv_sec; \ + if ((ip)->i_flag & IN_UPDATE) { \ + (ip)->i_mtime.ts_sec = (t2)->tv_sec; \ + (ip)->i_modrev++; \ + } \ + if ((ip)->i_flag & IN_CHANGE) \ + (ip)->i_ctime.ts_sec = time.tv_sec; \ + (ip)->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); \ + } \ +} + +/* This overlays the fid structure (see mount.h). */ +struct ufid { + u_short ufid_len; /* Length of structure. */ + u_short ufid_pad; /* Force long alignment. */ + ino_t ufid_ino; /* File number (ino). */ + long ufid_gen; /* Generation number. */ +}; +#endif /* KERNEL */ diff --git a/sys/gnu/fs/ext2fs/ext2_bmap.c b/sys/gnu/fs/ext2fs/ext2_bmap.c new file mode 100644 index 00000000000..bcd838d036a --- /dev/null +++ b/sys/gnu/fs/ext2fs/ext2_bmap.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +/* + * Bmap converts a the logical block number of a file to its physical block + * number on the disk. The conversion is done by using the logical block + * number to index into the array of block pointers described by the dinode. + */ +int +ufs_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_vpp != NULL) + *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; + if (ap->a_bnp == NULL) + return (0); + + return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, + ap->a_runp)); +} + +/* + * Indirect blocks are now on the vnode for the file. They are given negative + * logical block numbers. Indirect blocks are addressed by the negative + * address of the first data block to which they point. Double indirect blocks + * are addressed by one less than the address of the first indirect block to + * which they point. Triple indirect blocks are addressed by one less than + * the address of the first double indirect block to which they point. + * + * ufs_bmaparray does the bmap conversion, and if requested returns the + * array of logical blocks which must be traversed to get to a block. + * Each entry contains the offset into that block that gets you to the + * next block and the disk address of the block (if it is assigned). + */ + +int +ufs_bmaparray(vp, bn, bnp, ap, nump, runp) + struct vnode *vp; + register daddr_t bn; + daddr_t *bnp; + struct indir *ap; + int *nump; + int *runp; +{ + register struct inode *ip; + struct buf *bp; + struct ufsmount *ump; + struct mount *mp; + struct vnode *devvp; + struct indir a[NIADDR], *xap; + daddr_t daddr; + long metalbn; + int error, maxrun, num; + + ip = VTOI(vp); + mp = vp->v_mount; + ump = VFSTOUFS(mp); +#ifdef DIAGNOSTIC + if (ap != NULL && nump == NULL || ap == NULL && nump != NULL) + panic("ufs_bmaparray: invalid arguments"); +#endif + + if (runp) { + /* + * XXX + * If MAXBSIZE is the largest transfer the disks can handle, + * we probably want maxrun to be 1 block less so that we + * don't create a block larger than the device can handle. + */ + *runp = 0; + maxrun = MAXBSIZE / mp->mnt_stat.f_iosize - 1; + } + + xap = ap == NULL ? a : ap; + if (!nump) + nump = # + if (error = ufs_getlbns(vp, bn, xap, nump)) + return (error); + + num = *nump; + if (num == 0) { + *bnp = blkptrtodb(ump, ip->i_db[bn]); + if (*bnp == 0) + *bnp = -1; + else if (runp) + for (++bn; bn < NDADDR && *runp < maxrun && + is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); + ++bn, ++*runp); + return (0); + } + + + /* Get disk address out of indirect block array */ + daddr = ip->i_ib[xap->in_off]; + + devvp = VFSTOUFS(vp->v_mount)->um_devvp; + for (bp = NULL, ++xap; --num; ++xap) { + /* + * Exit the loop if there is no disk address assigned yet and + * the indirect block isn't in the cache, or if we were + * looking for an indirect block and we've found it. + */ + + metalbn = xap->in_lbn; + if (daddr == 0 && !incore(vp, metalbn) || metalbn == bn) + break; + /* + * If we get here, we've either got the block in the cache + * or we have a disk address for it, go fetch it. + */ + if (bp) + brelse(bp); + + xap->in_exists = 1; + bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); + if (bp->b_flags & (B_DONE | B_DELWRI)) { + trace(TR_BREADHIT, pack(vp, size), metalbn); + } +#ifdef DIAGNOSTIC + else if (!daddr) + panic("ufs_bmaparry: indirect block not in cache"); +#endif + else { + trace(TR_BREADMISS, pack(vp, size), metalbn); + bp->b_blkno = blkptrtodb(ump, daddr); + bp->b_flags |= B_READ; + VOP_STRATEGY(bp); + curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + if (error = biowait(bp)) { + brelse(bp); + return (error); + } + } + + daddr = ((daddr_t *)bp->b_data)[xap->in_off]; + if (num == 1 && daddr && runp) + for (bn = xap->in_off + 1; + bn < MNINDIR(ump) && *runp < maxrun && + is_sequential(ump, ((daddr_t *)bp->b_data)[bn - 1], + ((daddr_t *)bp->b_data)[bn]); + ++bn, ++*runp); + } + if (bp) + brelse(bp); + + daddr = blkptrtodb(ump, daddr); + *bnp = daddr == 0 ? -1 : daddr; + return (0); +} + +/* + * Create an array of logical block number/offset pairs which represent the + * path of indirect blocks required to access a data block. The first "pair" + * contains the logical block number of the appropriate single, double or + * triple indirect block and the offset into the inode indirect block array. + * Note, the logical block number of the inode single/double/triple indirect + * block appears twice in the array, once with the offset into the i_ib and + * once with the offset into the page itself. + */ +int +ufs_getlbns(vp, bn, ap, nump) + struct vnode *vp; + register daddr_t bn; + struct indir *ap; + int *nump; +{ + long metalbn, realbn; + struct ufsmount *ump; + int blockcnt, i, numlevels, off; + + ump = VFSTOUFS(vp->v_mount); + if (nump) + *nump = 0; + numlevels = 0; + realbn = bn; + if ((long)bn < 0) + bn = -(long)bn; + + /* The first NDADDR blocks are direct blocks. */ + if (bn < NDADDR) + return (0); + + /* + * Determine the number of levels of indirection. After this loop + * is done, blockcnt indicates the number of data blocks possible + * at the given level of indirection, and NIADDR - i is the number + * of levels of indirection needed to locate the requested block. + */ + for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { + if (i == 0) + return (EFBIG); + blockcnt *= MNINDIR(ump); + if (bn < blockcnt) + break; + } + + /* Calculate the address of the first meta-block. */ + if (realbn >= 0) + metalbn = -(realbn - bn + NIADDR - i); + else + metalbn = -(-realbn - bn + NIADDR - i); + + /* + * At each iteration, off is the offset into the bap array which is + * an array of disk addresses at the current level of indirection. + * The logical block number and the offset in that block are stored + * into the argument array. + */ + ap->in_lbn = metalbn; + ap->in_off = off = NIADDR - i; + ap->in_exists = 0; + ap++; + for (++numlevels; i <= NIADDR; i++) { + /* If searching for a meta-data block, quit when found. */ + if (metalbn == realbn) + break; + + blockcnt /= MNINDIR(ump); + off = (bn / blockcnt) % MNINDIR(ump); + + ++numlevels; + ap->in_lbn = metalbn; + ap->in_off = off; + ap->in_exists = 0; + ++ap; + + metalbn -= -1 + off * blockcnt; + } + if (nump) + *nump = numlevels; + return (0); +} diff --git a/sys/gnu/fs/ext2fs/ext2_mount.h b/sys/gnu/fs/ext2fs/ext2_mount.h new file mode 100644 index 00000000000..237871fdaac --- /dev/null +++ b/sys/gnu/fs/ext2fs/ext2_mount.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufsmount.h 8.2 (Berkeley) 1/12/94 + */ + +struct buf; +struct inode; +struct nameidata; +struct timeval; +struct ucred; +struct uio; +struct vnode; +struct netexport; + +/* This structure describes the UFS specific mount structure data. */ +struct ufsmount { + struct mount *um_mountp; /* filesystem vfs structure */ + dev_t um_dev; /* device mounted */ + struct vnode *um_devvp; /* block device mounted vnode */ + union { /* pointer to superblock */ + struct lfs *lfs; /* LFS */ + struct fs *fs; /* FFS */ + } ufsmount_u; +#define um_fs ufsmount_u.fs +#define um_lfs ufsmount_u.lfs + struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */ + struct ucred *um_cred[MAXQUOTAS]; /* quota file access cred */ + u_long um_nindir; /* indirect ptrs per block */ + u_long um_bptrtodb; /* indir ptr to disk block */ + u_long um_seqinc; /* inc between seq blocks */ + time_t um_btime[MAXQUOTAS]; /* block quota time limit */ + time_t um_itime[MAXQUOTAS]; /* inode quota time limit */ + char um_qflags[MAXQUOTAS]; /* quota specific flags */ + struct netexport um_export; /* export information */ +}; +/* + * Flags describing the state of quotas. + */ +#define QTF_OPENING 0x01 /* Q_QUOTAON in progress */ +#define QTF_CLOSING 0x02 /* Q_QUOTAOFF in progress */ + +/* Convert mount ptr to ufsmount ptr. */ +#define VFSTOUFS(mp) ((struct ufsmount *)((mp)->mnt_data)) + +/* + * Macros to access file system parameters in the ufsmount structure. + * Used by ufs_bmap. + */ +#define blkptrtodb(ump, b) ((b) << (ump)->um_bptrtodb) +#define is_sequential(ump, a, b) ((b) == (a) + ump->um_seqinc) +#define MNINDIR(ump) ((ump)->um_nindir) + + diff --git a/sys/gnu/fs/ext2fs/inode.h b/sys/gnu/fs/ext2fs/inode.h new file mode 100644 index 00000000000..df155967a7d --- /dev/null +++ b/sys/gnu/fs/ext2fs/inode.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)inode.h 8.4 (Berkeley) 1/21/94 + */ + +#include + +/* + * Theoretically, directories can be more than 2Gb in length, however, in + * practice this seems unlikely. So, we define the type doff_t as a long + * to keep down the cost of doing lookup on a 32-bit machine. If you are + * porting to a 64-bit architecture, you should make doff_t the same as off_t. + */ +#define doff_t long + +/* + * The inode is used to describe each active (or recently active) + * file in the UFS filesystem. It is composed of two types of + * information. The first part is the information that is needed + * only while the file is active (such as the identity of the file + * and linkage to speed its lookup). The second part is the + * permannent meta-data associated with the file which is read + * in from the permanent dinode from long term storage when the + * file becomes active, and is put back when the file is no longer + * being used. + */ +struct inode { + struct inode *i_next; /* Hash chain forward. */ + struct inode **i_prev; /* Hash chain back. */ + struct vnode *i_vnode; /* Vnode associated with this inode. */ + struct vnode *i_devvp; /* Vnode for block I/O. */ + u_long i_flag; /* I* flags. */ + dev_t i_dev; /* Device associated with the inode. */ + ino_t i_number; /* The identity of the inode. */ + union { /* Associated filesystem. */ + struct fs *fs; /* FFS */ + struct lfs *lfs; /* LFS */ + } inode_u; +#define i_fs inode_u.fs +#define i_lfs inode_u.lfs + struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ + u_quad_t i_modrev; /* Revision level for lease. */ + struct lockf *i_lockf; /* Head of byte-level lock list. */ + pid_t i_lockholder; /* DEBUG: holder of inode lock. */ + pid_t i_lockwaiter; /* DEBUG: latest blocked for inode lock. */ + /* + * Side effects; used during directory lookup. + */ + long i_count; /* Size of free slot in directory. */ + doff_t i_endoff; /* End of useful stuff in directory. */ + doff_t i_diroff; /* Offset in dir, where we found last entry. */ + doff_t i_offset; /* Offset of free space in directory. */ + ino_t i_ino; /* Inode number of found directory. */ + u_long i_reclen; /* Size of found directory entry. */ + long i_spare[11]; /* Spares to round up to 128 bytes. */ + /* + * The on-disk dinode itself. + */ + struct dinode i_din; /* 128 bytes of the on-disk dinode. */ +}; + +#define i_atime i_din.di_atime +#define i_blocks i_din.di_blocks +#define i_ctime i_din.di_ctime +#define i_db i_din.di_db +#define i_flags i_din.di_flags +#define i_gen i_din.di_gen +#define i_gid i_din.di_gid +#define i_ib i_din.di_ib +#define i_mode i_din.di_mode +#define i_mtime i_din.di_mtime +#define i_nlink i_din.di_nlink +#define i_rdev i_din.di_rdev +#define i_shortlink i_din.di_shortlink +#define i_size i_din.di_size +#define i_uid i_din.di_uid + +/* These flags are kept in i_flag. */ +#define IN_ACCESS 0x0001 /* Access time update request. */ +#define IN_CHANGE 0x0002 /* Inode change time update request. */ +#define IN_EXLOCK 0x0004 /* File has exclusive lock. */ +#define IN_LOCKED 0x0008 /* Inode lock. */ +#define IN_LWAIT 0x0010 /* Process waiting on file lock. */ +#define IN_MODIFIED 0x0020 /* Inode has been modified. */ +#define IN_RENAME 0x0040 /* Inode is being renamed. */ +#define IN_SHLOCK 0x0080 /* File has shared lock. */ +#define IN_UPDATE 0x0100 /* Modification time update request. */ +#define IN_WANTED 0x0200 /* Inode is wanted by a process. */ + +#ifdef KERNEL +/* + * Structure used to pass around logical block paths generated by + * ufs_getlbns and used by truncate and bmap code. + */ +struct indir { + daddr_t in_lbn; /* Logical block number. */ + int in_off; /* Offset in buffer. */ + int in_exists; /* Flag if the block exists. */ +}; + +/* Convert between inode pointers and vnode pointers. */ +#define VTOI(vp) ((struct inode *)(vp)->v_data) +#define ITOV(ip) ((ip)->i_vnode) + +#define ITIMES(ip, t1, t2) { \ + if ((ip)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) { \ + (ip)->i_flag |= IN_MODIFIED; \ + if ((ip)->i_flag & IN_ACCESS) \ + (ip)->i_atime.ts_sec = (t1)->tv_sec; \ + if ((ip)->i_flag & IN_UPDATE) { \ + (ip)->i_mtime.ts_sec = (t2)->tv_sec; \ + (ip)->i_modrev++; \ + } \ + if ((ip)->i_flag & IN_CHANGE) \ + (ip)->i_ctime.ts_sec = time.tv_sec; \ + (ip)->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); \ + } \ +} + +/* This overlays the fid structure (see mount.h). */ +struct ufid { + u_short ufid_len; /* Length of structure. */ + u_short ufid_pad; /* Force long alignment. */ + ino_t ufid_ino; /* File number (ino). */ + long ufid_gen; /* Generation number. */ +}; +#endif /* KERNEL */ diff --git a/sys/isofs/cd9660/TODO b/sys/isofs/cd9660/TODO new file mode 100644 index 00000000000..555d26ad7d1 --- /dev/null +++ b/sys/isofs/cd9660/TODO @@ -0,0 +1,77 @@ +# $Id: TODO,v 1.4 1993/09/07 15:40:51 ws Exp $ + + 1) should understand "older", original High Sierra ("CDROM001") type + + Not yet. ( I don't have this technical information, yet. ) + + 2) should understand Rock Ridge + + Yes, we have follows function. + + o Symbolic Link + o Real Name(long name) + o File Attribute + o Time stamp + o uid, gid + o Devices + o Relocated directories + + Except follows: + + o POSIX device number mapping + + There is some preliminary stuff in there that (ab-)uses the mknod + system call, but this needs a writable filesystem + + 3) should be called cdfs, as there are other ISO file system soon possible + + Not yet. Probably we should make another file system when the ECMA draft + is valid and do it. For doing Rock Ridge Support, I can use almost same + code. So I just use the same file system interface... + + 4) should have file handles implemented for use with NFS, etc + + Yes. we have already this one, and I based it for this release. + + 5) should have name translation enabled by mount flag + + Yes. we can disable the Rock Ridge Extension by follows option; + + "mount -t isofs -o -norrip /dev/cd0d /cdrom" + + 6) should run as a user process, and not take up kernel space (cdroms + are slow) + + Not yet. + + 7) ECMA support. + + Not yet. we need not only a technical spec but also ECMA format + cd-rom itself! + + 8) Character set change by SVD ( multi SVD support ) + + Not yet. We should also hack the other part of system as 8 bit + clean. As far as I know, if you export the cdrom by NFS, the client + can access the 8 bit clean (ie. Solaris Japanese with EUC code ) + + 9) Access checks in isofs_access + + Not yet. + + 10) Support for generation numbers + + Yes. Default is to list only the last file (the one with the highest + generation number). If you mount with -gen, all files are shown with + their generation numbers. In both cases you can specify the generation + number on opening files (if you happen to know it) or leave it off, + when it will again find the last file. + + 11) Support for extended attributes + + Yes. Since this requires an extra block buffer for the attributes + this must be enabled on mounting with the option -extattr. + +---------- +Last update July 19, '93 by Atsushi Murai. (amurai@spec.co.jp) +Last update August 19, '93 by Wolfgang Solfrank. (ws@tools.de) diff --git a/sys/isofs/cd9660/TODO.hibler b/sys/isofs/cd9660/TODO.hibler new file mode 100644 index 00000000000..3501aa296cd --- /dev/null +++ b/sys/isofs/cd9660/TODO.hibler @@ -0,0 +1,22 @@ +1. Investiate making ISOFS another UFS shared filesystem (ala FFS/MFS/LFS). + Since it was modelled after the inode code, we might be able to merge + them back. It looks like a seperate (but very similar) lookup routine + will be needed due to the associated file stuff. + +2. Make filesystem exportable. This comes for free if stacked with UFS. + Otherwise, the ufs_export routines need to be elevated to vfs_* routines. + [ DONE - hibler ] + +3. If it can't be merged with UFS, at least get them in sync. For example, + it could use the same style hashing routines as in ufs/ufs_ihash.c + +4. It would be nice to be able to use the vfs_cluster code. + Unfortunately, if the logical block size is smaller than the page size, + it won't work. Also, if throughtput is relatively constant for any + block size (as it is for the HP drive--150kbs) then clustering may not + buy much (or may even hurt when vfs_cluster comes up with a large sync + cluster). + +5. Seems like there should be a "notrans" or some such mount option to show + filenames as they really are without lower-casing, stripping of version + numbers, etc. Does this make sense? diff --git a/sys/isofs/cd9660/cd9660_bmap.c b/sys/isofs/cd9660/cd9660_bmap.c new file mode 100644 index 00000000000..911eedfd06a --- /dev/null +++ b/sys/isofs/cd9660/cd9660_bmap.c @@ -0,0 +1,102 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_bmap.c 8.3 (Berkeley) 1/23/94 + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Bmap converts a the logical block number of a file to its physical block + * number on the disk. The conversion is done by using the logical block + * number to index into the data block (extent) for the file. + */ +int +cd9660_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + struct iso_node *ip = VTOI(ap->a_vp); + daddr_t lblkno = ap->a_bn; + long bsize; + + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_vpp != NULL) + *ap->a_vpp = ip->i_devvp; + if (ap->a_bnp == NULL) + return (0); + + /* + * Compute the requested block number + */ + bsize = ip->i_mnt->logical_block_size; + *ap->a_bnp = (ip->iso_start + lblkno) * btodb(bsize); + + /* + * Determine maximum number of readahead blocks following the + * requested block. + */ + if (ap->a_runp) { + int nblk; + + nblk = (ip->i_size - (lblkno + 1) * bsize) / bsize; + if (nblk <= 0) + *ap->a_runp = 0; + else if (nblk >= MAXBSIZE/bsize) + *ap->a_runp = MAXBSIZE/bsize - 1; + else + *ap->a_runp = nblk; + } + + return 0; +} diff --git a/sys/isofs/cd9660/cd9660_lookup.c b/sys/isofs/cd9660/cd9660_lookup.c new file mode 100644 index 00000000000..62d1d3fc791 --- /dev/null +++ b/sys/isofs/cd9660/cd9660_lookup.c @@ -0,0 +1,465 @@ +/*- + * Copyright (c) 1989, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_lookup.c 7.33 (Berkeley) 5/19/91 + * + * @(#)cd9660_lookup.c 8.2 (Berkeley) 1/23/94 + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct nchstats iso_nchstats; + +/* + * Convert a component of a pathname into a pointer to a locked inode. + * This is a very central and rather complicated routine. + * If the file system is not maintained in a strict tree hierarchy, + * this can result in a deadlock situation (see comments in code below). + * + * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on + * whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it and the target of the pathname + * exists, lookup returns both the target and its parent directory locked. + * When creating or renaming and LOCKPARENT is specified, the target may + * not be ".". When deleting and LOCKPARENT is specified, the target may + * be "."., but the caller must check to ensure it does an vrele and iput + * instead of two iputs. + * + * Overall outline of ufs_lookup: + * + * check accessibility of directory + * look for name in cache, if found, then if at end of path + * and deleting or creating, drop it, else return name + * search for name in directory, to found or notfound + * notfound: + * if creating, return locked directory, leaving info on available slots + * else return error + * found: + * if at end of path and deleting, return information to allow delete + * if at end of path and rewriting (RENAME and LOCKPARENT), lock target + * inode and return info to allow rewrite + * if not at end, add name to cache; if at end and neither creating + * nor deleting, add name to cache + * + * NOTE: (LOOKUP | LOCKPARENT) currently returns the parent inode unlocked. + */ +cd9660_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vdp; /* vnode for directory being searched */ + register struct iso_node *dp; /* inode for directory being searched */ + register struct iso_mnt *imp; /* file system that directory is in */ + struct buf *bp; /* a buffer of directory entries */ + struct iso_directory_record *ep;/* the current directory entry */ + int entryoffsetinblock; /* offset of ep in bp's buffer */ + int saveoffset; /* offset of last directory entry in dir */ + int numdirpasses; /* strategy for directory search */ + doff_t endsearch; /* offset to end directory search */ + struct iso_node *pdp; /* saved dp during symlink work */ + struct iso_node *tdp; /* returned by iget */ + int lockparent; /* 1 => lockparent flag is set */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int error; + ino_t ino = 0; + int reclen; + u_short namelen; + char altname[NAME_MAX]; + int res; + int assoc, len; + char *name; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + struct ucred *cred = cnp->cn_cred; + int flags = cnp->cn_flags; + int nameiop = cnp->cn_nameiop; + + bp = NULL; + *vpp = NULL; + vdp = ap->a_dvp; + dp = VTOI(vdp); + imp = dp->i_mnt; + lockparent = flags & LOCKPARENT; + wantparent = flags & (LOCKPARENT|WANTPARENT); + + /* + * Check accessiblity of directory. + */ + if (vdp->v_type != VDIR) + return (ENOTDIR); + if (error = VOP_ACCESS(vdp, VEXEC, cred, cnp->cn_proc)) + return (error); + + /* + * We now have a segment name to search for, and a directory to search. + * + * Before tediously performing a linear scan of the directory, + * check the name cache to see if the directory/name pair + * we are looking for is known already. + */ + if (error = cache_lookup(vdp, vpp, cnp)) { + int vpid; /* capability number of vnode */ + + if (error == ENOENT) + return (error); +#ifdef PARANOID + if ((vdp->v_flag & VROOT) && (flags & ISDOTDOT)) + panic("ufs_lookup: .. through root"); +#endif + /* + * Get the next vnode in the path. + * See comment below starting `Step through' for + * an explaination of the locking protocol. + */ + pdp = dp; + dp = VTOI(*vpp); + vdp = *vpp; + vpid = vdp->v_id; + if (pdp == dp) { + VREF(vdp); + error = 0; + } else if (flags & ISDOTDOT) { + ISO_IUNLOCK(pdp); + error = vget(vdp, 1); + if (!error && lockparent && (flags & ISLASTCN)) + ISO_ILOCK(pdp); + } else { + error = vget(vdp, 1); + if (!lockparent || error || !(flags & ISLASTCN)) + ISO_IUNLOCK(pdp); + } + /* + * Check that the capability number did not change + * while we were waiting for the lock. + */ + if (!error) { + if (vpid == vdp->v_id) + return (0); + iso_iput(dp); + if (lockparent && pdp != dp && (flags & ISLASTCN)) + ISO_IUNLOCK(pdp); + } + ISO_ILOCK(pdp); + dp = pdp; + vdp = ITOV(dp); + *vpp = NULL; + } + + len = cnp->cn_namelen; + name = cnp->cn_nameptr; + /* + * A leading `=' means, we are looking for an associated file + */ + if (assoc = (imp->iso_ftype != ISO_FTYPE_RRIP && *name == ASSOCCHAR)) { + len--; + name++; + } + + /* + * If there is cached information on a previous search of + * this directory, pick up where we last left off. + * We cache only lookups as these are the most common + * and have the greatest payoff. Caching CREATE has little + * benefit as it usually must search the entire directory + * to determine that the entry does not exist. Caching the + * location of the last DELETE or RENAME has not reduced + * profiling time and hence has been removed in the interest + * of simplicity. + */ + if (nameiop != LOOKUP || dp->i_diroff == 0 || + dp->i_diroff > dp->i_size) { + entryoffsetinblock = 0; + dp->i_offset = 0; + numdirpasses = 1; + } else { + dp->i_offset = dp->i_diroff; + entryoffsetinblock = iso_blkoff(imp, dp->i_offset); + if (entryoffsetinblock != 0) { + if (error = iso_blkatoff(dp, dp->i_offset, &bp)) + return (error); + } + numdirpasses = 2; + iso_nchstats.ncs_2passes++; + } + endsearch = roundup(dp->i_size, imp->logical_block_size); + +searchloop: + while (dp->i_offset < endsearch) { + /* + * If offset is on a block boundary, + * read the next directory block. + * Release previous if it exists. + */ + if (iso_blkoff(imp, dp->i_offset) == 0) { + if (bp != NULL) + brelse(bp); + if (error = iso_blkatoff(dp, dp->i_offset, &bp)) + return (error); + entryoffsetinblock = 0; + } + /* + * Get pointer to next entry. + */ + ep = (struct iso_directory_record *) + (bp->b_un.b_addr + entryoffsetinblock); + + reclen = isonum_711 (ep->length); + if (reclen == 0) { + /* skip to next block, if any */ + dp->i_offset = + roundup(dp->i_offset, imp->logical_block_size); + continue; + } + + if (reclen < ISO_DIRECTORY_RECORD_SIZE) + /* illegal entry, stop */ + break; + + if (entryoffsetinblock + reclen > imp->logical_block_size) + /* entries are not allowed to cross boundaries */ + break; + + /* + * Check for a name match. + */ + namelen = isonum_711(ep->name_len); + + if (reclen < ISO_DIRECTORY_RECORD_SIZE + namelen) + /* illegal entry, stop */ + break; + + switch (imp->iso_ftype) { + default: + if ((!(isonum_711(ep->flags)&4)) == !assoc) { + if ((len == 1 + && *name == '.') + || (flags & ISDOTDOT)) { + if (namelen == 1 + && ep->name[0] == ((flags & ISDOTDOT) ? 1 : 0)) { + /* + * Save directory entry's inode number and + * reclen in ndp->ni_ufs area, and release + * directory buffer. + */ + isodirino(&dp->i_ino,ep,imp); + goto found; + } + if (namelen != 1 + || ep->name[0] != 0) + goto notfound; + } else if (!(res = isofncmp(name,len, + ep->name,namelen))) { + if (isonum_711(ep->flags)&2) + isodirino(&ino,ep,imp); + else + ino = dbtob(bp->b_blkno) + + entryoffsetinblock; + saveoffset = dp->i_offset; + } else if (ino) + goto foundino; +#ifdef NOSORTBUG /* On some CDs directory entries are not sorted correctly */ + else if (res < 0) + goto notfound; + else if (res > 0 && numdirpasses == 2) + numdirpasses++; +#endif + } + break; + case ISO_FTYPE_RRIP: + if (isonum_711(ep->flags)&2) + isodirino(&ino,ep,imp); + else + ino = dbtob(bp->b_blkno) + entryoffsetinblock; + dp->i_ino = ino; + cd9660_rrip_getname(ep,altname,&namelen,&dp->i_ino,imp); + if (namelen == cnp->cn_namelen + && !bcmp(name,altname,namelen)) + goto found; + ino = 0; + break; + } + dp->i_offset += reclen; + entryoffsetinblock += reclen; + } + if (ino) { +foundino: + dp->i_ino = ino; + if (saveoffset != dp->i_offset) { + if (iso_lblkno(imp,dp->i_offset) + != iso_lblkno(imp,saveoffset)) { + if (bp != NULL) + brelse(bp); + if (error = iso_blkatoff(dp, saveoffset, &bp)) + return (error); + } + ep = (struct iso_directory_record *)(bp->b_un.b_addr + + iso_blkoff(imp,saveoffset)); + dp->i_offset = saveoffset; + } + goto found; + } +notfound: + /* + * If we started in the middle of the directory and failed + * to find our target, we must check the beginning as well. + */ + if (numdirpasses == 2) { + numdirpasses--; + dp->i_offset = 0; + endsearch = dp->i_diroff; + goto searchloop; + } + if (bp != NULL) + brelse(bp); + /* + * Insert name into cache (as non-existent) if appropriate. + */ + if (cnp->cn_flags & MAKEENTRY) + cache_enter(vdp, *vpp, cnp); + if (nameiop == CREATE || nameiop == RENAME) + return (EJUSTRETURN); + return (ENOENT); + +found: + if (numdirpasses == 2) + iso_nchstats.ncs_pass2++; + if (bp != NULL) + brelse(bp); + + /* + * Found component in pathname. + * If the final component of path name, save information + * in the cache as to where the entry was found. + */ + if ((flags & ISLASTCN) && nameiop == LOOKUP) + dp->i_diroff = dp->i_offset; + + /* + * Step through the translation in the name. We do not `iput' the + * directory because we may need it again if a symbolic link + * is relative to the current directory. Instead we save it + * unlocked as "pdp". We must get the target inode before unlocking + * the directory to insure that the inode will not be removed + * before we get it. We prevent deadlock by always fetching + * inodes from the root, moving down the directory tree. Thus + * when following backward pointers ".." we must unlock the + * parent directory before getting the requested directory. + * There is a potential race condition here if both the current + * and parent directories are removed before the `iget' for the + * inode associated with ".." returns. We hope that this occurs + * infrequently since we cannot avoid this race condition without + * implementing a sophisticated deadlock detection algorithm. + * Note also that this simple deadlock detection scheme will not + * work if the file system has any hard links other than ".." + * that point backwards in the directory structure. + */ + pdp = dp; + /* + * If ino is different from dp->i_ino, + * it's a relocated directory. + */ + if (flags & ISDOTDOT) { + ISO_IUNLOCK(pdp); /* race to get the inode */ + if (error = iso_iget(dp,dp->i_ino, + dp->i_ino != ino, + &tdp,ep)) { + ISO_ILOCK(pdp); + return (error); + } + if (lockparent && (flags & ISLASTCN)) + ISO_ILOCK(pdp); + *vpp = ITOV(tdp); + } else if (dp->i_number == dp->i_ino) { + VREF(vdp); /* we want ourself, ie "." */ + *vpp = vdp; + } else { + if (error = iso_iget(dp,dp->i_ino,dp->i_ino!=ino,&tdp,ep)) + return (error); + if (!lockparent || !(flags & ISLASTCN)) + ISO_IUNLOCK(pdp); + *vpp = ITOV(tdp); + } + + /* + * Insert name into cache if appropriate. + */ + if (cnp->cn_flags & MAKEENTRY) + cache_enter(vdp, *vpp, cnp); + return (0); +} + +/* + * Return buffer with contents of block "offset" + * from the beginning of directory "ip". If "res" + * is non-zero, fill it in with a pointer to the + * remaining space in the directory. + */ +iso_blkatoff(ip, offset, bpp) + struct iso_node *ip; + doff_t offset; + struct buf **bpp; +{ + register struct iso_mnt *imp = ip->i_mnt; + daddr_t lbn = iso_lblkno(imp,offset); + int bsize = iso_blksize(imp,ip,lbn); + struct buf *bp; + int error; + + if (error = bread(ITOV(ip),lbn,bsize,NOCRED,&bp)) { + brelse(bp); + *bpp = 0; + return (error); + } + *bpp = bp; + + return (0); +} diff --git a/sys/isofs/cd9660/cd9660_node.c b/sys/isofs/cd9660/cd9660_node.c new file mode 100644 index 00000000000..d83a7a6f126 --- /dev/null +++ b/sys/isofs/cd9660/cd9660_node.c @@ -0,0 +1,648 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_node.c 8.2 (Berkeley) 1/23/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define INOHSZ 512 +#if ((INOHSZ&(INOHSZ-1)) == 0) +#define INOHASH(dev,ino) (((dev)+((ino)>>12))&(INOHSZ-1)) +#else +#define INOHASH(dev,ino) (((unsigned)((dev)+((ino)>>12)))%INOHSZ) +#endif + +union iso_ihead { + union iso_ihead *ih_head[2]; + struct iso_node *ih_chain[2]; +} iso_ihead[INOHSZ]; + +#ifdef ISODEVMAP +#define DNOHSZ 64 +#if ((DNOHSZ&(DNOHSZ-1)) == 0) +#define DNOHASH(dev,ino) (((dev)+((ino)>>12))&(DNOHSZ-1)) +#else +#define DNOHASH(dev,ino) (((unsigned)((dev)+((ino)>>12)))%DNOHSZ) +#endif + +union iso_dhead { + union iso_dhead *dh_head[2]; + struct iso_dnode *dh_chain[2]; +} iso_dhead[DNOHSZ]; +#endif + +int prtactive; /* 1 => print out reclaim of active vnodes */ + +/* + * Initialize hash links for inodes and dnodes. + */ +cd9660_init() +{ + register int i; + register union iso_ihead *ih = iso_ihead; +#ifdef ISODEVMAP + register union iso_dhead *dh = iso_dhead; +#endif + + for (i = INOHSZ; --i >= 0; ih++) { + ih->ih_head[0] = ih; + ih->ih_head[1] = ih; + } +#ifdef ISODEVMAP + for (i = DNOHSZ; --i >= 0; dh++) { + dh->dh_head[0] = dh; + dh->dh_head[1] = dh; + } +#endif +} + +#ifdef ISODEVMAP +/* + * Enter a new node into the device hash list + */ +struct iso_dnode * +iso_dmap(dev,ino,create) + dev_t dev; + ino_t ino; + int create; +{ + struct iso_dnode *dp; + union iso_dhead *dh; + + dh = &iso_dhead[DNOHASH(dev, ino)]; + for (dp = dh->dh_chain[0]; + dp != (struct iso_dnode *)dh; + dp = dp->d_forw) + if (ino == dp->i_number && dev == dp->i_dev) + return dp; + + if (!create) + return (struct iso_dnode *)0; + + MALLOC(dp,struct iso_dnode *,sizeof(struct iso_dnode),M_CACHE,M_WAITOK); + dp->i_dev = dev; + dp->i_number = ino; + insque(dp,dh); + + return dp; +} + +void +iso_dunmap(dev) + dev_t dev; +{ + struct iso_dnode *dp, *dq; + union iso_dhead *dh; + + for (dh = iso_dhead; dh < iso_dhead + DNOHSZ; dh++) { + for (dp = dh->dh_chain[0]; + dp != (struct iso_dnode *)dh; + dp = dq) { + dq = dp->d_forw; + if (dev == dp->i_dev) { + remque(dp); + FREE(dp,M_CACHE); + } + } + } +} +#endif + +/* + * Look up a ISOFS dinode number to find its incore vnode. + * If it is not in core, read it in from the specified device. + * If it is in core, wait for the lock bit to clear, then + * return the inode locked. Detection and handling of mount + * points must be done by the calling routine. + */ +iso_iget(xp, ino, relocated, ipp, isodir) + struct iso_node *xp; + ino_t ino; + struct iso_node **ipp; + struct iso_directory_record *isodir; +{ + dev_t dev = xp->i_dev; + struct mount *mntp = ITOV(xp)->v_mount; + register struct iso_node *ip, *iq; + register struct vnode *vp; + register struct iso_dnode *dp; + struct vnode *nvp; + struct buf *bp = NULL, *bp2 = NULL; + union iso_ihead *ih; + union iso_dhead *dh; + int i, error, result; + struct iso_mnt *imp; + ino_t defino; + + ih = &iso_ihead[INOHASH(dev, ino)]; +loop: + for (ip = ih->ih_chain[0]; + ip != (struct iso_node *)ih; + ip = ip->i_forw) { + if (ino != ip->i_number || dev != ip->i_dev) + continue; + if ((ip->i_flag&ILOCKED) != 0) { + ip->i_flag |= IWANT; + sleep((caddr_t)ip, PINOD); + goto loop; + } + if (vget(ITOV(ip), 1)) + goto loop; + *ipp = ip; + return 0; + } + /* + * Allocate a new vnode/iso_node. + */ + if (error = getnewvnode(VT_ISOFS, mntp, cd9660_vnodeop_p, &nvp)) { + *ipp = 0; + return error; + } + MALLOC(ip, struct iso_node *, sizeof(struct iso_node), + M_ISOFSNODE, M_WAITOK); + bzero((caddr_t)ip, sizeof(struct iso_node)); + nvp->v_data = ip; + ip->i_vnode = nvp; + ip->i_flag = 0; + ip->i_devvp = 0; + ip->i_diroff = 0; + ip->i_lockf = 0; + + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + ip->i_dev = dev; + ip->i_number = ino; + insque(ip, ih); + ISO_ILOCK(ip); + + imp = VFSTOISOFS (mntp); + ip->i_mnt = imp; + ip->i_devvp = imp->im_devvp; + VREF(ip->i_devvp); + + if (relocated) { + /* + * On relocated directories we must + * read the `.' entry out of a dir. + */ + ip->iso_start = ino >> imp->im_bshift; + if (error = iso_blkatoff(ip,0,&bp)) { + vrele(ip->i_devvp); + remque(ip); + ip->i_forw = ip; + ip->i_back = ip; + iso_iput(ip); + *ipp = 0; + return error; + } + isodir = (struct iso_directory_record *)bp->b_un.b_addr; + } + + ip->iso_extent = isonum_733(isodir->extent); + ip->i_size = isonum_733(isodir->size); + ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent; + + vp = ITOV(ip); + + /* + * Setup time stamp, attribute + */ + vp->v_type = VNON; + switch (imp->iso_ftype) { + default: /* ISO_FTYPE_9660 */ + if ((imp->im_flags&ISOFSMNT_EXTATT) + && isonum_711(isodir->ext_attr_length)) + iso_blkatoff(ip,-isonum_711(isodir->ext_attr_length), + &bp2); + cd9660_defattr(isodir,ip,bp2 ); + cd9660_deftstamp(isodir,ip,bp2 ); + break; + case ISO_FTYPE_RRIP: + result = cd9660_rrip_analyze(isodir,ip,imp); + break; + } + if (bp2) + brelse(bp2); + if (bp) + brelse(bp); + + /* + * Initialize the associated vnode + */ + vp->v_type = IFTOVT(ip->inode.iso_mode); + + if ( vp->v_type == VFIFO ) { +#ifdef FIFO + extern int (**cd9660_fifoop_p)(); + vp->v_op = cd9660_fifoop_p; +#else + iso_iput(ip); + *ipp = 0; + return EOPNOTSUPP; +#endif /* FIFO */ + } else if ( vp->v_type == VCHR || vp->v_type == VBLK ) { + extern int (**cd9660_specop_p)(); + + /* + * if device, look at device number table for translation + */ +#ifdef ISODEVMAP + if (dp = iso_dmap(dev,ino,0)) + ip->inode.iso_rdev = dp->d_dev; +#endif + vp->v_op = cd9660_specop_p; + if (nvp = checkalias(vp, ip->inode.iso_rdev, mntp)) { + /* + * Reinitialize aliased inode. + */ + vp = nvp; + iq = VTOI(vp); + iq->i_vnode = vp; + iq->i_flag = 0; + ISO_ILOCK(iq); + iq->i_dev = dev; + iq->i_number = ino; + iq->i_mnt = ip->i_mnt; + bcopy(&ip->iso_extent,&iq->iso_extent, + (char *)(ip + 1) - (char *)&ip->iso_extent); + insque(iq, ih); + /* + * Discard unneeded vnode + * (This introduces the need of INACTIVE modification) + */ + ip->inode.iso_mode = 0; + iso_iput(ip); + ip = iq; + } + } + + if (ip->iso_extent == imp->root_extent) + vp->v_flag |= VROOT; + + *ipp = ip; + return 0; +} + +/* + * Unlock and decrement the reference count of an inode structure. + */ +iso_iput(ip) + register struct iso_node *ip; +{ + + if ((ip->i_flag & ILOCKED) == 0) + panic("iso_iput"); + ISO_IUNLOCK(ip); + vrele(ITOV(ip)); +} + +/* + * Last reference to an inode, write the inode out and if necessary, + * truncate and deallocate the file. + */ +int +cd9660_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + register struct iso_node *ip = VTOI(vp); + int mode, error = 0; + + if (prtactive && vp->v_usecount != 0) + vprint("cd9660_inactive: pushing active", vp); + + ip->i_flag = 0; + /* + * If we are done with the inode, reclaim it + * so that it can be reused immediately. + */ + if (vp->v_usecount == 0 && ip->inode.iso_mode == 0) + vgone(vp); + return error; +} + +/* + * Reclaim an inode so that it can be used for other purposes. + */ +int +cd9660_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct iso_node *ip = VTOI(vp); + int i; + + if (prtactive && vp->v_usecount != 0) + vprint("cd9660_reclaim: pushing active", vp); + /* + * Remove the inode from its hash chain. + */ + remque(ip); + ip->i_forw = ip; + ip->i_back = ip; + /* + * Purge old data structures associated with the inode. + */ + cache_purge(vp); + if (ip->i_devvp) { + vrele(ip->i_devvp); + ip->i_devvp = 0; + } + FREE(vp->v_data, M_ISOFSNODE); + vp->v_data = NULL; + return 0; +} + +/* + * Lock an inode. If its already locked, set the WANT bit and sleep. + */ +iso_ilock(ip) + register struct iso_node *ip; +{ + + while (ip->i_flag & ILOCKED) { + ip->i_flag |= IWANT; + if (ip->i_spare0 == curproc->p_pid) + panic("locking against myself"); + ip->i_spare1 = curproc->p_pid; + (void) sleep((caddr_t)ip, PINOD); + } + ip->i_spare1 = 0; + ip->i_spare0 = curproc->p_pid; + ip->i_flag |= ILOCKED; +} + +/* + * Unlock an inode. If WANT bit is on, wakeup. + */ +iso_iunlock(ip) + register struct iso_node *ip; +{ + + if ((ip->i_flag & ILOCKED) == 0) + vprint("iso_iunlock: unlocked inode", ITOV(ip)); + ip->i_spare0 = 0; + ip->i_flag &= ~ILOCKED; + if (ip->i_flag&IWANT) { + ip->i_flag &= ~IWANT; + wakeup((caddr_t)ip); + } +} + +/* + * File attributes + */ +void +cd9660_defattr(isodir,inop,bp) + struct iso_directory_record *isodir; + struct iso_node *inop; + struct buf *bp; +{ + struct buf *bp2 = NULL; + struct iso_mnt *imp; + struct iso_extended_attributes *ap = NULL; + int off; + + if (isonum_711(isodir->flags)&2) { + inop->inode.iso_mode = S_IFDIR; + /* + * If we return 2, fts() will assume there are no subdirectories + * (just links for the path and .), so instead we return 1. + */ + inop->inode.iso_links = 1; + } else { + inop->inode.iso_mode = S_IFREG; + inop->inode.iso_links = 1; + } + if (!bp + && ((imp = inop->i_mnt)->im_flags&ISOFSMNT_EXTATT) + && (off = isonum_711(isodir->ext_attr_length))) { + iso_blkatoff(inop,-off * imp->logical_block_size,&bp2); + bp = bp2; + } + if (bp) { + ap = (struct iso_extended_attributes *)bp->b_un.b_addr; + + if (isonum_711(ap->version) == 1) { + if (!(ap->perm[0]&0x40)) + inop->inode.iso_mode |= VEXEC >> 6; + if (!(ap->perm[0]&0x10)) + inop->inode.iso_mode |= VREAD >> 6; + if (!(ap->perm[0]&4)) + inop->inode.iso_mode |= VEXEC >> 3; + if (!(ap->perm[0]&1)) + inop->inode.iso_mode |= VREAD >> 3; + if (!(ap->perm[1]&0x40)) + inop->inode.iso_mode |= VEXEC; + if (!(ap->perm[1]&0x10)) + inop->inode.iso_mode |= VREAD; + inop->inode.iso_uid = isonum_723(ap->owner); /* what about 0? */ + inop->inode.iso_gid = isonum_723(ap->group); /* what about 0? */ + } else + ap = NULL; + } + if (!ap) { + inop->inode.iso_mode |= VREAD|VEXEC|(VREAD|VEXEC)>>3|(VREAD|VEXEC)>>6; + inop->inode.iso_uid = (uid_t)0; + inop->inode.iso_gid = (gid_t)0; + } + if (bp2) + brelse(bp2); +} + +/* + * Time stamps + */ +void +cd9660_deftstamp(isodir,inop,bp) + struct iso_directory_record *isodir; + struct iso_node *inop; + struct buf *bp; +{ + struct buf *bp2 = NULL; + struct iso_mnt *imp; + struct iso_extended_attributes *ap = NULL; + int off; + + if (!bp + && ((imp = inop->i_mnt)->im_flags&ISOFSMNT_EXTATT) + && (off = isonum_711(isodir->ext_attr_length))) { + iso_blkatoff(inop,-off * imp->logical_block_size,&bp2); + bp = bp2; + } + if (bp) { + ap = (struct iso_extended_attributes *)bp->b_un.b_addr; + + if (isonum_711(ap->version) == 1) { + if (!cd9660_tstamp_conv17(ap->ftime,&inop->inode.iso_atime)) + cd9660_tstamp_conv17(ap->ctime,&inop->inode.iso_atime); + if (!cd9660_tstamp_conv17(ap->ctime,&inop->inode.iso_ctime)) + inop->inode.iso_ctime = inop->inode.iso_atime; + if (!cd9660_tstamp_conv17(ap->mtime,&inop->inode.iso_mtime)) + inop->inode.iso_mtime = inop->inode.iso_ctime; + } else + ap = NULL; + } + if (!ap) { + cd9660_tstamp_conv7(isodir->date,&inop->inode.iso_ctime); + inop->inode.iso_atime = inop->inode.iso_ctime; + inop->inode.iso_mtime = inop->inode.iso_ctime; + } + if (bp2) + brelse(bp2); +} + +int +cd9660_tstamp_conv7(pi,pu) +char *pi; +struct timeval *pu; +{ + int i; + int crtime, days; + int y, m, d, hour, minute, second, tz; + + y = pi[0] + 1900; + m = pi[1]; + d = pi[2]; + hour = pi[3]; + minute = pi[4]; + second = pi[5]; + tz = pi[6]; + + if (y < 1970) { + pu->tv_sec = 0; + pu->tv_usec = 0; + return 0; + } else { +#ifdef ORIGINAL + /* computes day number relative to Sept. 19th,1989 */ + /* don't even *THINK* about changing formula. It works! */ + days = 367*(y-1980)-7*(y+(m+9)/12)/4-3*((y+(m-9)/7)/100+1)/4+275*m/9+d-100; +#else + /* + * Changed :-) to make it relative to Jan. 1st, 1970 + * and to disambiguate negative division + */ + days = 367*(y-1960)-7*(y+(m+9)/12)/4-3*((y+(m+9)/12-1)/100+1)/4+275*m/9+d-239; +#endif + crtime = ((((days * 24) + hour) * 60 + minute) * 60) + second; + + /* timezone offset is unreliable on some disks */ + if (-48 <= tz && tz <= 52) + crtime += tz * 15 * 60; + } + pu->tv_sec = crtime; + pu->tv_usec = 0; + return 1; +} + +static unsigned +cd9660_chars2ui(begin,len) + unsigned char *begin; + int len; +{ + unsigned rc; + + for (rc = 0; --len >= 0;) { + rc *= 10; + rc += *begin++ - '0'; + } + return rc; +} + +int +cd9660_tstamp_conv17(pi,pu) + unsigned char *pi; + struct timeval *pu; +{ + unsigned char buf[7]; + + /* year:"0001"-"9999" -> -1900 */ + buf[0] = cd9660_chars2ui(pi,4) - 1900; + + /* month: " 1"-"12" -> 1 - 12 */ + buf[1] = cd9660_chars2ui(pi + 4,2); + + /* day: " 1"-"31" -> 1 - 31 */ + buf[2] = cd9660_chars2ui(pi + 6,2); + + /* hour: " 0"-"23" -> 0 - 23 */ + buf[3] = cd9660_chars2ui(pi + 8,2); + + /* minute:" 0"-"59" -> 0 - 59 */ + buf[4] = cd9660_chars2ui(pi + 10,2); + + /* second:" 0"-"59" -> 0 - 59 */ + buf[5] = cd9660_chars2ui(pi + 12,2); + + /* difference of GMT */ + buf[6] = pi[16]; + + return cd9660_tstamp_conv7(buf,pu); +} + +void +isodirino(inump,isodir,imp) + ino_t *inump; + struct iso_directory_record *isodir; + struct iso_mnt *imp; +{ + *inump = (isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length)) + * imp->logical_block_size; +} diff --git a/sys/isofs/cd9660/cd9660_node.h b/sys/isofs/cd9660/cd9660_node.h new file mode 100644 index 00000000000..45de67f1a6b --- /dev/null +++ b/sys/isofs/cd9660/cd9660_node.h @@ -0,0 +1,143 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_node.h 8.2 (Berkeley) 1/23/94 + */ + +/* + * Theoretically, directories can be more than 2Gb in length, + * however, in practice this seems unlikely. So, we define + * the type doff_t as a long to keep down the cost of doing + * lookup on a 32-bit machine. If you are porting to a 64-bit + * architecture, you should make doff_t the same as off_t. + */ +#define doff_t long + +typedef struct { + struct timespec iso_atime; /* time of last access */ + struct timespec iso_mtime; /* time of last modification */ + struct timespec iso_ctime; /* time file changed */ + u_short iso_mode; /* files access mode and type */ + uid_t iso_uid; /* owner user id */ + gid_t iso_gid; /* owner group id */ + short iso_links; /* links of file */ + dev_t iso_rdev; /* Major/Minor number for special */ +} ISO_RRIP_INODE; + +#ifdef ISODEVMAP +/* + * FOr device# (major,minor) translation table + */ +struct iso_dnode { + struct iso_dnode *d_chain[2]; /* hash chain, MUST be first */ + dev_t i_dev; /* device where dnode resides */ + ino_t i_number; /* the identity of the inode */ + dev_t d_dev; /* device # for translation */ +}; +#define d_forw d_chain[0] +#define d_back d_chain[1] +#endif + +struct iso_node { + struct iso_node *i_chain[2]; /* hash chain, MUST be first */ + struct vnode *i_vnode; /* vnode associated with this inode */ + struct vnode *i_devvp; /* vnode for block I/O */ + u_long i_flag; /* see below */ + dev_t i_dev; /* device where inode resides */ + ino_t i_number; /* the identity of the inode */ + /* we use the actual starting block of the file */ + struct iso_mnt *i_mnt; /* filesystem associated with this inode */ + struct lockf *i_lockf; /* head of byte-level lock list */ + doff_t i_endoff; /* end of useful stuff in directory */ + doff_t i_diroff; /* offset in dir, where we found last entry */ + doff_t i_offset; /* offset of free space in directory */ + ino_t i_ino; /* inode number of found directory */ + long i_spare0; + long i_spare1; + + long iso_extent; /* extent of file */ + long i_size; + long iso_start; /* actual start of data of file (may be different */ + /* from iso_extent, if file has extended attributes) */ + ISO_RRIP_INODE inode; +}; + +#define i_forw i_chain[0] +#define i_back i_chain[1] + +/* flags */ +#define ILOCKED 0x0001 /* inode is locked */ +#define IWANT 0x0002 /* some process waiting on lock */ +#define IACC 0x0020 /* inode access time to be updated */ + +#define VTOI(vp) ((struct iso_node *)(vp)->v_data) +#define ITOV(ip) ((ip)->i_vnode) + +#define ISO_ILOCK(ip) iso_ilock(ip) +#define ISO_IUNLOCK(ip) iso_iunlock(ip) + +/* + * Prototypes for ISOFS vnode operations + */ +int cd9660_lookup __P((struct vop_lookup_args *)); +int cd9660_open __P((struct vop_open_args *)); +int cd9660_close __P((struct vop_close_args *)); +int cd9660_access __P((struct vop_access_args *)); +int cd9660_getattr __P((struct vop_getattr_args *)); +int cd9660_read __P((struct vop_read_args *)); +int cd9660_ioctl __P((struct vop_ioctl_args *)); +int cd9660_select __P((struct vop_select_args *)); +int cd9660_mmap __P((struct vop_mmap_args *)); +int cd9660_seek __P((struct vop_seek_args *)); +int cd9660_readdir __P((struct vop_readdir_args *)); +int cd9660_abortop __P((struct vop_abortop_args *)); +int cd9660_inactive __P((struct vop_inactive_args *)); +int cd9660_reclaim __P((struct vop_reclaim_args *)); +int cd9660_bmap __P((struct vop_bmap_args *)); +int cd9660_lock __P((struct vop_lock_args *)); +int cd9660_unlock __P((struct vop_unlock_args *)); +int cd9660_strategy __P((struct vop_strategy_args *)); +int cd9660_print __P((struct vop_print_args *)); +int cd9660_islocked __P((struct vop_islocked_args *)); +void cd9660_defattr __P((struct iso_directory_record *, + struct iso_node *, struct buf *)); +void cd9660_deftstamp __P((struct iso_directory_record *, + struct iso_node *, struct buf *)); +#ifdef ISODEVMAP +struct iso_dnode *iso_dmap __P((dev_t, ino_t, int)); +void iso_dunmap __P((dev_t)); +#endif diff --git a/sys/isofs/cd9660/cd9660_rrip.c b/sys/isofs/cd9660/cd9660_rrip.c new file mode 100644 index 00000000000..0923fa01477 --- /dev/null +++ b/sys/isofs/cd9660/cd9660_rrip.c @@ -0,0 +1,685 @@ +/*- + * Copyright (c) 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_rrip.c 8.2 (Berkeley) 1/23/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +/* + * POSIX file attribute + */ +static int +cd9660_rrip_attr(p,ana) + ISO_RRIP_ATTR *p; + ISO_RRIP_ANALYZE *ana; +{ + ana->inop->inode.iso_mode = isonum_731(p->mode_l); + ana->inop->inode.iso_uid = (uid_t)isonum_731(p->uid_l); + ana->inop->inode.iso_gid = (gid_t)isonum_731(p->gid_l); + ana->inop->inode.iso_links = isonum_731(p->links_l); + ana->fields &= ~ISO_SUSP_ATTR; + return ISO_SUSP_ATTR; +} + +static void +cd9660_rrip_defattr(isodir,ana) + struct iso_directory_record *isodir; + ISO_RRIP_ANALYZE *ana; +{ + /* But this is a required field! */ + printf("RRIP without PX field?\n"); + cd9660_defattr(isodir,ana->inop,NULL); +} + +/* + * Symbolic Links + */ +static int +cd9660_rrip_slink(p,ana) + ISO_RRIP_SLINK *p; + ISO_RRIP_ANALYZE *ana; +{ + register ISO_RRIP_SLINK_COMPONENT *pcomp; + register ISO_RRIP_SLINK_COMPONENT *pcompe; + int len, wlen, cont; + char *outbuf, *inbuf; + + pcomp = (ISO_RRIP_SLINK_COMPONENT *)p->component; + pcompe = (ISO_RRIP_SLINK_COMPONENT *)((char *)p + isonum_711(p->h.length)); + len = *ana->outlen; + outbuf = ana->outbuf; + cont = ana->cont; + + /* + * Gathering a Symbolic name from each component with path + */ + for (; + pcomp < pcompe; + pcomp = (ISO_RRIP_SLINK_COMPONENT *)((char *)pcomp + ISO_RRIP_SLSIZ + + isonum_711(pcomp->clen))) { + + if (!cont) { + if (len < ana->maxlen) { + len++; + *outbuf++ = '/'; + } + } + cont = 0; + + inbuf = ".."; + wlen = 0; + + switch (*pcomp->cflag) { + + case ISO_SUSP_CFLAG_CURRENT: + /* Inserting Current */ + wlen = 1; + break; + + case ISO_SUSP_CFLAG_PARENT: + /* Inserting Parent */ + wlen = 2; + break; + + case ISO_SUSP_CFLAG_ROOT: + /* Inserting slash for ROOT */ + /* start over from beginning(?) */ + outbuf -= len; + len = 0; + break; + + case ISO_SUSP_CFLAG_VOLROOT: + /* Inserting a mount point i.e. "/cdrom" */ + /* same as above */ + outbuf -= len; + len = 0; + inbuf = ana->imp->im_mountp->mnt_stat.f_mntonname; + wlen = strlen(inbuf); + break; + + case ISO_SUSP_CFLAG_HOST: + /* Inserting hostname i.e. "kurt.tools.de" */ + inbuf = hostname; + wlen = hostnamelen; + break; + + case ISO_SUSP_CFLAG_CONTINUE: + cont = 1; + /* fall thru */ + case 0: + /* Inserting component */ + wlen = isonum_711(pcomp->clen); + inbuf = pcomp->name; + break; + default: + printf("RRIP with incorrect flags?"); + wlen = ana->maxlen + 1; + break; + } + + if (len + wlen > ana->maxlen) { + /* indicate error to caller */ + ana->cont = 1; + ana->fields = 0; + ana->outbuf -= *ana->outlen; + *ana->outlen = 0; + return 0; + } + + bcopy(inbuf,outbuf,wlen); + outbuf += wlen; + len += wlen; + + } + ana->outbuf = outbuf; + *ana->outlen = len; + ana->cont = cont; + + if (!isonum_711(p->flags)) { + ana->fields &= ~ISO_SUSP_SLINK; + return ISO_SUSP_SLINK; + } + return 0; +} + +/* + * Alternate name + */ +static int +cd9660_rrip_altname(p,ana) + ISO_RRIP_ALTNAME *p; + ISO_RRIP_ANALYZE *ana; +{ + char *inbuf; + int wlen; + int cont; + + inbuf = ".."; + wlen = 0; + cont = 0; + + switch (*p->flags) { + case ISO_SUSP_CFLAG_CURRENT: + /* Inserting Current */ + wlen = 1; + break; + + case ISO_SUSP_CFLAG_PARENT: + /* Inserting Parent */ + wlen = 2; + break; + + case ISO_SUSP_CFLAG_HOST: + /* Inserting hostname i.e. "kurt.tools.de" */ + inbuf = hostname; + wlen = hostnamelen; + break; + + case ISO_SUSP_CFLAG_CONTINUE: + cont = 1; + /* fall thru */ + case 0: + /* Inserting component */ + wlen = isonum_711(p->h.length) - 5; + inbuf = (char *)p + 5; + break; + + default: + printf("RRIP with incorrect NM flags?\n"); + wlen = ana->maxlen + 1; + break; + } + + if ((*ana->outlen += wlen) > ana->maxlen) { + /* treat as no name field */ + ana->fields &= ~ISO_SUSP_ALTNAME; + ana->outbuf -= *ana->outlen - wlen; + *ana->outlen = 0; + return 0; + } + + bcopy(inbuf,ana->outbuf,wlen); + ana->outbuf += wlen; + + if (!cont) { + ana->fields &= ~ISO_SUSP_ALTNAME; + return ISO_SUSP_ALTNAME; + } + return 0; +} + +static void +cd9660_rrip_defname(isodir,ana) + struct iso_directory_record *isodir; + ISO_RRIP_ANALYZE *ana; +{ + strcpy(ana->outbuf,".."); + switch (*isodir->name) { + default: + isofntrans(isodir->name,isonum_711(isodir->name_len), + ana->outbuf,ana->outlen, + 1,isonum_711(isodir->flags)&4); + break; + case 0: + *ana->outlen = 1; + break; + case 1: + *ana->outlen = 2; + break; + } +} + +/* + * Parent or Child Link + */ +static int +cd9660_rrip_pclink(p,ana) + ISO_RRIP_CLINK *p; + ISO_RRIP_ANALYZE *ana; +{ + *ana->inump = isonum_733(p->dir_loc) << ana->imp->im_bshift; + ana->fields &= ~(ISO_SUSP_CLINK|ISO_SUSP_PLINK); + return *p->h.type == 'C' ? ISO_SUSP_CLINK : ISO_SUSP_PLINK; +} + +/* + * Relocated directory + */ +static int +cd9660_rrip_reldir(p,ana) + ISO_RRIP_RELDIR *p; + ISO_RRIP_ANALYZE *ana; +{ + /* special hack to make caller aware of RE field */ + *ana->outlen = 0; + ana->fields = 0; + return ISO_SUSP_RELDIR|ISO_SUSP_ALTNAME|ISO_SUSP_CLINK|ISO_SUSP_PLINK; +} + +static int +cd9660_rrip_tstamp(p,ana) + ISO_RRIP_TSTAMP *p; + ISO_RRIP_ANALYZE *ana; +{ + unsigned char *ptime; + + ptime = p->time; + + /* Check a format of time stamp (7bytes/17bytes) */ + if (!(*p->flags&ISO_SUSP_TSTAMP_FORM17)) { + if (*p->flags&ISO_SUSP_TSTAMP_CREAT) + ptime += 7; + + if (*p->flags&ISO_SUSP_TSTAMP_MODIFY) { + cd9660_tstamp_conv7(ptime,&ana->inop->inode.iso_mtime); + ptime += 7; + } else + bzero(&ana->inop->inode.iso_mtime,sizeof(struct timeval)); + + if (*p->flags&ISO_SUSP_TSTAMP_ACCESS) { + cd9660_tstamp_conv7(ptime,&ana->inop->inode.iso_atime); + ptime += 7; + } else + ana->inop->inode.iso_atime = ana->inop->inode.iso_mtime; + + if (*p->flags&ISO_SUSP_TSTAMP_ATTR) + cd9660_tstamp_conv7(ptime,&ana->inop->inode.iso_ctime); + else + ana->inop->inode.iso_ctime = ana->inop->inode.iso_mtime; + + } else { + if (*p->flags&ISO_SUSP_TSTAMP_CREAT) + ptime += 17; + + if (*p->flags&ISO_SUSP_TSTAMP_MODIFY) { + cd9660_tstamp_conv17(ptime,&ana->inop->inode.iso_mtime); + ptime += 17; + } else + bzero(&ana->inop->inode.iso_mtime,sizeof(struct timeval)); + + if (*p->flags&ISO_SUSP_TSTAMP_ACCESS) { + cd9660_tstamp_conv17(ptime,&ana->inop->inode.iso_atime); + ptime += 17; + } else + ana->inop->inode.iso_atime = ana->inop->inode.iso_mtime; + + if (*p->flags&ISO_SUSP_TSTAMP_ATTR) + cd9660_tstamp_conv17(ptime,&ana->inop->inode.iso_ctime); + else + ana->inop->inode.iso_ctime = ana->inop->inode.iso_mtime; + + } + ana->fields &= ~ISO_SUSP_TSTAMP; + return ISO_SUSP_TSTAMP; +} + +static void +cd9660_rrip_deftstamp(isodir,ana) + struct iso_directory_record *isodir; + ISO_RRIP_ANALYZE *ana; +{ + cd9660_deftstamp(isodir,ana->inop,NULL); +} + +/* + * POSIX device modes + */ +static int +cd9660_rrip_device(p,ana) + ISO_RRIP_DEVICE *p; + ISO_RRIP_ANALYZE *ana; +{ + unsigned high, low; + + high = isonum_733(p->dev_t_high_l); + low = isonum_733(p->dev_t_low_l); + + if ( high == 0 ) { + ana->inop->inode.iso_rdev = makedev( major(low), minor(low) ); + } else { + ana->inop->inode.iso_rdev = makedev( high, minor(low) ); + } + ana->fields &= ~ISO_SUSP_DEVICE; + return ISO_SUSP_DEVICE; +} + +/* + * Flag indicating + */ +static int +cd9660_rrip_idflag(p,ana) + ISO_RRIP_IDFLAG *p; + ISO_RRIP_ANALYZE *ana; +{ + ana->fields &= isonum_711(p->flags)|~0xff; /* don't touch high bits */ + /* special handling of RE field */ + if (ana->fields&ISO_SUSP_RELDIR) + return cd9660_rrip_reldir(p,ana); + + return ISO_SUSP_IDFLAG; +} + +/* + * Continuation pointer + */ +static int +cd9660_rrip_cont(p,ana) + ISO_RRIP_CONT *p; + ISO_RRIP_ANALYZE *ana; +{ + ana->iso_ce_blk = isonum_733(p->location); + ana->iso_ce_off = isonum_733(p->offset); + ana->iso_ce_len = isonum_733(p->length); + return ISO_SUSP_CONT; +} + +/* + * System Use end + */ +static int +cd9660_rrip_stop(p,ana) + ISO_SUSP_HEADER *p; + ISO_RRIP_ANALYZE *ana; +{ + /* stop analyzing */ + ana->fields = 0; + return ISO_SUSP_STOP; +} + +/* + * Extension reference + */ +static int +cd9660_rrip_extref(p,ana) + ISO_RRIP_EXTREF *p; + ISO_RRIP_ANALYZE *ana; +{ + if (isonum_711(p->len_id) != 10 + || bcmp((char *)p + 8,"RRIP_1991A",10) + || isonum_711(p->version) != 1) + return 0; + ana->fields &= ~ISO_SUSP_EXTREF; + return ISO_SUSP_EXTREF; +} + +typedef struct { + char type[2]; + int (*func)(); + void (*func2)(); + int result; +} RRIP_TABLE; + +static int +cd9660_rrip_loop(isodir,ana,table) + struct iso_directory_record *isodir; + ISO_RRIP_ANALYZE *ana; + RRIP_TABLE *table; +{ + register RRIP_TABLE *ptable; + register ISO_SUSP_HEADER *phead; + register ISO_SUSP_HEADER *pend; + struct buf *bp = NULL; + int i; + char *pwhead; + int result; + + /* + * Note: If name length is odd, + * it will be padding 1 byte after the name + */ + pwhead = isodir->name + isonum_711(isodir->name_len); + if (!(isonum_711(isodir->name_len)&1)) + pwhead++; + + /* If it's not the '.' entry of the root dir obey SP field */ + if (*isodir->name != 0 + || isonum_733(isodir->extent) != ana->imp->root_extent) + pwhead += ana->imp->rr_skip; + else + pwhead += ana->imp->rr_skip0; + + phead = (ISO_SUSP_HEADER *)pwhead; + pend = (ISO_SUSP_HEADER *)((char *)isodir + isonum_711(isodir->length)); + + result = 0; + while (1) { + ana->iso_ce_len = 0; + /* + * Note: "pend" should be more than one SUSP header + */ + while (pend >= phead + 1) { + if (isonum_711(phead->version) == 1) { + for (ptable = table; ptable->func; ptable++) { + if (*phead->type == *ptable->type + && phead->type[1] == ptable->type[1]) { + result |= ptable->func(phead,ana); + break; + } + } + if (!ana->fields) + break; + } + /* + * move to next SUSP + * Hopefully this works with newer versions, too + */ + phead = (ISO_SUSP_HEADER *)((char *)phead + isonum_711(phead->length)); + } + + if ( ana->fields && ana->iso_ce_len ) { + if (ana->iso_ce_blk >= ana->imp->volume_space_size + || ana->iso_ce_off + ana->iso_ce_len > ana->imp->logical_block_size + || bread(ana->imp->im_devvp, + ana->iso_ce_blk * ana->imp->logical_block_size / DEV_BSIZE, + ana->imp->logical_block_size,NOCRED,&bp)) + /* what to do now? */ + break; + phead = (ISO_SUSP_HEADER *)(bp->b_un.b_addr + ana->iso_ce_off); + pend = (ISO_SUSP_HEADER *) ((char *)phead + ana->iso_ce_len); + } else + break; + } + if (bp) + brelse(bp); + /* + * If we don't find the Basic SUSP stuffs, just set default value + * ( attribute/time stamp ) + */ + for (ptable = table; ptable->func2; ptable++) + if (!(ptable->result&result)) + ptable->func2(isodir,ana); + + return result; +} + +static RRIP_TABLE rrip_table_analyze[] = { + { "PX", cd9660_rrip_attr, cd9660_rrip_defattr, ISO_SUSP_ATTR }, + { "TF", cd9660_rrip_tstamp, cd9660_rrip_deftstamp, ISO_SUSP_TSTAMP }, + { "PN", cd9660_rrip_device, 0, ISO_SUSP_DEVICE }, + { "RR", cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG }, + { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, + { "", 0, 0, 0 } +}; + +int +cd9660_rrip_analyze(isodir,inop,imp) + struct iso_directory_record *isodir; + struct iso_node *inop; + struct iso_mnt *imp; +{ + ISO_RRIP_ANALYZE analyze; + + analyze.inop = inop; + analyze.imp = imp; + analyze.fields = ISO_SUSP_ATTR|ISO_SUSP_TSTAMP|ISO_SUSP_DEVICE; + + return cd9660_rrip_loop(isodir,&analyze,rrip_table_analyze); +} + +/* + * Get Alternate Name from 'AL' record + * If either no AL record or 0 length, + * it will be return the translated ISO9660 name, + */ +static RRIP_TABLE rrip_table_getname[] = { + { "NM", cd9660_rrip_altname, cd9660_rrip_defname, ISO_SUSP_ALTNAME }, + { "CL", cd9660_rrip_pclink, 0, ISO_SUSP_CLINK|ISO_SUSP_PLINK }, + { "PL", cd9660_rrip_pclink, 0, ISO_SUSP_CLINK|ISO_SUSP_PLINK }, + { "RE", cd9660_rrip_reldir, 0, ISO_SUSP_RELDIR }, + { "RR", cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG }, + { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, + { "", 0, 0, 0 } +}; + +int +cd9660_rrip_getname(isodir,outbuf,outlen,inump,imp) + struct iso_directory_record *isodir; + char *outbuf; + u_short *outlen; + ino_t *inump; + struct iso_mnt *imp; +{ + ISO_RRIP_ANALYZE analyze; + RRIP_TABLE *tab; + + analyze.outbuf = outbuf; + analyze.outlen = outlen; + analyze.maxlen = NAME_MAX; + analyze.inump = inump; + analyze.imp = imp; + analyze.fields = ISO_SUSP_ALTNAME|ISO_SUSP_RELDIR|ISO_SUSP_CLINK|ISO_SUSP_PLINK; + *outlen = 0; + + tab = rrip_table_getname; + if (*isodir->name == 0 + || *isodir->name == 1) { + cd9660_rrip_defname(isodir,&analyze); + + analyze.fields &= ~ISO_SUSP_ALTNAME; + tab++; + } + + return cd9660_rrip_loop(isodir,&analyze,tab); +} + +/* + * Get Symbolic Name from 'SL' record + * + * Note: isodir should contains SL record! + */ +static RRIP_TABLE rrip_table_getsymname[] = { + { "SL", cd9660_rrip_slink, 0, ISO_SUSP_SLINK }, + { "RR", cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG }, + { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, + { "", 0, 0, 0 } +}; + +int +cd9660_rrip_getsymname(isodir,outbuf,outlen,imp) + struct iso_directory_record *isodir; + char *outbuf; + u_short *outlen; + struct iso_mnt *imp; +{ + ISO_RRIP_ANALYZE analyze; + + analyze.outbuf = outbuf; + analyze.outlen = outlen; + *outlen = 0; + analyze.maxlen = MAXPATHLEN; + analyze.cont = 1; /* don't start with a slash */ + analyze.imp = imp; + analyze.fields = ISO_SUSP_SLINK; + + return (cd9660_rrip_loop(isodir,&analyze,rrip_table_getsymname)&ISO_SUSP_SLINK); +} + +static RRIP_TABLE rrip_table_extref[] = { + { "ER", cd9660_rrip_extref, 0, ISO_SUSP_EXTREF }, + { "CE", cd9660_rrip_cont, 0, ISO_SUSP_CONT }, + { "ST", cd9660_rrip_stop, 0, ISO_SUSP_STOP }, + { "", 0, 0, 0 } +}; + +/* + * Check for Rock Ridge Extension and return offset of its fields. + * Note: We require the ER field. + */ +int +cd9660_rrip_offset(isodir,imp) + struct iso_directory_record *isodir; + struct iso_mnt *imp; +{ + ISO_RRIP_OFFSET *p; + ISO_RRIP_ANALYZE analyze; + + imp->rr_skip0 = 0; + p = (ISO_RRIP_OFFSET *)(isodir->name + 1); + if (bcmp(p,"SP\7\1\276\357",6)) { + /* Maybe, it's a CDROM XA disc? */ + imp->rr_skip0 = 15; + p = (ISO_RRIP_OFFSET *)((char *)p + 15); + if (bcmp(p,"SP\7\1\276\357",6)) + return -1; + } + + analyze.imp = imp; + analyze.fields = ISO_SUSP_EXTREF; + if (!(cd9660_rrip_loop(isodir,&analyze,rrip_table_extref)&ISO_SUSP_EXTREF)) + return -1; + + return isonum_711(p->skip); +} diff --git a/sys/isofs/cd9660/cd9660_rrip.h b/sys/isofs/cd9660/cd9660_rrip.h new file mode 100644 index 00000000000..b4017281f06 --- /dev/null +++ b/sys/isofs/cd9660/cd9660_rrip.h @@ -0,0 +1,146 @@ +/*- + * Copyright (c) 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_rrip.h 8.1 (Berkeley) 1/21/94 + */ + +typedef struct { + char type [ISODCL ( 0, 1)]; + unsigned char length [ISODCL ( 2, 2)]; /* 711 */ + unsigned char version [ISODCL ( 3, 3)]; +} ISO_SUSP_HEADER; + +typedef struct { + ISO_SUSP_HEADER h; + char mode_l [ISODCL ( 4, 7)]; /* 731 */ + char mode_m [ISODCL ( 8, 11)]; /* 732 */ + char links_l [ISODCL ( 12, 15)]; /* 731 */ + char links_m [ISODCL ( 16, 19)]; /* 732 */ + char uid_l [ISODCL ( 20, 23)]; /* 731 */ + char uid_m [ISODCL ( 24, 27)]; /* 732 */ + char gid_l [ISODCL ( 28, 31)]; /* 731 */ + char gid_m [ISODCL ( 32, 35)]; /* 732 */ +} ISO_RRIP_ATTR; + +typedef struct { + ISO_SUSP_HEADER h; + char dev_t_high_l [ISODCL ( 4, 7)]; /* 731 */ + char dev_t_high_m [ISODCL ( 8, 11)]; /* 732 */ + char dev_t_low_l [ISODCL ( 12, 15)]; /* 731 */ + char dev_t_low_m [ISODCL ( 16, 19)]; /* 732 */ +} ISO_RRIP_DEVICE; + +#define ISO_SUSP_CFLAG_CONTINUE 0x01 +#define ISO_SUSP_CFLAG_CURRENT 0x02 +#define ISO_SUSP_CFLAG_PARENT 0x04 +#define ISO_SUSP_CFLAG_ROOT 0x08 +#define ISO_SUSP_CFLAG_VOLROOT 0x10 +#define ISO_SUSP_CFLAG_HOST 0x20 + +typedef struct { + u_char cflag [ISODCL ( 1, 1)]; + u_char clen [ISODCL ( 2, 2)]; + u_char name [0]; +} ISO_RRIP_SLINK_COMPONENT; +#define ISO_RRIP_SLSIZ 2 + +typedef struct { + ISO_SUSP_HEADER h; + u_char flags [ISODCL ( 4, 4)]; + u_char component [ISODCL ( 5, 5)]; +} ISO_RRIP_SLINK; + +typedef struct { + ISO_SUSP_HEADER h; + char flags [ISODCL ( 4, 4)]; +} ISO_RRIP_ALTNAME; + +typedef struct { + ISO_SUSP_HEADER h; + char dir_loc [ISODCL ( 4, 11)]; /* 733 */ +} ISO_RRIP_CLINK; + +typedef struct { + ISO_SUSP_HEADER h; + char dir_loc [ISODCL ( 4, 11)]; /* 733 */ +} ISO_RRIP_PLINK; + +typedef struct { + ISO_SUSP_HEADER h; +} ISO_RRIP_RELDIR; + +#define ISO_SUSP_TSTAMP_FORM17 0x80 +#define ISO_SUSP_TSTAMP_FORM7 0x00 +#define ISO_SUSP_TSTAMP_CREAT 0x01 +#define ISO_SUSP_TSTAMP_MODIFY 0x02 +#define ISO_SUSP_TSTAMP_ACCESS 0x04 +#define ISO_SUSP_TSTAMP_ATTR 0x08 +#define ISO_SUSP_TSTAMP_BACKUP 0x10 +#define ISO_SUSP_TSTAMP_EXPIRE 0x20 +#define ISO_SUSP_TSTAMP_EFFECT 0x40 + +typedef struct { + ISO_SUSP_HEADER h; + unsigned char flags [ISODCL ( 4, 4)]; + unsigned char time [ISODCL ( 5, 5)]; +} ISO_RRIP_TSTAMP; + +typedef struct { + ISO_SUSP_HEADER h; + unsigned char flags [ISODCL ( 4, 4)]; +} ISO_RRIP_IDFLAG; + +typedef struct { + ISO_SUSP_HEADER h; + char len_id [ISODCL ( 4, 4)]; + char len_des [ISODCL ( 5, 5)]; + char len_src [ISODCL ( 6, 6)]; + char version [ISODCL ( 7, 7)]; +} ISO_RRIP_EXTREF; + +typedef struct { + ISO_SUSP_HEADER h; + char check [ISODCL ( 4, 5)]; + char skip [ISODCL ( 6, 6)]; +} ISO_RRIP_OFFSET; + +typedef struct { + ISO_SUSP_HEADER h; + char location [ISODCL ( 4, 11)]; + char offset [ISODCL ( 12, 19)]; + char length [ISODCL ( 20, 27)]; +} ISO_RRIP_CONT; diff --git a/sys/isofs/cd9660/cd9660_util.c b/sys/isofs/cd9660/cd9660_util.c new file mode 100644 index 00000000000..f74f0515ff7 --- /dev/null +++ b/sys/isofs/cd9660/cd9660_util.c @@ -0,0 +1,236 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_util.c 8.1 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* XXX */ +#include /* XXX */ +#include +#include + +#include + +#ifdef __notanymore__ +int +isonum_711 (p) +unsigned char *p; +{ + return (*p); +} + +int +isonum_712 (p) +signed char *p; +{ + return (*p); +} + +int +isonum_721 (p) +unsigned char *p; +{ + /* little endian short */ +#if BYTE_ORDER != LITTLE_ENDIAN + printf ("isonum_721 called on non little-endian machine!\n"); +#endif + + return *(short *)p; +} + +int +isonum_722 (p) +unsigned char *p; +{ + /* big endian short */ +#if BYTE_ORDER != BIG_ENDIAN + printf ("isonum_722 called on non big-endian machine!\n"); +#endif + + return *(short *)p; +} + +int +isonum_723 (p) +unsigned char *p; +{ +#if BYTE_ORDER == BIG_ENDIAN + return isonum_722 (p + 2); +#elif BYTE_ORDER == LITTLE_ENDIAN + return isonum_721 (p); +#else + printf ("isonum_723 unsupported byte order!\n"); + return 0; +#endif +} + +int +isonum_731 (p) +unsigned char *p; +{ + /* little endian long */ +#if BYTE_ORDER != LITTLE_ENDIAN + printf ("isonum_731 called on non little-endian machine!\n"); +#endif + + return *(long *)p; +} + +int +isonum_732 (p) +unsigned char *p; +{ + /* big endian long */ +#if BYTE_ORDER != BIG_ENDIAN + printf ("isonum_732 called on non big-endian machine!\n"); +#endif + + return *(long *)p; +} + +int +isonum_733 (p) +unsigned char *p; +{ +#if BYTE_ORDER == BIG_ENDIAN + return isonum_732 (p + 4); +#elif BYTE_ORDER == LITTLE_ENDIAN + return isonum_731 (p); +#else + printf ("isonum_733 unsupported byte order!\n"); + return 0; +#endif +} +#endif /* __notanymore__ */ + +/* + * translate and compare a filename + * Note: Version number plus ';' may be omitted. + */ +int +isofncmp(unsigned char *fn,int fnlen,unsigned char *isofn,int isolen) +{ + int i, j; + char c; + + while (--fnlen >= 0) { + if (--isolen < 0) + return *fn; + if ((c = *isofn++) == ';') { + switch (*fn++) { + default: + return *--fn; + case 0: + return 0; + case ';': + break; + } + for (i = 0; --fnlen >= 0; i = i * 10 + *fn++ - '0') { + if (*fn < '0' || *fn > '9') { + return -1; + } + } + for (j = 0; --isolen >= 0; j = j * 10 + *isofn++ - '0'); + return i - j; + } + if (c != *fn) { + if (c >= 'A' && c <= 'Z') { + if (c + ('a' - 'A') != *fn) { + if (*fn >= 'a' && *fn <= 'z') + return *fn - ('a' - 'A') - c; + else + return *fn - c; + } + } else + return *fn - c; + } + fn++; + } + if (isolen > 0) { + switch (*isofn) { + default: + return -1; + case '.': + if (isofn[1] != ';') + return -1; + case ';': + return 0; + } + } + return 0; +} + +/* + * translate a filename + */ +void +isofntrans(unsigned char *infn,int infnlen, + unsigned char *outfn,unsigned short *outfnlen, + int original,int assoc) +{ + int fnidx = 0; + + if (assoc) { + *outfn++ = ASSOCCHAR; + fnidx++; + } + for (; fnidx < infnlen; fnidx++) { + char c = *infn++; + + if (!original && c >= 'A' && c <= 'Z') + *outfn++ = c + ('a' - 'A'); + else if (!original && c == '.' && *infn == ';') + break; + else if (!original && c == ';') + break; + else + *outfn++ = c; + } + *outfnlen = fnidx; +} diff --git a/sys/isofs/cd9660/cd9660_vfsops.c b/sys/isofs/cd9660/cd9660_vfsops.c new file mode 100644 index 00000000000..02dd92af66f --- /dev/null +++ b/sys/isofs/cd9660/cd9660_vfsops.c @@ -0,0 +1,681 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_vfsops.c 8.3 (Berkeley) 1/31/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern int enodev (); + +struct vfsops cd9660_vfsops = { + cd9660_mount, + cd9660_start, + cd9660_unmount, + cd9660_root, + cd9660_quotactl, + cd9660_statfs, + cd9660_sync, + cd9660_vget, + cd9660_fhtovp, + cd9660_vptofh, + cd9660_init, +}; + +/* + * Called by vfs_mountroot when iso is going to be mounted as root. + * + * Name is updated by mount(8) after booting. + */ +#define ROOTNAME "root_device" + +static iso_mountfs(); + +cd9660_mountroot() +{ + register struct mount *mp; + extern struct vnode *rootvp; + struct proc *p = curproc; /* XXX */ + struct iso_mnt *imp; + register struct fs *fs; + u_int size; + int error; + struct iso_args args; + + /* + * Get vnodes for swapdev and rootdev. + */ + if (bdevvp(swapdev, &swapdev_vp) || bdevvp(rootdev, &rootvp)) + panic("cd9660_mountroot: can't setup bdevvp's"); + + mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + mp->mnt_op = &cd9660_vfsops; + mp->mnt_flag = MNT_RDONLY; + args.flags = ISOFSMNT_ROOT; + if (error = iso_mountfs(rootvp, mp, p, &args)) { + free(mp, M_MOUNT); + return (error); + } + if (error = vfs_lock(mp)) { + (void)cd9660_unmount(mp, 0, p); + free(mp, M_MOUNT); + return (error); + } + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mp->mnt_flag |= MNT_ROOTFS; + mp->mnt_vnodecovered = NULLVP; + imp = VFSTOISOFS(mp); + bzero(imp->im_fsmnt, sizeof(imp->im_fsmnt)); + imp->im_fsmnt[0] = '/'; + bcopy((caddr_t)imp->im_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) cd9660_statfs(mp, &mp->mnt_stat, p); + vfs_unlock(mp); + return (0); +} + +/* + * Flag to allow forcible unmounting. + */ +int iso_doforce = 1; + +/* + * VFS Operations. + * + * mount system call + */ +cd9660_mount(mp, path, data, ndp, p) + register struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct vnode *devvp; + struct iso_args args; + u_int size; + int error; + struct iso_mnt *imp; + + if (error = copyin(data, (caddr_t)&args, sizeof (struct iso_args))) + return (error); + + if ((mp->mnt_flag & MNT_RDONLY) == 0) + return (EROFS); + + /* + * If updating, check whether changing from read-only to + * read/write; if there is no device name, that's all we do. + */ + if (mp->mnt_flag & MNT_UPDATE) { + imp = VFSTOISOFS(mp); + if (args.fspec == 0) + return (vfs_export(mp, &imp->im_export, &args.export)); + } + /* + * Not an update, or updating the name: look up the name + * and verify that it refers to a sensible block device. + */ + NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); + if (error = namei(ndp)) + return (error); + devvp = ndp->ni_vp; + + if (devvp->v_type != VBLK) { + vrele(devvp); + return ENOTBLK; + } + if (major(devvp->v_rdev) >= nblkdev) { + vrele(devvp); + return ENXIO; + } + if ((mp->mnt_flag & MNT_UPDATE) == 0) + error = iso_mountfs(devvp, mp, p, &args); + else { + if (devvp != imp->im_devvp) + error = EINVAL; /* needs translation */ + else + vrele(devvp); + } + if (error) { + vrele(devvp); + return error; + } + imp = VFSTOISOFS(mp); + (void) copyinstr(path, imp->im_fsmnt, sizeof(imp->im_fsmnt)-1, &size); + bzero(imp->im_fsmnt + size, sizeof(imp->im_fsmnt) - size); + bcopy((caddr_t)imp->im_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) cd9660_statfs(mp, &mp->mnt_stat, p); + return 0; +} + +/* + * Common code for mount and mountroot + */ +static iso_mountfs(devvp, mp, p, argp) + register struct vnode *devvp; + struct mount *mp; + struct proc *p; + struct iso_args *argp; +{ + register struct iso_mnt *isomp = (struct iso_mnt *)0; + struct buf *bp = NULL; + dev_t dev = devvp->v_rdev; + caddr_t base, space; + int havepart = 0, blks; + int error = EINVAL, i, size; + int needclose = 0; + int ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + extern struct vnode *rootvp; + int j; + int iso_bsize; + int iso_blknum; + struct iso_volume_descriptor *vdp; + struct iso_primary_descriptor *pri; + struct iso_directory_record *rootp; + int logical_block_size; + + if (!ronly) + return EROFS; + + /* + * Disallow multiple mounts of the same device. + * Disallow mounting of a device that is currently in use + * (except for root, which might share swap device for miniroot). + * Flush out any old buffers remaining from a previous use. + */ + if (error = vfs_mountedon(devvp)) + return error; + if (vcount(devvp) > 1 && devvp != rootvp) + return EBUSY; + if (error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + + if (error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p)) + return error; + needclose = 1; + + /* This is the "logical sector size". The standard says this + * should be 2048 or the physical sector size on the device, + * whichever is greater. For now, we'll just use a constant. + */ + iso_bsize = ISO_DEFAULT_BLOCK_SIZE; + + for (iso_blknum = 16; iso_blknum < 100; iso_blknum++) { + if (error = bread (devvp, btodb(iso_blknum * iso_bsize), + iso_bsize, NOCRED, &bp)) + goto out; + + vdp = (struct iso_volume_descriptor *)bp->b_un.b_addr; + if (bcmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) != 0) { + error = EINVAL; + goto out; + } + + if (isonum_711 (vdp->type) == ISO_VD_END) { + error = EINVAL; + goto out; + } + + if (isonum_711 (vdp->type) == ISO_VD_PRIMARY) + break; + brelse(bp); + } + + if (isonum_711 (vdp->type) != ISO_VD_PRIMARY) { + error = EINVAL; + goto out; + } + + pri = (struct iso_primary_descriptor *)vdp; + + logical_block_size = isonum_723 (pri->logical_block_size); + + if (logical_block_size < DEV_BSIZE || logical_block_size > MAXBSIZE + || (logical_block_size & (logical_block_size - 1)) != 0) { + error = EINVAL; + goto out; + } + + rootp = (struct iso_directory_record *)pri->root_directory_record; + + isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK); + bzero((caddr_t)isomp, sizeof *isomp); + isomp->logical_block_size = logical_block_size; + isomp->volume_space_size = isonum_733 (pri->volume_space_size); + bcopy (rootp, isomp->root, sizeof isomp->root); + isomp->root_extent = isonum_733 (rootp->extent); + isomp->root_size = isonum_733 (rootp->size); + + isomp->im_bmask = logical_block_size - 1; + isomp->im_bshift = 0; + while ((1 << isomp->im_bshift) < isomp->logical_block_size) + isomp->im_bshift++; + + bp->b_flags |= B_AGE; + brelse(bp); + bp = NULL; + + mp->mnt_data = (qaddr_t)isomp; + mp->mnt_stat.f_fsid.val[0] = (long)dev; + mp->mnt_stat.f_fsid.val[1] = MOUNT_CD9660; + mp->mnt_maxsymlinklen = 0; + mp->mnt_flag |= MNT_LOCAL; + isomp->im_mountp = mp; + isomp->im_dev = dev; + isomp->im_devvp = devvp; + + devvp->v_specflags |= SI_MOUNTEDON; + + /* Check the Rock Ridge Extention support */ + if (!(argp->flags & ISOFSMNT_NORRIP)) { + if (error = bread (isomp->im_devvp, + (isomp->root_extent + isonum_711(rootp->ext_attr_length)) + * isomp->logical_block_size / DEV_BSIZE, + isomp->logical_block_size,NOCRED,&bp)) + goto out; + + rootp = (struct iso_directory_record *)bp->b_un.b_addr; + + if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) { + argp->flags |= ISOFSMNT_NORRIP; + } else { + argp->flags &= ~ISOFSMNT_GENS; + } + + /* + * The contents are valid, + * but they will get reread as part of another vnode, so... + */ + bp->b_flags |= B_AGE; + brelse(bp); + bp = NULL; + } + isomp->im_flags = argp->flags&(ISOFSMNT_NORRIP|ISOFSMNT_GENS|ISOFSMNT_EXTATT); + switch (isomp->im_flags&(ISOFSMNT_NORRIP|ISOFSMNT_GENS)) { + default: + isomp->iso_ftype = ISO_FTYPE_DEFAULT; + break; + case ISOFSMNT_GENS|ISOFSMNT_NORRIP: + isomp->iso_ftype = ISO_FTYPE_9660; + break; + case 0: + isomp->iso_ftype = ISO_FTYPE_RRIP; + break; + } + + return 0; +out: + if (bp) + brelse(bp); + if (needclose) + (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); + if (isomp) { + free((caddr_t)isomp, M_ISOFSMNT); + mp->mnt_data = (qaddr_t)0; + } + return error; +} + +/* + * Make a filesystem operational. + * Nothing to do at the moment. + */ +/* ARGSUSED */ +cd9660_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + return 0; +} + +/* + * unmount system call + */ +int +cd9660_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + register struct iso_mnt *isomp; + int i, error, ronly, flags = 0; + + if (mntflags & MNT_FORCE) { + if (!iso_doforce || (mp->mnt_flag & MNT_ROOTFS)) + return (EINVAL); + flags |= FORCECLOSE; + } +#if 0 + mntflushbuf(mp, 0); + if (mntinvalbuf(mp)) + return EBUSY; +#endif + if (error = vflush(mp, NULLVP, flags)) + return (error); + + isomp = VFSTOISOFS(mp); + +#ifdef ISODEVMAP + if (isomp->iso_ftype == ISO_FTYPE_RRIP) + iso_dunmap(isomp->im_dev); +#endif + + isomp->im_devvp->v_specflags &= ~SI_MOUNTEDON; + error = VOP_CLOSE(isomp->im_devvp, FREAD, NOCRED, p); + vrele(isomp->im_devvp); + free((caddr_t)isomp, M_ISOFSMNT); + mp->mnt_data = (qaddr_t)0; + mp->mnt_flag &= ~MNT_LOCAL; + return (error); +} + +/* + * Return root of a filesystem + */ +cd9660_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + register struct iso_node *ip; + struct iso_node tip, *nip; + struct vnode tvp; + int error; + struct iso_mnt *imp = VFSTOISOFS (mp); + struct iso_directory_record *dp; + + tvp.v_mount = mp; + tvp.v_data = &tip; + ip = VTOI(&tvp); + ip->i_vnode = &tvp; + ip->i_dev = imp->im_dev; + ip->i_diroff = 0; + dp = (struct iso_directory_record *)imp->root; + isodirino(&ip->i_number,dp,imp); + + /* + * With RRIP we must use the `.' entry of the root directory. + * Simply tell iget, that it's a relocated directory. + */ + error = iso_iget(ip,ip->i_number, + imp->iso_ftype == ISO_FTYPE_RRIP, + &nip,dp); + if (error) + return error; + *vpp = ITOV(nip); + return 0; +} + +/* + * Do operations associated with quotas, not supported + */ +/* ARGSUSED */ +int +cd9660_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} + +/* + * Get file system statistics. + */ +cd9660_statfs(mp, sbp, p) + struct mount *mp; + register struct statfs *sbp; + struct proc *p; +{ + register struct iso_mnt *isomp; + register struct fs *fs; + + isomp = VFSTOISOFS(mp); + + sbp->f_type = MOUNT_CD9660; + sbp->f_bsize = isomp->logical_block_size; + sbp->f_iosize = sbp->f_bsize; /* XXX */ + sbp->f_blocks = isomp->volume_space_size; + sbp->f_bfree = 0; /* total free blocks */ + sbp->f_bavail = 0; /* blocks free for non superuser */ + sbp->f_files = 0; /* total files */ + sbp->f_ffree = 0; /* free file nodes */ + if (sbp != &mp->mnt_stat) { + bcopy((caddr_t)mp->mnt_stat.f_mntonname, + (caddr_t)&sbp->f_mntonname[0], MNAMELEN); + bcopy((caddr_t)mp->mnt_stat.f_mntfromname, + (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); + } + /* Use the first spare for flags: */ + sbp->f_spare[0] = isomp->im_flags; + return 0; +} + +/* ARGSUSED */ +int +cd9660_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + return (0); +} + +/* + * Flat namespace lookup. + * Currently unsupported. + */ +/* ARGSUSED */ +int +cd9660_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +/* + * File handle to vnode + * + * Have to be really careful about stale file handles: + * - check that the inode number is in range + * - call iget() to get the locked inode + * - check for an unallocated inode (i_mode == 0) + * - check that the generation number matches + */ + +struct ifid { + ushort ifid_len; + ushort ifid_pad; + int ifid_ino; + long ifid_start; +}; + +/* ARGSUSED */ +int +cd9660_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) + register struct mount *mp; + struct fid *fhp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + struct vnode tvp; + int error; + int lbn, off; + struct ifid *ifhp; + struct iso_mnt *imp; + struct buf *bp; + struct iso_directory_record *dirp; + struct iso_node tip, *ip, *nip; + struct netcred *np; + + imp = VFSTOISOFS (mp); + ifhp = (struct ifid *)fhp; + +#ifdef ISOFS_DBG + printf("fhtovp: ino %d, start %ld\n", + ifhp->ifid_ino, ifhp->ifid_start); +#endif + + np = vfs_export_lookup(mp, &imp->im_export, nam); + if (np == NULL) + return (EACCES); + + lbn = iso_lblkno(imp, ifhp->ifid_ino); + if (lbn >= imp->volume_space_size) { + printf("fhtovp: lbn exceed volume space %d\n", lbn); + return (ESTALE); + } + + off = iso_blkoff(imp, ifhp->ifid_ino); + if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) { + printf("fhtovp: crosses block boundary %d\n", + off + ISO_DIRECTORY_RECORD_SIZE); + return (ESTALE); + } + + error = bread(imp->im_devvp, btodb(lbn * imp->logical_block_size), + imp->logical_block_size, NOCRED, &bp); + if (error) { + printf("fhtovp: bread error %d\n",error); + brelse(bp); + return (error); + } + + dirp = (struct iso_directory_record *)(bp->b_un.b_addr + off); + if (off + isonum_711(dirp->length) > imp->logical_block_size) { + brelse(bp); + printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n", + off+isonum_711(dirp->length), off, + isonum_711(dirp->length)); + return (ESTALE); + } + + if (isonum_733(dirp->extent) + isonum_711(dirp->ext_attr_length) != + ifhp->ifid_start) { + brelse(bp); + printf("fhtovp: file start miss %d vs %d\n", + isonum_733(dirp->extent)+isonum_711(dirp->ext_attr_length), + ifhp->ifid_start); + return (ESTALE); + } + brelse(bp); + + ip = &tip; + tvp.v_mount = mp; + tvp.v_data = ip; + ip->i_vnode = &tvp; + ip->i_dev = imp->im_dev; + if (error = iso_iget(ip, ifhp->ifid_ino, 0, &nip, dirp)) { + *vpp = NULLVP; + printf("fhtovp: failed to get inode\n"); + return (error); + } + ip = nip; + /* + * XXX need generation number? + */ + if (ip->inode.iso_mode == 0) { + iso_iput(ip); + *vpp = NULLVP; + printf("fhtovp: inode mode == 0\n"); + return (ESTALE); + } + *vpp = ITOV(ip); + *exflagsp = np->netc_exflags; + *credanonp = &np->netc_anon; + return 0; +} + +/* + * Vnode pointer to File handle + */ +/* ARGSUSED */ +cd9660_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + register struct iso_node *ip = VTOI(vp); + register struct ifid *ifhp; + register struct iso_mnt *mp = ip->i_mnt; + + ifhp = (struct ifid *)fhp; + ifhp->ifid_len = sizeof(struct ifid); + + ifhp->ifid_ino = ip->i_number; + ifhp->ifid_start = ip->iso_start; + +#ifdef ISOFS_DBG + printf("vptofh: ino %d, start %ld\n", + ifhp->ifid_ino,ifhp->ifid_start); +#endif + return 0; +} diff --git a/sys/isofs/cd9660/cd9660_vnops.c b/sys/isofs/cd9660/cd9660_vnops.c new file mode 100644 index 00000000000..59f5a73f5c8 --- /dev/null +++ b/sys/isofs/cd9660/cd9660_vnops.c @@ -0,0 +1,1038 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cd9660_vnops.c 8.3 (Berkeley) 1/23/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if 0 +/* + * Mknod vnode call + * Actually remap the device number + */ +cd9660_mknod(ndp, vap, cred, p) + struct nameidata *ndp; + struct ucred *cred; + struct vattr *vap; + struct proc *p; +{ +#ifndef ISODEVMAP + free(ndp->ni_pnbuf, M_NAMEI); + vput(ndp->ni_dvp); + vput(ndp->ni_vp); + return EINVAL; +#else + register struct vnode *vp; + struct iso_node *ip; + struct iso_dnode *dp; + int error; + + vp = ndp->ni_vp; + ip = VTOI(vp); + + if (ip->i_mnt->iso_ftype != ISO_FTYPE_RRIP + || vap->va_type != vp->v_type + || (vap->va_type != VCHR && vap->va_type != VBLK)) { + free(ndp->ni_pnbuf, M_NAMEI); + vput(ndp->ni_dvp); + vput(ndp->ni_vp); + return EINVAL; + } + + dp = iso_dmap(ip->i_dev,ip->i_number,1); + if (ip->inode.iso_rdev == vap->va_rdev || vap->va_rdev == VNOVAL) { + /* same as the unmapped one, delete the mapping */ + remque(dp); + FREE(dp,M_CACHE); + } else + /* enter new mapping */ + dp->d_dev = vap->va_rdev; + + /* + * Remove inode so that it will be reloaded by iget and + * checked to see if it is an alias of an existing entry + * in the inode cache. + */ + vput(vp); + vp->v_type = VNON; + vgone(vp); + return (0); +#endif +} +#endif + +/* + * Open called. + * + * Nothing to do. + */ +/* ARGSUSED */ +int +cd9660_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + return (0); +} + +/* + * Close called + * + * Update the times on the inode on writeable file systems. + */ +/* ARGSUSED */ +int +cd9660_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + return (0); +} + +/* + * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC. + * The mode is shifted to select the owner/group/other fields. The + * super user is granted all permissions. + */ +/* ARGSUSED */ +cd9660_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + return (0); +} + +cd9660_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; + +{ + struct vnode *vp = ap->a_vp; + register struct vattr *vap = ap->a_vap; + register struct iso_node *ip = VTOI(vp); + int i; + + vap->va_fsid = ip->i_dev; + vap->va_fileid = ip->i_number; + + vap->va_mode = ip->inode.iso_mode; + vap->va_nlink = ip->inode.iso_links; + vap->va_uid = ip->inode.iso_uid; + vap->va_gid = ip->inode.iso_gid; + vap->va_atime = ip->inode.iso_atime; + vap->va_mtime = ip->inode.iso_mtime; + vap->va_ctime = ip->inode.iso_ctime; + vap->va_rdev = ip->inode.iso_rdev; + + vap->va_size = (u_quad_t) ip->i_size; + vap->va_flags = 0; + vap->va_gen = 1; + vap->va_blocksize = ip->i_mnt->logical_block_size; + vap->va_bytes = (u_quad_t) ip->i_size; + vap->va_type = vp->v_type; + return (0); +} + +#if ISO_DEFAULT_BLOCK_SIZE >= NBPG +#ifdef DEBUG +extern int doclusterread; +#else +#define doclusterread 1 +#endif +#else +/* XXX until cluster routines can handle block sizes less than one page */ +#define doclusterread 0 +#endif + +/* + * Vnode op for reading. + */ +cd9660_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + register struct uio *uio = ap->a_uio; + register struct iso_node *ip = VTOI(vp); + register struct iso_mnt *imp; + struct buf *bp; + daddr_t lbn, bn, rablock; + off_t diff; + int rasize, error = 0; + long size, n, on; + + if (uio->uio_resid == 0) + return (0); + if (uio->uio_offset < 0) + return (EINVAL); + ip->i_flag |= IACC; + imp = ip->i_mnt; + do { + lbn = iso_lblkno(imp, uio->uio_offset); + on = iso_blkoff(imp, uio->uio_offset); + n = min((unsigned)(imp->logical_block_size - on), + uio->uio_resid); + diff = (off_t)ip->i_size - uio->uio_offset; + if (diff <= 0) + return (0); + if (diff < n) + n = diff; + size = iso_blksize(imp, ip, lbn); + rablock = lbn + 1; + if (doclusterread) { + if (iso_lblktosize(imp, rablock) <= ip->i_size) + error = cluster_read(vp, (off_t)ip->i_size, + lbn, size, NOCRED, &bp); + else + error = bread(vp, lbn, size, NOCRED, &bp); + } else { + if (vp->v_lastr + 1 == lbn && + iso_lblktosize(imp, rablock) < ip->i_size) { + rasize = iso_blksize(imp, ip, rablock); + error = breadn(vp, lbn, size, &rablock, + &rasize, 1, NOCRED, &bp); + } else + error = bread(vp, lbn, size, NOCRED, &bp); + } + vp->v_lastr = lbn; + n = min(n, size - bp->b_resid); + if (error) { + brelse(bp); + return (error); + } + + error = uiomove(bp->b_un.b_addr + on, (int)n, uio); + if (n + on == imp->logical_block_size || + uio->uio_offset == (off_t)ip->i_size) + bp->b_flags |= B_AGE; + brelse(bp); + } while (error == 0 && uio->uio_resid > 0 && n != 0); + return (error); +} + +/* ARGSUSED */ +int +cd9660_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + printf("You did ioctl for isofs !!\n"); + return (ENOTTY); +} + +/* ARGSUSED */ +int +cd9660_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* + * We should really check to see if I/O is possible. + */ + return (1); +} + +/* + * Mmap a file + * + * NB Currently unsupported. + */ +/* ARGSUSED */ +int +cd9660_mmap(ap) + struct vop_mmap_args /* { + struct vnode *a_vp; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (EINVAL); +} + +/* + * Seek on a file + * + * Nothing to do, so just return. + */ +/* ARGSUSED */ +int +cd9660_seek(ap) + struct vop_seek_args /* { + struct vnode *a_vp; + off_t a_oldoff; + off_t a_newoff; + struct ucred *a_cred; + } */ *ap; +{ + + return (0); +} + +/* + * Structure for reading directories + */ +struct isoreaddir { + struct dirent saveent; + struct dirent assocent; + struct dirent current; + off_t saveoff; + off_t assocoff; + off_t curroff; + struct uio *uio; + off_t uio_off; + u_int *cookiep; + int ncookies; + int eof; +}; + +static int +iso_uiodir(idp,dp,off) + struct isoreaddir *idp; + struct dirent *dp; + off_t off; +{ + int error; + + dp->d_name[dp->d_namlen] = 0; + dp->d_reclen = DIRSIZ(dp); + + if (idp->uio->uio_resid < dp->d_reclen) { + idp->eof = 0; + return -1; + } + + if (idp->cookiep) { + if (idp->ncookies <= 0) { + idp->eof = 0; + return -1; + } + + *idp->cookiep++ = off; + --idp->ncookies; + } + + if (error = uiomove(dp,dp->d_reclen,idp->uio)) + return error; + idp->uio_off = off; + return 0; +} + +static int +iso_shipdir(idp) + struct isoreaddir *idp; +{ + struct dirent *dp; + int cl, sl, assoc; + int error; + char *cname, *sname; + + cl = idp->current.d_namlen; + cname = idp->current.d_name; + if (assoc = cl > 1 && *cname == ASSOCCHAR) { + cl--; + cname++; + } + + dp = &idp->saveent; + sname = dp->d_name; + if (!(sl = dp->d_namlen)) { + dp = &idp->assocent; + sname = dp->d_name + 1; + sl = dp->d_namlen - 1; + } + if (sl > 0) { + if (sl != cl + || bcmp(sname,cname,sl)) { + if (idp->assocent.d_namlen) { + if (error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) + return error; + idp->assocent.d_namlen = 0; + } + if (idp->saveent.d_namlen) { + if (error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) + return error; + idp->saveent.d_namlen = 0; + } + } + } + idp->current.d_reclen = DIRSIZ(&idp->current); + if (assoc) { + idp->assocoff = idp->curroff; + bcopy(&idp->current,&idp->assocent,idp->current.d_reclen); + } else { + idp->saveoff = idp->curroff; + bcopy(&idp->current,&idp->saveent,idp->current.d_reclen); + } + return 0; +} + +/* + * Vnode op for readdir + * XXX make sure everything still works now that eofflagp and cookiep + * are no longer args. + */ +int +cd9660_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + register struct uio *uio = ap->a_uio; + struct isoreaddir *idp; + int entryoffsetinblock; + int error = 0; + int endsearch; + struct iso_directory_record *ep; + u_short elen; + int reclen; + struct iso_mnt *imp; + struct iso_node *ip; + struct buf *bp = NULL; + + ip = VTOI(ap->a_vp); + imp = ip->i_mnt; + + MALLOC(idp,struct isoreaddir *,sizeof(*idp),M_TEMP,M_WAITOK); + idp->saveent.d_namlen = 0; + idp->assocent.d_namlen = 0; + idp->uio = uio; +#if 0 + idp->cookiep = cookies; + idp->ncookies = ncookies; + idp->eof = 1; +#else + idp->cookiep = 0; +#endif + idp->curroff = uio->uio_offset; + + entryoffsetinblock = iso_blkoff(imp, idp->curroff); + if (entryoffsetinblock != 0) { + if (error = iso_blkatoff(ip, idp->curroff, &bp)) { + FREE(idp,M_TEMP); + return (error); + } + } + + endsearch = ip->i_size; + + while (idp->curroff < endsearch) { + /* + * If offset is on a block boundary, + * read the next directory block. + * Release previous if it exists. + */ + + if (iso_blkoff(imp, idp->curroff) == 0) { + if (bp != NULL) + brelse(bp); + if (error = iso_blkatoff(ip, idp->curroff, &bp)) + break; + entryoffsetinblock = 0; + } + /* + * Get pointer to next entry. + */ + + ep = (struct iso_directory_record *) + (bp->b_un.b_addr + entryoffsetinblock); + + reclen = isonum_711 (ep->length); + if (reclen == 0) { + /* skip to next block, if any */ + idp->curroff = roundup (idp->curroff, + imp->logical_block_size); + continue; + } + + if (reclen < ISO_DIRECTORY_RECORD_SIZE) { + error = EINVAL; + /* illegal entry, stop */ + break; + } + + if (entryoffsetinblock + reclen > imp->logical_block_size) { + error = EINVAL; + /* illegal directory, so stop looking */ + break; + } + + idp->current.d_namlen = isonum_711 (ep->name_len); + if (isonum_711(ep->flags)&2) + isodirino(&idp->current.d_fileno,ep,imp); + else + idp->current.d_fileno = dbtob(bp->b_blkno) + + idp->curroff; + + if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) { + error = EINVAL; + /* illegal entry, stop */ + break; + } + + idp->curroff += reclen; + /* + * + */ + switch (imp->iso_ftype) { + case ISO_FTYPE_RRIP: + cd9660_rrip_getname(ep,idp->current.d_name, + (u_short *)&idp->current.d_namlen, + &idp->current.d_fileno,imp); + if (idp->current.d_namlen) + error = iso_uiodir(idp,&idp->current,idp->curroff); + break; + default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 */ + strcpy(idp->current.d_name,".."); + switch (ep->name[0]) { + case 0: + idp->current.d_namlen = 1; + error = iso_uiodir(idp,&idp->current,idp->curroff); + break; + case 1: + idp->current.d_namlen = 2; + error = iso_uiodir(idp,&idp->current,idp->curroff); + break; + default: + isofntrans(ep->name,idp->current.d_namlen, + idp->current.d_name, &elen, + imp->iso_ftype == ISO_FTYPE_9660, + isonum_711(ep->flags)&4); + idp->current.d_namlen = (u_char)elen; + if (imp->iso_ftype == ISO_FTYPE_DEFAULT) + error = iso_shipdir(idp); + else + error = iso_uiodir(idp,&idp->current,idp->curroff); + break; + } + } + if (error) + break; + + entryoffsetinblock += reclen; + } + + if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) { + idp->current.d_namlen = 0; + error = iso_shipdir(idp); + } + if (error < 0) + error = 0; + + if (bp) + brelse (bp); + + uio->uio_offset = idp->uio_off; +#if 0 + *eofflagp = idp->eof; +#endif + + FREE(idp,M_TEMP); + + return (error); +} + +/* + * Return target name of a symbolic link + * Shouldn't we get the parent vnode and read the data from there? + * This could eventually result in deadlocks in cd9660_lookup. + * But otherwise the block read here is in the block buffer two times. + */ +typedef struct iso_directory_record ISODIR; +typedef struct iso_node ISONODE; +typedef struct iso_mnt ISOMNT; +int +cd9660_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + ISONODE *ip; + ISODIR *dirp; + ISOMNT *imp; + struct buf *bp; + u_short symlen; + int error; + char *symname; + ino_t ino; + + ip = VTOI(ap->a_vp); + imp = ip->i_mnt; + + if (imp->iso_ftype != ISO_FTYPE_RRIP) + return EINVAL; + + /* + * Get parents directory record block that this inode included. + */ + error = bread(imp->im_devvp, + (daddr_t)(ip->i_number / DEV_BSIZE), + imp->logical_block_size, + NOCRED, + &bp); + if (error) { + brelse(bp); + return EINVAL; + } + + /* + * Setup the directory pointer for this inode + */ + dirp = (ISODIR *)(bp->b_un.b_addr + (ip->i_number & imp->im_bmask)); +#ifdef DEBUG + printf("lbn=%d,off=%d,bsize=%d,DEV_BSIZE=%d, dirp= %08x, b_addr=%08x, offset=%08x(%08x)\n", + (daddr_t)(ip->i_number >> imp->im_bshift), + ip->i_number & imp->im_bmask, + imp->logical_block_size, + DEV_BSIZE, + dirp, + bp->b_un.b_addr, + ip->i_number, + ip->i_number & imp->im_bmask ); +#endif + + /* + * Just make sure, we have a right one.... + * 1: Check not cross boundary on block + */ + if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length) + > imp->logical_block_size) { + brelse(bp); + return EINVAL; + } + + /* + * Now get a buffer + * Abuse a namei buffer for now. + */ + MALLOC(symname,char *,MAXPATHLEN,M_NAMEI,M_WAITOK); + + /* + * Ok, we just gathering a symbolic name in SL record. + */ + if (cd9660_rrip_getsymname(dirp,symname,&symlen,imp) == 0) { + FREE(symname,M_NAMEI); + brelse(bp); + return EINVAL; + } + /* + * Don't forget before you leave from home ;-) + */ + brelse(bp); + + /* + * return with the symbolic name to caller's. + */ + error = uiomove(symname,symlen,ap->a_uio); + + FREE(symname,M_NAMEI); + + return error; +} + +/* + * Ufs abort op, called after namei() when a CREATE/DELETE isn't actually + * done. If a buffer has been saved in anticipation of a CREATE, delete it. + */ +int +cd9660_abortop(ap) + struct vop_abortop_args /* { + struct vnode *a_dvp; + struct componentname *a_cnp; + } */ *ap; +{ + if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) + FREE(ap->a_cnp->cn_pnbuf, M_NAMEI); + return 0; +} + +/* + * Lock an inode. + */ +int +cd9660_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct iso_node *ip = VTOI(ap->a_vp); + + ISO_ILOCK(ip); + return 0; +} + +/* + * Unlock an inode. + */ +int +cd9660_unlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct iso_node *ip = VTOI(ap->a_vp); + + if (!(ip->i_flag & ILOCKED)) + panic("cd9660_unlock NOT LOCKED"); + ISO_IUNLOCK(ip); + return 0; +} + +/* + * Check for a locked inode. + */ +int +cd9660_islocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + if (VTOI(ap->a_vp)->i_flag & ILOCKED) + return 1; + return 0; +} + +/* + * Calculate the logical to physical mapping if not done already, + * then call the device strategy routine. + */ +int +cd9660_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + register struct buf *bp = ap->a_bp; + register struct vnode *vp = bp->b_vp; + register struct iso_node *ip; + int error; + + ip = VTOI(vp); + if (vp->v_type == VBLK || vp->v_type == VCHR) + panic("cd9660_strategy: spec"); + if (bp->b_blkno == bp->b_lblkno) { + if (error = + VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) { + bp->b_error = error; + bp->b_flags |= B_ERROR; + biodone(bp); + return (error); + } + if ((long)bp->b_blkno == -1) + clrbuf(bp); + } + if ((long)bp->b_blkno == -1) { + biodone(bp); + return (0); + } + vp = ip->i_devvp; + bp->b_dev = vp->v_rdev; + VOCALL (vp->v_op, VOFFSET(vop_strategy), ap); + return (0); +} + +/* + * Print out the contents of an inode. + */ +int +cd9660_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + printf("tag VT_ISOFS, isofs vnode\n"); + return 0; +} + +/* + * Unsupported operation + */ +int +cd9660_enotsupp() +{ + + return (EOPNOTSUPP); +} + +/* + * Global vfs data structures for isofs + */ +#define cd9660_create \ + ((int (*) __P((struct vop_create_args *)))cd9660_enotsupp) +#define cd9660_mknod ((int (*) __P((struct vop_mknod_args *)))cd9660_enotsupp) +#define cd9660_setattr \ + ((int (*) __P((struct vop_setattr_args *)))cd9660_enotsupp) +#define cd9660_write ((int (*) __P((struct vop_write_args *)))cd9660_enotsupp) +#define cd9660_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) +#define cd9660_remove \ + ((int (*) __P((struct vop_remove_args *)))cd9660_enotsupp) +#define cd9660_link ((int (*) __P((struct vop_link_args *)))cd9660_enotsupp) +#define cd9660_rename \ + ((int (*) __P((struct vop_rename_args *)))cd9660_enotsupp) +#define cd9660_mkdir ((int (*) __P((struct vop_mkdir_args *)))cd9660_enotsupp) +#define cd9660_rmdir ((int (*) __P((struct vop_rmdir_args *)))cd9660_enotsupp) +#define cd9660_symlink \ + ((int (*) __P((struct vop_symlink_args *)))cd9660_enotsupp) +#define cd9660_pathconf \ + ((int (*) __P((struct vop_pathconf_args *)))cd9660_enotsupp) +#define cd9660_advlock \ + ((int (*) __P((struct vop_advlock_args *)))cd9660_enotsupp) +#define cd9660_blkatoff \ + ((int (*) __P((struct vop_blkatoff_args *)))cd9660_enotsupp) +#define cd9660_valloc ((int(*) __P(( \ + struct vnode *pvp, \ + int mode, \ + struct ucred *cred, \ + struct vnode **vpp))) cd9660_enotsupp) +#define cd9660_vfree ((int (*) __P((struct vop_vfree_args *)))cd9660_enotsupp) +#define cd9660_truncate \ + ((int (*) __P((struct vop_truncate_args *)))cd9660_enotsupp) +#define cd9660_update \ + ((int (*) __P((struct vop_update_args *)))cd9660_enotsupp) +#define cd9660_bwrite \ + ((int (*) __P((struct vop_bwrite_args *)))cd9660_enotsupp) + +/* + * Global vfs data structures for nfs + */ +int (**cd9660_vnodeop_p)(); +struct vnodeopv_entry_desc cd9660_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, cd9660_lookup }, /* lookup */ + { &vop_create_desc, cd9660_create }, /* create */ + { &vop_mknod_desc, cd9660_mknod }, /* mknod */ + { &vop_open_desc, cd9660_open }, /* open */ + { &vop_close_desc, cd9660_close }, /* close */ + { &vop_access_desc, cd9660_access }, /* access */ + { &vop_getattr_desc, cd9660_getattr }, /* getattr */ + { &vop_setattr_desc, cd9660_setattr }, /* setattr */ + { &vop_read_desc, cd9660_read }, /* read */ + { &vop_write_desc, cd9660_write }, /* write */ + { &vop_ioctl_desc, cd9660_ioctl }, /* ioctl */ + { &vop_select_desc, cd9660_select }, /* select */ + { &vop_mmap_desc, cd9660_mmap }, /* mmap */ + { &vop_fsync_desc, cd9660_fsync }, /* fsync */ + { &vop_seek_desc, cd9660_seek }, /* seek */ + { &vop_remove_desc, cd9660_remove }, /* remove */ + { &vop_link_desc, cd9660_link }, /* link */ + { &vop_rename_desc, cd9660_rename }, /* rename */ + { &vop_mkdir_desc, cd9660_mkdir }, /* mkdir */ + { &vop_rmdir_desc, cd9660_rmdir }, /* rmdir */ + { &vop_symlink_desc, cd9660_symlink }, /* symlink */ + { &vop_readdir_desc, cd9660_readdir }, /* readdir */ + { &vop_readlink_desc, cd9660_readlink },/* readlink */ + { &vop_abortop_desc, cd9660_abortop }, /* abortop */ + { &vop_inactive_desc, cd9660_inactive },/* inactive */ + { &vop_reclaim_desc, cd9660_reclaim }, /* reclaim */ + { &vop_lock_desc, cd9660_lock }, /* lock */ + { &vop_unlock_desc, cd9660_unlock }, /* unlock */ + { &vop_bmap_desc, cd9660_bmap }, /* bmap */ + { &vop_strategy_desc, cd9660_strategy },/* strategy */ + { &vop_print_desc, cd9660_print }, /* print */ + { &vop_islocked_desc, cd9660_islocked },/* islocked */ + { &vop_pathconf_desc, cd9660_pathconf },/* pathconf */ + { &vop_advlock_desc, cd9660_advlock }, /* advlock */ + { &vop_blkatoff_desc, cd9660_blkatoff },/* blkatoff */ + { &vop_valloc_desc, cd9660_valloc }, /* valloc */ + { &vop_vfree_desc, cd9660_vfree }, /* vfree */ + { &vop_truncate_desc, cd9660_truncate },/* truncate */ + { &vop_update_desc, cd9660_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc cd9660_vnodeop_opv_desc = + { &cd9660_vnodeop_p, cd9660_vnodeop_entries }; + +/* + * Special device vnode ops + */ +int (**cd9660_specop_p)(); +struct vnodeopv_entry_desc cd9660_specop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, cd9660_create }, /* create */ + { &vop_mknod_desc, cd9660_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, spec_close }, /* close */ + { &vop_access_desc, cd9660_access }, /* access */ + { &vop_getattr_desc, cd9660_getattr }, /* getattr */ + { &vop_setattr_desc, cd9660_setattr }, /* setattr */ + { &vop_read_desc, spec_read }, /* read */ + { &vop_write_desc, spec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_select_desc, spec_select }, /* select */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, cd9660_remove }, /* remove */ + { &vop_link_desc, cd9660_link }, /* link */ + { &vop_rename_desc, cd9660_rename }, /* rename */ + { &vop_mkdir_desc, cd9660_mkdir }, /* mkdir */ + { &vop_rmdir_desc, cd9660_rmdir }, /* rmdir */ + { &vop_symlink_desc, cd9660_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, cd9660_inactive },/* inactive */ + { &vop_reclaim_desc, cd9660_reclaim }, /* reclaim */ + { &vop_lock_desc, cd9660_lock }, /* lock */ + { &vop_unlock_desc, cd9660_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + /* XXX strategy: panics, should be notsupp instead? */ + { &vop_strategy_desc, cd9660_strategy },/* strategy */ + { &vop_print_desc, cd9660_print }, /* print */ + { &vop_islocked_desc, cd9660_islocked },/* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_blkatoff_desc, spec_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, spec_valloc }, /* valloc */ + { &vop_vfree_desc, spec_vfree }, /* vfree */ + { &vop_truncate_desc, spec_truncate }, /* truncate */ + { &vop_update_desc, cd9660_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc cd9660_specop_opv_desc = + { &cd9660_specop_p, cd9660_specop_entries }; + +#ifdef FIFO +int (**cd9660_fifoop_p)(); +struct vnodeopv_entry_desc cd9660_fifoop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fifo_lookup }, /* lookup */ + { &vop_create_desc, cd9660_create }, /* create */ + { &vop_mknod_desc, cd9660_mknod }, /* mknod */ + { &vop_open_desc, fifo_open }, /* open */ + { &vop_close_desc, fifo_close }, /* close */ + { &vop_access_desc, cd9660_access }, /* access */ + { &vop_getattr_desc, cd9660_getattr }, /* getattr */ + { &vop_setattr_desc, cd9660_setattr }, /* setattr */ + { &vop_read_desc, fifo_read }, /* read */ + { &vop_write_desc, fifo_write }, /* write */ + { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */ + { &vop_select_desc, fifo_select }, /* select */ + { &vop_mmap_desc, fifo_mmap }, /* mmap */ + { &vop_fsync_desc, fifo_fsync }, /* fsync */ + { &vop_seek_desc, fifo_seek }, /* seek */ + { &vop_remove_desc, cd9660_remove }, /* remove */ + { &vop_link_desc, cd9660_link }, /* link */ + { &vop_rename_desc, cd9660_rename }, /* rename */ + { &vop_mkdir_desc, cd9660_mkdir }, /* mkdir */ + { &vop_rmdir_desc, cd9660_rmdir }, /* rmdir */ + { &vop_symlink_desc, cd9660_symlink }, /* symlink */ + { &vop_readdir_desc, fifo_readdir }, /* readdir */ + { &vop_readlink_desc, fifo_readlink }, /* readlink */ + { &vop_abortop_desc, fifo_abortop }, /* abortop */ + { &vop_inactive_desc, cd9660_inactive },/* inactive */ + { &vop_reclaim_desc, cd9660_reclaim }, /* reclaim */ + { &vop_lock_desc, cd9660_lock }, /* lock */ + { &vop_unlock_desc, cd9660_unlock }, /* unlock */ + { &vop_bmap_desc, fifo_bmap }, /* bmap */ + { &vop_strategy_desc, fifo_badop }, /* strategy */ + { &vop_print_desc, cd9660_print }, /* print */ + { &vop_islocked_desc, cd9660_islocked },/* islocked */ + { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */ + { &vop_advlock_desc, fifo_advlock }, /* advlock */ + { &vop_blkatoff_desc, fifo_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fifo_valloc }, /* valloc */ + { &vop_vfree_desc, fifo_vfree }, /* vfree */ + { &vop_truncate_desc, fifo_truncate }, /* truncate */ + { &vop_update_desc, cd9660_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc cd9660_fifoop_opv_desc = + { &cd9660_fifoop_p, cd9660_fifoop_entries }; +#endif /* FIFO */ diff --git a/sys/isofs/cd9660/iso.h b/sys/isofs/cd9660/iso.h new file mode 100644 index 00000000000..e3567066e1c --- /dev/null +++ b/sys/isofs/cd9660/iso.h @@ -0,0 +1,256 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso.h 8.2 (Berkeley) 1/23/94 + */ + +#define ISODCL(from, to) (to - from + 1) + +struct iso_volume_descriptor { + char type[ISODCL(1,1)]; /* 711 */ + char id[ISODCL(2,6)]; + char version[ISODCL(7,7)]; + char data[ISODCL(8,2048)]; +}; + +/* volume descriptor types */ +#define ISO_VD_PRIMARY 1 +#define ISO_VD_END 255 + +#define ISO_STANDARD_ID "CD001" +#define ISO_ECMA_ID "CDW01" + +struct iso_primary_descriptor { + char type [ISODCL ( 1, 1)]; /* 711 */ + char id [ISODCL ( 2, 6)]; + char version [ISODCL ( 7, 7)]; /* 711 */ + char unused1 [ISODCL ( 8, 8)]; + char system_id [ISODCL ( 9, 40)]; /* achars */ + char volume_id [ISODCL ( 41, 72)]; /* dchars */ + char unused2 [ISODCL ( 73, 80)]; + char volume_space_size [ISODCL ( 81, 88)]; /* 733 */ + char unused3 [ISODCL ( 89, 120)]; + char volume_set_size [ISODCL (121, 124)]; /* 723 */ + char volume_sequence_number [ISODCL (125, 128)]; /* 723 */ + char logical_block_size [ISODCL (129, 132)]; /* 723 */ + char path_table_size [ISODCL (133, 140)]; /* 733 */ + char type_l_path_table [ISODCL (141, 144)]; /* 731 */ + char opt_type_l_path_table [ISODCL (145, 148)]; /* 731 */ + char type_m_path_table [ISODCL (149, 152)]; /* 732 */ + char opt_type_m_path_table [ISODCL (153, 156)]; /* 732 */ + char root_directory_record [ISODCL (157, 190)]; /* 9.1 */ + char volume_set_id [ISODCL (191, 318)]; /* dchars */ + char publisher_id [ISODCL (319, 446)]; /* achars */ + char preparer_id [ISODCL (447, 574)]; /* achars */ + char application_id [ISODCL (575, 702)]; /* achars */ + char copyright_file_id [ISODCL (703, 739)]; /* 7.5 dchars */ + char abstract_file_id [ISODCL (740, 776)]; /* 7.5 dchars */ + char bibliographic_file_id [ISODCL (777, 813)]; /* 7.5 dchars */ + char creation_date [ISODCL (814, 830)]; /* 8.4.26.1 */ + char modification_date [ISODCL (831, 847)]; /* 8.4.26.1 */ + char expiration_date [ISODCL (848, 864)]; /* 8.4.26.1 */ + char effective_date [ISODCL (865, 881)]; /* 8.4.26.1 */ + char file_structure_version [ISODCL (882, 882)]; /* 711 */ + char unused4 [ISODCL (883, 883)]; + char application_data [ISODCL (884, 1395)]; + char unused5 [ISODCL (1396, 2048)]; +}; +#define ISO_DEFAULT_BLOCK_SIZE 2048 + +struct iso_directory_record { + char length [ISODCL (1, 1)]; /* 711 */ + char ext_attr_length [ISODCL (2, 2)]; /* 711 */ + unsigned char extent [ISODCL (3, 10)]; /* 733 */ + unsigned char size [ISODCL (11, 18)]; /* 733 */ + char date [ISODCL (19, 25)]; /* 7 by 711 */ + char flags [ISODCL (26, 26)]; + char file_unit_size [ISODCL (27, 27)]; /* 711 */ + char interleave [ISODCL (28, 28)]; /* 711 */ + char volume_sequence_number [ISODCL (29, 32)]; /* 723 */ + char name_len [ISODCL (33, 33)]; /* 711 */ + char name [0]; +}; +/* can't take sizeof(iso_directory_record), because of possible alignment + of the last entry (34 instead of 33) */ +#define ISO_DIRECTORY_RECORD_SIZE 33 + +struct iso_extended_attributes { + unsigned char owner [ISODCL (1, 4)]; /* 723 */ + unsigned char group [ISODCL (5, 8)]; /* 723 */ + unsigned char perm [ISODCL (9, 10)]; /* 9.5.3 */ + char ctime [ISODCL (11, 27)]; /* 8.4.26.1 */ + char mtime [ISODCL (28, 44)]; /* 8.4.26.1 */ + char xtime [ISODCL (45, 61)]; /* 8.4.26.1 */ + char ftime [ISODCL (62, 78)]; /* 8.4.26.1 */ + char recfmt [ISODCL (79, 79)]; /* 711 */ + char recattr [ISODCL (80, 80)]; /* 711 */ + unsigned char reclen [ISODCL (81, 84)]; /* 723 */ + char system_id [ISODCL (85, 116)]; /* achars */ + char system_use [ISODCL (117, 180)]; + char version [ISODCL (181, 181)]; /* 711 */ + char len_esc [ISODCL (182, 182)]; /* 711 */ + char reserved [ISODCL (183, 246)]; + unsigned char len_au [ISODCL (247, 250)]; /* 723 */ +}; + +/* CD-ROM Format type */ +enum ISO_FTYPE { ISO_FTYPE_DEFAULT, ISO_FTYPE_9660, ISO_FTYPE_RRIP, ISO_FTYPE_ECMA }; + +#ifndef ISOFSMNT_ROOT +#define ISOFSMNT_ROOT 0 +#endif + +struct iso_mnt { + int im_flags; + + struct mount *im_mountp; + dev_t im_dev; + struct vnode *im_devvp; + + int logical_block_size; + int im_bshift; + int im_bmask; + + int volume_space_size; + char im_fsmnt[50]; + struct netexport im_export; + + char root[ISODCL (157, 190)]; + int root_extent; + int root_size; + enum ISO_FTYPE iso_ftype; + + int rr_skip; + int rr_skip0; +}; + +#define VFSTOISOFS(mp) ((struct iso_mnt *)((mp)->mnt_data)) + +#define iso_blkoff(imp, loc) ((loc) & (imp)->im_bmask) +#define iso_lblkno(imp, loc) ((loc) >> (imp)->im_bshift) +#define iso_blksize(imp, ip, lbn) ((imp)->logical_block_size) +#define iso_lblktosize(imp, blk) ((blk) << (imp)->im_bshift) + +int cd9660_mount __P((struct mount *, + char *, caddr_t, struct nameidata *, struct proc *)); +int cd9660_start __P((struct mount *, int, struct proc *)); +int cd9660_unmount __P((struct mount *, int, struct proc *)); +int cd9660_root __P((struct mount *, struct vnode **)); +int cd9660_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); +int cd9660_statfs __P((struct mount *, struct statfs *, struct proc *)); +int cd9660_sync __P((struct mount *, int, struct ucred *, struct proc *)); +int cd9660_vget __P((struct mount *, ino_t, struct vnode **)); +int cd9660_fhtovp __P((struct mount *, struct fid *, struct mbuf *, + struct vnode **, int *, struct ucred **)); +int cd9660_vptofh __P((struct vnode *, struct fid *)); +int cd9660_init __P(()); + +struct iso_node; +int iso_blkatoff __P((struct iso_node *ip, long offset, struct buf **bpp)); +int iso_iget __P((struct iso_node *xp, ino_t ino, int relocated, + struct iso_node **ipp, struct iso_directory_record *isodir)); +int iso_iput __P((struct iso_node *ip)); +int iso_ilock __P((struct iso_node *ip)); +int iso_iunlock __P((struct iso_node *ip)); +int cd9660_mountroot __P((void)); + +extern int (**cd9660_vnodeop_p)(); + +extern inline int +isonum_711(p) + unsigned char *p; +{ + return *p; +} + +extern inline int +isonum_712(p) + char *p; +{ + return *p; +} + +extern inline int +isonum_721(p) + unsigned char *p; +{ + return *p|((char)p[1] << 8); +} + +extern inline int +isonum_722(p) + unsigned char *p; +{ + return ((char)*p << 8)|p[1]; +} + +extern inline int +isonum_723(p) + unsigned char *p; +{ + return isonum_721(p); +} + +extern inline int +isonum_731(p) + unsigned char *p; +{ + return *p|(p[1] << 8)|(p[2] << 16)|(p[3] << 24); +} + +extern inline int +isonum_732(p) + unsigned char *p; +{ + return (*p << 24)|(p[1] << 16)|(p[2] << 8)|p[3]; +} + +extern inline int +isonum_733(p) + unsigned char *p; +{ + return isonum_731(p); +} + +int isofncmp __P((unsigned char *, int, unsigned char *, int)); +void isofntrans __P((unsigned char *, int, unsigned char *, unsigned short *, + int, int)); + +/* + * Associated files have a leading '='. + */ +#define ASSOCCHAR '=' diff --git a/sys/isofs/cd9660/iso_rrip.h b/sys/isofs/cd9660/iso_rrip.h new file mode 100644 index 00000000000..78e4a775201 --- /dev/null +++ b/sys/isofs/cd9660/iso_rrip.h @@ -0,0 +1,83 @@ +/*- + * Copyright (c) 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley + * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension + * Support code is derived from software contributed to Berkeley + * by Atsushi Murai (amurai@spec.co.jp). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso_rrip.h 8.2 (Berkeley) 1/23/94 + */ + + +/* + * Analyze function flag (similar to RR field bits) + */ +#define ISO_SUSP_ATTR 0x0001 +#define ISO_SUSP_DEVICE 0x0002 +#define ISO_SUSP_SLINK 0x0004 +#define ISO_SUSP_ALTNAME 0x0008 +#define ISO_SUSP_CLINK 0x0010 +#define ISO_SUSP_PLINK 0x0020 +#define ISO_SUSP_RELDIR 0x0040 +#define ISO_SUSP_TSTAMP 0x0080 +#define ISO_SUSP_IDFLAG 0x0100 +#define ISO_SUSP_EXTREF 0x0200 +#define ISO_SUSP_CONT 0x0400 +#define ISO_SUSP_OFFSET 0x0800 +#define ISO_SUSP_STOP 0x1000 +#define ISO_SUSP_UNKNOWN 0x8000 + +typedef struct { + struct iso_node *inop; + int fields; /* interesting fields in this analysis */ + daddr_t iso_ce_blk; /* block of continuation area */ + off_t iso_ce_off; /* offset of continuation area */ + int iso_ce_len; /* length of continuation area */ + struct iso_mnt *imp; /* mount structure */ + ino_t *inump; /* inode number pointer */ + char *outbuf; /* name/symbolic link output area */ + u_short *outlen; /* length of above */ + u_short maxlen; /* maximum length of above */ + int cont; /* continuation of above */ +} ISO_RRIP_ANALYZE; + +int cd9660_rrip_analyze __P((struct iso_directory_record *isodir, + struct iso_node *inop, struct iso_mnt *imp)); +int cd9660_rrip_getname __P((struct iso_directory_record *isodir, + char *outbuf, u_short *outlen, + ino_t *inump, struct iso_mnt *imp)); +int cd9660_rrip_getsymname __P((struct iso_directory_record *isodir, + char *outbuf, u_short *outlen, + struct iso_mnt *imp)); +int cd9660_rrip_offset __P((struct iso_directory_record *isodir, + struct iso_mnt *imp)); diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc new file mode 100644 index 00000000000..1563c4165f1 --- /dev/null +++ b/sys/kern/Make.tags.inc @@ -0,0 +1,18 @@ +# @(#)Make.tags.inc 8.1 (Berkeley) 6/11/93 + +# Common files for "make tags". +# Included by the Makefile for each architecture. + +# Put the ../sys stuff near the end so that subroutine definitions win when +# there is a struct tag with the same name (eg., vmmeter). The real +# solution would probably be for ctags to generate "struct vmmeter" tags. + +COMM= /sys/conf/*.[ch] \ + /sys/dev/*.[ch] /sys/dev/scsi/*.[ch] \ + /sys/kern/*.[ch] /sys/libkern/*.[ch] \ + /sys/miscfs/*/*.[ch] \ + /sys/net/*.[ch] /sys/netccitt/*.[ch] /sys/netinet/*.[ch] \ + /sys/netiso/*.[ch] /sys/netns/*.[ch] \ + /sys/nfs/*.[ch] /sys/sys/*.[ch] \ + /sys/ufs/*/*.[ch] \ + /sys/vm/*.[ch] diff --git a/sys/kern/Makefile b/sys/kern/Makefile new file mode 100644 index 00000000000..cfe962a9a66 --- /dev/null +++ b/sys/kern/Makefile @@ -0,0 +1,50 @@ +# @(#)Makefile 8.2 (Berkeley) 3/21/94 + +# Makefile for kernel tags files, init_sysent, etc. + +ARCH= hp300 i386 luna68k news3400 pmax sparc tahoe vax + +all: + @echo "make tags, make links or init_sysent.c only" + +init_sysent.c syscalls.c ../sys/syscall.h: makesyscalls.sh syscalls.master + -mv -f init_sysent.c init_sysent.c.bak + -mv -f syscalls.c syscalls.c.bak + -mv -f ../sys/syscall.h ../sys/syscall.h.bak + sh makesyscalls.sh syscalls.master + +# Kernel tags: +# Tags files are built in the top-level directory for each architecture, +# with a makefile listing the architecture-dependent files, etc. The list +# of common files is in ./Make.tags.inc. Links to the correct tags file +# are placed in each source directory. We need to have links to tags files +# from the generic directories that are relative to the machine type, even +# via remote mounts; therefore we use symlinks to $SYSTAGS, which points at +# ${SYSDIR}/${MACHINE}/tags. + +SYSTAGS=/var/db/sys_tags +SYSDIR=/sys + +# Directories in which to place tags links (other than machine-dependent) +DGEN= conf \ + dev dev/scsi \ + hp hp/dev hp/hpux \ + kern libkern \ + miscfs miscfs/deadfs miscfs/fdesc miscfs/fifofs miscfs/kernfs \ + miscfs/lofs miscfs/nullfs miscfs/portal miscfs/procfs \ + miscfs/specfs miscfs/umapfs miscfs/union \ + net netccitt netinet netiso netns nfs scripts sys \ + ufs ufs/ffs ufs/lfs ufs/mfs ufs/ufs \ + vm + +tags:: + -for i in ${ARCH}; do \ + (cd ../$$i && make ${MFLAGS} tags); done + +links:: + rm -f ${SYSTAGS} + ln -s ${SYSDIR}/${MACHINE}/tags ${SYSTAGS} + -for i in ${DGEN}; do \ + (cd ../$$i && { rm -f tags; ln -s ${SYSTAGS} tags; }) done + -for i in ${ARCH}; do \ + (cd ../$$i && make ${MFLAGS} SYSTAGS=${SYSTAGS} links); done diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c new file mode 100644 index 00000000000..c6497153a69 --- /dev/null +++ b/sys/kern/init_main.c @@ -0,0 +1,394 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)init_main.c 8.9 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#ifdef HPFPLIB +char copyright[] = +"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California.\nCopyright (c) 1992 Hewlett-Packard Company\nCopyright (c) 1992 Motorola Inc.\nAll rights reserved.\n\n"; +#else +char copyright[] = +"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California. All rights reserved.\n\n"; +#endif + +/* Components of the first process -- never freed. */ +struct session session0; +struct pgrp pgrp0; +struct proc proc0; +struct pcred cred0; +struct filedesc0 filedesc0; +struct plimit limit0; +struct vmspace vmspace0; +struct proc *curproc = &proc0; +struct proc *initproc, *pageproc; + +int cmask = CMASK; +extern struct user *proc0paddr; + +struct vnode *rootvp, *swapdev_vp; +int boothowto; +struct timeval boottime; +struct timeval runtime; + +static void start_init __P((struct proc *p, void *framep)); + +/* + * System startup; initialize the world, create process 0, mount root + * filesystem, and fork to create init and pagedaemon. Most of the + * hard work is done in the lower-level initialization routines including + * startup(), which does memory initialization and autoconfiguration. + */ +main(framep) + void *framep; +{ + register struct proc *p; + register struct filedesc0 *fdp; + register struct pdevinit *pdev; + register int i; + int s, rval[2]; + extern int (*mountroot) __P((void)); + extern struct pdevinit pdevinit[]; + extern void roundrobin __P((void *)); + extern void schedcpu __P((void *)); + + /* + * Initialize the current process pointer (curproc) before + * any possible traps/probes to simplify trap processing. + */ + p = &proc0; + curproc = p; + /* + * Attempt to find console and initialize + * in case of early panic or other messages. + */ + consinit(); + printf(copyright); + + vm_mem_init(); + kmeminit(); + cpu_startup(); + + /* + * Create process 0 (the swapper). + */ + allproc = (volatile struct proc *)p; + p->p_prev = (struct proc **)&allproc; + p->p_pgrp = &pgrp0; + pgrphash[0] = &pgrp0; + pgrp0.pg_mem = p; + pgrp0.pg_session = &session0; + session0.s_count = 1; + session0.s_leader = p; + + p->p_flag = P_INMEM | P_SYSTEM; + p->p_stat = SRUN; + p->p_nice = NZERO; + bcopy("swapper", p->p_comm, sizeof ("swapper")); + + /* Create credentials. */ + cred0.p_refcnt = 1; + p->p_cred = &cred0; + p->p_ucred = crget(); + p->p_ucred->cr_ngroups = 1; /* group 0 */ + + /* Create the file descriptor table. */ + fdp = &filedesc0; + p->p_fd = &fdp->fd_fd; + fdp->fd_fd.fd_refcnt = 1; + fdp->fd_fd.fd_cmask = cmask; + fdp->fd_fd.fd_ofiles = fdp->fd_dfiles; + fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags; + fdp->fd_fd.fd_nfiles = NDFILE; + + /* Create the limits structures. */ + p->p_limit = &limit0; + for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++) + limit0.pl_rlimit[i].rlim_cur = + limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY; + limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = NOFILE; + limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = MAXUPRC; + i = ptoa(cnt.v_free_count); + limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i; + limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i; + limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3; + limit0.p_refcnt = 1; + + /* Allocate a prototype map so we have something to fork. */ + p->p_vmspace = &vmspace0; + vmspace0.vm_refcnt = 1; + pmap_pinit(&vmspace0.vm_pmap); + vm_map_init(&p->p_vmspace->vm_map, round_page(VM_MIN_ADDRESS), + trunc_page(VM_MAX_ADDRESS), TRUE); + vmspace0.vm_map.pmap = &vmspace0.vm_pmap; + p->p_addr = proc0paddr; /* XXX */ + + /* + * We continue to place resource usage info and signal + * actions in the user struct so they're pageable. + */ + p->p_stats = &p->p_addr->u_stats; + p->p_sigacts = &p->p_addr->u_sigacts; + + /* + * Initialize per uid information structure and charge + * root for one process. + */ + usrinfoinit(); + (void)chgproccnt(0, 1); + + rqinit(); + + /* Configure virtual memory system, set vm rlimits. */ + vm_init_limits(p); + + /* Initialize the file systems. */ + vfsinit(); + + /* Start real time and statistics clocks. */ + initclocks(); + + /* Initialize mbuf's. */ + mbinit(); + + /* Initialize clists. */ + clist_init(); + +#ifdef SYSVSHM + /* Initialize System V style shared memory. */ + shminit(); +#endif + + /* Attach pseudo-devices. */ + for (pdev = pdevinit; pdev->pdev_attach != NULL; pdev++) + (*pdev->pdev_attach)(pdev->pdev_count); + + /* + * Initialize protocols. Block reception of incoming packets + * until everything is ready. + */ + s = splimp(); + ifinit(); + domaininit(); + splx(s); + +#ifdef GPROF + /* Initialize kernel profiling. */ + kmstartup(); +#endif + + /* Kick off timeout driven events by calling first time. */ + roundrobin(NULL); + schedcpu(NULL); + + /* Mount the root file system. */ + if ((*mountroot)()) + panic("cannot mount root"); + + /* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */ + if (VFS_ROOT(mountlist.tqh_first, &rootvnode)) + panic("cannot find root vnode"); + fdp->fd_fd.fd_cdir = rootvnode; + VREF(fdp->fd_fd.fd_cdir); + VOP_UNLOCK(rootvnode); + fdp->fd_fd.fd_rdir = NULL; + swapinit(); + + /* + * Now can look at time, having had a chance to verify the time + * from the file system. Reset p->p_rtime as it may have been + * munched in mi_switch() after the time got set. + */ + p->p_stats->p_start = runtime = mono_time = boottime = time; + p->p_rtime.tv_sec = p->p_rtime.tv_usec = 0; + + /* Initialize signal state for process 0. */ + siginit(p); + + /* Create process 1 (init(8)). */ + if (fork(p, NULL, rval)) + panic("fork init"); + if (rval[1]) { + start_init(curproc, framep); + return; + } + + /* Create process 2 (the pageout daemon). */ + if (fork(p, NULL, rval)) + panic("fork pager"); + if (rval[1]) { + /* + * Now in process 2. + */ + p = curproc; + pageproc = p; + p->p_flag |= P_INMEM | P_SYSTEM; /* XXX */ + bcopy("pagedaemon", curproc->p_comm, sizeof ("pagedaemon")); + vm_pageout(); + /* NOTREACHED */ + } + + /* The scheduler is an infinite loop. */ + scheduler(); + /* NOTREACHED */ +} + +/* + * List of paths to try when searching for "init". + */ +static char *initpaths[] = { + "/sbin/init", + "/sbin/oinit", + "/sbin/init.bak", + NULL, +}; + +/* + * Start the initial user process; try exec'ing each pathname in "initpaths". + * The program is invoked with one argument containing the boot flags. + */ +static void +start_init(p, framep) + struct proc *p; + void *framep; +{ + vm_offset_t addr; + struct execve_args args; + int options, i, retval[2], error; + char **pathp, *path, *ucp, **uap, *arg0, *arg1; + + initproc = p; + + /* + * We need to set the system call frame as if we were entered through + * a syscall() so that when we call execve() below, it will be able + * to set the entry point (see setregs) when it tries to exec. The + * startup code in "locore.s" has allocated space for the frame and + * passed a pointer to that space as main's argument. + */ + cpu_set_init_frame(p, framep); + + /* + * Need just enough stack to hold the faked-up "execve()" arguments. + */ + addr = trunc_page(VM_MAX_ADDRESS - PAGE_SIZE); + if (vm_allocate(&p->p_vmspace->vm_map, &addr, PAGE_SIZE, FALSE) != 0) + panic("init: couldn't allocate argument space"); + p->p_vmspace->vm_maxsaddr = (caddr_t)addr; + + for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) { + /* + * Move out the boot flag argument. + */ + options = 0; + ucp = (char *)USRSTACK; + (void)subyte(--ucp, 0); /* trailing zero */ + if (boothowto & RB_SINGLE) { + (void)subyte(--ucp, 's'); + options = 1; + } +#ifdef notyet + if (boothowto & RB_FASTBOOT) { + (void)subyte(--ucp, 'f'); + options = 1; + } +#endif + if (options == 0) + (void)subyte(--ucp, '-'); + (void)subyte(--ucp, '-'); /* leading hyphen */ + arg1 = ucp; + + /* + * Move out the file name (also arg 0). + */ + for (i = strlen(path) + 1; i >= 0; i--) + (void)subyte(--ucp, path[i]); + arg0 = ucp; + + /* + * Move out the arg pointers. + */ + uap = (char **)((int)ucp & ~(NBPW-1)); + (void)suword((caddr_t)--uap, 0); /* terminator */ + (void)suword((caddr_t)--uap, (int)arg1); + (void)suword((caddr_t)--uap, (int)arg0); + + /* + * Point at the arguments. + */ + args.fname = arg0; + args.argp = uap; + args.envp = NULL; + + /* + * Now try to exec the program. If can't for any reason + * other than it doesn't exist, complain. + */ + if ((error = execve(p, &args, &retval)) == 0) + return; + if (error != ENOENT) + printf("exec %s: error %d\n", path, error); + } + printf("init: not found\n"); + panic("no init"); +} diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c new file mode 100644 index 00000000000..4b25c0695cf --- /dev/null +++ b/sys/kern/init_sysent.c @@ -0,0 +1,480 @@ +/* + * System call switch table. + * + * DO NOT EDIT-- this file is automatically generated. + * created from @(#)syscalls.master 8.2 (Berkeley) 1/13/94 + */ + +#include +#include + +int nosys(); + +int nosys(); +int exit(); +int fork(); +int read(); +int write(); +int open(); +int close(); +int wait4(); +int link(); +int unlink(); +int chdir(); +int fchdir(); +int mknod(); +int chmod(); +int chown(); +int obreak(); +int getfsstat(); +int getpid(); +int mount(); +int unmount(); +int setuid(); +int getuid(); +int geteuid(); +int ptrace(); +int recvmsg(); +int sendmsg(); +int recvfrom(); +int accept(); +int getpeername(); +int getsockname(); +int access(); +int chflags(); +int fchflags(); +int sync(); +int kill(); +int getppid(); +int dup(); +int pipe(); +int getegid(); +int profil(); +#ifdef KTRACE +int ktrace(); +#else +#endif +int sigaction(); +int getgid(); +int sigprocmask(); +int getlogin(); +int setlogin(); +int acct(); +int sigpending(); +int sigaltstack(); +int ioctl(); +int reboot(); +int revoke(); +int symlink(); +int readlink(); +int execve(); +int umask(); +int chroot(); +int msync(); +int vfork(); +int sbrk(); +int sstk(); +int ovadvise(); +int munmap(); +int mprotect(); +int madvise(); +int mincore(); +int getgroups(); +int setgroups(); +int getpgrp(); +int setpgid(); +int setitimer(); +int swapon(); +int getitimer(); +int getdtablesize(); +int dup2(); +int fcntl(); +int select(); +int fsync(); +int setpriority(); +int socket(); +int connect(); +int getpriority(); +int sigreturn(); +int bind(); +int setsockopt(); +int listen(); +int sigsuspend(); +#ifdef TRACE +int vtrace(); +#else +#endif +int gettimeofday(); +int getrusage(); +int getsockopt(); +#ifdef vax +int resuba(); +#else +#endif +int readv(); +int writev(); +int settimeofday(); +int fchown(); +int fchmod(); +int rename(); +int flock(); +int mkfifo(); +int sendto(); +int shutdown(); +int socketpair(); +int mkdir(); +int rmdir(); +int utimes(); +int adjtime(); +int setsid(); +int quotactl(); +#ifdef NFS +int nfssvc(); +#else +#endif +int statfs(); +int fstatfs(); +#ifdef NFS +int getfh(); +#else +#endif +#ifdef SYSVSHM +int shmsys(); +#else +#endif +int setgid(); +int setegid(); +int seteuid(); +#ifdef LFS +int lfs_bmapv(); +int lfs_markv(); +int lfs_segclean(); +int lfs_segwait(); +#else +#endif +int stat(); +int fstat(); +int lstat(); +int pathconf(); +int fpathconf(); +int getrlimit(); +int setrlimit(); +int getdirentries(); +int mmap(); +int nosys(); +int lseek(); +int truncate(); +int ftruncate(); +int __sysctl(); +int mlock(); +int munlock(); + +#ifdef COMPAT_43 +#define compat(n, name) n, __CONCAT(o,name) + +int ocreat(); +int olseek(); +int ostat(); +int olstat(); +#ifdef KTRACE +#else +#endif +int ofstat(); +int ogetkerninfo(); +int ogetpagesize(); +int ommap(); +int owait(); +int ogethostname(); +int osethostname(); +int oaccept(); +int osend(); +int orecv(); +int osigvec(); +int osigblock(); +int osigsetmask(); +int osigstack(); +int orecvmsg(); +int osendmsg(); +#ifdef TRACE +#else +#endif +#ifdef vax +#else +#endif +int orecvfrom(); +int osetreuid(); +int osetregid(); +int otruncate(); +int oftruncate(); +int ogetpeername(); +int ogethostid(); +int osethostid(); +int ogetrlimit(); +int osetrlimit(); +int okillpg(); +int oquota(); +int ogetsockname(); +#ifdef NFS +#else +#endif +int ogetdirentries(); +#ifdef NFS +#else +#endif +#ifdef SYSVSHM +#else +#endif +#ifdef LFS +#else +#endif + +#else /* COMPAT_43 */ +#define compat(n, name) 0, nosys +#endif /* COMPAT_43 */ + +struct sysent sysent[] = { + { 0, nosys }, /* 0 = syscall */ + { 1, exit }, /* 1 = exit */ + { 0, fork }, /* 2 = fork */ + { 3, read }, /* 3 = read */ + { 3, write }, /* 4 = write */ + { 3, open }, /* 5 = open */ + { 1, close }, /* 6 = close */ + { 4, wait4 }, /* 7 = wait4 */ + { compat(2,creat) }, /* 8 = old creat */ + { 2, link }, /* 9 = link */ + { 1, unlink }, /* 10 = unlink */ + { 0, nosys }, /* 11 = obsolete execv */ + { 1, chdir }, /* 12 = chdir */ + { 1, fchdir }, /* 13 = fchdir */ + { 3, mknod }, /* 14 = mknod */ + { 2, chmod }, /* 15 = chmod */ + { 3, chown }, /* 16 = chown */ + { 1, obreak }, /* 17 = break */ + { 3, getfsstat }, /* 18 = getfsstat */ + { compat(3,lseek) }, /* 19 = old lseek */ + { 0, getpid }, /* 20 = getpid */ + { 4, mount }, /* 21 = mount */ + { 2, unmount }, /* 22 = unmount */ + { 1, setuid }, /* 23 = setuid */ + { 0, getuid }, /* 24 = getuid */ + { 0, geteuid }, /* 25 = geteuid */ + { 4, ptrace }, /* 26 = ptrace */ + { 3, recvmsg }, /* 27 = recvmsg */ + { 3, sendmsg }, /* 28 = sendmsg */ + { 6, recvfrom }, /* 29 = recvfrom */ + { 3, accept }, /* 30 = accept */ + { 3, getpeername }, /* 31 = getpeername */ + { 3, getsockname }, /* 32 = getsockname */ + { 2, access }, /* 33 = access */ + { 2, chflags }, /* 34 = chflags */ + { 2, fchflags }, /* 35 = fchflags */ + { 0, sync }, /* 36 = sync */ + { 2, kill }, /* 37 = kill */ + { compat(2,stat) }, /* 38 = old stat */ + { 0, getppid }, /* 39 = getppid */ + { compat(2,lstat) }, /* 40 = old lstat */ + { 2, dup }, /* 41 = dup */ + { 0, pipe }, /* 42 = pipe */ + { 0, getegid }, /* 43 = getegid */ + { 4, profil }, /* 44 = profil */ +#ifdef KTRACE + { 4, ktrace }, /* 45 = ktrace */ +#else + { 0, nosys }, /* 45 = ktrace */ +#endif + { 3, sigaction }, /* 46 = sigaction */ + { 0, getgid }, /* 47 = getgid */ + { 2, sigprocmask }, /* 48 = sigprocmask */ + { 2, getlogin }, /* 49 = getlogin */ + { 1, setlogin }, /* 50 = setlogin */ + { 1, acct }, /* 51 = acct */ + { 0, sigpending }, /* 52 = sigpending */ + { 2, sigaltstack }, /* 53 = sigaltstack */ + { 3, ioctl }, /* 54 = ioctl */ + { 1, reboot }, /* 55 = reboot */ + { 1, revoke }, /* 56 = revoke */ + { 2, symlink }, /* 57 = symlink */ + { 3, readlink }, /* 58 = readlink */ + { 3, execve }, /* 59 = execve */ + { 1, umask }, /* 60 = umask */ + { 1, chroot }, /* 61 = chroot */ + { compat(2,fstat) }, /* 62 = old fstat */ + { compat(4,getkerninfo) }, /* 63 = old getkerninfo */ + { compat(0,getpagesize) }, /* 64 = old getpagesize */ + { 2, msync }, /* 65 = msync */ + { 0, vfork }, /* 66 = vfork */ + { 0, nosys }, /* 67 = obsolete vread */ + { 0, nosys }, /* 68 = obsolete vwrite */ + { 1, sbrk }, /* 69 = sbrk */ + { 1, sstk }, /* 70 = sstk */ + { compat(7,mmap) }, /* 71 = old mmap */ + { 1, ovadvise }, /* 72 = vadvise */ + { 2, munmap }, /* 73 = munmap */ + { 3, mprotect }, /* 74 = mprotect */ + { 3, madvise }, /* 75 = madvise */ + { 0, nosys }, /* 76 = obsolete vhangup */ + { 0, nosys }, /* 77 = obsolete vlimit */ + { 3, mincore }, /* 78 = mincore */ + { 2, getgroups }, /* 79 = getgroups */ + { 2, setgroups }, /* 80 = setgroups */ + { 0, getpgrp }, /* 81 = getpgrp */ + { 2, setpgid }, /* 82 = setpgid */ + { 3, setitimer }, /* 83 = setitimer */ + { compat(0,wait) }, /* 84 = old wait */ + { 1, swapon }, /* 85 = swapon */ + { 2, getitimer }, /* 86 = getitimer */ + { compat(2,gethostname) }, /* 87 = old gethostname */ + { compat(2,sethostname) }, /* 88 = old sethostname */ + { 0, getdtablesize }, /* 89 = getdtablesize */ + { 2, dup2 }, /* 90 = dup2 */ + { 0, nosys }, /* 91 = getdopt */ + { 3, fcntl }, /* 92 = fcntl */ + { 5, select }, /* 93 = select */ + { 0, nosys }, /* 94 = setdopt */ + { 1, fsync }, /* 95 = fsync */ + { 3, setpriority }, /* 96 = setpriority */ + { 3, socket }, /* 97 = socket */ + { 3, connect }, /* 98 = connect */ + { compat(3,accept) }, /* 99 = old accept */ + { 2, getpriority }, /* 100 = getpriority */ + { compat(4,send) }, /* 101 = old send */ + { compat(4,recv) }, /* 102 = old recv */ + { 1, sigreturn }, /* 103 = sigreturn */ + { 3, bind }, /* 104 = bind */ + { 5, setsockopt }, /* 105 = setsockopt */ + { 2, listen }, /* 106 = listen */ + { 0, nosys }, /* 107 = obsolete vtimes */ + { compat(3,sigvec) }, /* 108 = old sigvec */ + { compat(1,sigblock) }, /* 109 = old sigblock */ + { compat(1,sigsetmask) }, /* 110 = old sigsetmask */ + { 1, sigsuspend }, /* 111 = sigsuspend */ + { compat(2,sigstack) }, /* 112 = old sigstack */ + { compat(3,recvmsg) }, /* 113 = old recvmsg */ + { compat(3,sendmsg) }, /* 114 = old sendmsg */ +#ifdef TRACE + { 2, vtrace }, /* 115 = vtrace */ +#else + { 0, nosys }, /* 115 = obsolete vtrace */ +#endif + { 2, gettimeofday }, /* 116 = gettimeofday */ + { 2, getrusage }, /* 117 = getrusage */ + { 5, getsockopt }, /* 118 = getsockopt */ +#ifdef vax + { 1, resuba }, /* 119 = resuba */ +#else + { 0, nosys }, /* 119 = nosys */ +#endif + { 3, readv }, /* 120 = readv */ + { 3, writev }, /* 121 = writev */ + { 2, settimeofday }, /* 122 = settimeofday */ + { 3, fchown }, /* 123 = fchown */ + { 2, fchmod }, /* 124 = fchmod */ + { compat(6,recvfrom) }, /* 125 = old recvfrom */ + { compat(2,setreuid) }, /* 126 = old setreuid */ + { compat(2,setregid) }, /* 127 = old setregid */ + { 2, rename }, /* 128 = rename */ + { compat(2,truncate) }, /* 129 = old truncate */ + { compat(2,ftruncate) }, /* 130 = old ftruncate */ + { 2, flock }, /* 131 = flock */ + { 2, mkfifo }, /* 132 = mkfifo */ + { 6, sendto }, /* 133 = sendto */ + { 2, shutdown }, /* 134 = shutdown */ + { 5, socketpair }, /* 135 = socketpair */ + { 2, mkdir }, /* 136 = mkdir */ + { 1, rmdir }, /* 137 = rmdir */ + { 2, utimes }, /* 138 = utimes */ + { 0, nosys }, /* 139 = obsolete 4.2 sigreturn */ + { 2, adjtime }, /* 140 = adjtime */ + { compat(3,getpeername) }, /* 141 = old getpeername */ + { compat(0,gethostid) }, /* 142 = old gethostid */ + { compat(1,sethostid) }, /* 143 = old sethostid */ + { compat(2,getrlimit) }, /* 144 = old getrlimit */ + { compat(2,setrlimit) }, /* 145 = old setrlimit */ + { compat(2,killpg) }, /* 146 = old killpg */ + { 0, setsid }, /* 147 = setsid */ + { 4, quotactl }, /* 148 = quotactl */ + { compat(4,quota) }, /* 149 = old quota */ + { compat(3,getsockname) }, /* 150 = old getsockname */ + { 0, nosys }, /* 151 = nosys */ + { 0, nosys }, /* 152 = nosys */ + { 0, nosys }, /* 153 = nosys */ + { 0, nosys }, /* 154 = nosys */ +#ifdef NFS + { 2, nfssvc }, /* 155 = nfssvc */ +#else + { 0, nosys }, /* 155 = nosys */ +#endif + { compat(4,getdirentries) }, /* 156 = old getdirentries */ + { 2, statfs }, /* 157 = statfs */ + { 2, fstatfs }, /* 158 = fstatfs */ + { 0, nosys }, /* 159 = nosys */ + { 0, nosys }, /* 160 = nosys */ +#ifdef NFS + { 2, getfh }, /* 161 = getfh */ +#else + { 0, nosys }, /* 161 = nosys */ +#endif + { 0, nosys }, /* 162 = nosys */ + { 0, nosys }, /* 163 = nosys */ + { 0, nosys }, /* 164 = nosys */ + { 0, nosys }, /* 165 = nosys */ + { 0, nosys }, /* 166 = nosys */ + { 0, nosys }, /* 167 = nosys */ + { 0, nosys }, /* 168 = nosys */ + { 0, nosys }, /* 169 = nosys */ + { 0, nosys }, /* 170 = nosys */ +#ifdef SYSVSHM + { 4, shmsys }, /* 171 = shmsys */ +#else + { 0, nosys }, /* 171 = nosys */ +#endif + { 0, nosys }, /* 172 = nosys */ + { 0, nosys }, /* 173 = nosys */ + { 0, nosys }, /* 174 = nosys */ + { 0, nosys }, /* 175 = nosys */ + { 0, nosys }, /* 176 = nosys */ + { 0, nosys }, /* 177 = nosys */ + { 0, nosys }, /* 178 = nosys */ + { 0, nosys }, /* 179 = nosys */ + { 0, nosys }, /* 180 = nosys */ + { 1, setgid }, /* 181 = setgid */ + { 1, setegid }, /* 182 = setegid */ + { 1, seteuid }, /* 183 = seteuid */ +#ifdef LFS + { 3, lfs_bmapv }, /* 184 = lfs_bmapv */ + { 3, lfs_markv }, /* 185 = lfs_markv */ + { 2, lfs_segclean }, /* 186 = lfs_segclean */ + { 2, lfs_segwait }, /* 187 = lfs_segwait */ +#else + { 0, nosys }, /* 184 = nosys */ + { 0, nosys }, /* 185 = nosys */ + { 0, nosys }, /* 186 = nosys */ + { 0, nosys }, /* 187 = nosys */ +#endif + { 2, stat }, /* 188 = stat */ + { 2, fstat }, /* 189 = fstat */ + { 2, lstat }, /* 190 = lstat */ + { 2, pathconf }, /* 191 = pathconf */ + { 2, fpathconf }, /* 192 = fpathconf */ + { 0, nosys }, /* 193 = nosys */ + { 2, getrlimit }, /* 194 = getrlimit */ + { 2, setrlimit }, /* 195 = setrlimit */ + { 4, getdirentries }, /* 196 = getdirentries */ + { 8, mmap }, /* 197 = mmap */ + { 0, nosys }, /* 198 = __syscall */ + { 5, lseek }, /* 199 = lseek */ + { 4, truncate }, /* 200 = truncate */ + { 4, ftruncate }, /* 201 = ftruncate */ + { 6, __sysctl }, /* 202 = __sysctl */ + { 2, mlock }, /* 203 = mlock */ + { 2, munlock }, /* 204 = munlock */ + { 0, nosys }, /* 205 = nosys */ + { 0, nosys }, /* 206 = nosys */ + { 0, nosys }, /* 207 = nosys */ + { 0, nosys }, /* 208 = nosys */ + { 0, nosys }, /* 209 = nosys */ + { 0, nosys }, /* 210 = nosys */ +}; + +int nsysent = sizeof(sysent) / sizeof(sysent[0]); diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c new file mode 100644 index 00000000000..b752279d120 --- /dev/null +++ b/sys/kern/kern_acct.c @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)kern_acct.c 8.1 (Berkeley) 6/14/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +struct acct_args { + char *fname; +}; +acct(a1, a2, a3) + struct proc *a1; + struct acct_args *a2; + int *a3; +{ + /* + * Body deleted. + */ + return (ENOSYS); +} + +acct_process(a1) + struct proc *a1; +{ + + /* + * Body deleted. + */ + return; +} + +/* + * Periodically check the file system to see if accounting + * should be turned on or off. + */ + +/* + * Values associated with enabling and disabling accounting + */ +int acctsuspend = 2; /* stop accounting when < 2% free space left */ +int acctresume = 4; /* resume when free space risen to > 4% */ +int acctchkfreq = 15; /* frequency (in seconds) to check space */ + +/* + * SHOULD REPLACE THIS WITH A DRIVER THAT CAN BE READ TO SIMPLIFY. + */ +struct vnode *acctp; +struct vnode *savacctp; + +/* ARGSUSED */ +void +acctwatch(a) + void *a; +{ + struct statfs sb; + + if (savacctp) { + (void)VFS_STATFS(savacctp->v_mount, &sb, (struct proc *)0); + if (sb.f_bavail > acctresume * sb.f_blocks / 100) { + acctp = savacctp; + savacctp = NULL; + log(LOG_NOTICE, "Accounting resumed\n"); + } + } else { + if (acctp == NULL) + return; + (void)VFS_STATFS(acctp->v_mount, &sb, (struct proc *)0); + if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) { + savacctp = acctp; + acctp = NULL; + log(LOG_NOTICE, "Accounting suspended\n"); + } + } + timeout(acctwatch, NULL, acctchkfreq * hz); +} diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c new file mode 100644 index 00000000000..f42900cb75d --- /dev/null +++ b/sys/kern/kern_clock.c @@ -0,0 +1,528 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef GPROF +#include +#endif + +/* + * Clock handling routines. + * + * This code is written to operate with two timers that run independently of + * each other. The main clock, running hz times per second, is used to keep + * track of real time. The second timer handles kernel and user profiling, + * and does resource use estimation. If the second timer is programmable, + * it is randomized to avoid aliasing between the two clocks. For example, + * the randomization prevents an adversary from always giving up the cpu + * just before its quantum expires. Otherwise, it would never accumulate + * cpu ticks. The mean frequency of the second timer is stathz. + * + * If no second timer exists, stathz will be zero; in this case we drive + * profiling and statistics off the main clock. This WILL NOT be accurate; + * do not do it unless absolutely necessary. + * + * The statistics clock may (or may not) be run at a higher rate while + * profiling. This profile clock runs at profhz. We require that profhz + * be an integral multiple of stathz. + * + * If the statistics clock is running fast, it must be divided by the ratio + * profhz/stathz for statistics. (For profiling, every tick counts.) + */ + +/* + * TODO: + * allocate more timeout table slots when table overflows. + */ + +/* + * Bump a timeval by a small number of usec's. + */ +#define BUMPTIME(t, usec) { \ + register volatile struct timeval *tp = (t); \ + register long us; \ + \ + tp->tv_usec = us = tp->tv_usec + (usec); \ + if (us >= 1000000) { \ + tp->tv_usec = us - 1000000; \ + tp->tv_sec++; \ + } \ +} + +int stathz; +int profhz; +int profprocs; +int ticks; +static int psdiv, pscnt; /* prof => stat divider */ +int psratio; /* ratio: prof / stat */ + +volatile struct timeval time; +volatile struct timeval mono_time; + +/* + * Initialize clock frequencies and start both clocks running. + */ +void +initclocks() +{ + register int i; + + /* + * Set divisors to 1 (normal case) and let the machine-specific + * code do its bit. + */ + psdiv = pscnt = 1; + cpu_initclocks(); + + /* + * Compute profhz/stathz, and fix profhz if needed. + */ + i = stathz ? stathz : hz; + if (profhz == 0) + profhz = i; + psratio = profhz / i; +} + +/* + * The real-time timer, interrupting hz times per second. + */ +void +hardclock(frame) + register struct clockframe *frame; +{ + register struct callout *p1; + register struct proc *p; + register int delta, needsoft; + extern int tickdelta; + extern long timedelta; + + /* + * Update real-time timeout queue. + * At front of queue are some number of events which are ``due''. + * The time to these is <= 0 and if negative represents the + * number of ticks which have passed since it was supposed to happen. + * The rest of the q elements (times > 0) are events yet to happen, + * where the time for each is given as a delta from the previous. + * Decrementing just the first of these serves to decrement the time + * to all events. + */ + needsoft = 0; + for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) { + if (--p1->c_time > 0) + break; + needsoft = 1; + if (p1->c_time == 0) + break; + } + + p = curproc; + if (p) { + register struct pstats *pstats; + + /* + * Run current process's virtual and profile time, as needed. + */ + pstats = p->p_stats; + if (CLKF_USERMODE(frame) && + timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) + psignal(p, SIGVTALRM); + if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) + psignal(p, SIGPROF); + } + + /* + * If no separate statistics clock is available, run it from here. + */ + if (stathz == 0) + statclock(frame); + + /* + * Increment the time-of-day. The increment is just ``tick'' unless + * we are still adjusting the clock; see adjtime(). + */ + ticks++; + if (timedelta == 0) + delta = tick; + else { + delta = tick + tickdelta; + timedelta -= tickdelta; + } + BUMPTIME(&time, delta); + BUMPTIME(&mono_time, delta); + + /* + * Process callouts at a very low cpu priority, so we don't keep the + * relatively high clock interrupt priority any longer than necessary. + */ + if (needsoft) { + if (CLKF_BASEPRI(frame)) { + /* + * Save the overhead of a software interrupt; + * it will happen as soon as we return, so do it now. + */ + (void)splsoftclock(); + softclock(); + } else + setsoftclock(); + } +} + +/* + * Software (low priority) clock interrupt. + * Run periodic events from timeout queue. + */ +/*ARGSUSED*/ +void +softclock() +{ + register struct callout *c; + register void *arg; + register void (*func) __P((void *)); + register int s; + + s = splhigh(); + while ((c = calltodo.c_next) != NULL && c->c_time <= 0) { + func = c->c_func; + arg = c->c_arg; + calltodo.c_next = c->c_next; + c->c_next = callfree; + callfree = c; + splx(s); + (*func)(arg); + (void) splhigh(); + } + splx(s); +} + +/* + * timeout -- + * Execute a function after a specified length of time. + * + * untimeout -- + * Cancel previous timeout function call. + * + * See AT&T BCI Driver Reference Manual for specification. This + * implementation differs from that one in that no identification + * value is returned from timeout, rather, the original arguments + * to timeout are used to identify entries for untimeout. + */ +void +timeout(ftn, arg, ticks) + void (*ftn) __P((void *)); + void *arg; + register int ticks; +{ + register struct callout *new, *p, *t; + register int s; + + if (ticks <= 0) + ticks = 1; + + /* Lock out the clock. */ + s = splhigh(); + + /* Fill in the next free callout structure. */ + if (callfree == NULL) + panic("timeout table full"); + new = callfree; + callfree = new->c_next; + new->c_arg = arg; + new->c_func = ftn; + + /* + * The time for each event is stored as a difference from the time + * of the previous event on the queue. Walk the queue, correcting + * the ticks argument for queue entries passed. Correct the ticks + * value for the queue entry immediately after the insertion point + * as well. Watch out for negative c_time values; these represent + * overdue events. + */ + for (p = &calltodo; + (t = p->c_next) != NULL && ticks > t->c_time; p = t) + if (t->c_time > 0) + ticks -= t->c_time; + new->c_time = ticks; + if (t != NULL) + t->c_time -= ticks; + + /* Insert the new entry into the queue. */ + p->c_next = new; + new->c_next = t; + splx(s); +} + +void +untimeout(ftn, arg) + void (*ftn) __P((void *)); + void *arg; +{ + register struct callout *p, *t; + register int s; + + s = splhigh(); + for (p = &calltodo; (t = p->c_next) != NULL; p = t) + if (t->c_func == ftn && t->c_arg == arg) { + /* Increment next entry's tick count. */ + if (t->c_next && t->c_time > 0) + t->c_next->c_time += t->c_time; + + /* Move entry from callout queue to callfree queue. */ + p->c_next = t->c_next; + t->c_next = callfree; + callfree = t; + break; + } + splx(s); +} + +/* + * Compute number of hz until specified time. Used to + * compute third argument to timeout() from an absolute time. + */ +int +hzto(tv) + struct timeval *tv; +{ + register long ticks, sec; + int s; + + /* + * If number of milliseconds will fit in 32 bit arithmetic, + * then compute number of milliseconds to time and scale to + * ticks. Otherwise just compute number of hz in time, rounding + * times greater than representible to maximum value. + * + * Delta times less than 25 days can be computed ``exactly''. + * Maximum value for any timeout in 10ms ticks is 250 days. + */ + s = splhigh(); + sec = tv->tv_sec - time.tv_sec; + if (sec <= 0x7fffffff / 1000 - 1000) + ticks = ((tv->tv_sec - time.tv_sec) * 1000 + + (tv->tv_usec - time.tv_usec) / 1000) / (tick / 1000); + else if (sec <= 0x7fffffff / hz) + ticks = sec * hz; + else + ticks = 0x7fffffff; + splx(s); + return (ticks); +} + +/* + * Start profiling on a process. + * + * Kernel profiling passes proc0 which never exits and hence + * keeps the profile clock running constantly. + */ +void +startprofclock(p) + register struct proc *p; +{ + int s; + + if ((p->p_flag & P_PROFIL) == 0) { + p->p_flag |= P_PROFIL; + if (++profprocs == 1 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = psratio; + setstatclockrate(profhz); + splx(s); + } + } +} + +/* + * Stop profiling on a process. + */ +void +stopprofclock(p) + register struct proc *p; +{ + int s; + + if (p->p_flag & P_PROFIL) { + p->p_flag &= ~P_PROFIL; + if (--profprocs == 0 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = 1; + setstatclockrate(stathz); + splx(s); + } + } +} + +int dk_ndrive = DK_NDRIVE; + +/* + * Statistics clock. Grab profile sample, and if divider reaches 0, + * do process and kernel statistics. + */ +void +statclock(frame) + register struct clockframe *frame; +{ +#ifdef GPROF + register struct gmonparam *g; +#endif + register struct proc *p; + register int i; + + if (CLKF_USERMODE(frame)) { + p = curproc; + if (p->p_flag & P_PROFIL) + addupc_intr(p, CLKF_PC(frame), 1); + if (--pscnt > 0) + return; + /* + * Came from user mode; CPU was in user state. + * If this process is being profiled record the tick. + */ + p->p_uticks++; + if (p->p_nice > NZERO) + cp_time[CP_NICE]++; + else + cp_time[CP_USER]++; + } else { +#ifdef GPROF + /* + * Kernel statistics are just like addupc_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = CLKF_PC(frame) - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif + if (--pscnt > 0) + return; + /* + * Came from kernel mode, so we were: + * - handling an interrupt, + * - doing syscall or trap work on behalf of the current + * user process, or + * - spinning in the idle loop. + * Whichever it is, charge the time as appropriate. + * Note that we charge interrupts to the current process, + * regardless of whether they are ``for'' that process, + * so that we know how much of its real time was spent + * in ``non-process'' (i.e., interrupt) work. + */ + p = curproc; + if (CLKF_INTR(frame)) { + if (p != NULL) + p->p_iticks++; + cp_time[CP_INTR]++; + } else if (p != NULL) { + p->p_sticks++; + cp_time[CP_SYS]++; + } else + cp_time[CP_IDLE]++; + } + pscnt = psdiv; + + /* + * We maintain statistics shown by user-level statistics + * programs: the amount of time in each cpu state, and + * the amount of time each of DK_NDRIVE ``drives'' is busy. + * + * XXX should either run linked list of drives, or (better) + * grab timestamps in the start & done code. + */ + for (i = 0; i < DK_NDRIVE; i++) + if (dk_busy & (1 << i)) + dk_time[i]++; + + /* + * We adjust the priority of the current process. The priority of + * a process gets worse as it accumulates CPU time. The cpu usage + * estimator (p_estcpu) is increased here. The formula for computing + * priorities (in kern_synch.c) will compute a different value each + * time p_estcpu increases by 4. The cpu usage estimator ramps up + * quite quickly when the process is running (linearly), and decays + * away exponentially, at a rate which is proportionally slower when + * the system is busy. The basic principal is that the system will + * 90% forget that the process used a lot of CPU time in 5 * loadav + * seconds. This causes the system to favor processes which haven't + * run much recently, and to round-robin among other processes. + */ + if (p != NULL) { + p->p_cpticks++; + if (++p->p_estcpu == 0) + p->p_estcpu--; + if ((p->p_estcpu & 3) == 0) { + resetpriority(p); + if (p->p_priority >= PUSER) + p->p_priority = p->p_usrpri; + } + } +} + +/* + * Return information about system clocks. + */ +sysctl_clockrate(where, sizep) + register char *where; + size_t *sizep; +{ + struct clockinfo clkinfo; + + /* + * Construct clockinfo structure. + */ + clkinfo.hz = hz; + clkinfo.tick = tick; + clkinfo.profhz = profhz; + clkinfo.stathz = stathz ? stathz : hz; + return (sysctl_rdstruct(where, sizep, NULL, &clkinfo, sizeof(clkinfo))); +} diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c new file mode 100644 index 00000000000..543946d3f8f --- /dev/null +++ b/sys/kern/kern_descrip.c @@ -0,0 +1,914 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Descriptor management. + */ +struct file *filehead; /* head of list of open files */ +int nfiles; /* actual number of open files */ + +/* + * System calls on descriptors. + */ +struct getdtablesize_args { + int dummy; +}; +/* ARGSUSED */ +getdtablesize(p, uap, retval) + struct proc *p; + struct getdtablesize_args *uap; + int *retval; +{ + + *retval = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); + return (0); +} + +/* + * Duplicate a file descriptor. + */ +struct dup_args { + u_int fd; +}; +/* ARGSUSED */ +dup(p, uap, retval) + struct proc *p; + struct dup_args *uap; + int *retval; +{ + register struct filedesc *fdp; + u_int old; + int new, error; + + old = uap->fd; + /* + * XXX Compatibility + */ + if (old &~ 077) { uap->fd &= 077; return (dup2(p, uap, retval)); } + + fdp = p->p_fd; + if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) + return (EBADF); + if (error = fdalloc(p, 0, &new)) + return (error); + return (finishdup(fdp, (int)old, new, retval)); +} + +/* + * Duplicate a file descriptor to a particular value. + */ +struct dup2_args { + u_int from; + u_int to; +}; +/* ARGSUSED */ +dup2(p, uap, retval) + struct proc *p; + struct dup2_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register u_int old = uap->from, new = uap->to; + int i, error; + + if (old >= fdp->fd_nfiles || + fdp->fd_ofiles[old] == NULL || + new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || + new >= maxfiles) + return (EBADF); + if (old == new) { + *retval = new; + return (0); + } + if (new >= fdp->fd_nfiles) { + if (error = fdalloc(p, new, &i)) + return (error); + if (new != i) + panic("dup2: fdalloc"); + } else if (fdp->fd_ofiles[new]) { + if (fdp->fd_ofileflags[new] & UF_MAPPED) + (void) munmapfd(p, new); + /* + * dup2() must succeed even if the close has an error. + */ + (void) closef(fdp->fd_ofiles[new], p); + } + return (finishdup(fdp, (int)old, (int)new, retval)); +} + +/* + * The file control system call. + */ +struct fcntl_args { + int fd; + int cmd; + int arg; +}; +/* ARGSUSED */ +fcntl(p, uap, retval) + struct proc *p; + register struct fcntl_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register char *pop; + struct vnode *vp; + int i, tmp, error, flg = F_POSIX; + struct flock fl; + u_int newmin; + + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + pop = &fdp->fd_ofileflags[uap->fd]; + switch (uap->cmd) { + + case F_DUPFD: + newmin = uap->arg; + if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || + newmin >= maxfiles) + return (EINVAL); + if (error = fdalloc(p, newmin, &i)) + return (error); + return (finishdup(fdp, uap->fd, i, retval)); + + case F_GETFD: + *retval = *pop & 1; + return (0); + + case F_SETFD: + *pop = (*pop &~ 1) | (uap->arg & 1); + return (0); + + case F_GETFL: + *retval = OFLAGS(fp->f_flag); + return (0); + + case F_SETFL: + fp->f_flag &= ~FCNTLFLAGS; + fp->f_flag |= FFLAGS(uap->arg) & FCNTLFLAGS; + tmp = fp->f_flag & FNONBLOCK; + error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); + if (error) + return (error); + tmp = fp->f_flag & FASYNC; + error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); + if (!error) + return (0); + fp->f_flag &= ~FNONBLOCK; + tmp = 0; + (void) (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); + return (error); + + case F_GETOWN: + if (fp->f_type == DTYPE_SOCKET) { + *retval = ((struct socket *)fp->f_data)->so_pgid; + return (0); + } + error = (*fp->f_ops->fo_ioctl) + (fp, (int)TIOCGPGRP, (caddr_t)retval, p); + *retval = -*retval; + return (error); + + case F_SETOWN: + if (fp->f_type == DTYPE_SOCKET) { + ((struct socket *)fp->f_data)->so_pgid = uap->arg; + return (0); + } + if (uap->arg <= 0) { + uap->arg = -uap->arg; + } else { + struct proc *p1 = pfind(uap->arg); + if (p1 == 0) + return (ESRCH); + uap->arg = p1->p_pgrp->pg_id; + } + return ((*fp->f_ops->fo_ioctl) + (fp, (int)TIOCSPGRP, (caddr_t)&uap->arg, p)); + + case F_SETLKW: + flg |= F_WAIT; + /* Fall into F_SETLK */ + + case F_SETLK: + if (fp->f_type != DTYPE_VNODE) + return (EBADF); + vp = (struct vnode *)fp->f_data; + /* Copy in the lock structure */ + error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl)); + if (error) + return (error); + if (fl.l_whence == SEEK_CUR) + fl.l_start += fp->f_offset; + switch (fl.l_type) { + + case F_RDLCK: + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + p->p_flag |= P_ADVLOCK; + return (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg)); + + case F_WRLCK: + if ((fp->f_flag & FWRITE) == 0) + return (EBADF); + p->p_flag |= P_ADVLOCK; + return (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg)); + + case F_UNLCK: + return (VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &fl, + F_POSIX)); + + default: + return (EINVAL); + } + + case F_GETLK: + if (fp->f_type != DTYPE_VNODE) + return (EBADF); + vp = (struct vnode *)fp->f_data; + /* Copy in the lock structure */ + error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl)); + if (error) + return (error); + if (fl.l_whence == SEEK_CUR) + fl.l_start += fp->f_offset; + if (error = VOP_ADVLOCK(vp, (caddr_t)p, F_GETLK, &fl, F_POSIX)) + return (error); + return (copyout((caddr_t)&fl, (caddr_t)uap->arg, sizeof (fl))); + + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Common code for dup, dup2, and fcntl(F_DUPFD). + */ +int +finishdup(fdp, old, new, retval) + register struct filedesc *fdp; + register int old, new, *retval; +{ + register struct file *fp; + + fp = fdp->fd_ofiles[old]; + fdp->fd_ofiles[new] = fp; + fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; + fp->f_count++; + if (new > fdp->fd_lastfile) + fdp->fd_lastfile = new; + *retval = new; + return (0); +} + +/* + * Close a file descriptor. + */ +struct close_args { + int fd; +}; +/* ARGSUSED */ +close(p, uap, retval) + struct proc *p; + struct close_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register int fd = uap->fd; + register u_char *pf; + + if ((unsigned)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + pf = (u_char *)&fdp->fd_ofileflags[fd]; + if (*pf & UF_MAPPED) + (void) munmapfd(p, fd); + fdp->fd_ofiles[fd] = NULL; + while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) + fdp->fd_lastfile--; + if (fd < fdp->fd_freefile) + fdp->fd_freefile = fd; + *pf = 0; + return (closef(fp, p)); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Return status information about a file descriptor. + */ +struct ofstat_args { + int fd; + struct ostat *sb; +}; +/* ARGSUSED */ +ofstat(p, uap, retval) + struct proc *p; + register struct ofstat_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct stat ub; + struct ostat oub; + int error; + + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + switch (fp->f_type) { + + case DTYPE_VNODE: + error = vn_stat((struct vnode *)fp->f_data, &ub, p); + break; + + case DTYPE_SOCKET: + error = soo_stat((struct socket *)fp->f_data, &ub); + break; + + default: + panic("ofstat"); + /*NOTREACHED*/ + } + cvtstat(&ub, &oub); + if (error == 0) + error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub)); + return (error); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Return status information about a file descriptor. + */ +struct fstat_args { + int fd; + struct stat *sb; +}; +/* ARGSUSED */ +fstat(p, uap, retval) + struct proc *p; + register struct fstat_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct stat ub; + int error; + + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + switch (fp->f_type) { + + case DTYPE_VNODE: + error = vn_stat((struct vnode *)fp->f_data, &ub, p); + break; + + case DTYPE_SOCKET: + error = soo_stat((struct socket *)fp->f_data, &ub); + break; + + default: + panic("fstat"); + /*NOTREACHED*/ + } + if (error == 0) + error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub)); + return (error); +} + +/* + * Return pathconf information about a file descriptor. + */ +struct fpathconf_args { + int fd; + int name; +}; +/* ARGSUSED */ +fpathconf(p, uap, retval) + struct proc *p; + register struct fpathconf_args *uap; + int *retval; +{ + struct filedesc *fdp = p->p_fd; + struct file *fp; + struct vnode *vp; + + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + switch (fp->f_type) { + + case DTYPE_SOCKET: + if (uap->name != _PC_PIPE_BUF) + return (EINVAL); + *retval = PIPE_BUF; + return (0); + + case DTYPE_VNODE: + vp = (struct vnode *)fp->f_data; + return (VOP_PATHCONF(vp, uap->name, retval)); + + default: + panic("fpathconf"); + } + /*NOTREACHED*/ +} + +/* + * Allocate a file descriptor for the process. + */ +int fdexpand; + +fdalloc(p, want, result) + struct proc *p; + int want; + int *result; +{ + register struct filedesc *fdp = p->p_fd; + register int i; + int lim, last, nfiles; + struct file **newofile; + char *newofileflags; + + /* + * Search for a free descriptor starting at the higher + * of want or fd_freefile. If that fails, consider + * expanding the ofile array. + */ + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); + for (;;) { + last = min(fdp->fd_nfiles, lim); + if ((i = want) < fdp->fd_freefile) + i = fdp->fd_freefile; + for (; i < last; i++) { + if (fdp->fd_ofiles[i] == NULL) { + fdp->fd_ofileflags[i] = 0; + if (i > fdp->fd_lastfile) + fdp->fd_lastfile = i; + if (want <= fdp->fd_freefile) + fdp->fd_freefile = i; + *result = i; + return (0); + } + } + + /* + * No space in current array. Expand? + */ + if (fdp->fd_nfiles >= lim) + return (EMFILE); + if (fdp->fd_nfiles < NDEXTENT) + nfiles = NDEXTENT; + else + nfiles = 2 * fdp->fd_nfiles; + MALLOC(newofile, struct file **, nfiles * OFILESIZE, + M_FILEDESC, M_WAITOK); + newofileflags = (char *) &newofile[nfiles]; + /* + * Copy the existing ofile and ofileflags arrays + * and zero the new portion of each array. + */ + bcopy(fdp->fd_ofiles, newofile, + (i = sizeof(struct file *) * fdp->fd_nfiles)); + bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i); + bcopy(fdp->fd_ofileflags, newofileflags, + (i = sizeof(char) * fdp->fd_nfiles)); + bzero(newofileflags + i, nfiles * sizeof(char) - i); + if (fdp->fd_nfiles > NDFILE) + FREE(fdp->fd_ofiles, M_FILEDESC); + fdp->fd_ofiles = newofile; + fdp->fd_ofileflags = newofileflags; + fdp->fd_nfiles = nfiles; + fdexpand++; + } +} + +/* + * Check to see whether n user file descriptors + * are available to the process p. + */ +fdavail(p, n) + struct proc *p; + register int n; +{ + register struct filedesc *fdp = p->p_fd; + register struct file **fpp; + register int i, lim; + + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); + if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) + return (1); + fpp = &fdp->fd_ofiles[fdp->fd_freefile]; + for (i = fdp->fd_nfiles - fdp->fd_freefile; --i >= 0; fpp++) + if (*fpp == NULL && --n <= 0) + return (1); + return (0); +} + +/* + * Create a new open file structure and allocate + * a file decriptor for the process that refers to it. + */ +falloc(p, resultfp, resultfd) + register struct proc *p; + struct file **resultfp; + int *resultfd; +{ + register struct file *fp, *fq, **fpp; + int error, i; + + if (error = fdalloc(p, 0, &i)) + return (error); + if (nfiles >= maxfiles) { + tablefull("file"); + return (ENFILE); + } + /* + * Allocate a new file descriptor. + * If the process has file descriptor zero open, add to the list + * of open files at that point, otherwise put it at the front of + * the list of open files. + */ + nfiles++; + MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK); + bzero(fp, sizeof(struct file)); + if (fq = p->p_fd->fd_ofiles[0]) + fpp = &fq->f_filef; + else + fpp = &filehead; + p->p_fd->fd_ofiles[i] = fp; + if (fq = *fpp) + fq->f_fileb = &fp->f_filef; + fp->f_filef = fq; + fp->f_fileb = fpp; + *fpp = fp; + fp->f_count = 1; + fp->f_cred = p->p_ucred; + crhold(fp->f_cred); + if (resultfp) + *resultfp = fp; + if (resultfd) + *resultfd = i; + return (0); +} + +/* + * Free a file descriptor. + */ +ffree(fp) + register struct file *fp; +{ + register struct file *fq; + + if (fq = fp->f_filef) + fq->f_fileb = fp->f_fileb; + *fp->f_fileb = fq; + crfree(fp->f_cred); +#ifdef DIAGNOSTIC + fp->f_filef = NULL; + fp->f_fileb = NULL; + fp->f_count = 0; +#endif + nfiles--; + FREE(fp, M_FILE); +} + +/* + * Copy a filedesc structure. + */ +struct filedesc * +fdcopy(p) + struct proc *p; +{ + register struct filedesc *newfdp, *fdp = p->p_fd; + register struct file **fpp; + register int i; + + MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0), + M_FILEDESC, M_WAITOK); + bcopy(fdp, newfdp, sizeof(struct filedesc)); + VREF(newfdp->fd_cdir); + if (newfdp->fd_rdir) + VREF(newfdp->fd_rdir); + newfdp->fd_refcnt = 1; + + /* + * If the number of open files fits in the internal arrays + * of the open file structure, use them, otherwise allocate + * additional memory for the number of descriptors currently + * in use. + */ + if (newfdp->fd_lastfile < NDFILE) { + newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles; + newfdp->fd_ofileflags = + ((struct filedesc0 *) newfdp)->fd_dfileflags; + i = NDFILE; + } else { + /* + * Compute the smallest multiple of NDEXTENT needed + * for the file descriptors currently in use, + * allowing the table to shrink. + */ + i = newfdp->fd_nfiles; + while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2) + i /= 2; + MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE, + M_FILEDESC, M_WAITOK); + newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i]; + } + newfdp->fd_nfiles = i; + bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **)); + bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char)); + fpp = newfdp->fd_ofiles; + for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) + if (*fpp != NULL) + (*fpp)->f_count++; + return (newfdp); +} + +/* + * Release a filedesc structure. + */ +void +fdfree(p) + struct proc *p; +{ + register struct filedesc *fdp = p->p_fd; + struct file **fpp; + register int i; + + if (--fdp->fd_refcnt > 0) + return; + fpp = fdp->fd_ofiles; + for (i = fdp->fd_lastfile; i-- >= 0; fpp++) + if (*fpp) + (void) closef(*fpp, p); + if (fdp->fd_nfiles > NDFILE) + FREE(fdp->fd_ofiles, M_FILEDESC); + vrele(fdp->fd_cdir); + if (fdp->fd_rdir) + vrele(fdp->fd_rdir); + FREE(fdp, M_FILEDESC); +} + +/* + * Internal form of close. + * Decrement reference count on file structure. + * Note: p may be NULL when closing a file + * that was being passed in a message. + */ +closef(fp, p) + register struct file *fp; + register struct proc *p; +{ + struct vnode *vp; + struct flock lf; + int error; + + if (fp == NULL) + return (0); + /* + * POSIX record locking dictates that any close releases ALL + * locks owned by this process. This is handled by setting + * a flag in the unlock to free ONLY locks obeying POSIX + * semantics, and not to free BSD-style file locks. + * If the descriptor was in a message, POSIX-style locks + * aren't passed with the descriptor. + */ + if (p && (p->p_flag & P_ADVLOCK) && fp->f_type == DTYPE_VNODE) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = (struct vnode *)fp->f_data; + (void) VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_POSIX); + } + if (--fp->f_count > 0) + return (0); + if (fp->f_count < 0) + panic("closef: count < 0"); + if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = (struct vnode *)fp->f_data; + (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); + } + if (fp->f_ops) + error = (*fp->f_ops->fo_close)(fp, p); + else + error = 0; + ffree(fp); + return (error); +} + +/* + * Apply an advisory lock on a file descriptor. + * + * Just attempt to get a record lock of the requested type on + * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). + */ +struct flock_args { + int fd; + int how; +}; +/* ARGSUSED */ +flock(p, uap, retval) + struct proc *p; + register struct flock_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vnode *vp; + struct flock lf; + + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (EOPNOTSUPP); + vp = (struct vnode *)fp->f_data; + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (uap->how & LOCK_UN) { + lf.l_type = F_UNLCK; + fp->f_flag &= ~FHASLOCK; + return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK)); + } + if (uap->how & LOCK_EX) + lf.l_type = F_WRLCK; + else if (uap->how & LOCK_SH) + lf.l_type = F_RDLCK; + else + return (EBADF); + fp->f_flag |= FHASLOCK; + if (uap->how & LOCK_NB) + return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK)); + return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT)); +} + +/* + * File Descriptor pseudo-device driver (/dev/fd/). + * + * Opening minor device N dup()s the file (if any) connected to file + * descriptor N belonging to the calling process. Note that this driver + * consists of only the ``open()'' routine, because all subsequent + * references to this file will be direct to the other driver. + */ +/* ARGSUSED */ +fdopen(dev, mode, type, p) + dev_t dev; + int mode, type; + struct proc *p; +{ + + /* + * XXX Kludge: set curproc->p_dupfd to contain the value of the + * the file descriptor being sought for duplication. The error + * return ensures that the vnode for this device will be released + * by vn_open. Open will detect this special error and take the + * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN + * will simply report the error. + */ + p->p_dupfd = minor(dev); + return (ENODEV); +} + +/* + * Duplicate the specified descriptor to a free descriptor. + */ +dupfdopen(fdp, indx, dfd, mode, error) + register struct filedesc *fdp; + register int indx, dfd; + int mode; + int error; +{ + register struct file *wfp; + struct file *fp; + + /* + * If the to-be-dup'd fd number is greater than the allowed number + * of file descriptors, or the fd to be dup'd has already been + * closed, reject. Note, check for new == old is necessary as + * falloc could allocate an already closed to-be-dup'd descriptor + * as the new descriptor. + */ + fp = fdp->fd_ofiles[indx]; + if ((u_int)dfd >= fdp->fd_nfiles || + (wfp = fdp->fd_ofiles[dfd]) == NULL || fp == wfp) + return (EBADF); + + /* + * There are two cases of interest here. + * + * For ENODEV simply dup (dfd) to file descriptor + * (indx) and return. + * + * For ENXIO steal away the file structure from (dfd) and + * store it in (indx). (dfd) is effectively closed by + * this operation. + * + * Any other error code is just returned. + */ + switch (error) { + case ENODEV: + /* + * Check that the mode the file is being opened for is a + * subset of the mode of the existing descriptor. + */ + if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) + return (EACCES); + fdp->fd_ofiles[indx] = wfp; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + wfp->f_count++; + if (indx > fdp->fd_lastfile) + fdp->fd_lastfile = indx; + return (0); + + case ENXIO: + /* + * Steal away the file pointer from dfd, and stuff it into indx. + */ + fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; + fdp->fd_ofiles[dfd] = NULL; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + fdp->fd_ofileflags[dfd] = 0; + /* + * Complete the clean up of the filedesc structure by + * recomputing the various hints. + */ + if (indx > fdp->fd_lastfile) + fdp->fd_lastfile = indx; + else + while (fdp->fd_lastfile > 0 && + fdp->fd_ofiles[fdp->fd_lastfile] == NULL) + fdp->fd_lastfile--; + if (dfd < fdp->fd_freefile) + fdp->fd_freefile = dfd; + return (0); + + default: + return (error); + } + /* NOTREACHED */ +} diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c new file mode 100644 index 00000000000..fbb4444d52b --- /dev/null +++ b/sys/kern/kern_exec.c @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)kern_exec.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include + +/* + * exec system call + */ +struct execve_args { + char *fname; + char **argp; + char **envp; +}; +/* ARGSUSED */ +execve(a1, a2, a3) + struct proc *a1; + struct execve_args *a2; + int *a3; +{ + + /* + * Body deleted. + */ + return (ENOSYS); +} diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c new file mode 100644 index 00000000000..03353c72d1d --- /dev/null +++ b/sys/kern/kern_exit.c @@ -0,0 +1,492 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifdef COMPAT_43 +#include +#include +#endif + +#include +#include + +__dead void cpu_exit __P((struct proc *)); +__dead void exit1 __P((struct proc *, int)); + +/* + * exit -- + * Death of process. + */ +struct rexit_args { + int rval; +}; +__dead void +exit(p, uap, retval) + struct proc *p; + struct rexit_args *uap; + int *retval; +{ + + exit1(p, W_EXITCODE(uap->rval, 0)); + /* NOTREACHED */ +} + +/* + * Exit: deallocate address space and other resources, change proc state + * to zombie, and unlink proc from allproc and parent's lists. Save exit + * status and rusage for wait(). Check for child processes and orphan them. + */ +__dead void +exit1(p, rv) + register struct proc *p; + int rv; +{ + register struct proc *q, *nq; + register struct proc **pp; + register struct vmspace *vm; + + if (p->p_pid == 1) + panic("init died (signal %d, exit %d)", + WTERMSIG(rv), WEXITSTATUS(rv)); +#ifdef PGINPROF + vmsizmon(); +#endif + if (p->p_flag & P_PROFIL) + stopprofclock(p); + MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), + M_ZOMBIE, M_WAITOK); + /* + * If parent is waiting for us to exit or exec, + * P_PPWAIT is set; we will wakeup the parent below. + */ + p->p_flag &= ~(P_TRACED | P_PPWAIT); + p->p_flag |= P_WEXIT; + p->p_sigignore = ~0; + p->p_siglist = 0; + untimeout(realitexpire, (caddr_t)p); + + /* + * Close open files and release open-file table. + * This may block! + */ + fdfree(p); + + /* The next two chunks should probably be moved to vmspace_exit. */ + vm = p->p_vmspace; +#ifdef SYSVSHM + if (vm->vm_shm) + shmexit(p); +#endif + /* + * Release user portion of address space. + * This releases references to vnodes, + * which could cause I/O if the file has been unlinked. + * Need to do this early enough that we can still sleep. + * Can't free the entire vmspace as the kernel stack + * may be mapped within that space also. + */ + if (vm->vm_refcnt == 1) + (void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS, + VM_MAXUSER_ADDRESS); + + if (SESS_LEADER(p)) { + register struct session *sp = p->p_session; + + if (sp->s_ttyvp) { + /* + * Controlling process. + * Signal foreground pgrp, + * drain controlling terminal + * and revoke access to controlling terminal. + */ + if (sp->s_ttyp->t_session == sp) { + if (sp->s_ttyp->t_pgrp) + pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1); + (void) ttywait(sp->s_ttyp); + /* + * The tty could have been revoked + * if we blocked. + */ + if (sp->s_ttyvp) + vgoneall(sp->s_ttyvp); + } + if (sp->s_ttyvp) + vrele(sp->s_ttyvp); + sp->s_ttyvp = NULL; + /* + * s_ttyp is not zero'd; we use this to indicate + * that the session once had a controlling terminal. + * (for logging and informational purposes) + */ + } + sp->s_leader = NULL; + } + fixjobc(p, p->p_pgrp, 0); + p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + (void)acct_process(p); +#ifdef KTRACE + /* + * release trace file + */ + p->p_traceflag = 0; /* don't trace the vrele() */ + if (p->p_tracep) + vrele(p->p_tracep); +#endif + /* + * Remove proc from allproc queue and pidhash chain. + * Place onto zombproc. Unlink from parent's child list. + */ + if (*p->p_prev = p->p_next) + p->p_next->p_prev = p->p_prev; + if (p->p_next = zombproc) + p->p_next->p_prev = &p->p_next; + p->p_prev = &zombproc; + zombproc = p; + p->p_stat = SZOMB; + + for (pp = &pidhash[PIDHASH(p->p_pid)]; *pp; pp = &(*pp)->p_hash) + if (*pp == p) { + *pp = p->p_hash; + goto done; + } + panic("exit"); +done: + + if (p->p_cptr) /* only need this if any child is S_ZOMB */ + wakeup((caddr_t) initproc); + for (q = p->p_cptr; q != NULL; q = nq) { + nq = q->p_osptr; + if (nq != NULL) + nq->p_ysptr = NULL; + if (initproc->p_cptr) + initproc->p_cptr->p_ysptr = q; + q->p_osptr = initproc->p_cptr; + q->p_ysptr = NULL; + initproc->p_cptr = q; + + q->p_pptr = initproc; + /* + * Traced processes are killed + * since their existence means someone is screwing up. + */ + if (q->p_flag & P_TRACED) { + q->p_flag &= ~P_TRACED; + psignal(q, SIGKILL); + } + } + p->p_cptr = NULL; + + /* + * Save exit status and final rusage info, adding in child rusage + * info and self times. + */ + p->p_xstat = rv; + *p->p_ru = p->p_stats->p_ru; + calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); + ruadd(p->p_ru, &p->p_stats->p_cru); + + /* + * Notify parent that we're gone. + */ + psignal(p->p_pptr, SIGCHLD); + wakeup((caddr_t)p->p_pptr); +#if defined(tahoe) + /* move this to cpu_exit */ + p->p_addr->u_pcb.pcb_savacc.faddr = (float *)NULL; +#endif + /* + * Clear curproc after we've done all operations + * that could block, and before tearing down the rest + * of the process state that might be used from clock, etc. + * Also, can't clear curproc while we're still runnable, + * as we're not on a run queue (we are current, just not + * a proper proc any longer!). + * + * Other substructures are freed from wait(). + */ + curproc = NULL; + if (--p->p_limit->p_refcnt == 0) + FREE(p->p_limit, M_SUBPROC); + + /* + * Finally, call machine-dependent code to release the remaining + * resources including address space, the kernel stack and pcb. + * The address space is released by "vmspace_free(p->p_vmspace)"; + * This is machine-dependent, as we may have to change stacks + * or ensure that the current one isn't reallocated before we + * finish. cpu_exit will end with a call to cpu_swtch(), finishing + * our execution (pun intended). + */ + cpu_exit(p); +} + +struct wait_args { + int pid; + int *status; + int options; + struct rusage *rusage; +#ifdef COMPAT_43 + int compat; /* pseudo */ +#endif +}; + +#ifdef COMPAT_43 +#if defined(hp300) || defined(luna68k) +#include +#define GETPS(rp) ((struct frame *)(rp))->f_sr +#else +#define GETPS(rp) (rp)[PS] +#endif + +owait(p, uap, retval) + struct proc *p; + register struct wait_args *uap; + int *retval; +{ + +#ifdef PSL_ALLCC + if ((GETPS(p->p_md.md_regs) & PSL_ALLCC) != PSL_ALLCC) { + uap->options = 0; + uap->rusage = NULL; + } else { + uap->options = p->p_md.md_regs[R0]; + uap->rusage = (struct rusage *)p->p_md.md_regs[R1]; + } +#else + uap->options = 0; + uap->rusage = NULL; +#endif + uap->pid = WAIT_ANY; + uap->status = NULL; + uap->compat = 1; + return (wait1(p, uap, retval)); +} + +wait4(p, uap, retval) + struct proc *p; + struct wait_args *uap; + int *retval; +{ + + uap->compat = 0; + return (wait1(p, uap, retval)); +} +#else +#define wait1 wait4 +#endif + +int +wait1(q, uap, retval) + register struct proc *q; + register struct wait_args *uap; + int retval[]; +{ + register int nfound; + register struct proc *p, *t; + int status, error; + + if (uap->pid == 0) + uap->pid = -q->p_pgid; +#ifdef notyet + if (uap->options &~ (WUNTRACED|WNOHANG)) + return (EINVAL); +#endif +loop: + nfound = 0; + for (p = q->p_cptr; p; p = p->p_osptr) { + if (uap->pid != WAIT_ANY && + p->p_pid != uap->pid && p->p_pgid != -uap->pid) + continue; + nfound++; + if (p->p_stat == SZOMB) { + retval[0] = p->p_pid; +#ifdef COMPAT_43 + if (uap->compat) + retval[1] = p->p_xstat; + else +#endif + if (uap->status) { + status = p->p_xstat; /* convert to int */ + if (error = copyout((caddr_t)&status, + (caddr_t)uap->status, sizeof(status))) + return (error); + } + if (uap->rusage && (error = copyout((caddr_t)p->p_ru, + (caddr_t)uap->rusage, sizeof (struct rusage)))) + return (error); + /* + * If we got the child via a ptrace 'attach', + * we need to give it back to the old parent. + */ + if (p->p_oppid && (t = pfind(p->p_oppid))) { + p->p_oppid = 0; + proc_reparent(p, t); + psignal(t, SIGCHLD); + wakeup((caddr_t)t); + return (0); + } + p->p_xstat = 0; + ruadd(&q->p_stats->p_cru, p->p_ru); + FREE(p->p_ru, M_ZOMBIE); + + /* + * Decrement the count of procs running with this uid. + */ + (void)chgproccnt(p->p_cred->p_ruid, -1); + + /* + * Free up credentials. + */ + if (--p->p_cred->p_refcnt == 0) { + crfree(p->p_cred->pc_ucred); + FREE(p->p_cred, M_SUBPROC); + } + + /* + * Release reference to text vnode + */ + if (p->p_textvp) + vrele(p->p_textvp); + + /* + * Finally finished with old proc entry. + * Unlink it from its process group and free it. + */ + leavepgrp(p); + if (*p->p_prev = p->p_next) /* off zombproc */ + p->p_next->p_prev = p->p_prev; + if (q = p->p_ysptr) + q->p_osptr = p->p_osptr; + if (q = p->p_osptr) + q->p_ysptr = p->p_ysptr; + if ((q = p->p_pptr)->p_cptr == p) + q->p_cptr = p->p_osptr; + + /* + * Give machine-dependent layer a chance + * to free anything that cpu_exit couldn't + * release while still running in process context. + */ + cpu_wait(p); + FREE(p, M_PROC); + nprocs--; + return (0); + } + if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 && + (p->p_flag & P_TRACED || uap->options & WUNTRACED)) { + p->p_flag |= P_WAITED; + retval[0] = p->p_pid; +#ifdef COMPAT_43 + if (uap->compat) { + retval[1] = W_STOPCODE(p->p_xstat); + error = 0; + } else +#endif + if (uap->status) { + status = W_STOPCODE(p->p_xstat); + error = copyout((caddr_t)&status, + (caddr_t)uap->status, sizeof(status)); + } else + error = 0; + return (error); + } + } + if (nfound == 0) + return (ECHILD); + if (uap->options & WNOHANG) { + retval[0] = 0; + return (0); + } + if (error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0)) + return (error); + goto loop; +} + +/* + * make process 'parent' the new parent of process 'child'. + */ +void +proc_reparent(child, parent) + register struct proc *child; + register struct proc *parent; +{ + register struct proc *o; + register struct proc *y; + + if (child->p_pptr == parent) + return; + + /* fix up the child linkage for the old parent */ + o = child->p_osptr; + y = child->p_ysptr; + if (y) + y->p_osptr = o; + if (o) + o->p_ysptr = y; + if (child->p_pptr->p_cptr == child) + child->p_pptr->p_cptr = o; + + /* fix up child linkage for new parent */ + o = parent->p_cptr; + if (o) + o->p_ysptr = child; + child->p_osptr = o; + child->p_ysptr = NULL; + parent->p_cptr = child; + child->p_pptr = parent; +} diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c new file mode 100644 index 00000000000..8bec2fa5d5f --- /dev/null +++ b/sys/kern/kern_fork.c @@ -0,0 +1,315 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct fork_args { + int dummy; +}; +/* ARGSUSED */ +fork(p, uap, retval) + struct proc *p; + struct fork_args *uap; + int retval[]; +{ + + return (fork1(p, 0, retval)); +} + +/* ARGSUSED */ +vfork(p, uap, retval) + struct proc *p; + struct fork_args *uap; + int retval[]; +{ + + return (fork1(p, 1, retval)); +} + +int nprocs = 1; /* process 0 */ + +fork1(p1, isvfork, retval) + register struct proc *p1; + int isvfork, retval[]; +{ + register struct proc *p2; + register uid_t uid; + struct proc *newproc; + struct proc **hash; + int count; + static int nextpid, pidchecked = 0; + + /* + * Although process entries are dynamically created, we still keep + * a global limit on the maximum number we will create. Don't allow + * a nonprivileged user to use the last process; don't let root + * exceed the limit. The variable nprocs is the current number of + * processes, maxproc is the limit. + */ + uid = p1->p_cred->p_ruid; + if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) { + tablefull("proc"); + return (EAGAIN); + } + /* + * Increment the count of procs running with this uid. Don't allow + * a nonprivileged user to exceed their current limit. + */ + count = chgproccnt(uid, 1); + if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) { + (void)chgproccnt(uid, -1); + return (EAGAIN); + } + + /* Allocate new proc. */ + MALLOC(newproc, struct proc *, sizeof(struct proc), M_PROC, M_WAITOK); + + /* + * Find an unused process ID. We remember a range of unused IDs + * ready to use (from nextpid+1 through pidchecked-1). + */ + nextpid++; +retry: + /* + * If the process ID prototype has wrapped around, + * restart somewhat above 0, as the low-numbered procs + * tend to include daemons that don't exit. + */ + if (nextpid >= PID_MAX) { + nextpid = 100; + pidchecked = 0; + } + if (nextpid >= pidchecked) { + int doingzomb = 0; + + pidchecked = PID_MAX; + /* + * Scan the active and zombie procs to check whether this pid + * is in use. Remember the lowest pid that's greater + * than nextpid, so we can avoid checking for a while. + */ + p2 = (struct proc *)allproc; +again: + for (; p2 != NULL; p2 = p2->p_next) { + while (p2->p_pid == nextpid || + p2->p_pgrp->pg_id == nextpid) { + nextpid++; + if (nextpid >= pidchecked) + goto retry; + } + if (p2->p_pid > nextpid && pidchecked > p2->p_pid) + pidchecked = p2->p_pid; + if (p2->p_pgrp->pg_id > nextpid && + pidchecked > p2->p_pgrp->pg_id) + pidchecked = p2->p_pgrp->pg_id; + } + if (!doingzomb) { + doingzomb = 1; + p2 = zombproc; + goto again; + } + } + + + /* + * Link onto allproc (this should probably be delayed). + * Heavy use of volatile here to prevent the compiler from + * rearranging code. Yes, it *is* terribly ugly, but at least + * it works. + */ + nprocs++; + p2 = newproc; +#define Vp2 ((volatile struct proc *)p2) + Vp2->p_stat = SIDL; /* protect against others */ + Vp2->p_pid = nextpid; + /* + * This is really: + * p2->p_next = allproc; + * allproc->p_prev = &p2->p_next; + * p2->p_prev = &allproc; + * allproc = p2; + * The assignment via allproc is legal since it is never NULL. + */ + *(volatile struct proc **)&Vp2->p_next = allproc; + *(volatile struct proc ***)&allproc->p_prev = + (volatile struct proc **)&Vp2->p_next; + *(volatile struct proc ***)&Vp2->p_prev = &allproc; + allproc = Vp2; +#undef Vp2 + p2->p_forw = p2->p_back = NULL; /* shouldn't be necessary */ + + /* Insert on the hash chain. */ + hash = &pidhash[PIDHASH(p2->p_pid)]; + p2->p_hash = *hash; + *hash = p2; + + /* + * Make a proc table entry for the new process. + * Start by zeroing the section of proc that is zero-initialized, + * then copy the section that is copied directly from the parent. + */ + bzero(&p2->p_startzero, + (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero)); + bcopy(&p1->p_startcopy, &p2->p_startcopy, + (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy)); + + /* + * Duplicate sub-structures as needed. + * Increase reference counts on shared objects. + * The p_stats and p_sigacts substructs are set in vm_fork. + */ + p2->p_flag = P_INMEM; + if (p1->p_flag & P_PROFIL) + startprofclock(p2); + MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred), + M_SUBPROC, M_WAITOK); + bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred)); + p2->p_cred->p_refcnt = 1; + crhold(p1->p_ucred); + + /* bump references to the text vnode (for procfs) */ + p2->p_textvp = p1->p_textvp; + if (p2->p_textvp) + VREF(p2->p_textvp); + + p2->p_fd = fdcopy(p1); + /* + * If p_limit is still copy-on-write, bump refcnt, + * otherwise get a copy that won't be modified. + * (If PL_SHAREMOD is clear, the structure is shared + * copy-on-write.) + */ + if (p1->p_limit->p_lflags & PL_SHAREMOD) + p2->p_limit = limcopy(p1->p_limit); + else { + p2->p_limit = p1->p_limit; + p2->p_limit->p_refcnt++; + } + + if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) + p2->p_flag |= P_CONTROLT; + if (isvfork) + p2->p_flag |= P_PPWAIT; + p2->p_pgrpnxt = p1->p_pgrpnxt; + p1->p_pgrpnxt = p2; + p2->p_pptr = p1; + p2->p_osptr = p1->p_cptr; + if (p1->p_cptr) + p1->p_cptr->p_ysptr = p2; + p1->p_cptr = p2; +#ifdef KTRACE + /* + * Copy traceflag and tracefile if enabled. + * If not inherited, these were zeroed above. + */ + if (p1->p_traceflag&KTRFAC_INHERIT) { + p2->p_traceflag = p1->p_traceflag; + if ((p2->p_tracep = p1->p_tracep) != NULL) + VREF(p2->p_tracep); + } +#endif + + /* + * This begins the section where we must prevent the parent + * from being swapped. + */ + p1->p_flag |= P_NOSWAP; + /* + * Set return values for child before vm_fork, + * so they can be copied to child stack. + * We return parent pid, and mark as child in retval[1]. + * NOTE: the kernel stack may be at a different location in the child + * process, and thus addresses of automatic variables (including retval) + * may be invalid after vm_fork returns in the child process. + */ + retval[0] = p1->p_pid; + retval[1] = 1; + if (vm_fork(p1, p2, isvfork)) { + /* + * Child process. Set start time and get to work. + */ + (void) splclock(); + p2->p_stats->p_start = time; + (void) spl0(); + p2->p_acflag = AFORK; + return (0); + } + + /* + * Make child runnable and add to run queue. + */ + (void) splhigh(); + p2->p_stat = SRUN; + setrunqueue(p2); + (void) spl0(); + + /* + * Now can be swapped. + */ + p1->p_flag &= ~P_NOSWAP; + + /* + * Preserve synchronization semantics of vfork. If waiting for + * child to exec or exit, set P_PPWAIT on child, and sleep on our + * proc (in case of exit). + */ + if (isvfork) + while (p2->p_flag & P_PPWAIT) + tsleep(p1, PWAIT, "ppwait", 0); + + /* + * Return child pid to parent process, + * marking us as parent via retval[1]. + */ + retval[0] = p2->p_pid; + retval[1] = 0; + return (0); +} diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c new file mode 100644 index 00000000000..763cfb257ff --- /dev/null +++ b/sys/kern/kern_ktrace.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93 + */ + +#ifdef KTRACE + +#include +#include +#include +#include +#include +#include +#include +#include + +struct ktr_header * +ktrgetheader(type) + int type; +{ + register struct ktr_header *kth; + struct proc *p = curproc; /* XXX */ + + MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header), + M_TEMP, M_WAITOK); + kth->ktr_type = type; + microtime(&kth->ktr_time); + kth->ktr_pid = p->p_pid; + bcopy(p->p_comm, kth->ktr_comm, MAXCOMLEN); + return (kth); +} + +ktrsyscall(vp, code, narg, args) + struct vnode *vp; + int code, narg, args[]; +{ + struct ktr_header *kth; + struct ktr_syscall *ktp; + register len = sizeof(struct ktr_syscall) + (narg * sizeof(int)); + struct proc *p = curproc; /* XXX */ + int *argp, i; + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_SYSCALL); + MALLOC(ktp, struct ktr_syscall *, len, M_TEMP, M_WAITOK); + ktp->ktr_code = code; + ktp->ktr_narg = narg; + argp = (int *)((char *)ktp + sizeof(struct ktr_syscall)); + for (i = 0; i < narg; i++) + *argp++ = args[i]; + kth->ktr_buf = (caddr_t)ktp; + kth->ktr_len = len; + ktrwrite(vp, kth); + FREE(ktp, M_TEMP); + FREE(kth, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +ktrsysret(vp, code, error, retval) + struct vnode *vp; + int code, error, retval; +{ + struct ktr_header *kth; + struct ktr_sysret ktp; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_SYSRET); + ktp.ktr_code = code; + ktp.ktr_error = error; + ktp.ktr_retval = retval; /* what about val2 ? */ + + kth->ktr_buf = (caddr_t)&ktp; + kth->ktr_len = sizeof(struct ktr_sysret); + + ktrwrite(vp, kth); + FREE(kth, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +ktrnamei(vp, path) + struct vnode *vp; + char *path; +{ + struct ktr_header *kth; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_NAMEI); + kth->ktr_len = strlen(path); + kth->ktr_buf = path; + + ktrwrite(vp, kth); + FREE(kth, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +ktrgenio(vp, fd, rw, iov, len, error) + struct vnode *vp; + int fd; + enum uio_rw rw; + register struct iovec *iov; + int len, error; +{ + struct ktr_header *kth; + register struct ktr_genio *ktp; + register caddr_t cp; + register int resid = len, cnt; + struct proc *p = curproc; /* XXX */ + + if (error) + return; + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_GENIO); + MALLOC(ktp, struct ktr_genio *, sizeof(struct ktr_genio) + len, + M_TEMP, M_WAITOK); + ktp->ktr_fd = fd; + ktp->ktr_rw = rw; + cp = (caddr_t)((char *)ktp + sizeof (struct ktr_genio)); + while (resid > 0) { + if ((cnt = iov->iov_len) > resid) + cnt = resid; + if (copyin(iov->iov_base, cp, (unsigned)cnt)) + goto done; + cp += cnt; + resid -= cnt; + iov++; + } + kth->ktr_buf = (caddr_t)ktp; + kth->ktr_len = sizeof (struct ktr_genio) + len; + + ktrwrite(vp, kth); +done: + FREE(kth, M_TEMP); + FREE(ktp, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +ktrpsig(vp, sig, action, mask, code) + struct vnode *vp; + int sig; + sig_t action; + int mask, code; +{ + struct ktr_header *kth; + struct ktr_psig kp; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_PSIG); + kp.signo = (char)sig; + kp.action = action; + kp.mask = mask; + kp.code = code; + kth->ktr_buf = (caddr_t)&kp; + kth->ktr_len = sizeof (struct ktr_psig); + + ktrwrite(vp, kth); + FREE(kth, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +ktrcsw(vp, out, user) + struct vnode *vp; + int out, user; +{ + struct ktr_header *kth; + struct ktr_csw kc; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_CSW); + kc.out = out; + kc.user = user; + kth->ktr_buf = (caddr_t)&kc; + kth->ktr_len = sizeof (struct ktr_csw); + + ktrwrite(vp, kth); + FREE(kth, M_TEMP); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +/* Interface and common routines */ + +/* + * ktrace system call + */ +struct ktrace_args { + char *fname; + int ops; + int facs; + int pid; +}; +/* ARGSUSED */ +ktrace(curp, uap, retval) + struct proc *curp; + register struct ktrace_args *uap; + int *retval; +{ + register struct vnode *vp = NULL; + register struct proc *p; + struct pgrp *pg; + int facs = uap->facs & ~KTRFAC_ROOT; + int ops = KTROP(uap->ops); + int descend = uap->ops & KTRFLAG_DESCEND; + int ret = 0; + int error = 0; + struct nameidata nd; + + curp->p_traceflag |= KTRFAC_ACTIVE; + if (ops != KTROP_CLEAR) { + /* + * an operation which requires a file argument. + */ + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->fname, curp); + if (error = vn_open(&nd, FREAD|FWRITE, 0)) { + curp->p_traceflag &= ~KTRFAC_ACTIVE; + return (error); + } + vp = nd.ni_vp; + VOP_UNLOCK(vp); + if (vp->v_type != VREG) { + (void) vn_close(vp, FREAD|FWRITE, curp->p_ucred, curp); + curp->p_traceflag &= ~KTRFAC_ACTIVE; + return (EACCES); + } + } + /* + * Clear all uses of the tracefile + */ + if (ops == KTROP_CLEARFILE) { + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + if (p->p_tracep == vp) { + if (ktrcanset(curp, p)) { + p->p_tracep = NULL; + p->p_traceflag = 0; + (void) vn_close(vp, FREAD|FWRITE, + p->p_ucred, p); + } else + error = EPERM; + } + } + goto done; + } + /* + * need something to (un)trace (XXX - why is this here?) + */ + if (!facs) { + error = EINVAL; + goto done; + } + /* + * do it + */ + if (uap->pid < 0) { + /* + * by process group + */ + pg = pgfind(-uap->pid); + if (pg == NULL) { + error = ESRCH; + goto done; + } + for (p = pg->pg_mem; p != NULL; p = p->p_pgrpnxt) + if (descend) + ret |= ktrsetchildren(curp, p, ops, facs, vp); + else + ret |= ktrops(curp, p, ops, facs, vp); + + } else { + /* + * by pid + */ + p = pfind(uap->pid); + if (p == NULL) { + error = ESRCH; + goto done; + } + if (descend) + ret |= ktrsetchildren(curp, p, ops, facs, vp); + else + ret |= ktrops(curp, p, ops, facs, vp); + } + if (!ret) + error = EPERM; +done: + if (vp != NULL) + (void) vn_close(vp, FWRITE, curp->p_ucred, curp); + curp->p_traceflag &= ~KTRFAC_ACTIVE; + return (error); +} + +int +ktrops(curp, p, ops, facs, vp) + struct proc *p, *curp; + int ops, facs; + struct vnode *vp; +{ + + if (!ktrcanset(curp, p)) + return (0); + if (ops == KTROP_SET) { + if (p->p_tracep != vp) { + /* + * if trace file already in use, relinquish + */ + if (p->p_tracep != NULL) + vrele(p->p_tracep); + VREF(vp); + p->p_tracep = vp; + } + p->p_traceflag |= facs; + if (curp->p_ucred->cr_uid == 0) + p->p_traceflag |= KTRFAC_ROOT; + } else { + /* KTROP_CLEAR */ + if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) { + /* no more tracing */ + p->p_traceflag = 0; + if (p->p_tracep != NULL) { + vrele(p->p_tracep); + p->p_tracep = NULL; + } + } + } + + return (1); +} + +ktrsetchildren(curp, top, ops, facs, vp) + struct proc *curp, *top; + int ops, facs; + struct vnode *vp; +{ + register struct proc *p; + register int ret = 0; + + p = top; + for (;;) { + ret |= ktrops(curp, p, ops, facs, vp); + /* + * If this process has children, descend to them next, + * otherwise do any siblings, and if done with this level, + * follow back up the tree (but not past top). + */ + if (p->p_cptr) + p = p->p_cptr; + else if (p == top) + return (ret); + else if (p->p_osptr) + p = p->p_osptr; + else for (;;) { + p = p->p_pptr; + if (p == top) + return (ret); + if (p->p_osptr) { + p = p->p_osptr; + break; + } + } + } + /*NOTREACHED*/ +} + +ktrwrite(vp, kth) + struct vnode *vp; + register struct ktr_header *kth; +{ + struct uio auio; + struct iovec aiov[2]; + register struct proc *p = curproc; /* XXX */ + int error; + + if (vp == NULL) + return; + auio.uio_iov = &aiov[0]; + auio.uio_offset = 0; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + aiov[0].iov_base = (caddr_t)kth; + aiov[0].iov_len = sizeof(struct ktr_header); + auio.uio_resid = sizeof(struct ktr_header); + auio.uio_iovcnt = 1; + auio.uio_procp = (struct proc *)0; + if (kth->ktr_len > 0) { + auio.uio_iovcnt++; + aiov[1].iov_base = kth->ktr_buf; + aiov[1].iov_len = kth->ktr_len; + auio.uio_resid += kth->ktr_len; + } + VOP_LOCK(vp); + error = VOP_WRITE(vp, &auio, IO_UNIT|IO_APPEND, p->p_ucred); + VOP_UNLOCK(vp); + if (!error) + return; + /* + * If error encountered, give up tracing on this vnode. + */ + log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n", + error); + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + if (p->p_tracep == vp) { + p->p_tracep = NULL; + p->p_traceflag = 0; + vrele(vp); + } + } +} + +/* + * Return true if caller has permission to set the ktracing state + * of target. Essentially, the target can't possess any + * more permissions than the caller. KTRFAC_ROOT signifies that + * root previously set the tracing status on the target process, and + * so, only root may further change it. + * + * TODO: check groups. use caller effective gid. + */ +ktrcanset(callp, targetp) + struct proc *callp, *targetp; +{ + register struct pcred *caller = callp->p_cred; + register struct pcred *target = targetp->p_cred; + + if ((caller->pc_ucred->cr_uid == target->p_ruid && + target->p_ruid == target->p_svuid && + caller->p_rgid == target->p_rgid && /* XXX */ + target->p_rgid == target->p_svgid && + (targetp->p_traceflag & KTRFAC_ROOT) == 0) || + caller->pc_ucred->cr_uid == 0) + return (1); + + return (0); +} + +#endif diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c new file mode 100644 index 00000000000..c6276bc73cf --- /dev/null +++ b/sys/kern/kern_malloc.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 1987, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include + +#include +#include + +struct kmembuckets bucket[MINBUCKET + 16]; +struct kmemstats kmemstats[M_LAST]; +struct kmemusage *kmemusage; +char *kmembase, *kmemlimit; +char *memname[] = INITKMEMNAMES; + +#ifdef DIAGNOSTIC +/* + * This structure provides a set of masks to catch unaligned frees. + */ +long addrmask[] = { 0, + 0x00000001, 0x00000003, 0x00000007, 0x0000000f, + 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff, + 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, + 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, +}; + +/* + * The WEIRD_ADDR is used as known text to copy into free objects so + * that modifications after frees can be detected. + */ +#define WEIRD_ADDR 0xdeadbeef +#define MAX_COPY 32 + +/* + * Normally the first word of the structure is used to hold the list + * pointer for free objects. However, when running with diagnostics, + * we use the third and fourth fields, so as to catch modifications + * in the most commonly trashed first two words. + */ +struct freelist { + long spare0; + short type; + long spare1; + caddr_t next; +}; +#else /* !DIAGNOSTIC */ +struct freelist { + caddr_t next; +}; +#endif /* DIAGNOSTIC */ + +/* + * Allocate a block of memory + */ +void * +malloc(size, type, flags) + unsigned long size; + int type, flags; +{ + register struct kmembuckets *kbp; + register struct kmemusage *kup; + register struct freelist *freep; + long indx, npg, allocsize; + int s; + caddr_t va, cp, savedlist; +#ifdef DIAGNOSTIC + long *end, *lp; + int copysize; + char *savedtype; +#endif +#ifdef KMEMSTATS + register struct kmemstats *ksp = &kmemstats[type]; + + if (((unsigned long)type) > M_LAST) + panic("malloc - bogus type"); +#endif + indx = BUCKETINDX(size); + kbp = &bucket[indx]; + s = splimp(); +#ifdef KMEMSTATS + while (ksp->ks_memuse >= ksp->ks_limit) { + if (flags & M_NOWAIT) { + splx(s); + return ((void *) NULL); + } + if (ksp->ks_limblocks < 65535) + ksp->ks_limblocks++; + tsleep((caddr_t)ksp, PSWP+2, memname[type], 0); + } + ksp->ks_size |= 1 << indx; +#endif +#ifdef DIAGNOSTIC + copysize = 1 << indx < MAX_COPY ? 1 << indx : MAX_COPY; +#endif + if (kbp->kb_next == NULL) { + kbp->kb_last = NULL; + if (size > MAXALLOCSAVE) + allocsize = roundup(size, CLBYTES); + else + allocsize = 1 << indx; + npg = clrnd(btoc(allocsize)); + va = (caddr_t) kmem_malloc(kmem_map, (vm_size_t)ctob(npg), + !(flags & M_NOWAIT)); + if (va == NULL) { + splx(s); + return ((void *) NULL); + } +#ifdef KMEMSTATS + kbp->kb_total += kbp->kb_elmpercl; +#endif + kup = btokup(va); + kup->ku_indx = indx; + if (allocsize > MAXALLOCSAVE) { + if (npg > 65535) + panic("malloc: allocation too large"); + kup->ku_pagecnt = npg; +#ifdef KMEMSTATS + ksp->ks_memuse += allocsize; +#endif + goto out; + } +#ifdef KMEMSTATS + kup->ku_freecnt = kbp->kb_elmpercl; + kbp->kb_totalfree += kbp->kb_elmpercl; +#endif + /* + * Just in case we blocked while allocating memory, + * and someone else also allocated memory for this + * bucket, don't assume the list is still empty. + */ + savedlist = kbp->kb_next; + kbp->kb_next = cp = va + (npg * NBPG) - allocsize; + for (;;) { + freep = (struct freelist *)cp; +#ifdef DIAGNOSTIC + /* + * Copy in known text to detect modification + * after freeing. + */ + end = (long *)&cp[copysize]; + for (lp = (long *)cp; lp < end; lp++) + *lp = WEIRD_ADDR; + freep->type = M_FREE; +#endif /* DIAGNOSTIC */ + if (cp <= va) + break; + cp -= allocsize; + freep->next = cp; + } + freep->next = savedlist; + if (kbp->kb_last == NULL) + kbp->kb_last = (caddr_t)freep; + } + va = kbp->kb_next; + kbp->kb_next = ((struct freelist *)va)->next; +#ifdef DIAGNOSTIC + freep = (struct freelist *)va; + savedtype = (unsigned)freep->type < M_LAST ? + memname[freep->type] : "???"; + if (kbp->kb_next && + !kernacc(kbp->kb_next, sizeof(struct freelist), 0)) { + printf("%s of object 0x%x size %d %s %s (invalid addr 0x%x)\n", + "Data modified on freelist: word 2.5", va, size, + "previous type", savedtype, kbp->kb_next); + kbp->kb_next = NULL; + } +#if BYTE_ORDER == BIG_ENDIAN + freep->type = WEIRD_ADDR >> 16; +#endif +#if BYTE_ORDER == LITTLE_ENDIAN + freep->type = (short)WEIRD_ADDR; +#endif + if (((long)(&freep->next)) & 0x2) + freep->next = (caddr_t)((WEIRD_ADDR >> 16)|(WEIRD_ADDR << 16)); + else + freep->next = (caddr_t)WEIRD_ADDR; + end = (long *)&va[copysize]; + for (lp = (long *)va; lp < end; lp++) { + if (*lp == WEIRD_ADDR) + continue; + printf("%s %d of object 0x%x size %d %s %s (0x%x != 0x%x)\n", + "Data modified on freelist: word", lp - (long *)va, + va, size, "previous type", savedtype, *lp, WEIRD_ADDR); + break; + } + freep->spare0 = 0; +#endif /* DIAGNOSTIC */ +#ifdef KMEMSTATS + kup = btokup(va); + if (kup->ku_indx != indx) + panic("malloc: wrong bucket"); + if (kup->ku_freecnt == 0) + panic("malloc: lost data"); + kup->ku_freecnt--; + kbp->kb_totalfree--; + ksp->ks_memuse += 1 << indx; +out: + kbp->kb_calls++; + ksp->ks_inuse++; + ksp->ks_calls++; + if (ksp->ks_memuse > ksp->ks_maxused) + ksp->ks_maxused = ksp->ks_memuse; +#else +out: +#endif + splx(s); + return ((void *) va); +} + +/* + * Free a block of memory allocated by malloc. + */ +void +free(addr, type) + void *addr; + int type; +{ + register struct kmembuckets *kbp; + register struct kmemusage *kup; + register struct freelist *freep; + long size; + int s; +#ifdef DIAGNOSTIC + caddr_t cp; + long *end, *lp, alloc, copysize; +#endif +#ifdef KMEMSTATS + register struct kmemstats *ksp = &kmemstats[type]; +#endif + + kup = btokup(addr); + size = 1 << kup->ku_indx; + kbp = &bucket[kup->ku_indx]; + s = splimp(); +#ifdef DIAGNOSTIC + /* + * Check for returns of data that do not point to the + * beginning of the allocation. + */ + if (size > NBPG * CLSIZE) + alloc = addrmask[BUCKETINDX(NBPG * CLSIZE)]; + else + alloc = addrmask[kup->ku_indx]; + if (((u_long)addr & alloc) != 0) + panic("free: unaligned addr 0x%x, size %d, type %s, mask %d\n", + addr, size, memname[type], alloc); +#endif /* DIAGNOSTIC */ + if (size > MAXALLOCSAVE) { + kmem_free(kmem_map, (vm_offset_t)addr, ctob(kup->ku_pagecnt)); +#ifdef KMEMSTATS + size = kup->ku_pagecnt << PGSHIFT; + ksp->ks_memuse -= size; + kup->ku_indx = 0; + kup->ku_pagecnt = 0; + if (ksp->ks_memuse + size >= ksp->ks_limit && + ksp->ks_memuse < ksp->ks_limit) + wakeup((caddr_t)ksp); + ksp->ks_inuse--; + kbp->kb_total -= 1; +#endif + splx(s); + return; + } + freep = (struct freelist *)addr; +#ifdef DIAGNOSTIC + /* + * Check for multiple frees. Use a quick check to see if + * it looks free before laboriously searching the freelist. + */ + if (freep->spare0 == WEIRD_ADDR) { + for (cp = kbp->kb_next; cp; cp = *(caddr_t *)cp) { + if (addr != cp) + continue; + printf("multiply freed item 0x%x\n", addr); + panic("free: duplicated free"); + } + } + /* + * Copy in known text to detect modification after freeing + * and to make it look free. Also, save the type being freed + * so we can list likely culprit if modification is detected + * when the object is reallocated. + */ + copysize = size < MAX_COPY ? size : MAX_COPY; + end = (long *)&((caddr_t)addr)[copysize]; + for (lp = (long *)addr; lp < end; lp++) + *lp = WEIRD_ADDR; + freep->type = type; +#endif /* DIAGNOSTIC */ +#ifdef KMEMSTATS + kup->ku_freecnt++; + if (kup->ku_freecnt >= kbp->kb_elmpercl) + if (kup->ku_freecnt > kbp->kb_elmpercl) + panic("free: multiple frees"); + else if (kbp->kb_totalfree > kbp->kb_highwat) + kbp->kb_couldfree++; + kbp->kb_totalfree++; + ksp->ks_memuse -= size; + if (ksp->ks_memuse + size >= ksp->ks_limit && + ksp->ks_memuse < ksp->ks_limit) + wakeup((caddr_t)ksp); + ksp->ks_inuse--; +#endif + if (kbp->kb_next == NULL) + kbp->kb_next = addr; + else + ((struct freelist *)kbp->kb_last)->next = addr; + freep->next = NULL; + kbp->kb_last = addr; + splx(s); +} + +/* + * Initialize the kernel memory allocator + */ +kmeminit() +{ + register long indx; + int npg; + +#if ((MAXALLOCSAVE & (MAXALLOCSAVE - 1)) != 0) + ERROR!_kmeminit:_MAXALLOCSAVE_not_power_of_2 +#endif +#if (MAXALLOCSAVE > MINALLOCSIZE * 32768) + ERROR!_kmeminit:_MAXALLOCSAVE_too_big +#endif +#if (MAXALLOCSAVE < CLBYTES) + ERROR!_kmeminit:_MAXALLOCSAVE_too_small +#endif + npg = VM_KMEM_SIZE/ NBPG; + kmemusage = (struct kmemusage *) kmem_alloc(kernel_map, + (vm_size_t)(npg * sizeof(struct kmemusage))); + kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase, + (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * NBPG), FALSE); +#ifdef KMEMSTATS + for (indx = 0; indx < MINBUCKET + 16; indx++) { + if (1 << indx >= CLBYTES) + bucket[indx].kb_elmpercl = 1; + else + bucket[indx].kb_elmpercl = CLBYTES / (1 << indx); + bucket[indx].kb_highwat = 5 * bucket[indx].kb_elmpercl; + } + for (indx = 0; indx < M_LAST; indx++) + kmemstats[indx].ks_limit = npg * NBPG * 6 / 10; +#endif +} diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c new file mode 100644 index 00000000000..1eaae3599de --- /dev/null +++ b/sys/kern/kern_physio.c @@ -0,0 +1,93 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include + +physio(a1, a2, a3, a4, a5, a6) + int (*a1)(); + struct buf *a2; + dev_t a3; + int a4; + u_int (*a5)(); + struct uio *a6; +{ + + /* + * Body deleted. + */ + return (EIO); +} + +u_int +minphys(a1) + struct buf *a1; +{ + + /* + * Body deleted. + */ + return (0); +} + +/* + * Do a read on a device for a user process. + */ +rawread(dev, uio) + dev_t dev; + struct uio *uio; +{ + return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL, + dev, B_READ, minphys, uio)); +} + +/* + * Do a write on a device for a user process. + */ +rawwrite(dev, uio) + dev_t dev; + struct uio *uio; +{ + return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL, + dev, B_WRITE, minphys, uio)); +} diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c new file mode 100644 index 00000000000..91d9e212d38 --- /dev/null +++ b/sys/kern/kern_proc.c @@ -0,0 +1,401 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_proc.c 8.4 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Structure associated with user cacheing. + */ +struct uidinfo { + struct uidinfo *ui_next; + struct uidinfo **ui_prev; + uid_t ui_uid; + long ui_proccnt; +} **uihashtbl; +u_long uihash; /* size of hash table - 1 */ +#define UIHASH(uid) ((uid) & uihash) + +/* + * Allocate a hash table. + */ +usrinfoinit() +{ + + uihashtbl = hashinit(maxproc / 16, M_PROC, &uihash); +} + +/* + * Change the count associated with number of processes + * a given user is using. + */ +int +chgproccnt(uid, diff) + uid_t uid; + int diff; +{ + register struct uidinfo **uipp, *uip, *uiq; + + uipp = &uihashtbl[UIHASH(uid)]; + for (uip = *uipp; uip; uip = uip->ui_next) + if (uip->ui_uid == uid) + break; + if (uip) { + uip->ui_proccnt += diff; + if (uip->ui_proccnt > 0) + return (uip->ui_proccnt); + if (uip->ui_proccnt < 0) + panic("chgproccnt: procs < 0"); + if (uiq = uip->ui_next) + uiq->ui_prev = uip->ui_prev; + *uip->ui_prev = uiq; + FREE(uip, M_PROC); + return (0); + } + if (diff <= 0) { + if (diff == 0) + return(0); + panic("chgproccnt: lost user"); + } + MALLOC(uip, struct uidinfo *, sizeof(*uip), M_PROC, M_WAITOK); + if (uiq = *uipp) + uiq->ui_prev = &uip->ui_next; + uip->ui_next = uiq; + uip->ui_prev = uipp; + *uipp = uip; + uip->ui_uid = uid; + uip->ui_proccnt = diff; + return (diff); +} + +/* + * Is p an inferior of the current process? + */ +inferior(p) + register struct proc *p; +{ + + for (; p != curproc; p = p->p_pptr) + if (p->p_pid == 0) + return (0); + return (1); +} + +/* + * Locate a process by number + */ +struct proc * +pfind(pid) + register pid_t pid; +{ + register struct proc *p; + + for (p = pidhash[PIDHASH(pid)]; p != NULL; p = p->p_hash) + if (p->p_pid == pid) + return (p); + return (NULL); +} + +/* + * Locate a process group by number + */ +struct pgrp * +pgfind(pgid) + register pid_t pgid; +{ + register struct pgrp *pgrp; + + for (pgrp = pgrphash[PIDHASH(pgid)]; + pgrp != NULL; pgrp = pgrp->pg_hforw) + if (pgrp->pg_id == pgid) + return (pgrp); + return (NULL); +} + +/* + * Move p to a new or existing process group (and session) + */ +enterpgrp(p, pgid, mksess) + register struct proc *p; + pid_t pgid; + int mksess; +{ + register struct pgrp *pgrp = pgfind(pgid); + register struct proc **pp; + int n; + +#ifdef DIAGNOSTIC + if (pgrp != NULL && mksess) /* firewalls */ + panic("enterpgrp: setsid into non-empty pgrp"); + if (SESS_LEADER(p)) + panic("enterpgrp: session leader attempted setpgrp"); +#endif + if (pgrp == NULL) { + pid_t savepid = p->p_pid; + struct proc *np; + /* + * new process group + */ +#ifdef DIAGNOSTIC + if (p->p_pid != pgid) + panic("enterpgrp: new pgrp and pid != pgid"); +#endif + MALLOC(pgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, + M_WAITOK); + if ((np = pfind(savepid)) == NULL || np != p) + return (ESRCH); + if (mksess) { + register struct session *sess; + + /* + * new session + */ + MALLOC(sess, struct session *, sizeof(struct session), + M_SESSION, M_WAITOK); + sess->s_leader = p; + sess->s_count = 1; + sess->s_ttyvp = NULL; + sess->s_ttyp = NULL; + bcopy(p->p_session->s_login, sess->s_login, + sizeof(sess->s_login)); + p->p_flag &= ~P_CONTROLT; + pgrp->pg_session = sess; +#ifdef DIAGNOSTIC + if (p != curproc) + panic("enterpgrp: mksession and p != curproc"); +#endif + } else { + pgrp->pg_session = p->p_session; + pgrp->pg_session->s_count++; + } + pgrp->pg_id = pgid; + pgrp->pg_hforw = pgrphash[n = PIDHASH(pgid)]; + pgrphash[n] = pgrp; + pgrp->pg_jobc = 0; + pgrp->pg_mem = NULL; + } else if (pgrp == p->p_pgrp) + return (0); + + /* + * Adjust eligibility of affected pgrps to participate in job control. + * Increment eligibility counts before decrementing, otherwise we + * could reach 0 spuriously during the first call. + */ + fixjobc(p, pgrp, 1); + fixjobc(p, p->p_pgrp, 0); + + /* + * unlink p from old process group + */ + for (pp = &p->p_pgrp->pg_mem; *pp; pp = &(*pp)->p_pgrpnxt) { + if (*pp == p) { + *pp = p->p_pgrpnxt; + break; + } + } +#ifdef DIAGNOSTIC + if (pp == NULL) + panic("enterpgrp: can't find p on old pgrp"); +#endif + /* + * delete old if empty + */ + if (p->p_pgrp->pg_mem == 0) + pgdelete(p->p_pgrp); + /* + * link into new one + */ + p->p_pgrp = pgrp; + p->p_pgrpnxt = pgrp->pg_mem; + pgrp->pg_mem = p; + return (0); +} + +/* + * remove process from process group + */ +leavepgrp(p) + register struct proc *p; +{ + register struct proc **pp = &p->p_pgrp->pg_mem; + + for (; *pp; pp = &(*pp)->p_pgrpnxt) { + if (*pp == p) { + *pp = p->p_pgrpnxt; + break; + } + } +#ifdef DIAGNOSTIC + if (pp == NULL) + panic("leavepgrp: can't find p in pgrp"); +#endif + if (!p->p_pgrp->pg_mem) + pgdelete(p->p_pgrp); + p->p_pgrp = 0; + return (0); +} + +/* + * delete a process group + */ +pgdelete(pgrp) + register struct pgrp *pgrp; +{ + register struct pgrp **pgp = &pgrphash[PIDHASH(pgrp->pg_id)]; + + if (pgrp->pg_session->s_ttyp != NULL && + pgrp->pg_session->s_ttyp->t_pgrp == pgrp) + pgrp->pg_session->s_ttyp->t_pgrp = NULL; + for (; *pgp; pgp = &(*pgp)->pg_hforw) { + if (*pgp == pgrp) { + *pgp = pgrp->pg_hforw; + break; + } + } +#ifdef DIAGNOSTIC + if (pgp == NULL) + panic("pgdelete: can't find pgrp on hash chain"); +#endif + if (--pgrp->pg_session->s_count == 0) + FREE(pgrp->pg_session, M_SESSION); + FREE(pgrp, M_PGRP); +} + +static void orphanpg(); + +/* + * Adjust pgrp jobc counters when specified process changes process group. + * We count the number of processes in each process group that "qualify" + * the group for terminal job control (those with a parent in a different + * process group of the same session). If that count reaches zero, the + * process group becomes orphaned. Check both the specified process' + * process group and that of its children. + * entering == 0 => p is leaving specified group. + * entering == 1 => p is entering specified group. + */ +fixjobc(p, pgrp, entering) + register struct proc *p; + register struct pgrp *pgrp; + int entering; +{ + register struct pgrp *hispgrp; + register struct session *mysession = pgrp->pg_session; + + /* + * Check p's parent to see whether p qualifies its own process + * group; if so, adjust count for p's process group. + */ + if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && + hispgrp->pg_session == mysession) + if (entering) + pgrp->pg_jobc++; + else if (--pgrp->pg_jobc == 0) + orphanpg(pgrp); + + /* + * Check this process' children to see whether they qualify + * their process groups; if so, adjust counts for children's + * process groups. + */ + for (p = p->p_cptr; p; p = p->p_osptr) + if ((hispgrp = p->p_pgrp) != pgrp && + hispgrp->pg_session == mysession && + p->p_stat != SZOMB) + if (entering) + hispgrp->pg_jobc++; + else if (--hispgrp->pg_jobc == 0) + orphanpg(hispgrp); +} + +/* + * A process group has become orphaned; + * if there are any stopped processes in the group, + * hang-up all process in that group. + */ +static void +orphanpg(pg) + struct pgrp *pg; +{ + register struct proc *p; + + for (p = pg->pg_mem; p; p = p->p_pgrpnxt) { + if (p->p_stat == SSTOP) { + for (p = pg->pg_mem; p; p = p->p_pgrpnxt) { + psignal(p, SIGHUP); + psignal(p, SIGCONT); + } + return; + } + } +} + +#ifdef debug +/* DEBUG */ +pgrpdump() +{ + register struct pgrp *pgrp; + register struct proc *p; + register i; + + for (i=0; ipg_hforw) { + printf("\tpgrp %x, pgid %d, sess %x, sesscnt %d, mem %x\n", + pgrp, pgrp->pg_id, pgrp->pg_session, + pgrp->pg_session->s_count, pgrp->pg_mem); + for (p=pgrp->pg_mem; p; p=p->p_pgrpnxt) { + printf("\t\tpid %d addr %x pgrp %x\n", + p->p_pid, p, p->p_pgrp); + } + } + + } + } +} +#endif /* debug */ diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c new file mode 100644 index 00000000000..ef400770e20 --- /dev/null +++ b/sys/kern/kern_prot.c @@ -0,0 +1,566 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94 + */ + +/* + * System calls related to processes and protection + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct args { + int dummy; +}; + +/* ARGSUSED */ +getpid(p, uap, retval) + struct proc *p; + struct args *uap; + int *retval; +{ + + *retval = p->p_pid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + retval[1] = p->p_pptr->p_pid; +#endif + return (0); +} + +/* ARGSUSED */ +getppid(p, uap, retval) + struct proc *p; + struct args *uap; + int *retval; +{ + + *retval = p->p_pptr->p_pid; + return (0); +} + +/* Get process group ID; note that POSIX getpgrp takes no parameter */ +getpgrp(p, uap, retval) + struct proc *p; + struct args *uap; + int *retval; +{ + + *retval = p->p_pgrp->pg_id; + return (0); +} + +/* ARGSUSED */ +getuid(p, uap, retval) + struct proc *p; + struct args *uap; + int *retval; +{ + + *retval = p->p_cred->p_ruid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + retval[1] = p->p_ucred->cr_uid; +#endif + return (0); +} + +/* ARGSUSED */ +geteuid(p, uap, retval) + struct proc *p; + struct args *uap; + int *retval; +{ + + *retval = p->p_ucred->cr_uid; + return (0); +} + +/* ARGSUSED */ +getgid(p, uap, retval) + struct proc *p; + struct args *uap; + int *retval; +{ + + *retval = p->p_cred->p_rgid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + retval[1] = p->p_ucred->cr_groups[0]; +#endif + return (0); +} + +/* + * Get effective group ID. The "egid" is groups[0], and could be obtained + * via getgroups. This syscall exists because it is somewhat painful to do + * correctly in a library function. + */ +/* ARGSUSED */ +getegid(p, uap, retval) + struct proc *p; + struct args *uap; + int *retval; +{ + + *retval = p->p_ucred->cr_groups[0]; + return (0); +} + +struct getgroups_args { + u_int gidsetsize; + gid_t *gidset; +}; +getgroups(p, uap, retval) + struct proc *p; + register struct getgroups_args *uap; + int *retval; +{ + register struct pcred *pc = p->p_cred; + register u_int ngrp; + int error; + + if ((ngrp = uap->gidsetsize) == 0) { + *retval = pc->pc_ucred->cr_ngroups; + return (0); + } + if (ngrp < pc->pc_ucred->cr_ngroups) + return (EINVAL); + ngrp = pc->pc_ucred->cr_ngroups; + if (error = copyout((caddr_t)pc->pc_ucred->cr_groups, + (caddr_t)uap->gidset, ngrp * sizeof(gid_t))) + return (error); + *retval = ngrp; + return (0); +} + +/* ARGSUSED */ +setsid(p, uap, retval) + register struct proc *p; + struct args *uap; + int *retval; +{ + + if (p->p_pgid == p->p_pid || pgfind(p->p_pid)) { + return (EPERM); + } else { + (void)enterpgrp(p, p->p_pid, 1); + *retval = p->p_pid; + return (0); + } +} + +/* + * set process group (setpgid/old setpgrp) + * + * caller does setpgid(targpid, targpgid) + * + * pid must be caller or child of caller (ESRCH) + * if a child + * pid must be in same session (EPERM) + * pid can't have done an exec (EACCES) + * if pgid != pid + * there must exist some pid in same session having pgid (EPERM) + * pid must not be session leader (EPERM) + */ +struct setpgid_args { + int pid; /* target process id */ + int pgid; /* target pgrp id */ +}; +/* ARGSUSED */ +setpgid(curp, uap, retval) + struct proc *curp; + register struct setpgid_args *uap; + int *retval; +{ + register struct proc *targp; /* target process */ + register struct pgrp *pgrp; /* target pgrp */ + + if (uap->pid != 0 && uap->pid != curp->p_pid) { + if ((targp = pfind(uap->pid)) == 0 || !inferior(targp)) + return (ESRCH); + if (targp->p_session != curp->p_session) + return (EPERM); + if (targp->p_flag & P_EXEC) + return (EACCES); + } else + targp = curp; + if (SESS_LEADER(targp)) + return (EPERM); + if (uap->pgid == 0) + uap->pgid = targp->p_pid; + else if (uap->pgid != targp->p_pid) + if ((pgrp = pgfind(uap->pgid)) == 0 || + pgrp->pg_session != curp->p_session) + return (EPERM); + return (enterpgrp(targp, uap->pgid, 0)); +} + +struct setuid_args { + uid_t uid; +}; +/* ARGSUSED */ +setuid(p, uap, retval) + struct proc *p; + struct setuid_args *uap; + int *retval; +{ + register struct pcred *pc = p->p_cred; + register uid_t uid; + int error; + + uid = uap->uid; + if (uid != pc->p_ruid && + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + /* + * Everything's okay, do it. + * Transfer proc count to new user. + * Copy credentials so other references do not see our changes. + */ + (void)chgproccnt(pc->p_ruid, -1); + (void)chgproccnt(uid, 1); + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_uid = uid; + pc->p_ruid = uid; + pc->p_svuid = uid; + p->p_flag |= P_SUGID; + return (0); +} + +struct seteuid_args { + uid_t euid; +}; +/* ARGSUSED */ +seteuid(p, uap, retval) + struct proc *p; + struct seteuid_args *uap; + int *retval; +{ + register struct pcred *pc = p->p_cred; + register uid_t euid; + int error; + + euid = uap->euid; + if (euid != pc->p_ruid && euid != pc->p_svuid && + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + /* + * Everything's okay, do it. Copy credentials so other references do + * not see our changes. + */ + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_uid = euid; + p->p_flag |= P_SUGID; + return (0); +} + +struct setgid_args { + gid_t gid; +}; +/* ARGSUSED */ +setgid(p, uap, retval) + struct proc *p; + struct setgid_args *uap; + int *retval; +{ + register struct pcred *pc = p->p_cred; + register gid_t gid; + int error; + + gid = uap->gid; + if (gid != pc->p_rgid && (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_groups[0] = gid; + pc->p_rgid = gid; + pc->p_svgid = gid; /* ??? */ + p->p_flag |= P_SUGID; + return (0); +} + +struct setegid_args { + gid_t egid; +}; +/* ARGSUSED */ +setegid(p, uap, retval) + struct proc *p; + struct setegid_args *uap; + int *retval; +{ + register struct pcred *pc = p->p_cred; + register gid_t egid; + int error; + + egid = uap->egid; + if (egid != pc->p_rgid && egid != pc->p_svgid && + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_groups[0] = egid; + p->p_flag |= P_SUGID; + return (0); +} + +struct setgroups_args { + u_int gidsetsize; + gid_t *gidset; +}; +/* ARGSUSED */ +setgroups(p, uap, retval) + struct proc *p; + struct setgroups_args *uap; + int *retval; +{ + register struct pcred *pc = p->p_cred; + register u_int ngrp; + int error; + + if (error = suser(pc->pc_ucred, &p->p_acflag)) + return (error); + if ((ngrp = uap->gidsetsize) > NGROUPS) + return (EINVAL); + pc->pc_ucred = crcopy(pc->pc_ucred); + if (error = copyin((caddr_t)uap->gidset, + (caddr_t)pc->pc_ucred->cr_groups, ngrp * sizeof(gid_t))) + return (error); + pc->pc_ucred->cr_ngroups = ngrp; + p->p_flag |= P_SUGID; + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +struct setreuid_args { + int ruid; + int euid; +}; +/* ARGSUSED */ +osetreuid(p, uap, retval) + register struct proc *p; + struct setreuid_args *uap; + int *retval; +{ + register struct pcred *pc = p->p_cred; + struct seteuid_args args; + + /* + * we assume that the intent of setting ruid is to be able to get + * back ruid priviledge. So we make sure that we will be able to + * do so, but do not actually set the ruid. + */ + if (uap->ruid != (uid_t)-1 && uap->ruid != pc->p_ruid && + uap->ruid != pc->p_svuid) + return (EPERM); + if (uap->euid == (uid_t)-1) + return (0); + args.euid = uap->euid; + return (seteuid(p, &args, retval)); +} + +struct setregid_args { + int rgid; + int egid; +}; +/* ARGSUSED */ +osetregid(p, uap, retval) + register struct proc *p; + struct setregid_args *uap; + int *retval; +{ + register struct pcred *pc = p->p_cred; + struct setegid_args args; + + /* + * we assume that the intent of setting rgid is to be able to get + * back rgid priviledge. So we make sure that we will be able to + * do so, but do not actually set the rgid. + */ + if (uap->rgid != (gid_t)-1 && uap->rgid != pc->p_rgid && + uap->rgid != pc->p_svgid) + return (EPERM); + if (uap->egid == (gid_t)-1) + return (0); + args.egid = uap->egid; + return (setegid(p, &args, retval)); +} +#endif /* defined(COMPAT_43) || defined(COMPAT_SUNOS) */ + +/* + * Check if gid is a member of the group set. + */ +groupmember(gid, cred) + gid_t gid; + register struct ucred *cred; +{ + register gid_t *gp; + gid_t *egp; + + egp = &(cred->cr_groups[cred->cr_ngroups]); + for (gp = cred->cr_groups; gp < egp; gp++) + if (*gp == gid) + return (1); + return (0); +} + +/* + * Test whether the specified credentials imply "super-user" + * privilege; if so, and we have accounting info, set the flag + * indicating use of super-powers. + * Returns 0 or error. + */ +suser(cred, acflag) + struct ucred *cred; + short *acflag; +{ + if (cred->cr_uid == 0) { + if (acflag) + *acflag |= ASU; + return (0); + } + return (EPERM); +} + +/* + * Allocate a zeroed cred structure. + */ +struct ucred * +crget() +{ + register struct ucred *cr; + + MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK); + bzero((caddr_t)cr, sizeof(*cr)); + cr->cr_ref = 1; + return (cr); +} + +/* + * Free a cred structure. + * Throws away space when ref count gets to 0. + */ +crfree(cr) + struct ucred *cr; +{ + int s; + + s = splimp(); /* ??? */ + if (--cr->cr_ref == 0) + FREE((caddr_t)cr, M_CRED); + (void) splx(s); +} + +/* + * Copy cred structure to a new one and free the old one. + */ +struct ucred * +crcopy(cr) + struct ucred *cr; +{ + struct ucred *newcr; + + if (cr->cr_ref == 1) + return (cr); + newcr = crget(); + *newcr = *cr; + crfree(cr); + newcr->cr_ref = 1; + return (newcr); +} + +/* + * Dup cred struct to a new held one. + */ +struct ucred * +crdup(cr) + struct ucred *cr; +{ + struct ucred *newcr; + + newcr = crget(); + *newcr = *cr; + newcr->cr_ref = 1; + return (newcr); +} + +/* + * Get login name, if available. + */ +struct getlogin_args { + char *namebuf; + u_int namelen; +}; +/* ARGSUSED */ +getlogin(p, uap, retval) + struct proc *p; + struct getlogin_args *uap; + int *retval; +{ + + if (uap->namelen > sizeof (p->p_pgrp->pg_session->s_login)) + uap->namelen = sizeof (p->p_pgrp->pg_session->s_login); + return (copyout((caddr_t) p->p_pgrp->pg_session->s_login, + (caddr_t) uap->namebuf, uap->namelen)); +} + +/* + * Set login name. + */ +struct setlogin_args { + char *namebuf; +}; +/* ARGSUSED */ +setlogin(p, uap, retval) + struct proc *p; + struct setlogin_args *uap; + int *retval; +{ + int error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + error = copyinstr((caddr_t) uap->namebuf, + (caddr_t) p->p_pgrp->pg_session->s_login, + sizeof (p->p_pgrp->pg_session->s_login) - 1, (u_int *)0); + if (error == ENAMETOOLONG) + error = EINVAL; + return (error); +} diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c new file mode 100644 index 00000000000..68e9dfbc86d --- /dev/null +++ b/sys/kern/kern_resource.c @@ -0,0 +1,476 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include + +#include + +/* + * Resource controls and accounting. + */ + +struct getpriority_args { + int which; + int who; +}; +getpriority(curp, uap, retval) + struct proc *curp; + register struct getpriority_args *uap; + int *retval; +{ + register struct proc *p; + register int low = PRIO_MAX + 1; + + switch (uap->which) { + + case PRIO_PROCESS: + if (uap->who == 0) + p = curp; + else + p = pfind(uap->who); + if (p == 0) + break; + low = p->p_nice; + break; + + case PRIO_PGRP: { + register struct pgrp *pg; + + if (uap->who == 0) + pg = curp->p_pgrp; + else if ((pg = pgfind(uap->who)) == NULL) + break; + for (p = pg->pg_mem; p != NULL; p = p->p_pgrpnxt) { + if (p->p_nice < low) + low = p->p_nice; + } + break; + } + + case PRIO_USER: + if (uap->who == 0) + uap->who = curp->p_ucred->cr_uid; + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + if (p->p_ucred->cr_uid == uap->who && + p->p_nice < low) + low = p->p_nice; + } + break; + + default: + return (EINVAL); + } + if (low == PRIO_MAX + 1) + return (ESRCH); + *retval = low; + return (0); +} + +struct setpriority_args { + int which; + int who; + int prio; +}; +/* ARGSUSED */ +setpriority(curp, uap, retval) + struct proc *curp; + register struct setpriority_args *uap; + int *retval; +{ + register struct proc *p; + int found = 0, error = 0; + + switch (uap->which) { + + case PRIO_PROCESS: + if (uap->who == 0) + p = curp; + else + p = pfind(uap->who); + if (p == 0) + break; + error = donice(curp, p, uap->prio); + found++; + break; + + case PRIO_PGRP: { + register struct pgrp *pg; + + if (uap->who == 0) + pg = curp->p_pgrp; + else if ((pg = pgfind(uap->who)) == NULL) + break; + for (p = pg->pg_mem; p != NULL; p = p->p_pgrpnxt) { + error = donice(curp, p, uap->prio); + found++; + } + break; + } + + case PRIO_USER: + if (uap->who == 0) + uap->who = curp->p_ucred->cr_uid; + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) + if (p->p_ucred->cr_uid == uap->who) { + error = donice(curp, p, uap->prio); + found++; + } + break; + + default: + return (EINVAL); + } + if (found == 0) + return (ESRCH); + return (error); +} + +donice(curp, chgp, n) + register struct proc *curp, *chgp; + register int n; +{ + register struct pcred *pcred = curp->p_cred; + + if (pcred->pc_ucred->cr_uid && pcred->p_ruid && + pcred->pc_ucred->cr_uid != chgp->p_ucred->cr_uid && + pcred->p_ruid != chgp->p_ucred->cr_uid) + return (EPERM); + if (n > PRIO_MAX) + n = PRIO_MAX; + if (n < PRIO_MIN) + n = PRIO_MIN; + if (n < chgp->p_nice && suser(pcred->pc_ucred, &curp->p_acflag)) + return (EACCES); + chgp->p_nice = n; + (void)resetpriority(chgp); + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +struct setrlimit_args { + u_int which; + struct orlimit *lim; +}; +/* ARGSUSED */ +osetrlimit(p, uap, retval) + struct proc *p; + register struct setrlimit_args *uap; + int *retval; +{ + struct orlimit olim; + struct rlimit lim; + int error; + + if (error = + copyin((caddr_t)uap->lim, (caddr_t)&olim, sizeof (struct orlimit))) + return (error); + lim.rlim_cur = olim.rlim_cur; + lim.rlim_max = olim.rlim_max; + return (dosetrlimit(p, uap->which, &lim)); +} + +struct getrlimit_args { + u_int which; + struct orlimit *rlp; +}; +/* ARGSUSED */ +ogetrlimit(p, uap, retval) + struct proc *p; + register struct getrlimit_args *uap; + int *retval; +{ + struct orlimit olim; + + if (uap->which >= RLIM_NLIMITS) + return (EINVAL); + olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur; + if (olim.rlim_cur == -1) + olim.rlim_cur = 0x7fffffff; + olim.rlim_max = p->p_rlimit[uap->which].rlim_max; + if (olim.rlim_max == -1) + olim.rlim_max = 0x7fffffff; + return (copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim))); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +struct __setrlimit_args { + u_int which; + struct rlimit *lim; +}; +/* ARGSUSED */ +setrlimit(p, uap, retval) + struct proc *p; + register struct __setrlimit_args *uap; + int *retval; +{ + struct rlimit alim; + int error; + + if (error = + copyin((caddr_t)uap->lim, (caddr_t)&alim, sizeof (struct rlimit))) + return (error); + return (dosetrlimit(p, uap->which, &alim)); +} + +dosetrlimit(p, which, limp) + struct proc *p; + u_int which; + struct rlimit *limp; +{ + register struct rlimit *alimp; + extern unsigned maxdmap; + int error; + + if (which >= RLIM_NLIMITS) + return (EINVAL); + alimp = &p->p_rlimit[which]; + if (limp->rlim_cur > alimp->rlim_max || + limp->rlim_max > alimp->rlim_max) + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + if (limp->rlim_cur > limp->rlim_max) + limp->rlim_cur = limp->rlim_max; + if (p->p_limit->p_refcnt > 1 && + (p->p_limit->p_lflags & PL_SHAREMOD) == 0) { + p->p_limit->p_refcnt--; + p->p_limit = limcopy(p->p_limit); + alimp = &p->p_rlimit[which]; + } + + switch (which) { + + case RLIMIT_DATA: + if (limp->rlim_cur > maxdmap) + limp->rlim_cur = maxdmap; + if (limp->rlim_max > maxdmap) + limp->rlim_max = maxdmap; + break; + + case RLIMIT_STACK: + if (limp->rlim_cur > maxdmap) + limp->rlim_cur = maxdmap; + if (limp->rlim_max > maxdmap) + limp->rlim_max = maxdmap; + /* + * Stack is allocated to the max at exec time with only + * "rlim_cur" bytes accessible. If stack limit is going + * up make more accessible, if going down make inaccessible. + */ + if (limp->rlim_cur != alimp->rlim_cur) { + vm_offset_t addr; + vm_size_t size; + vm_prot_t prot; + + if (limp->rlim_cur > alimp->rlim_cur) { + prot = VM_PROT_ALL; + size = limp->rlim_cur - alimp->rlim_cur; + addr = USRSTACK - limp->rlim_cur; + } else { + prot = VM_PROT_NONE; + size = alimp->rlim_cur - limp->rlim_cur; + addr = USRSTACK - alimp->rlim_cur; + } + addr = trunc_page(addr); + size = round_page(size); + (void) vm_map_protect(&p->p_vmspace->vm_map, + addr, addr+size, prot, FALSE); + } + break; + + case RLIMIT_NOFILE: + if (limp->rlim_cur > maxfiles) + limp->rlim_cur = maxfiles; + if (limp->rlim_max > maxfiles) + limp->rlim_max = maxfiles; + break; + + case RLIMIT_NPROC: + if (limp->rlim_cur > maxproc) + limp->rlim_cur = maxproc; + if (limp->rlim_max > maxproc) + limp->rlim_max = maxproc; + break; + } + *alimp = *limp; + return (0); +} + +struct __getrlimit_args { + u_int which; + struct rlimit *rlp; +}; +/* ARGSUSED */ +getrlimit(p, uap, retval) + struct proc *p; + register struct __getrlimit_args *uap; + int *retval; +{ + + if (uap->which >= RLIM_NLIMITS) + return (EINVAL); + return (copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp, + sizeof (struct rlimit))); +} + +/* + * Transform the running time and tick information in proc p into user, + * system, and interrupt time usage. + */ +calcru(p, up, sp, ip) + register struct proc *p; + register struct timeval *up; + register struct timeval *sp; + register struct timeval *ip; +{ + register u_quad_t u, st, ut, it, tot; + register u_long sec, usec; + register int s; + struct timeval tv; + + s = splstatclock(); + st = p->p_sticks; + ut = p->p_uticks; + it = p->p_iticks; + splx(s); + + tot = st + ut + it; + if (tot == 0) { + up->tv_sec = up->tv_usec = 0; + sp->tv_sec = sp->tv_usec = 0; + if (ip != NULL) + ip->tv_sec = ip->tv_usec = 0; + return; + } + + sec = p->p_rtime.tv_sec; + usec = p->p_rtime.tv_usec; + if (p == curproc) { + /* + * Adjust for the current time slice. This is actually fairly + * important since the error here is on the order of a time + * quantum, which is much greater than the sampling error. + */ + microtime(&tv); + sec += tv.tv_sec - runtime.tv_sec; + usec += tv.tv_usec - runtime.tv_usec; + } + u = sec * 1000000 + usec; + st = (u * st) / tot; + sp->tv_sec = st / 1000000; + sp->tv_usec = st % 1000000; + ut = (u * ut) / tot; + up->tv_sec = ut / 1000000; + up->tv_usec = ut % 1000000; + if (ip != NULL) { + it = (u * it) / tot; + ip->tv_sec = it / 1000000; + ip->tv_usec = it % 1000000; + } +} + +struct getrusage_args { + int who; + struct rusage *rusage; +}; +/* ARGSUSED */ +getrusage(p, uap, retval) + register struct proc *p; + register struct getrusage_args *uap; + int *retval; +{ + register struct rusage *rup; + + switch (uap->who) { + + case RUSAGE_SELF: + rup = &p->p_stats->p_ru; + calcru(p, &rup->ru_utime, &rup->ru_stime, NULL); + break; + + case RUSAGE_CHILDREN: + rup = &p->p_stats->p_cru; + break; + + default: + return (EINVAL); + } + return (copyout((caddr_t)rup, (caddr_t)uap->rusage, + sizeof (struct rusage))); +} + +ruadd(ru, ru2) + register struct rusage *ru, *ru2; +{ + register long *ip, *ip2; + register int i; + + timevaladd(&ru->ru_utime, &ru2->ru_utime); + timevaladd(&ru->ru_stime, &ru2->ru_stime); + if (ru->ru_maxrss < ru2->ru_maxrss) + ru->ru_maxrss = ru2->ru_maxrss; + ip = &ru->ru_first; ip2 = &ru2->ru_first; + for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) + *ip++ += *ip2++; +} + +/* + * Make a copy of the plimit structure. + * We share these structures copy-on-write after fork, + * and copy when a limit is changed. + */ +struct plimit * +limcopy(lim) + struct plimit *lim; +{ + register struct plimit *copy; + + MALLOC(copy, struct plimit *, sizeof(struct plimit), + M_SUBPROC, M_WAITOK); + bcopy(lim->pl_rlimit, copy->pl_rlimit, + sizeof(struct rlimit) * RLIM_NLIMITS); + copy->p_lflags = 0; + copy->p_refcnt = 1; + return (copy); +} diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c new file mode 100644 index 00000000000..3dcff922c39 --- /dev/null +++ b/sys/kern/kern_sig.c @@ -0,0 +1,1197 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 + */ + +#define SIGPROP /* include signal properties table */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include /* for coredump */ + +/* + * Can process p, with pcred pc, send the signal signum to process q? + */ +#define CANSIGNAL(p, pc, q, signum) \ + ((pc)->pc_ucred->cr_uid == 0 || \ + (pc)->p_ruid == (q)->p_cred->p_ruid || \ + (pc)->pc_ucred->cr_uid == (q)->p_cred->p_ruid || \ + (pc)->p_ruid == (q)->p_ucred->cr_uid || \ + (pc)->pc_ucred->cr_uid == (q)->p_ucred->cr_uid || \ + ((signum) == SIGCONT && (q)->p_session == (p)->p_session)) + +struct sigaction_args { + int signum; + struct sigaction *nsa; + struct sigaction *osa; +}; +/* ARGSUSED */ +sigaction(p, uap, retval) + struct proc *p; + register struct sigaction_args *uap; + int *retval; +{ + struct sigaction vec; + register struct sigaction *sa; + register struct sigacts *ps = p->p_sigacts; + register int signum; + int bit, error; + + signum = uap->signum; + if (signum <= 0 || signum >= NSIG || + signum == SIGKILL || signum == SIGSTOP) + return (EINVAL); + sa = &vec; + if (uap->osa) { + sa->sa_handler = ps->ps_sigact[signum]; + sa->sa_mask = ps->ps_catchmask[signum]; + bit = sigmask(signum); + sa->sa_flags = 0; + if ((ps->ps_sigonstack & bit) != 0) + sa->sa_flags |= SA_ONSTACK; + if ((ps->ps_sigintr & bit) == 0) + sa->sa_flags |= SA_RESTART; + if (p->p_flag & P_NOCLDSTOP) + sa->sa_flags |= SA_NOCLDSTOP; + if (error = copyout((caddr_t)sa, (caddr_t)uap->osa, + sizeof (vec))) + return (error); + } + if (uap->nsa) { + if (error = copyin((caddr_t)uap->nsa, (caddr_t)sa, + sizeof (vec))) + return (error); + setsigvec(p, signum, sa); + } + return (0); +} + +setsigvec(p, signum, sa) + register struct proc *p; + int signum; + register struct sigaction *sa; +{ + register struct sigacts *ps = p->p_sigacts; + register int bit; + + bit = sigmask(signum); + /* + * Change setting atomically. + */ + (void) splhigh(); + ps->ps_sigact[signum] = sa->sa_handler; + ps->ps_catchmask[signum] = sa->sa_mask &~ sigcantmask; + if ((sa->sa_flags & SA_RESTART) == 0) + ps->ps_sigintr |= bit; + else + ps->ps_sigintr &= ~bit; + if (sa->sa_flags & SA_ONSTACK) + ps->ps_sigonstack |= bit; + else + ps->ps_sigonstack &= ~bit; +#ifdef COMPAT_SUNOS + if (sa->sa_flags & SA_USERTRAMP) + ps->ps_usertramp |= bit; + else + ps->ps_usertramp &= ~bit; +#endif + if (signum == SIGCHLD) { + if (sa->sa_flags & SA_NOCLDSTOP) + p->p_flag |= P_NOCLDSTOP; + else + p->p_flag &= ~P_NOCLDSTOP; + } + /* + * Set bit in p_sigignore for signals that are set to SIG_IGN, + * and for signals set to SIG_DFL where the default is to ignore. + * However, don't put SIGCONT in p_sigignore, + * as we have to restart the process. + */ + if (sa->sa_handler == SIG_IGN || + (sigprop[signum] & SA_IGNORE && sa->sa_handler == SIG_DFL)) { + p->p_siglist &= ~bit; /* never to be seen again */ + if (signum != SIGCONT) + p->p_sigignore |= bit; /* easier in psignal */ + p->p_sigcatch &= ~bit; + } else { + p->p_sigignore &= ~bit; + if (sa->sa_handler == SIG_DFL) + p->p_sigcatch &= ~bit; + else + p->p_sigcatch |= bit; + } + (void) spl0(); +} + +/* + * Initialize signal state for process 0; + * set to ignore signals that are ignored by default. + */ +void +siginit(p) + struct proc *p; +{ + register int i; + + for (i = 0; i < NSIG; i++) + if (sigprop[i] & SA_IGNORE && i != SIGCONT) + p->p_sigignore |= sigmask(i); +} + +/* + * Reset signals for an exec of the specified process. + */ +void +execsigs(p) + register struct proc *p; +{ + register struct sigacts *ps = p->p_sigacts; + register int nc, mask; + + /* + * Reset caught signals. Held signals remain held + * through p_sigmask (unless they were caught, + * and are now ignored by default). + */ + while (p->p_sigcatch) { + nc = ffs((long)p->p_sigcatch); + mask = sigmask(nc); + p->p_sigcatch &= ~mask; + if (sigprop[nc] & SA_IGNORE) { + if (nc != SIGCONT) + p->p_sigignore |= mask; + p->p_siglist &= ~mask; + } + ps->ps_sigact[nc] = SIG_DFL; + } + /* + * Reset stack state to the user stack. + * Clear set of signals caught on the signal stack. + */ + ps->ps_sigstk.ss_flags = SA_DISABLE; + ps->ps_sigstk.ss_size = 0; + ps->ps_sigstk.ss_base = 0; + ps->ps_flags = 0; +} + +/* + * Manipulate signal mask. + * Note that we receive new mask, not pointer, + * and return old mask as return value; + * the library stub does the rest. + */ +struct sigprocmask_args { + int how; + sigset_t mask; +}; +sigprocmask(p, uap, retval) + register struct proc *p; + struct sigprocmask_args *uap; + int *retval; +{ + int error = 0; + + *retval = p->p_sigmask; + (void) splhigh(); + + switch (uap->how) { + case SIG_BLOCK: + p->p_sigmask |= uap->mask &~ sigcantmask; + break; + + case SIG_UNBLOCK: + p->p_sigmask &= ~uap->mask; + break; + + case SIG_SETMASK: + p->p_sigmask = uap->mask &~ sigcantmask; + break; + + default: + error = EINVAL; + break; + } + (void) spl0(); + return (error); +} + +struct sigpending_args { + int dummy; +}; +/* ARGSUSED */ +sigpending(p, uap, retval) + struct proc *p; + struct sigpending_args *uap; + int *retval; +{ + + *retval = p->p_siglist; + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Generalized interface signal handler, 4.3-compatible. + */ +struct osigvec_args { + int signum; + struct sigvec *nsv; + struct sigvec *osv; +}; +/* ARGSUSED */ +osigvec(p, uap, retval) + struct proc *p; + register struct osigvec_args *uap; + int *retval; +{ + struct sigvec vec; + register struct sigacts *ps = p->p_sigacts; + register struct sigvec *sv; + register int signum; + int bit, error; + + signum = uap->signum; + if (signum <= 0 || signum >= NSIG || + signum == SIGKILL || signum == SIGSTOP) + return (EINVAL); + sv = &vec; + if (uap->osv) { + *(sig_t *)&sv->sv_handler = ps->ps_sigact[signum]; + sv->sv_mask = ps->ps_catchmask[signum]; + bit = sigmask(signum); + sv->sv_flags = 0; + if ((ps->ps_sigonstack & bit) != 0) + sv->sv_flags |= SV_ONSTACK; + if ((ps->ps_sigintr & bit) != 0) + sv->sv_flags |= SV_INTERRUPT; +#ifndef COMPAT_SUNOS + if (p->p_flag & P_NOCLDSTOP) + sv->sv_flags |= SA_NOCLDSTOP; +#endif + if (error = copyout((caddr_t)sv, (caddr_t)uap->osv, + sizeof (vec))) + return (error); + } + if (uap->nsv) { + if (error = copyin((caddr_t)uap->nsv, (caddr_t)sv, + sizeof (vec))) + return (error); +#ifdef COMPAT_SUNOS + /* + * SunOS uses this bit (4, aka SA_DISABLE) as SV_RESETHAND, + * `reset to SIG_DFL on delivery'. We have no such option + * now or ever! + */ + if (sv->sv_flags & SA_DISABLE) + return (EINVAL); + sv->sv_flags |= SA_USERTRAMP; +#endif + sv->sv_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ + setsigvec(p, signum, (struct sigaction *)sv); + } + return (0); +} + +struct osigblock_args { + int mask; +}; +osigblock(p, uap, retval) + register struct proc *p; + struct osigblock_args *uap; + int *retval; +{ + + (void) splhigh(); + *retval = p->p_sigmask; + p->p_sigmask |= uap->mask &~ sigcantmask; + (void) spl0(); + return (0); +} + +struct osigsetmask_args { + int mask; +}; +osigsetmask(p, uap, retval) + struct proc *p; + struct osigsetmask_args *uap; + int *retval; +{ + + (void) splhigh(); + *retval = p->p_sigmask; + p->p_sigmask = uap->mask &~ sigcantmask; + (void) spl0(); + return (0); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Suspend process until signal, providing mask to be set + * in the meantime. Note nonstandard calling convention: + * libc stub passes mask, not pointer, to save a copyin. + */ +struct sigsuspend_args { + sigset_t mask; +}; +/* ARGSUSED */ +sigsuspend(p, uap, retval) + register struct proc *p; + struct sigsuspend_args *uap; + int *retval; +{ + register struct sigacts *ps = p->p_sigacts; + + /* + * When returning from sigpause, we want + * the old mask to be restored after the + * signal handler has finished. Thus, we + * save it here and mark the sigacts structure + * to indicate this. + */ + ps->ps_oldmask = p->p_sigmask; + ps->ps_flags |= SAS_OLDMASK; + p->p_sigmask = uap->mask &~ sigcantmask; + while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "pause", 0) == 0) + /* void */; + /* always return EINTR rather than ERESTART... */ + return (EINTR); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +struct osigstack_args { + struct sigstack *nss; + struct sigstack *oss; +}; +/* ARGSUSED */ +osigstack(p, uap, retval) + struct proc *p; + register struct osigstack_args *uap; + int *retval; +{ + struct sigstack ss; + struct sigacts *psp; + int error = 0; + + psp = p->p_sigacts; + ss.ss_sp = psp->ps_sigstk.ss_base; + ss.ss_onstack = psp->ps_sigstk.ss_flags & SA_ONSTACK; + if (uap->oss && (error = copyout((caddr_t)&ss, (caddr_t)uap->oss, + sizeof (struct sigstack)))) + return (error); + if (uap->nss && (error = copyin((caddr_t)uap->nss, (caddr_t)&ss, + sizeof (ss))) == 0) { + psp->ps_sigstk.ss_base = ss.ss_sp; + psp->ps_sigstk.ss_size = 0; + psp->ps_sigstk.ss_flags |= ss.ss_onstack & SA_ONSTACK; + psp->ps_flags |= SAS_ALTSTACK; + } + return (error); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +struct sigaltstack_args { + struct sigaltstack *nss; + struct sigaltstack *oss; +}; +/* ARGSUSED */ +sigaltstack(p, uap, retval) + struct proc *p; + register struct sigaltstack_args *uap; + int *retval; +{ + struct sigacts *psp; + struct sigaltstack ss; + int error; + + psp = p->p_sigacts; + if ((psp->ps_flags & SAS_ALTSTACK) == 0) + psp->ps_sigstk.ss_flags |= SA_DISABLE; + if (uap->oss && (error = copyout((caddr_t)&psp->ps_sigstk, + (caddr_t)uap->oss, sizeof (struct sigaltstack)))) + return (error); + if (uap->nss == 0) + return (0); + if (error = copyin((caddr_t)uap->nss, (caddr_t)&ss, sizeof (ss))) + return (error); + if (ss.ss_flags & SA_DISABLE) { + if (psp->ps_sigstk.ss_flags & SA_ONSTACK) + return (EINVAL); + psp->ps_flags &= ~SAS_ALTSTACK; + psp->ps_sigstk.ss_flags = ss.ss_flags; + return (0); + } + if (ss.ss_size < MINSIGSTKSZ) + return (ENOMEM); + psp->ps_flags |= SAS_ALTSTACK; + psp->ps_sigstk= ss; + return (0); +} + +struct kill_args { + int pid; + int signum; +}; +/* ARGSUSED */ +kill(cp, uap, retval) + register struct proc *cp; + register struct kill_args *uap; + int *retval; +{ + register struct proc *p; + register struct pcred *pc = cp->p_cred; + + if ((u_int)uap->signum >= NSIG) + return (EINVAL); + if (uap->pid > 0) { + /* kill single process */ + if ((p = pfind(uap->pid)) == NULL) + return (ESRCH); + if (!CANSIGNAL(cp, pc, p, uap->signum)) + return (EPERM); + if (uap->signum) + psignal(p, uap->signum); + return (0); + } + switch (uap->pid) { + case -1: /* broadcast signal */ + return (killpg1(cp, uap->signum, 0, 1)); + case 0: /* signal own process group */ + return (killpg1(cp, uap->signum, 0, 0)); + default: /* negative explicit process group */ + return (killpg1(cp, uap->signum, -uap->pid, 0)); + } + /* NOTREACHED */ +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +struct okillpg_args { + int pgid; + int signum; +}; +/* ARGSUSED */ +okillpg(p, uap, retval) + struct proc *p; + register struct okillpg_args *uap; + int *retval; +{ + + if ((u_int)uap->signum >= NSIG) + return (EINVAL); + return (killpg1(p, uap->signum, uap->pgid, 0)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Common code for kill process group/broadcast kill. + * cp is calling process. + */ +killpg1(cp, signum, pgid, all) + register struct proc *cp; + int signum, pgid, all; +{ + register struct proc *p; + register struct pcred *pc = cp->p_cred; + struct pgrp *pgrp; + int nfound = 0; + + if (all) + /* + * broadcast + */ + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || + p == cp || !CANSIGNAL(cp, pc, p, signum)) + continue; + nfound++; + if (signum) + psignal(p, signum); + } + else { + if (pgid == 0) + /* + * zero pgid means send to my process group. + */ + pgrp = cp->p_pgrp; + else { + pgrp = pgfind(pgid); + if (pgrp == NULL) + return (ESRCH); + } + for (p = pgrp->pg_mem; p != NULL; p = p->p_pgrpnxt) { + if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || + p->p_stat == SZOMB || + !CANSIGNAL(cp, pc, p, signum)) + continue; + nfound++; + if (signum) + psignal(p, signum); + } + } + return (nfound ? 0 : ESRCH); +} + +/* + * Send a signal to a process group. + */ +void +gsignal(pgid, signum) + int pgid, signum; +{ + struct pgrp *pgrp; + + if (pgid && (pgrp = pgfind(pgid))) + pgsignal(pgrp, signum, 0); +} + +/* + * Send a signal to a process group. If checktty is 1, + * limit to members which have a controlling terminal. + */ +void +pgsignal(pgrp, signum, checkctty) + struct pgrp *pgrp; + int signum, checkctty; +{ + register struct proc *p; + + if (pgrp) + for (p = pgrp->pg_mem; p != NULL; p = p->p_pgrpnxt) + if (checkctty == 0 || p->p_flag & P_CONTROLT) + psignal(p, signum); +} + +/* + * Send a signal caused by a trap to the current process. + * If it will be caught immediately, deliver it with correct code. + * Otherwise, post it normally. + */ +void +trapsignal(p, signum, code) + struct proc *p; + register int signum; + u_int code; +{ + register struct sigacts *ps = p->p_sigacts; + int mask; + + mask = sigmask(signum); + if ((p->p_flag & P_TRACED) == 0 && (p->p_sigcatch & mask) != 0 && + (p->p_sigmask & mask) == 0) { + p->p_stats->p_ru.ru_nsignals++; +#ifdef KTRACE + if (KTRPOINT(p, KTR_PSIG)) + ktrpsig(p->p_tracep, signum, ps->ps_sigact[signum], + p->p_sigmask, code); +#endif + sendsig(ps->ps_sigact[signum], signum, p->p_sigmask, code); + p->p_sigmask |= ps->ps_catchmask[signum] | mask; + } else { + ps->ps_code = code; /* XXX for core dump/debugger */ + psignal(p, signum); + } +} + +/* + * Send the signal to the process. If the signal has an action, the action + * is usually performed by the target process rather than the caller; we add + * the signal to the set of pending signals for the process. + * + * Exceptions: + * o When a stop signal is sent to a sleeping process that takes the + * default action, the process is stopped without awakening it. + * o SIGCONT restarts stopped processes (or puts them back to sleep) + * regardless of the signal action (eg, blocked or ignored). + * + * Other ignored signals are discarded immediately. + */ +void +psignal(p, signum) + register struct proc *p; + register int signum; +{ + register int s, prop; + register sig_t action; + int mask; + + if ((u_int)signum >= NSIG || signum == 0) + panic("psignal signal number"); + mask = sigmask(signum); + prop = sigprop[signum]; + + /* + * If proc is traced, always give parent a chance. + */ + if (p->p_flag & P_TRACED) + action = SIG_DFL; + else { + /* + * If the signal is being ignored, + * then we forget about it immediately. + * (Note: we don't set SIGCONT in p_sigignore, + * and if it is set to SIG_IGN, + * action will be SIG_DFL here.) + */ + if (p->p_sigignore & mask) + return; + if (p->p_sigmask & mask) + action = SIG_HOLD; + else if (p->p_sigcatch & mask) + action = SIG_CATCH; + else + action = SIG_DFL; + } + + if (p->p_nice > NZERO && action == SIG_DFL && (prop & SA_KILL) && + (p->p_flag & P_TRACED) == 0) + p->p_nice = NZERO; + + if (prop & SA_CONT) + p->p_siglist &= ~stopsigmask; + + if (prop & SA_STOP) { + /* + * If sending a tty stop signal to a member of an orphaned + * process group, discard the signal here if the action + * is default; don't stop the process below if sleeping, + * and don't clear any pending SIGCONT. + */ + if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 && + action == SIG_DFL) + return; + p->p_siglist &= ~contsigmask; + } + p->p_siglist |= mask; + + /* + * Defer further processing for signals which are held, + * except that stopped processes must be continued by SIGCONT. + */ + if (action == SIG_HOLD && ((prop & SA_CONT) == 0 || p->p_stat != SSTOP)) + return; + s = splhigh(); + switch (p->p_stat) { + + case SSLEEP: + /* + * If process is sleeping uninterruptibly + * we can't interrupt the sleep... the signal will + * be noticed when the process returns through + * trap() or syscall(). + */ + if ((p->p_flag & P_SINTR) == 0) + goto out; + /* + * Process is sleeping and traced... make it runnable + * so it can discover the signal in issignal() and stop + * for the parent. + */ + if (p->p_flag & P_TRACED) + goto run; + /* + * If SIGCONT is default (or ignored) and process is + * asleep, we are finished; the process should not + * be awakened. + */ + if ((prop & SA_CONT) && action == SIG_DFL) { + p->p_siglist &= ~mask; + goto out; + } + /* + * When a sleeping process receives a stop + * signal, process immediately if possible. + * All other (caught or default) signals + * cause the process to run. + */ + if (prop & SA_STOP) { + if (action != SIG_DFL) + goto runfast; + /* + * If a child holding parent blocked, + * stopping could cause deadlock. + */ + if (p->p_flag & P_PPWAIT) + goto out; + p->p_siglist &= ~mask; + p->p_xstat = signum; + if ((p->p_pptr->p_flag & P_NOCLDSTOP) == 0) + psignal(p->p_pptr, SIGCHLD); + stop(p); + goto out; + } else + goto runfast; + /*NOTREACHED*/ + + case SSTOP: + /* + * If traced process is already stopped, + * then no further action is necessary. + */ + if (p->p_flag & P_TRACED) + goto out; + + /* + * Kill signal always sets processes running. + */ + if (signum == SIGKILL) + goto runfast; + + if (prop & SA_CONT) { + /* + * If SIGCONT is default (or ignored), we continue the + * process but don't leave the signal in p_siglist, as + * it has no further action. If SIGCONT is held, we + * continue the process and leave the signal in + * p_siglist. If the process catches SIGCONT, let it + * handle the signal itself. If it isn't waiting on + * an event, then it goes back to run state. + * Otherwise, process goes back to sleep state. + */ + if (action == SIG_DFL) + p->p_siglist &= ~mask; + if (action == SIG_CATCH) + goto runfast; + if (p->p_wchan == 0) + goto run; + p->p_stat = SSLEEP; + goto out; + } + + if (prop & SA_STOP) { + /* + * Already stopped, don't need to stop again. + * (If we did the shell could get confused.) + */ + p->p_siglist &= ~mask; /* take it away */ + goto out; + } + + /* + * If process is sleeping interruptibly, then simulate a + * wakeup so that when it is continued, it will be made + * runnable and can look at the signal. But don't make + * the process runnable, leave it stopped. + */ + if (p->p_wchan && p->p_flag & P_SINTR) + unsleep(p); + goto out; + + default: + /* + * SRUN, SIDL, SZOMB do nothing with the signal, + * other than kicking ourselves if we are running. + * It will either never be noticed, or noticed very soon. + */ + if (p == curproc) + signotify(p); + goto out; + } + /*NOTREACHED*/ + +runfast: + /* + * Raise priority to at least PUSER. + */ + if (p->p_priority > PUSER) + p->p_priority = PUSER; +run: + setrunnable(p); +out: + splx(s); +} + +/* + * If the current process has received a signal (should be caught or cause + * termination, should interrupt current syscall), return the signal number. + * Stop signals with default action are processed immediately, then cleared; + * they aren't returned. This is checked after each entry to the system for + * a syscall or trap (though this can usually be done without calling issignal + * by checking the pending signal masks in the CURSIG macro.) The normal call + * sequence is + * + * while (signum = CURSIG(curproc)) + * postsig(signum); + */ +issignal(p) + register struct proc *p; +{ + register int signum, mask, prop; + + for (;;) { + mask = p->p_siglist & ~p->p_sigmask; + if (p->p_flag & P_PPWAIT) + mask &= ~stopsigmask; + if (mask == 0) /* no signal to send */ + return (0); + signum = ffs((long)mask); + mask = sigmask(signum); + prop = sigprop[signum]; + /* + * We should see pending but ignored signals + * only if P_TRACED was on when they were posted. + */ + if (mask & p->p_sigignore && (p->p_flag & P_TRACED) == 0) { + p->p_siglist &= ~mask; + continue; + } + if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) { + /* + * If traced, always stop, and stay + * stopped until released by the parent. + */ + p->p_xstat = signum; + psignal(p->p_pptr, SIGCHLD); + do { + stop(p); + mi_switch(); + } while (!trace_req(p) && p->p_flag & P_TRACED); + + /* + * If the traced bit got turned off, go back up + * to the top to rescan signals. This ensures + * that p_sig* and ps_sigact are consistent. + */ + if ((p->p_flag & P_TRACED) == 0) + continue; + + /* + * If parent wants us to take the signal, + * then it will leave it in p->p_xstat; + * otherwise we just look for signals again. + */ + p->p_siglist &= ~mask; /* clear the old signal */ + signum = p->p_xstat; + if (signum == 0) + continue; + + /* + * Put the new signal into p_siglist. If the + * signal is being masked, look for other signals. + */ + mask = sigmask(signum); + p->p_siglist |= mask; + if (p->p_sigmask & mask) + continue; + } + + /* + * Decide whether the signal should be returned. + * Return the signal's number, or fall through + * to clear it from the pending mask. + */ + switch ((int)p->p_sigacts->ps_sigact[signum]) { + + case SIG_DFL: + /* + * Don't take default actions on system processes. + */ + if (p->p_pid <= 1) { +#ifdef DIAGNOSTIC + /* + * Are you sure you want to ignore SIGSEGV + * in init? XXX + */ + printf("Process (pid %d) got signal %d\n", + p->p_pid, signum); +#endif + break; /* == ignore */ + } + /* + * If there is a pending stop signal to process + * with default action, stop here, + * then clear the signal. However, + * if process is member of an orphaned + * process group, ignore tty stop signals. + */ + if (prop & SA_STOP) { + if (p->p_flag & P_TRACED || + (p->p_pgrp->pg_jobc == 0 && + prop & SA_TTYSTOP)) + break; /* == ignore */ + p->p_xstat = signum; + stop(p); + if ((p->p_pptr->p_flag & P_NOCLDSTOP) == 0) + psignal(p->p_pptr, SIGCHLD); + mi_switch(); + break; + } else if (prop & SA_IGNORE) { + /* + * Except for SIGCONT, shouldn't get here. + * Default action is to ignore; drop it. + */ + break; /* == ignore */ + } else + return (signum); + /*NOTREACHED*/ + + case SIG_IGN: + /* + * Masking above should prevent us ever trying + * to take action on an ignored signal other + * than SIGCONT, unless process is traced. + */ + if ((prop & SA_CONT) == 0 && + (p->p_flag & P_TRACED) == 0) + printf("issignal\n"); + break; /* == ignore */ + + default: + /* + * This signal has an action, let + * postsig() process it. + */ + return (signum); + } + p->p_siglist &= ~mask; /* take the signal! */ + } + /* NOTREACHED */ +} + +/* + * Put the argument process into the stopped state and notify the parent + * via wakeup. Signals are handled elsewhere. The process must not be + * on the run queue. + */ +stop(p) + register struct proc *p; +{ + + p->p_stat = SSTOP; + p->p_flag &= ~P_WAITED; + wakeup((caddr_t)p->p_pptr); +} + +/* + * Take the action for the specified signal + * from the current set of pending signals. + */ +void +postsig(signum) + register int signum; +{ + register struct proc *p = curproc; + register struct sigacts *ps = p->p_sigacts; + register sig_t action; + int code, mask, returnmask; + +#ifdef DIAGNOSTIC + if (signum == 0) + panic("postsig"); +#endif + mask = sigmask(signum); + p->p_siglist &= ~mask; + action = ps->ps_sigact[signum]; +#ifdef KTRACE + if (KTRPOINT(p, KTR_PSIG)) + ktrpsig(p->p_tracep, + signum, action, ps->ps_flags & SAS_OLDMASK ? + ps->ps_oldmask : p->p_sigmask, 0); +#endif + if (action == SIG_DFL) { + /* + * Default action, where the default is to kill + * the process. (Other cases were ignored above.) + */ + sigexit(p, signum); + /* NOTREACHED */ + } else { + /* + * If we get here, the signal must be caught. + */ +#ifdef DIAGNOSTIC + if (action == SIG_IGN || (p->p_sigmask & mask)) + panic("postsig action"); +#endif + /* + * Set the new mask value and also defer further + * occurences of this signal. + * + * Special case: user has done a sigpause. Here the + * current mask is not of interest, but rather the + * mask from before the sigpause is what we want + * restored after the signal processing is completed. + */ + (void) splhigh(); + if (ps->ps_flags & SAS_OLDMASK) { + returnmask = ps->ps_oldmask; + ps->ps_flags &= ~SAS_OLDMASK; + } else + returnmask = p->p_sigmask; + p->p_sigmask |= ps->ps_catchmask[signum] | mask; + (void) spl0(); + p->p_stats->p_ru.ru_nsignals++; + if (ps->ps_sig != signum) { + code = 0; + } else { + code = ps->ps_code; + ps->ps_code = 0; + } + sendsig(action, signum, returnmask, code); + } +} + +/* + * Kill the current process for stated reason. + */ +killproc(p, why) + struct proc *p; + char *why; +{ + + log(LOG_ERR, "pid %d was killed: %s\n", p->p_pid, why); + uprintf("sorry, pid %d was killed: %s\n", p->p_pid, why); + psignal(p, SIGKILL); +} + +/* + * Force the current process to exit with the specified signal, dumping core + * if appropriate. We bypass the normal tests for masked and caught signals, + * allowing unrecoverable failures to terminate the process without changing + * signal state. Mark the accounting record with the signal termination. + * If dumping core, save the signal number for the debugger. Calls exit and + * does not return. + */ +sigexit(p, signum) + register struct proc *p; + int signum; +{ + + p->p_acflag |= AXSIG; + if (sigprop[signum] & SA_CORE) { + p->p_sigacts->ps_sig = signum; + if (coredump(p) == 0) + signum |= WCOREFLAG; + } + exit1(p, W_EXITCODE(0, signum)); + /* NOTREACHED */ +} + +/* + * Dump core, into a file named "progname.core", unless the process was + * setuid/setgid. + */ +coredump(p) + register struct proc *p; +{ + register struct vnode *vp; + register struct pcred *pcred = p->p_cred; + register struct ucred *cred = pcred->pc_ucred; + register struct vmspace *vm = p->p_vmspace; + struct nameidata nd; + struct vattr vattr; + int error, error1; + char name[MAXCOMLEN+6]; /* progname.core */ + + if (pcred->p_svuid != pcred->p_ruid || pcred->p_svgid != pcred->p_rgid) + return (EFAULT); + if (ctob(UPAGES + vm->vm_dsize + vm->vm_ssize) >= + p->p_rlimit[RLIMIT_CORE].rlim_cur) + return (EFAULT); + sprintf(name, "%s.core", p->p_comm); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, name, p); + if (error = vn_open(&nd, + O_CREAT | FWRITE, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) + return (error); + vp = nd.ni_vp; + + /* Don't dump to non-regular files or files with links. */ + if (vp->v_type != VREG || + VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) { + error = EFAULT; + goto out; + } + VATTR_NULL(&vattr); + vattr.va_size = 0; + LEASE_CHECK(vp, p, cred, LEASE_WRITE); + VOP_SETATTR(vp, &vattr, cred, p); + p->p_acflag |= ACORE; + bcopy(p, &p->p_addr->u_kproc.kp_proc, sizeof(struct proc)); + fill_eproc(p, &p->p_addr->u_kproc.kp_eproc); + error = cpu_coredump(p, vp, cred); + if (error == 0) + error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr, + (int)ctob(vm->vm_dsize), (off_t)ctob(UPAGES), UIO_USERSPACE, + IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p); + if (error == 0) + error = vn_rdwr(UIO_WRITE, vp, + (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)), + round_page(ctob(vm->vm_ssize)), + (off_t)ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE, + IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p); +out: + VOP_UNLOCK(vp); + error1 = vn_close(vp, FWRITE, cred, p); + if (error == 0) + error = error1; + return (error); +} + +/* + * Nonexistent system call-- signal process (may want to handle it). + * Flag error in case process won't see signal immediately (blocked or ignored). + */ +struct nosys_args { + int dummy; +}; +/* ARGSUSED */ +nosys(p, args, retval) + struct proc *p; + struct nosys_args *args; + int *retval; +{ + + psignal(p, SIGSYS); + return (EINVAL); +} diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c new file mode 100644 index 00000000000..5c12afcba33 --- /dev/null +++ b/sys/kern/kern_subr.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include + +uiomove(cp, n, uio) + register caddr_t cp; + register int n; + register struct uio *uio; +{ + register struct iovec *iov; + u_int cnt; + int error = 0; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ && uio->uio_rw != UIO_WRITE) + panic("uiomove: mode"); + if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) + panic("uiomove proc"); +#endif + while (n > 0 && uio->uio_resid) { + iov = uio->uio_iov; + cnt = iov->iov_len; + if (cnt == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + if (cnt > n) + cnt = n; + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + case UIO_USERISPACE: + if (uio->uio_rw == UIO_READ) + error = copyout(cp, iov->iov_base, cnt); + else + error = copyin(iov->iov_base, cp, cnt); + if (error) + return (error); + break; + + case UIO_SYSSPACE: + if (uio->uio_rw == UIO_READ) + bcopy((caddr_t)cp, iov->iov_base, cnt); + else + bcopy(iov->iov_base, (caddr_t)cp, cnt); + break; + } + iov->iov_base += cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + cp += cnt; + n -= cnt; + } + return (error); +} + +/* + * Give next character to user as result of read. + */ +ureadc(c, uio) + register int c; + register struct uio *uio; +{ + register struct iovec *iov; + +again: + if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) + panic("ureadc"); + iov = uio->uio_iov; + if (iov->iov_len == 0) { + uio->uio_iovcnt--; + uio->uio_iov++; + goto again; + } + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + if (subyte(iov->iov_base, c) < 0) + return (EFAULT); + break; + + case UIO_SYSSPACE: + *iov->iov_base = c; + break; + + case UIO_USERISPACE: + if (suibyte(iov->iov_base, c) < 0) + return (EFAULT); + break; + } + iov->iov_base++; + iov->iov_len--; + uio->uio_resid--; + uio->uio_offset++; + return (0); +} + +#ifdef vax /* unused except by ct.c, other oddities XXX */ +/* + * Get next character written in by user from uio. + */ +uwritec(uio) + struct uio *uio; +{ + register struct iovec *iov; + register int c; + + if (uio->uio_resid <= 0) + return (-1); +again: + if (uio->uio_iovcnt <= 0) + panic("uwritec"); + iov = uio->uio_iov; + if (iov->iov_len == 0) { + uio->uio_iov++; + if (--uio->uio_iovcnt == 0) + return (-1); + goto again; + } + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + c = fubyte(iov->iov_base); + break; + + case UIO_SYSSPACE: + c = *(u_char *) iov->iov_base; + break; + + case UIO_USERISPACE: + c = fuibyte(iov->iov_base); + break; + } + if (c < 0) + return (-1); + iov->iov_base++; + iov->iov_len--; + uio->uio_resid--; + uio->uio_offset++; + return (c); +} +#endif /* vax */ + +/* + * General routine to allocate a hash table. + */ +void * +hashinit(elements, type, hashmask) + int elements, type; + u_long *hashmask; +{ + long hashsize; + LIST_HEAD(generic, generic) *hashtbl; + int i; + + if (elements <= 0) + panic("hashinit: bad cnt"); + for (hashsize = 1; hashsize <= elements; hashsize <<= 1) + continue; + hashsize >>= 1; + hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); + for (i = 0; i < hashsize; i++) + LIST_INIT(&hashtbl[i]); + *hashmask = hashsize - 1; + return (hashtbl); +} diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c new file mode 100644 index 00000000000..1c2a578f303 --- /dev/null +++ b/sys/kern/kern_synch.c @@ -0,0 +1,666 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_synch.c 8.6 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef KTRACE +#include +#endif + +#include + +u_char curpriority; /* usrpri of curproc */ +int lbolt; /* once a second sleep address */ + +/* + * Force switch among equal priority processes every 100ms. + */ +/* ARGSUSED */ +void +roundrobin(arg) + void *arg; +{ + + need_resched(); + timeout(roundrobin, NULL, hz / 10); +} + +/* + * Constants for digital decay and forget: + * 90% of (p_estcpu) usage in 5 * loadav time + * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) + * Note that, as ps(1) mentions, this can let percentages + * total over 100% (I've seen 137.9% for 3 processes). + * + * Note that hardclock updates p_estcpu and p_cpticks independently. + * + * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. + * That is, the system wants to compute a value of decay such + * that the following for loop: + * for (i = 0; i < (5 * loadavg); i++) + * p_estcpu *= decay; + * will compute + * p_estcpu *= 0.1; + * for all values of loadavg: + * + * Mathematically this loop can be expressed by saying: + * decay ** (5 * loadavg) ~= .1 + * + * The system computes decay as: + * decay = (2 * loadavg) / (2 * loadavg + 1) + * + * We wish to prove that the system's computation of decay + * will always fulfill the equation: + * decay ** (5 * loadavg) ~= .1 + * + * If we compute b as: + * b = 2 * loadavg + * then + * decay = b / (b + 1) + * + * We now need to prove two things: + * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) + * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) + * + * Facts: + * For x close to zero, exp(x) =~ 1 + x, since + * exp(x) = 0! + x**1/1! + x**2/2! + ... . + * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. + * For x close to zero, ln(1+x) =~ x, since + * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 + * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). + * ln(.1) =~ -2.30 + * + * Proof of (1): + * Solve (factor)**(power) =~ .1 given power (5*loadav): + * solving for factor, + * ln(factor) =~ (-2.30/5*loadav), or + * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = + * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED + * + * Proof of (2): + * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): + * solving for power, + * power*ln(b/(b+1)) =~ -2.30, or + * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED + * + * Actual power values for the implemented algorithm are as follows: + * loadav: 1 2 3 4 + * power: 5.68 10.32 14.94 19.55 + */ + +/* calculations for digital decay to forget 90% of usage in 5*loadav sec */ +#define loadfactor(loadav) (2 * (loadav)) +#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) + +/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ +fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ + +/* + * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the + * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below + * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). + * + * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: + * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). + * + * If you dont want to bother with the faster/more-accurate formula, you + * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate + * (more general) method of calculating the %age of CPU used by a process. + */ +#define CCPU_SHIFT 11 + +/* + * Recompute process priorities, every hz ticks. + */ +/* ARGSUSED */ +void +schedcpu(arg) + void *arg; +{ + register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); + register struct proc *p; + register int s; + register unsigned int newcpu; + + wakeup((caddr_t)&lbolt); + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + /* + * Increment time in/out of memory and sleep time + * (if sleeping). We ignore overflow; with 16-bit int's + * (remember them?) overflow takes 45 days. + */ + p->p_swtime++; + if (p->p_stat == SSLEEP || p->p_stat == SSTOP) + p->p_slptime++; + p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; + /* + * If the process has slept the entire second, + * stop recalculating its priority until it wakes up. + */ + if (p->p_slptime > 1) + continue; + s = splstatclock(); /* prevent state changes */ + /* + * p_pctcpu is only for ps. + */ +#if (FSHIFT >= CCPU_SHIFT) + p->p_pctcpu += (hz == 100)? + ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): + 100 * (((fixpt_t) p->p_cpticks) + << (FSHIFT - CCPU_SHIFT)) / hz; +#else + p->p_pctcpu += ((FSCALE - ccpu) * + (p->p_cpticks * FSCALE / hz)) >> FSHIFT; +#endif + p->p_cpticks = 0; + newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu) + p->p_nice; + p->p_estcpu = min(newcpu, UCHAR_MAX); + resetpriority(p); + if (p->p_priority >= PUSER) { +#define PPQ (128 / NQS) /* priorities per queue */ + if ((p != curproc) && + p->p_stat == SRUN && + (p->p_flag & P_INMEM) && + (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) { + remrq(p); + p->p_priority = p->p_usrpri; + setrunqueue(p); + } else + p->p_priority = p->p_usrpri; + } + splx(s); + } + vmmeter(); + if (bclnlist != NULL) + wakeup((caddr_t)pageproc); + timeout(schedcpu, (void *)0, hz); +} + +/* + * Recalculate the priority of a process after it has slept for a while. + * For all load averages >= 1 and max p_estcpu of 255, sleeping for at + * least six times the loadfactor will decay p_estcpu to zero. + */ +void +updatepri(p) + register struct proc *p; +{ + register unsigned int newcpu = p->p_estcpu; + register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); + + if (p->p_slptime > 5 * loadfac) + p->p_estcpu = 0; + else { + p->p_slptime--; /* the first time was done in schedcpu */ + while (newcpu && --p->p_slptime) + newcpu = (int) decay_cpu(loadfac, newcpu); + p->p_estcpu = min(newcpu, UCHAR_MAX); + } + resetpriority(p); +} + +/* + * We're only looking at 7 bits of the address; everything is + * aligned to 4, lots of things are aligned to greater powers + * of 2. Shift right by 8, i.e. drop the bottom 256 worth. + */ +#define TABLESIZE 128 +#define LOOKUP(x) (((int)(x) >> 8) & (TABLESIZE - 1)) +struct slpque { + struct proc *sq_head; + struct proc **sq_tailp; +} slpque[TABLESIZE]; + +/* + * During autoconfiguration or after a panic, a sleep will simply + * lower the priority briefly to allow interrupts, then return. + * The priority to be used (safepri) is machine-dependent, thus this + * value is initialized and maintained in the machine-dependent layers. + * This priority will typically be 0, or the lowest priority + * that is safe for use on the interrupt stack; it can be made + * higher to block network software interrupts after panics. + */ +int safepri; + +/* + * General sleep call. Suspends the current process until a wakeup is + * performed on the specified identifier. The process will then be made + * runnable with the specified priority. Sleeps at most timo/hz seconds + * (0 means no timeout). If pri includes PCATCH flag, signals are checked + * before and after sleeping, else signals are not checked. Returns 0 if + * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a + * signal needs to be delivered, ERESTART is returned if the current system + * call should be restarted if possible, and EINTR is returned if the system + * call should be interrupted by the signal (return EINTR). + */ +int +tsleep(ident, priority, wmesg, timo) + void *ident; + int priority, timo; + char *wmesg; +{ + register struct proc *p = curproc; + register struct slpque *qp; + register s; + int sig, catch = priority & PCATCH; + extern int cold; + void endtsleep __P((void *)); + +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 1, 0); +#endif + s = splhigh(); + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, + * just give interrupts a chance, then just return; + * don't run any other procs or panic below, + * in case this is the idle process and already asleep. + */ + splx(safepri); + splx(s); + return (0); + } +#ifdef DIAGNOSTIC + if (ident == NULL || p->p_stat != SRUN || p->p_back) + panic("tsleep"); +#endif + p->p_wchan = ident; + p->p_wmesg = wmesg; + p->p_slptime = 0; + p->p_priority = priority & PRIMASK; + qp = &slpque[LOOKUP(ident)]; + if (qp->sq_head == 0) + qp->sq_head = p; + else + *qp->sq_tailp = p; + *(qp->sq_tailp = &p->p_forw) = 0; + if (timo) + timeout(endtsleep, (void *)p, timo); + /* + * We put ourselves on the sleep queue and start our timeout + * before calling CURSIG, as we could stop there, and a wakeup + * or a SIGCONT (or both) could occur while we were stopped. + * A SIGCONT would cause us to be marked as SSLEEP + * without resuming us, thus we must be ready for sleep + * when CURSIG is called. If the wakeup happens while we're + * stopped, p->p_wchan will be 0 upon return from CURSIG. + */ + if (catch) { + p->p_flag |= P_SINTR; + if (sig = CURSIG(p)) { + if (p->p_wchan) + unsleep(p); + p->p_stat = SRUN; + goto resume; + } + if (p->p_wchan == 0) { + catch = 0; + goto resume; + } + } else + sig = 0; + p->p_stat = SSLEEP; + p->p_stats->p_ru.ru_nvcsw++; + mi_switch(); +resume: + curpriority = p->p_usrpri; + splx(s); + p->p_flag &= ~P_SINTR; + if (p->p_flag & P_TIMEOUT) { + p->p_flag &= ~P_TIMEOUT; + if (sig == 0) { +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + return (EWOULDBLOCK); + } + } else if (timo) + untimeout(endtsleep, (void *)p); + if (catch && (sig != 0 || (sig = CURSIG(p)))) { +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + if (p->p_sigacts->ps_sigintr & sigmask(sig)) + return (EINTR); + return (ERESTART); + } +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + return (0); +} + +/* + * Implement timeout for tsleep. + * If process hasn't been awakened (wchan non-zero), + * set timeout flag and undo the sleep. If proc + * is stopped, just unsleep so it will remain stopped. + */ +void +endtsleep(arg) + void *arg; +{ + register struct proc *p; + int s; + + p = (struct proc *)arg; + s = splhigh(); + if (p->p_wchan) { + if (p->p_stat == SSLEEP) + setrunnable(p); + else + unsleep(p); + p->p_flag |= P_TIMEOUT; + } + splx(s); +} + +/* + * Short-term, non-interruptable sleep. + */ +void +sleep(ident, priority) + void *ident; + int priority; +{ + register struct proc *p = curproc; + register struct slpque *qp; + register s; + extern int cold; + +#ifdef DIAGNOSTIC + if (priority > PZERO) { + printf("sleep called with priority %d > PZERO, wchan: %x\n", + priority, ident); + panic("old sleep"); + } +#endif + s = splhigh(); + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, + * just give interrupts a chance, then just return; + * don't run any other procs or panic below, + * in case this is the idle process and already asleep. + */ + splx(safepri); + splx(s); + return; + } +#ifdef DIAGNOSTIC + if (ident == NULL || p->p_stat != SRUN || p->p_back) + panic("sleep"); +#endif + p->p_wchan = ident; + p->p_wmesg = NULL; + p->p_slptime = 0; + p->p_priority = priority; + qp = &slpque[LOOKUP(ident)]; + if (qp->sq_head == 0) + qp->sq_head = p; + else + *qp->sq_tailp = p; + *(qp->sq_tailp = &p->p_forw) = 0; + p->p_stat = SSLEEP; + p->p_stats->p_ru.ru_nvcsw++; +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 1, 0); +#endif + mi_switch(); +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + curpriority = p->p_usrpri; + splx(s); +} + +/* + * Remove a process from its wait queue + */ +void +unsleep(p) + register struct proc *p; +{ + register struct slpque *qp; + register struct proc **hp; + int s; + + s = splhigh(); + if (p->p_wchan) { + hp = &(qp = &slpque[LOOKUP(p->p_wchan)])->sq_head; + while (*hp != p) + hp = &(*hp)->p_forw; + *hp = p->p_forw; + if (qp->sq_tailp == &p->p_forw) + qp->sq_tailp = hp; + p->p_wchan = 0; + } + splx(s); +} + +/* + * Make all processes sleeping on the specified identifier runnable. + */ +void +wakeup(ident) + register void *ident; +{ + register struct slpque *qp; + register struct proc *p, **q; + int s; + + s = splhigh(); + qp = &slpque[LOOKUP(ident)]; +restart: + for (q = &qp->sq_head; p = *q; ) { +#ifdef DIAGNOSTIC + if (p->p_back || p->p_stat != SSLEEP && p->p_stat != SSTOP) + panic("wakeup"); +#endif + if (p->p_wchan == ident) { + p->p_wchan = 0; + *q = p->p_forw; + if (qp->sq_tailp == &p->p_forw) + qp->sq_tailp = q; + if (p->p_stat == SSLEEP) { + /* OPTIMIZED EXPANSION OF setrunnable(p); */ + if (p->p_slptime > 1) + updatepri(p); + p->p_slptime = 0; + p->p_stat = SRUN; + if (p->p_flag & P_INMEM) + setrunqueue(p); + /* + * Since curpriority is a user priority, + * p->p_priority is always better than + * curpriority. + */ + if ((p->p_flag & P_INMEM) == 0) + wakeup((caddr_t)&proc0); + else + need_resched(); + /* END INLINE EXPANSION */ + goto restart; + } + } else + q = &p->p_forw; + } + splx(s); +} + +/* + * The machine independent parts of mi_switch(). + * Must be called at splstatclock() or higher. + */ +void +mi_switch() +{ + register struct proc *p = curproc; /* XXX */ + register struct rlimit *rlim; + register long s, u; + struct timeval tv; + + /* + * Compute the amount of time during which the current + * process was running, and add that to its total so far. + */ + microtime(&tv); + u = p->p_rtime.tv_usec + (tv.tv_usec - runtime.tv_usec); + s = p->p_rtime.tv_sec + (tv.tv_sec - runtime.tv_sec); + if (u < 0) { + u += 1000000; + s--; + } else if (u >= 1000000) { + u -= 1000000; + s++; + } + p->p_rtime.tv_usec = u; + p->p_rtime.tv_sec = s; + + /* + * Check if the process exceeds its cpu resource allocation. + * If over max, kill it. In any case, if it has run for more + * than 10 minutes, reduce priority to give others a chance. + */ + rlim = &p->p_rlimit[RLIMIT_CPU]; + if (s >= rlim->rlim_cur) { + if (s >= rlim->rlim_max) + psignal(p, SIGKILL); + else { + psignal(p, SIGXCPU); + if (rlim->rlim_cur < rlim->rlim_max) + rlim->rlim_cur += 5; + } + } + if (s > 10 * 60 && p->p_ucred->cr_uid && p->p_nice == NZERO) { + p->p_nice = NZERO + 4; + resetpriority(p); + } + + /* + * Pick a new current process and record its start time. + */ + cnt.v_swtch++; + cpu_switch(p); + microtime(&runtime); +} + +/* + * Initialize the (doubly-linked) run queues + * to be empty. + */ +rqinit() +{ + register int i; + + for (i = 0; i < NQS; i++) + qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i]; +} + +/* + * Change process state to be runnable, + * placing it on the run queue if it is in memory, + * and awakening the swapper if it isn't in memory. + */ +void +setrunnable(p) + register struct proc *p; +{ + register int s; + + s = splhigh(); + switch (p->p_stat) { + case 0: + case SRUN: + case SZOMB: + default: + panic("setrunnable"); + case SSTOP: + case SSLEEP: + unsleep(p); /* e.g. when sending signals */ + break; + + case SIDL: + break; + } + p->p_stat = SRUN; + if (p->p_flag & P_INMEM) + setrunqueue(p); + splx(s); + if (p->p_slptime > 1) + updatepri(p); + p->p_slptime = 0; + if ((p->p_flag & P_INMEM) == 0) + wakeup((caddr_t)&proc0); + else if (p->p_priority < curpriority) + need_resched(); +} + +/* + * Compute the priority of a process when running in user mode. + * Arrange to reschedule if the resulting priority is better + * than that of the current process. + */ +void +resetpriority(p) + register struct proc *p; +{ + register unsigned int newpriority; + + newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice; + newpriority = min(newpriority, MAXPRI); + p->p_usrpri = newpriority; + if (newpriority < curpriority) + need_resched(); +} diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c new file mode 100644 index 00000000000..ae16decff81 --- /dev/null +++ b/sys/kern/kern_sysctl.c @@ -0,0 +1,787 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Karels at Berkeley Software Design, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 + */ + +/* + * sysctl system call. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +sysctlfn kern_sysctl; +sysctlfn hw_sysctl; +#ifdef DEBUG +sysctlfn debug_sysctl; +#endif +extern sysctlfn vm_sysctl; +extern sysctlfn fs_sysctl; +extern sysctlfn net_sysctl; +extern sysctlfn cpu_sysctl; + +/* + * Locking and stats + */ +static struct sysctl_lock { + int sl_lock; + int sl_want; + int sl_locked; +} memlock; + +struct sysctl_args { + int *name; + u_int namelen; + void *old; + size_t *oldlenp; + void *new; + size_t newlen; +}; + +int +__sysctl(p, uap, retval) + struct proc *p; + register struct sysctl_args *uap; + int *retval; +{ + int error, dolock = 1; + u_int savelen, oldlen = 0; + sysctlfn *fn; + int name[CTL_MAXNAME]; + + if (uap->new != NULL && (error = suser(p->p_ucred, &p->p_acflag))) + return (error); + /* + * all top-level sysctl names are non-terminal + */ + if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) + return (EINVAL); + if (error = copyin(uap->name, &name, uap->namelen * sizeof(int))) + return (error); + + switch (name[0]) { + case CTL_KERN: + fn = kern_sysctl; + if (name[2] != KERN_VNODE) /* XXX */ + dolock = 0; + break; + case CTL_HW: + fn = hw_sysctl; + break; + case CTL_VM: + fn = vm_sysctl; + break; + case CTL_NET: + fn = net_sysctl; + break; +#ifdef notyet + case CTL_FS: + fn = fs_sysctl; + break; +#endif + case CTL_MACHDEP: + fn = cpu_sysctl; + break; +#ifdef DEBUG + case CTL_DEBUG: + fn = debug_sysctl; + break; +#endif + default: + return (EOPNOTSUPP); + } + + if (uap->oldlenp && + (error = copyin(uap->oldlenp, &oldlen, sizeof(oldlen)))) + return (error); + if (uap->old != NULL) { + if (!useracc(uap->old, oldlen, B_WRITE)) + return (EFAULT); + while (memlock.sl_lock) { + memlock.sl_want = 1; + sleep((caddr_t)&memlock, PRIBIO+1); + memlock.sl_locked++; + } + memlock.sl_lock = 1; + if (dolock) + vslock(uap->old, oldlen); + savelen = oldlen; + } + error = (*fn)(name + 1, uap->namelen - 1, uap->old, &oldlen, + uap->new, uap->newlen, p); + if (uap->old != NULL) { + if (dolock) + vsunlock(uap->old, savelen, B_WRITE); + memlock.sl_lock = 0; + if (memlock.sl_want) { + memlock.sl_want = 0; + wakeup((caddr_t)&memlock); + } + } + if (error) + return (error); + if (uap->oldlenp) + error = copyout(&oldlen, uap->oldlenp, sizeof(oldlen)); + *retval = oldlen; + return (0); +} + +/* + * Attributes stored in the kernel. + */ +char hostname[MAXHOSTNAMELEN]; +int hostnamelen; +long hostid; +int securelevel; + +/* + * kernel related system variables. + */ +kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + int error, level, inthostid; + extern char ostype[], osrelease[], version[]; + + /* all sysctl names at this level are terminal */ + if (namelen != 1 && !(name[0] == KERN_PROC || name[0] == KERN_PROF)) + return (ENOTDIR); /* overloaded */ + + switch (name[0]) { + case KERN_OSTYPE: + return (sysctl_rdstring(oldp, oldlenp, newp, ostype)); + case KERN_OSRELEASE: + return (sysctl_rdstring(oldp, oldlenp, newp, osrelease)); + case KERN_OSREV: + return (sysctl_rdint(oldp, oldlenp, newp, BSD)); + case KERN_VERSION: + return (sysctl_rdstring(oldp, oldlenp, newp, version)); + case KERN_MAXVNODES: + return(sysctl_int(oldp, oldlenp, newp, newlen, &desiredvnodes)); + case KERN_MAXPROC: + return (sysctl_int(oldp, oldlenp, newp, newlen, &maxproc)); + case KERN_MAXFILES: + return (sysctl_int(oldp, oldlenp, newp, newlen, &maxfiles)); + case KERN_ARGMAX: + return (sysctl_rdint(oldp, oldlenp, newp, ARG_MAX)); + case KERN_SECURELVL: + level = securelevel; + if ((error = sysctl_int(oldp, oldlenp, newp, newlen, &level)) || + newp == NULL) + return (error); + if (level < securelevel && p->p_pid != 1) + return (EPERM); + securelevel = level; + return (0); + case KERN_HOSTNAME: + error = sysctl_string(oldp, oldlenp, newp, newlen, + hostname, sizeof(hostname)); + if (newp && !error) + hostnamelen = newlen; + return (error); + case KERN_HOSTID: + inthostid = hostid; /* XXX assumes sizeof long <= sizeof int */ + error = sysctl_int(oldp, oldlenp, newp, newlen, &inthostid); + hostid = inthostid; + return (error); + case KERN_CLOCKRATE: + return (sysctl_clockrate(oldp, oldlenp)); + case KERN_BOOTTIME: + return (sysctl_rdstruct(oldp, oldlenp, newp, &boottime, + sizeof(struct timeval))); + case KERN_VNODE: + return (sysctl_vnode(oldp, oldlenp)); + case KERN_PROC: + return (sysctl_doproc(name + 1, namelen - 1, oldp, oldlenp)); + case KERN_FILE: + return (sysctl_file(oldp, oldlenp)); +#ifdef GPROF + case KERN_PROF: + return (sysctl_doprof(name + 1, namelen - 1, oldp, oldlenp, + newp, newlen)); +#endif + case KERN_POSIX1: + return (sysctl_rdint(oldp, oldlenp, newp, _POSIX_VERSION)); + case KERN_NGROUPS: + return (sysctl_rdint(oldp, oldlenp, newp, NGROUPS_MAX)); + case KERN_JOB_CONTROL: + return (sysctl_rdint(oldp, oldlenp, newp, 1)); + case KERN_SAVED_IDS: +#ifdef _POSIX_SAVED_IDS + return (sysctl_rdint(oldp, oldlenp, newp, 1)); +#else + return (sysctl_rdint(oldp, oldlenp, newp, 0)); +#endif + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} + +/* + * hardware related system variables. + */ +hw_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + extern char machine[], cpu_model[]; + + /* all sysctl names at this level are terminal */ + if (namelen != 1) + return (ENOTDIR); /* overloaded */ + + switch (name[0]) { + case HW_MACHINE: + return (sysctl_rdstring(oldp, oldlenp, newp, machine)); + case HW_MODEL: + return (sysctl_rdstring(oldp, oldlenp, newp, cpu_model)); + case HW_NCPU: + return (sysctl_rdint(oldp, oldlenp, newp, 1)); /* XXX */ + case HW_BYTEORDER: + return (sysctl_rdint(oldp, oldlenp, newp, BYTE_ORDER)); + case HW_PHYSMEM: + return (sysctl_rdint(oldp, oldlenp, newp, ctob(physmem))); + case HW_USERMEM: + return (sysctl_rdint(oldp, oldlenp, newp, + ctob(physmem - cnt.v_wire_count))); + case HW_PAGESIZE: + return (sysctl_rdint(oldp, oldlenp, newp, PAGE_SIZE)); + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} + +#ifdef DEBUG +/* + * Debugging related system variables. + */ +struct ctldebug debug0, debug1, debug2, debug3, debug4; +struct ctldebug debug5, debug6, debug7, debug8, debug9; +struct ctldebug debug10, debug11, debug12, debug13, debug14; +struct ctldebug debug15, debug16, debug17, debug18, debug19; +static struct ctldebug *debugvars[CTL_DEBUG_MAXID] = { + &debug0, &debug1, &debug2, &debug3, &debug4, + &debug5, &debug6, &debug7, &debug8, &debug9, + &debug10, &debug11, &debug12, &debug13, &debug14, + &debug15, &debug16, &debug17, &debug18, &debug19, +}; +int +debug_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + struct ctldebug *cdp; + + /* all sysctl names at this level are name and field */ + if (namelen != 2) + return (ENOTDIR); /* overloaded */ + cdp = debugvars[name[0]]; + if (cdp->debugname == 0) + return (EOPNOTSUPP); + switch (name[1]) { + case CTL_DEBUG_NAME: + return (sysctl_rdstring(oldp, oldlenp, newp, cdp->debugname)); + case CTL_DEBUG_VALUE: + return (sysctl_int(oldp, oldlenp, newp, newlen, cdp->debugvar)); + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} +#endif /* DEBUG */ + +/* + * Validate parameters and get old / set new parameters + * for an integer-valued sysctl function. + */ +sysctl_int(oldp, oldlenp, newp, newlen, valp) + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + int *valp; +{ + int error = 0; + + if (oldp && *oldlenp < sizeof(int)) + return (ENOMEM); + if (newp && newlen != sizeof(int)) + return (EINVAL); + *oldlenp = sizeof(int); + if (oldp) + error = copyout(valp, oldp, sizeof(int)); + if (error == 0 && newp) + error = copyin(newp, valp, sizeof(int)); + return (error); +} + +/* + * As above, but read-only. + */ +sysctl_rdint(oldp, oldlenp, newp, val) + void *oldp; + size_t *oldlenp; + void *newp; + int val; +{ + int error = 0; + + if (oldp && *oldlenp < sizeof(int)) + return (ENOMEM); + if (newp) + return (EPERM); + *oldlenp = sizeof(int); + if (oldp) + error = copyout((caddr_t)&val, oldp, sizeof(int)); + return (error); +} + +/* + * Validate parameters and get old / set new parameters + * for a string-valued sysctl function. + */ +sysctl_string(oldp, oldlenp, newp, newlen, str, maxlen) + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + char *str; + int maxlen; +{ + int len, error = 0; + + len = strlen(str) + 1; + if (oldp && *oldlenp < len) + return (ENOMEM); + if (newp && newlen >= maxlen) + return (EINVAL); + if (oldp) { + *oldlenp = len; + error = copyout(str, oldp, len); + } + if (error == 0 && newp) { + error = copyin(newp, str, newlen); + str[newlen] = 0; + } + return (error); +} + +/* + * As above, but read-only. + */ +sysctl_rdstring(oldp, oldlenp, newp, str) + void *oldp; + size_t *oldlenp; + void *newp; + char *str; +{ + int len, error = 0; + + len = strlen(str) + 1; + if (oldp && *oldlenp < len) + return (ENOMEM); + if (newp) + return (EPERM); + *oldlenp = len; + if (oldp) + error = copyout(str, oldp, len); + return (error); +} + +/* + * Validate parameters and get old / set new parameters + * for a structure oriented sysctl function. + */ +sysctl_struct(oldp, oldlenp, newp, newlen, sp, len) + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + void *sp; + int len; +{ + int error = 0; + + if (oldp && *oldlenp < len) + return (ENOMEM); + if (newp && newlen > len) + return (EINVAL); + if (oldp) { + *oldlenp = len; + error = copyout(sp, oldp, len); + } + if (error == 0 && newp) + error = copyin(newp, sp, len); + return (error); +} + +/* + * Validate parameters and get old parameters + * for a structure oriented sysctl function. + */ +sysctl_rdstruct(oldp, oldlenp, newp, sp, len) + void *oldp; + size_t *oldlenp; + void *newp, *sp; + int len; +{ + int error = 0; + + if (oldp && *oldlenp < len) + return (ENOMEM); + if (newp) + return (EPERM); + *oldlenp = len; + if (oldp) + error = copyout(sp, oldp, len); + return (error); +} + +/* + * Get file structures. + */ +sysctl_file(where, sizep) + char *where; + size_t *sizep; +{ + int buflen, error; + struct file *fp; + char *start = where; + + buflen = *sizep; + if (where == NULL) { + /* + * overestimate by 10 files + */ + *sizep = sizeof(filehead) + (nfiles + 10) * sizeof(struct file); + return (0); + } + + /* + * first copyout filehead + */ + if (buflen < sizeof(filehead)) { + *sizep = 0; + return (0); + } + if (error = copyout((caddr_t)&filehead, where, sizeof(filehead))) + return (error); + buflen -= sizeof(filehead); + where += sizeof(filehead); + + /* + * followed by an array of file structures + */ + for (fp = filehead; fp != NULL; fp = fp->f_filef) { + if (buflen < sizeof(struct file)) { + *sizep = where - start; + return (ENOMEM); + } + if (error = copyout((caddr_t)fp, where, sizeof (struct file))) + return (error); + buflen -= sizeof(struct file); + where += sizeof(struct file); + } + *sizep = where - start; + return (0); +} + +/* + * try over estimating by 5 procs + */ +#define KERN_PROCSLOP (5 * sizeof (struct kinfo_proc)) + +sysctl_doproc(name, namelen, where, sizep) + int *name; + u_int namelen; + char *where; + size_t *sizep; +{ + register struct proc *p; + register struct kinfo_proc *dp = (struct kinfo_proc *)where; + register int needed = 0; + int buflen = where != NULL ? *sizep : 0; + int doingzomb; + struct eproc eproc; + int error = 0; + + if (namelen != 2 && !(namelen == 1 && name[0] == KERN_PROC_ALL)) + return (EINVAL); + p = (struct proc *)allproc; + doingzomb = 0; +again: + for (; p != NULL; p = p->p_next) { + /* + * Skip embryonic processes. + */ + if (p->p_stat == SIDL) + continue; + /* + * TODO - make more efficient (see notes below). + * do by session. + */ + switch (name[0]) { + + case KERN_PROC_PID: + /* could do this with just a lookup */ + if (p->p_pid != (pid_t)name[1]) + continue; + break; + + case KERN_PROC_PGRP: + /* could do this by traversing pgrp */ + if (p->p_pgrp->pg_id != (pid_t)name[1]) + continue; + break; + + case KERN_PROC_TTY: + if ((p->p_flag & P_CONTROLT) == 0 || + p->p_session->s_ttyp == NULL || + p->p_session->s_ttyp->t_dev != (dev_t)name[1]) + continue; + break; + + case KERN_PROC_UID: + if (p->p_ucred->cr_uid != (uid_t)name[1]) + continue; + break; + + case KERN_PROC_RUID: + if (p->p_cred->p_ruid != (uid_t)name[1]) + continue; + break; + } + if (buflen >= sizeof(struct kinfo_proc)) { + fill_eproc(p, &eproc); + if (error = copyout((caddr_t)p, &dp->kp_proc, + sizeof(struct proc))) + return (error); + if (error = copyout((caddr_t)&eproc, &dp->kp_eproc, + sizeof(eproc))) + return (error); + dp++; + buflen -= sizeof(struct kinfo_proc); + } + needed += sizeof(struct kinfo_proc); + } + if (doingzomb == 0) { + p = zombproc; + doingzomb++; + goto again; + } + if (where != NULL) { + *sizep = (caddr_t)dp - where; + if (needed > *sizep) + return (ENOMEM); + } else { + needed += KERN_PROCSLOP; + *sizep = needed; + } + return (0); +} + +/* + * Fill in an eproc structure for the specified process. + */ +void +fill_eproc(p, ep) + register struct proc *p; + register struct eproc *ep; +{ + register struct tty *tp; + + ep->e_paddr = p; + ep->e_sess = p->p_pgrp->pg_session; + ep->e_pcred = *p->p_cred; + ep->e_ucred = *p->p_ucred; + if (p->p_stat == SIDL || p->p_stat == SZOMB) { + ep->e_vm.vm_rssize = 0; + ep->e_vm.vm_tsize = 0; + ep->e_vm.vm_dsize = 0; + ep->e_vm.vm_ssize = 0; +#ifndef sparc + /* ep->e_vm.vm_pmap = XXX; */ +#endif + } else { + register struct vmspace *vm = p->p_vmspace; + +#ifdef pmap_resident_count + ep->e_vm.vm_rssize = pmap_resident_count(&vm->vm_pmap); /*XXX*/ +#else + ep->e_vm.vm_rssize = vm->vm_rssize; +#endif + ep->e_vm.vm_tsize = vm->vm_tsize; + ep->e_vm.vm_dsize = vm->vm_dsize; + ep->e_vm.vm_ssize = vm->vm_ssize; +#ifndef sparc + ep->e_vm.vm_pmap = vm->vm_pmap; +#endif + } + if (p->p_pptr) + ep->e_ppid = p->p_pptr->p_pid; + else + ep->e_ppid = 0; + ep->e_pgid = p->p_pgrp->pg_id; + ep->e_jobc = p->p_pgrp->pg_jobc; + if ((p->p_flag & P_CONTROLT) && + (tp = ep->e_sess->s_ttyp)) { + ep->e_tdev = tp->t_dev; + ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; + ep->e_tsess = tp->t_session; + } else + ep->e_tdev = NODEV; + ep->e_flag = ep->e_sess->s_ttyvp ? EPROC_CTTY : 0; + if (SESS_LEADER(p)) + ep->e_flag |= EPROC_SLEADER; + if (p->p_wmesg) + strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN); + ep->e_xsize = ep->e_xrssize = 0; + ep->e_xccount = ep->e_xswrss = 0; +} + +#ifdef COMPAT_43 +#include +#define KINFO_PROC (0<<8) +#define KINFO_RT (1<<8) +#define KINFO_VNODE (2<<8) +#define KINFO_FILE (3<<8) +#define KINFO_METER (4<<8) +#define KINFO_LOADAVG (5<<8) +#define KINFO_CLOCKRATE (6<<8) + +struct getkerninfo_args { + int op; + char *where; + int *size; + int arg; +}; + +ogetkerninfo(p, uap, retval) + struct proc *p; + register struct getkerninfo_args *uap; + int *retval; +{ + int error, name[5]; + u_int size; + + if (uap->size && + (error = copyin((caddr_t)uap->size, (caddr_t)&size, sizeof(size)))) + return (error); + + switch (uap->op & 0xff00) { + + case KINFO_RT: + name[0] = PF_ROUTE; + name[1] = 0; + name[2] = (uap->op & 0xff0000) >> 16; + name[3] = uap->op & 0xff; + name[4] = uap->arg; + error = net_sysctl(name, 5, uap->where, &size, NULL, 0, p); + break; + + case KINFO_VNODE: + name[0] = KERN_VNODE; + error = kern_sysctl(name, 1, uap->where, &size, NULL, 0, p); + break; + + case KINFO_PROC: + name[0] = KERN_PROC; + name[1] = uap->op & 0xff; + name[2] = uap->arg; + error = kern_sysctl(name, 3, uap->where, &size, NULL, 0, p); + break; + + case KINFO_FILE: + name[0] = KERN_FILE; + error = kern_sysctl(name, 1, uap->where, &size, NULL, 0, p); + break; + + case KINFO_METER: + name[0] = VM_METER; + error = vm_sysctl(name, 1, uap->where, &size, NULL, 0, p); + break; + + case KINFO_LOADAVG: + name[0] = VM_LOADAVG; + error = vm_sysctl(name, 1, uap->where, &size, NULL, 0, p); + break; + + case KINFO_CLOCKRATE: + name[0] = KERN_CLOCKRATE; + error = kern_sysctl(name, 1, uap->where, &size, NULL, 0, p); + break; + + default: + return (EOPNOTSUPP); + } + if (error) + return (error); + *retval = size; + if (uap->size) + error = copyout((caddr_t)&size, (caddr_t)uap->size, + sizeof(size)); + return (error); +} +#endif /* COMPAT_43 */ diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c new file mode 100644 index 00000000000..f42900cb75d --- /dev/null +++ b/sys/kern/kern_tc.c @@ -0,0 +1,528 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef GPROF +#include +#endif + +/* + * Clock handling routines. + * + * This code is written to operate with two timers that run independently of + * each other. The main clock, running hz times per second, is used to keep + * track of real time. The second timer handles kernel and user profiling, + * and does resource use estimation. If the second timer is programmable, + * it is randomized to avoid aliasing between the two clocks. For example, + * the randomization prevents an adversary from always giving up the cpu + * just before its quantum expires. Otherwise, it would never accumulate + * cpu ticks. The mean frequency of the second timer is stathz. + * + * If no second timer exists, stathz will be zero; in this case we drive + * profiling and statistics off the main clock. This WILL NOT be accurate; + * do not do it unless absolutely necessary. + * + * The statistics clock may (or may not) be run at a higher rate while + * profiling. This profile clock runs at profhz. We require that profhz + * be an integral multiple of stathz. + * + * If the statistics clock is running fast, it must be divided by the ratio + * profhz/stathz for statistics. (For profiling, every tick counts.) + */ + +/* + * TODO: + * allocate more timeout table slots when table overflows. + */ + +/* + * Bump a timeval by a small number of usec's. + */ +#define BUMPTIME(t, usec) { \ + register volatile struct timeval *tp = (t); \ + register long us; \ + \ + tp->tv_usec = us = tp->tv_usec + (usec); \ + if (us >= 1000000) { \ + tp->tv_usec = us - 1000000; \ + tp->tv_sec++; \ + } \ +} + +int stathz; +int profhz; +int profprocs; +int ticks; +static int psdiv, pscnt; /* prof => stat divider */ +int psratio; /* ratio: prof / stat */ + +volatile struct timeval time; +volatile struct timeval mono_time; + +/* + * Initialize clock frequencies and start both clocks running. + */ +void +initclocks() +{ + register int i; + + /* + * Set divisors to 1 (normal case) and let the machine-specific + * code do its bit. + */ + psdiv = pscnt = 1; + cpu_initclocks(); + + /* + * Compute profhz/stathz, and fix profhz if needed. + */ + i = stathz ? stathz : hz; + if (profhz == 0) + profhz = i; + psratio = profhz / i; +} + +/* + * The real-time timer, interrupting hz times per second. + */ +void +hardclock(frame) + register struct clockframe *frame; +{ + register struct callout *p1; + register struct proc *p; + register int delta, needsoft; + extern int tickdelta; + extern long timedelta; + + /* + * Update real-time timeout queue. + * At front of queue are some number of events which are ``due''. + * The time to these is <= 0 and if negative represents the + * number of ticks which have passed since it was supposed to happen. + * The rest of the q elements (times > 0) are events yet to happen, + * where the time for each is given as a delta from the previous. + * Decrementing just the first of these serves to decrement the time + * to all events. + */ + needsoft = 0; + for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) { + if (--p1->c_time > 0) + break; + needsoft = 1; + if (p1->c_time == 0) + break; + } + + p = curproc; + if (p) { + register struct pstats *pstats; + + /* + * Run current process's virtual and profile time, as needed. + */ + pstats = p->p_stats; + if (CLKF_USERMODE(frame) && + timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) + psignal(p, SIGVTALRM); + if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) + psignal(p, SIGPROF); + } + + /* + * If no separate statistics clock is available, run it from here. + */ + if (stathz == 0) + statclock(frame); + + /* + * Increment the time-of-day. The increment is just ``tick'' unless + * we are still adjusting the clock; see adjtime(). + */ + ticks++; + if (timedelta == 0) + delta = tick; + else { + delta = tick + tickdelta; + timedelta -= tickdelta; + } + BUMPTIME(&time, delta); + BUMPTIME(&mono_time, delta); + + /* + * Process callouts at a very low cpu priority, so we don't keep the + * relatively high clock interrupt priority any longer than necessary. + */ + if (needsoft) { + if (CLKF_BASEPRI(frame)) { + /* + * Save the overhead of a software interrupt; + * it will happen as soon as we return, so do it now. + */ + (void)splsoftclock(); + softclock(); + } else + setsoftclock(); + } +} + +/* + * Software (low priority) clock interrupt. + * Run periodic events from timeout queue. + */ +/*ARGSUSED*/ +void +softclock() +{ + register struct callout *c; + register void *arg; + register void (*func) __P((void *)); + register int s; + + s = splhigh(); + while ((c = calltodo.c_next) != NULL && c->c_time <= 0) { + func = c->c_func; + arg = c->c_arg; + calltodo.c_next = c->c_next; + c->c_next = callfree; + callfree = c; + splx(s); + (*func)(arg); + (void) splhigh(); + } + splx(s); +} + +/* + * timeout -- + * Execute a function after a specified length of time. + * + * untimeout -- + * Cancel previous timeout function call. + * + * See AT&T BCI Driver Reference Manual for specification. This + * implementation differs from that one in that no identification + * value is returned from timeout, rather, the original arguments + * to timeout are used to identify entries for untimeout. + */ +void +timeout(ftn, arg, ticks) + void (*ftn) __P((void *)); + void *arg; + register int ticks; +{ + register struct callout *new, *p, *t; + register int s; + + if (ticks <= 0) + ticks = 1; + + /* Lock out the clock. */ + s = splhigh(); + + /* Fill in the next free callout structure. */ + if (callfree == NULL) + panic("timeout table full"); + new = callfree; + callfree = new->c_next; + new->c_arg = arg; + new->c_func = ftn; + + /* + * The time for each event is stored as a difference from the time + * of the previous event on the queue. Walk the queue, correcting + * the ticks argument for queue entries passed. Correct the ticks + * value for the queue entry immediately after the insertion point + * as well. Watch out for negative c_time values; these represent + * overdue events. + */ + for (p = &calltodo; + (t = p->c_next) != NULL && ticks > t->c_time; p = t) + if (t->c_time > 0) + ticks -= t->c_time; + new->c_time = ticks; + if (t != NULL) + t->c_time -= ticks; + + /* Insert the new entry into the queue. */ + p->c_next = new; + new->c_next = t; + splx(s); +} + +void +untimeout(ftn, arg) + void (*ftn) __P((void *)); + void *arg; +{ + register struct callout *p, *t; + register int s; + + s = splhigh(); + for (p = &calltodo; (t = p->c_next) != NULL; p = t) + if (t->c_func == ftn && t->c_arg == arg) { + /* Increment next entry's tick count. */ + if (t->c_next && t->c_time > 0) + t->c_next->c_time += t->c_time; + + /* Move entry from callout queue to callfree queue. */ + p->c_next = t->c_next; + t->c_next = callfree; + callfree = t; + break; + } + splx(s); +} + +/* + * Compute number of hz until specified time. Used to + * compute third argument to timeout() from an absolute time. + */ +int +hzto(tv) + struct timeval *tv; +{ + register long ticks, sec; + int s; + + /* + * If number of milliseconds will fit in 32 bit arithmetic, + * then compute number of milliseconds to time and scale to + * ticks. Otherwise just compute number of hz in time, rounding + * times greater than representible to maximum value. + * + * Delta times less than 25 days can be computed ``exactly''. + * Maximum value for any timeout in 10ms ticks is 250 days. + */ + s = splhigh(); + sec = tv->tv_sec - time.tv_sec; + if (sec <= 0x7fffffff / 1000 - 1000) + ticks = ((tv->tv_sec - time.tv_sec) * 1000 + + (tv->tv_usec - time.tv_usec) / 1000) / (tick / 1000); + else if (sec <= 0x7fffffff / hz) + ticks = sec * hz; + else + ticks = 0x7fffffff; + splx(s); + return (ticks); +} + +/* + * Start profiling on a process. + * + * Kernel profiling passes proc0 which never exits and hence + * keeps the profile clock running constantly. + */ +void +startprofclock(p) + register struct proc *p; +{ + int s; + + if ((p->p_flag & P_PROFIL) == 0) { + p->p_flag |= P_PROFIL; + if (++profprocs == 1 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = psratio; + setstatclockrate(profhz); + splx(s); + } + } +} + +/* + * Stop profiling on a process. + */ +void +stopprofclock(p) + register struct proc *p; +{ + int s; + + if (p->p_flag & P_PROFIL) { + p->p_flag &= ~P_PROFIL; + if (--profprocs == 0 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = 1; + setstatclockrate(stathz); + splx(s); + } + } +} + +int dk_ndrive = DK_NDRIVE; + +/* + * Statistics clock. Grab profile sample, and if divider reaches 0, + * do process and kernel statistics. + */ +void +statclock(frame) + register struct clockframe *frame; +{ +#ifdef GPROF + register struct gmonparam *g; +#endif + register struct proc *p; + register int i; + + if (CLKF_USERMODE(frame)) { + p = curproc; + if (p->p_flag & P_PROFIL) + addupc_intr(p, CLKF_PC(frame), 1); + if (--pscnt > 0) + return; + /* + * Came from user mode; CPU was in user state. + * If this process is being profiled record the tick. + */ + p->p_uticks++; + if (p->p_nice > NZERO) + cp_time[CP_NICE]++; + else + cp_time[CP_USER]++; + } else { +#ifdef GPROF + /* + * Kernel statistics are just like addupc_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = CLKF_PC(frame) - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif + if (--pscnt > 0) + return; + /* + * Came from kernel mode, so we were: + * - handling an interrupt, + * - doing syscall or trap work on behalf of the current + * user process, or + * - spinning in the idle loop. + * Whichever it is, charge the time as appropriate. + * Note that we charge interrupts to the current process, + * regardless of whether they are ``for'' that process, + * so that we know how much of its real time was spent + * in ``non-process'' (i.e., interrupt) work. + */ + p = curproc; + if (CLKF_INTR(frame)) { + if (p != NULL) + p->p_iticks++; + cp_time[CP_INTR]++; + } else if (p != NULL) { + p->p_sticks++; + cp_time[CP_SYS]++; + } else + cp_time[CP_IDLE]++; + } + pscnt = psdiv; + + /* + * We maintain statistics shown by user-level statistics + * programs: the amount of time in each cpu state, and + * the amount of time each of DK_NDRIVE ``drives'' is busy. + * + * XXX should either run linked list of drives, or (better) + * grab timestamps in the start & done code. + */ + for (i = 0; i < DK_NDRIVE; i++) + if (dk_busy & (1 << i)) + dk_time[i]++; + + /* + * We adjust the priority of the current process. The priority of + * a process gets worse as it accumulates CPU time. The cpu usage + * estimator (p_estcpu) is increased here. The formula for computing + * priorities (in kern_synch.c) will compute a different value each + * time p_estcpu increases by 4. The cpu usage estimator ramps up + * quite quickly when the process is running (linearly), and decays + * away exponentially, at a rate which is proportionally slower when + * the system is busy. The basic principal is that the system will + * 90% forget that the process used a lot of CPU time in 5 * loadav + * seconds. This causes the system to favor processes which haven't + * run much recently, and to round-robin among other processes. + */ + if (p != NULL) { + p->p_cpticks++; + if (++p->p_estcpu == 0) + p->p_estcpu--; + if ((p->p_estcpu & 3) == 0) { + resetpriority(p); + if (p->p_priority >= PUSER) + p->p_priority = p->p_usrpri; + } + } +} + +/* + * Return information about system clocks. + */ +sysctl_clockrate(where, sizep) + register char *where; + size_t *sizep; +{ + struct clockinfo clkinfo; + + /* + * Construct clockinfo structure. + */ + clkinfo.hz = hz; + clkinfo.tick = tick; + clkinfo.profhz = profhz; + clkinfo.stathz = stathz ? stathz : hz; + return (sysctl_rdstruct(where, sizep, NULL, &clkinfo, sizeof(clkinfo))); +} diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c new file mode 100644 index 00000000000..4dadcb8e0b9 --- /dev/null +++ b/sys/kern/kern_time.c @@ -0,0 +1,416 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_time.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include + +#include + +/* + * Time of day and interval timer support. + * + * These routines provide the kernel entry points to get and set + * the time-of-day and per-process interval timers. Subroutines + * here provide support for adding and subtracting timeval structures + * and decrementing interval timers, optionally reloading the interval + * timers when they expire. + */ + +struct gettimeofday_args { + struct timeval *tp; + struct timezone *tzp; +}; +/* ARGSUSED */ +gettimeofday(p, uap, retval) + struct proc *p; + register struct gettimeofday_args *uap; + int *retval; +{ + struct timeval atv; + int error = 0; + + if (uap->tp) { + microtime(&atv); + if (error = copyout((caddr_t)&atv, (caddr_t)uap->tp, + sizeof (atv))) + return (error); + } + if (uap->tzp) + error = copyout((caddr_t)&tz, (caddr_t)uap->tzp, + sizeof (tz)); + return (error); +} + +struct settimeofday_args { + struct timeval *tv; + struct timezone *tzp; +}; +/* ARGSUSED */ +settimeofday(p, uap, retval) + struct proc *p; + struct settimeofday_args *uap; + int *retval; +{ + struct timeval atv, delta; + struct timezone atz; + int error, s; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + /* Verify all parameters before changing time. */ + if (uap->tv && + (error = copyin((caddr_t)uap->tv, (caddr_t)&atv, sizeof(atv)))) + return (error); + if (uap->tzp && + (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz)))) + return (error); + if (uap->tv) { + /* WHAT DO WE DO ABOUT PENDING REAL-TIME TIMEOUTS??? */ + s = splclock(); + /* nb. delta.tv_usec may be < 0, but this is OK here */ + delta.tv_sec = atv.tv_sec - time.tv_sec; + delta.tv_usec = atv.tv_usec - time.tv_usec; + time = atv; + (void) splsoftclock(); + timevaladd(&boottime, &delta); + timevalfix(&boottime); + timevaladd(&runtime, &delta); + timevalfix(&runtime); + LEASE_UPDATETIME(delta.tv_sec); + splx(s); + resettodr(); + } + if (uap->tzp) + tz = atz; + return (0); +} + +extern int tickadj; /* "standard" clock skew, us./tick */ +int tickdelta; /* current clock skew, us. per tick */ +long timedelta; /* unapplied time correction, us. */ +long bigadj = 1000000; /* use 10x skew above bigadj us. */ + +struct adjtime_args { + struct timeval *delta; + struct timeval *olddelta; +}; +/* ARGSUSED */ +adjtime(p, uap, retval) + struct proc *p; + register struct adjtime_args *uap; + int *retval; +{ + struct timeval atv; + register long ndelta, ntickdelta, odelta; + int s, error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + if (error = + copyin((caddr_t)uap->delta, (caddr_t)&atv, sizeof(struct timeval))) + return (error); + + /* + * Compute the total correction and the rate at which to apply it. + * Round the adjustment down to a whole multiple of the per-tick + * delta, so that after some number of incremental changes in + * hardclock(), tickdelta will become zero, lest the correction + * overshoot and start taking us away from the desired final time. + */ + ndelta = atv.tv_sec * 1000000 + atv.tv_usec; + if (ndelta > bigadj) + ntickdelta = 10 * tickadj; + else + ntickdelta = tickadj; + if (ndelta % ntickdelta) + ndelta = ndelta / ntickdelta * ntickdelta; + + /* + * To make hardclock()'s job easier, make the per-tick delta negative + * if we want time to run slower; then hardclock can simply compute + * tick + tickdelta, and subtract tickdelta from timedelta. + */ + if (ndelta < 0) + ntickdelta = -ntickdelta; + s = splclock(); + odelta = timedelta; + timedelta = ndelta; + tickdelta = ntickdelta; + splx(s); + + if (uap->olddelta) { + atv.tv_sec = odelta / 1000000; + atv.tv_usec = odelta % 1000000; + (void) copyout((caddr_t)&atv, (caddr_t)uap->olddelta, + sizeof(struct timeval)); + } + return (0); +} + +/* + * Get value of an interval timer. The process virtual and + * profiling virtual time timers are kept in the p_stats area, since + * they can be swapped out. These are kept internally in the + * way they are specified externally: in time until they expire. + * + * The real time interval timer is kept in the process table slot + * for the process, and its value (it_value) is kept as an + * absolute time rather than as a delta, so that it is easy to keep + * periodic real-time signals from drifting. + * + * Virtual time timers are processed in the hardclock() routine of + * kern_clock.c. The real time timer is processed by a timeout + * routine, called from the softclock() routine. Since a callout + * may be delayed in real time due to interrupt processing in the system, + * it is possible for the real time timeout routine (realitexpire, given below), + * to be delayed in real time past when it is supposed to occur. It + * does not suffice, therefore, to reload the real timer .it_value from the + * real time timers .it_interval. Rather, we compute the next time in + * absolute time the timer should go off. + */ +struct getitimer_args { + u_int which; + struct itimerval *itv; +}; +/* ARGSUSED */ +getitimer(p, uap, retval) + struct proc *p; + register struct getitimer_args *uap; + int *retval; +{ + struct itimerval aitv; + int s; + + if (uap->which > ITIMER_PROF) + return (EINVAL); + s = splclock(); + if (uap->which == ITIMER_REAL) { + /* + * Convert from absoulte to relative time in .it_value + * part of real time timer. If time for real time timer + * has passed return 0, else return difference between + * current time and time for the timer to go off. + */ + aitv = p->p_realtimer; + if (timerisset(&aitv.it_value)) + if (timercmp(&aitv.it_value, &time, <)) + timerclear(&aitv.it_value); + else + timevalsub(&aitv.it_value, + (struct timeval *)&time); + } else + aitv = p->p_stats->p_timer[uap->which]; + splx(s); + return (copyout((caddr_t)&aitv, (caddr_t)uap->itv, + sizeof (struct itimerval))); +} + +struct setitimer_args { + u_int which; + struct itimerval *itv, *oitv; +}; +/* ARGSUSED */ +setitimer(p, uap, retval) + struct proc *p; + register struct setitimer_args *uap; + int *retval; +{ + struct itimerval aitv; + register struct itimerval *itvp; + int s, error; + + if (uap->which > ITIMER_PROF) + return (EINVAL); + itvp = uap->itv; + if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv, + sizeof(struct itimerval)))) + return (error); + if ((uap->itv = uap->oitv) && (error = getitimer(p, uap, retval))) + return (error); + if (itvp == 0) + return (0); + if (itimerfix(&aitv.it_value) || itimerfix(&aitv.it_interval)) + return (EINVAL); + s = splclock(); + if (uap->which == ITIMER_REAL) { + untimeout(realitexpire, (caddr_t)p); + if (timerisset(&aitv.it_value)) { + timevaladd(&aitv.it_value, (struct timeval *)&time); + timeout(realitexpire, (caddr_t)p, hzto(&aitv.it_value)); + } + p->p_realtimer = aitv; + } else + p->p_stats->p_timer[uap->which] = aitv; + splx(s); + return (0); +} + +/* + * Real interval timer expired: + * send process whose timer expired an alarm signal. + * If time is not set up to reload, then just return. + * Else compute next time timer should go off which is > current time. + * This is where delay in processing this timeout causes multiple + * SIGALRM calls to be compressed into one. + */ +void +realitexpire(arg) + void *arg; +{ + register struct proc *p; + int s; + + p = (struct proc *)arg; + psignal(p, SIGALRM); + if (!timerisset(&p->p_realtimer.it_interval)) { + timerclear(&p->p_realtimer.it_value); + return; + } + for (;;) { + s = splclock(); + timevaladd(&p->p_realtimer.it_value, + &p->p_realtimer.it_interval); + if (timercmp(&p->p_realtimer.it_value, &time, >)) { + timeout(realitexpire, (caddr_t)p, + hzto(&p->p_realtimer.it_value)); + splx(s); + return; + } + splx(s); + } +} + +/* + * Check that a proposed value to load into the .it_value or + * .it_interval part of an interval timer is acceptable, and + * fix it to have at least minimal value (i.e. if it is less + * than the resolution of the clock, round it up.) + */ +itimerfix(tv) + struct timeval *tv; +{ + + if (tv->tv_sec < 0 || tv->tv_sec > 100000000 || + tv->tv_usec < 0 || tv->tv_usec >= 1000000) + return (EINVAL); + if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick) + tv->tv_usec = tick; + return (0); +} + +/* + * Decrement an interval timer by a specified number + * of microseconds, which must be less than a second, + * i.e. < 1000000. If the timer expires, then reload + * it. In this case, carry over (usec - old value) to + * reduce the value reloaded into the timer so that + * the timer does not drift. This routine assumes + * that it is called in a context where the timers + * on which it is operating cannot change in value. + */ +itimerdecr(itp, usec) + register struct itimerval *itp; + int usec; +{ + + if (itp->it_value.tv_usec < usec) { + if (itp->it_value.tv_sec == 0) { + /* expired, and already in next interval */ + usec -= itp->it_value.tv_usec; + goto expire; + } + itp->it_value.tv_usec += 1000000; + itp->it_value.tv_sec--; + } + itp->it_value.tv_usec -= usec; + usec = 0; + if (timerisset(&itp->it_value)) + return (1); + /* expired, exactly at end of interval */ +expire: + if (timerisset(&itp->it_interval)) { + itp->it_value = itp->it_interval; + itp->it_value.tv_usec -= usec; + if (itp->it_value.tv_usec < 0) { + itp->it_value.tv_usec += 1000000; + itp->it_value.tv_sec--; + } + } else + itp->it_value.tv_usec = 0; /* sec is already 0 */ + return (0); +} + +/* + * Add and subtract routines for timevals. + * N.B.: subtract routine doesn't deal with + * results which are before the beginning, + * it just gets very confused in this case. + * Caveat emptor. + */ +timevaladd(t1, t2) + struct timeval *t1, *t2; +{ + + t1->tv_sec += t2->tv_sec; + t1->tv_usec += t2->tv_usec; + timevalfix(t1); +} + +timevalsub(t1, t2) + struct timeval *t1, *t2; +{ + + t1->tv_sec -= t2->tv_sec; + t1->tv_usec -= t2->tv_usec; + timevalfix(t1); +} + +timevalfix(t1) + struct timeval *t1; +{ + + if (t1->tv_usec < 0) { + t1->tv_sec--; + t1->tv_usec += 1000000; + } + if (t1->tv_usec >= 1000000) { + t1->tv_sec++; + t1->tv_usec -= 1000000; + } +} diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c new file mode 100644 index 00000000000..f42900cb75d --- /dev/null +++ b/sys/kern/kern_timeout.c @@ -0,0 +1,528 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef GPROF +#include +#endif + +/* + * Clock handling routines. + * + * This code is written to operate with two timers that run independently of + * each other. The main clock, running hz times per second, is used to keep + * track of real time. The second timer handles kernel and user profiling, + * and does resource use estimation. If the second timer is programmable, + * it is randomized to avoid aliasing between the two clocks. For example, + * the randomization prevents an adversary from always giving up the cpu + * just before its quantum expires. Otherwise, it would never accumulate + * cpu ticks. The mean frequency of the second timer is stathz. + * + * If no second timer exists, stathz will be zero; in this case we drive + * profiling and statistics off the main clock. This WILL NOT be accurate; + * do not do it unless absolutely necessary. + * + * The statistics clock may (or may not) be run at a higher rate while + * profiling. This profile clock runs at profhz. We require that profhz + * be an integral multiple of stathz. + * + * If the statistics clock is running fast, it must be divided by the ratio + * profhz/stathz for statistics. (For profiling, every tick counts.) + */ + +/* + * TODO: + * allocate more timeout table slots when table overflows. + */ + +/* + * Bump a timeval by a small number of usec's. + */ +#define BUMPTIME(t, usec) { \ + register volatile struct timeval *tp = (t); \ + register long us; \ + \ + tp->tv_usec = us = tp->tv_usec + (usec); \ + if (us >= 1000000) { \ + tp->tv_usec = us - 1000000; \ + tp->tv_sec++; \ + } \ +} + +int stathz; +int profhz; +int profprocs; +int ticks; +static int psdiv, pscnt; /* prof => stat divider */ +int psratio; /* ratio: prof / stat */ + +volatile struct timeval time; +volatile struct timeval mono_time; + +/* + * Initialize clock frequencies and start both clocks running. + */ +void +initclocks() +{ + register int i; + + /* + * Set divisors to 1 (normal case) and let the machine-specific + * code do its bit. + */ + psdiv = pscnt = 1; + cpu_initclocks(); + + /* + * Compute profhz/stathz, and fix profhz if needed. + */ + i = stathz ? stathz : hz; + if (profhz == 0) + profhz = i; + psratio = profhz / i; +} + +/* + * The real-time timer, interrupting hz times per second. + */ +void +hardclock(frame) + register struct clockframe *frame; +{ + register struct callout *p1; + register struct proc *p; + register int delta, needsoft; + extern int tickdelta; + extern long timedelta; + + /* + * Update real-time timeout queue. + * At front of queue are some number of events which are ``due''. + * The time to these is <= 0 and if negative represents the + * number of ticks which have passed since it was supposed to happen. + * The rest of the q elements (times > 0) are events yet to happen, + * where the time for each is given as a delta from the previous. + * Decrementing just the first of these serves to decrement the time + * to all events. + */ + needsoft = 0; + for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) { + if (--p1->c_time > 0) + break; + needsoft = 1; + if (p1->c_time == 0) + break; + } + + p = curproc; + if (p) { + register struct pstats *pstats; + + /* + * Run current process's virtual and profile time, as needed. + */ + pstats = p->p_stats; + if (CLKF_USERMODE(frame) && + timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) + psignal(p, SIGVTALRM); + if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) + psignal(p, SIGPROF); + } + + /* + * If no separate statistics clock is available, run it from here. + */ + if (stathz == 0) + statclock(frame); + + /* + * Increment the time-of-day. The increment is just ``tick'' unless + * we are still adjusting the clock; see adjtime(). + */ + ticks++; + if (timedelta == 0) + delta = tick; + else { + delta = tick + tickdelta; + timedelta -= tickdelta; + } + BUMPTIME(&time, delta); + BUMPTIME(&mono_time, delta); + + /* + * Process callouts at a very low cpu priority, so we don't keep the + * relatively high clock interrupt priority any longer than necessary. + */ + if (needsoft) { + if (CLKF_BASEPRI(frame)) { + /* + * Save the overhead of a software interrupt; + * it will happen as soon as we return, so do it now. + */ + (void)splsoftclock(); + softclock(); + } else + setsoftclock(); + } +} + +/* + * Software (low priority) clock interrupt. + * Run periodic events from timeout queue. + */ +/*ARGSUSED*/ +void +softclock() +{ + register struct callout *c; + register void *arg; + register void (*func) __P((void *)); + register int s; + + s = splhigh(); + while ((c = calltodo.c_next) != NULL && c->c_time <= 0) { + func = c->c_func; + arg = c->c_arg; + calltodo.c_next = c->c_next; + c->c_next = callfree; + callfree = c; + splx(s); + (*func)(arg); + (void) splhigh(); + } + splx(s); +} + +/* + * timeout -- + * Execute a function after a specified length of time. + * + * untimeout -- + * Cancel previous timeout function call. + * + * See AT&T BCI Driver Reference Manual for specification. This + * implementation differs from that one in that no identification + * value is returned from timeout, rather, the original arguments + * to timeout are used to identify entries for untimeout. + */ +void +timeout(ftn, arg, ticks) + void (*ftn) __P((void *)); + void *arg; + register int ticks; +{ + register struct callout *new, *p, *t; + register int s; + + if (ticks <= 0) + ticks = 1; + + /* Lock out the clock. */ + s = splhigh(); + + /* Fill in the next free callout structure. */ + if (callfree == NULL) + panic("timeout table full"); + new = callfree; + callfree = new->c_next; + new->c_arg = arg; + new->c_func = ftn; + + /* + * The time for each event is stored as a difference from the time + * of the previous event on the queue. Walk the queue, correcting + * the ticks argument for queue entries passed. Correct the ticks + * value for the queue entry immediately after the insertion point + * as well. Watch out for negative c_time values; these represent + * overdue events. + */ + for (p = &calltodo; + (t = p->c_next) != NULL && ticks > t->c_time; p = t) + if (t->c_time > 0) + ticks -= t->c_time; + new->c_time = ticks; + if (t != NULL) + t->c_time -= ticks; + + /* Insert the new entry into the queue. */ + p->c_next = new; + new->c_next = t; + splx(s); +} + +void +untimeout(ftn, arg) + void (*ftn) __P((void *)); + void *arg; +{ + register struct callout *p, *t; + register int s; + + s = splhigh(); + for (p = &calltodo; (t = p->c_next) != NULL; p = t) + if (t->c_func == ftn && t->c_arg == arg) { + /* Increment next entry's tick count. */ + if (t->c_next && t->c_time > 0) + t->c_next->c_time += t->c_time; + + /* Move entry from callout queue to callfree queue. */ + p->c_next = t->c_next; + t->c_next = callfree; + callfree = t; + break; + } + splx(s); +} + +/* + * Compute number of hz until specified time. Used to + * compute third argument to timeout() from an absolute time. + */ +int +hzto(tv) + struct timeval *tv; +{ + register long ticks, sec; + int s; + + /* + * If number of milliseconds will fit in 32 bit arithmetic, + * then compute number of milliseconds to time and scale to + * ticks. Otherwise just compute number of hz in time, rounding + * times greater than representible to maximum value. + * + * Delta times less than 25 days can be computed ``exactly''. + * Maximum value for any timeout in 10ms ticks is 250 days. + */ + s = splhigh(); + sec = tv->tv_sec - time.tv_sec; + if (sec <= 0x7fffffff / 1000 - 1000) + ticks = ((tv->tv_sec - time.tv_sec) * 1000 + + (tv->tv_usec - time.tv_usec) / 1000) / (tick / 1000); + else if (sec <= 0x7fffffff / hz) + ticks = sec * hz; + else + ticks = 0x7fffffff; + splx(s); + return (ticks); +} + +/* + * Start profiling on a process. + * + * Kernel profiling passes proc0 which never exits and hence + * keeps the profile clock running constantly. + */ +void +startprofclock(p) + register struct proc *p; +{ + int s; + + if ((p->p_flag & P_PROFIL) == 0) { + p->p_flag |= P_PROFIL; + if (++profprocs == 1 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = psratio; + setstatclockrate(profhz); + splx(s); + } + } +} + +/* + * Stop profiling on a process. + */ +void +stopprofclock(p) + register struct proc *p; +{ + int s; + + if (p->p_flag & P_PROFIL) { + p->p_flag &= ~P_PROFIL; + if (--profprocs == 0 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = 1; + setstatclockrate(stathz); + splx(s); + } + } +} + +int dk_ndrive = DK_NDRIVE; + +/* + * Statistics clock. Grab profile sample, and if divider reaches 0, + * do process and kernel statistics. + */ +void +statclock(frame) + register struct clockframe *frame; +{ +#ifdef GPROF + register struct gmonparam *g; +#endif + register struct proc *p; + register int i; + + if (CLKF_USERMODE(frame)) { + p = curproc; + if (p->p_flag & P_PROFIL) + addupc_intr(p, CLKF_PC(frame), 1); + if (--pscnt > 0) + return; + /* + * Came from user mode; CPU was in user state. + * If this process is being profiled record the tick. + */ + p->p_uticks++; + if (p->p_nice > NZERO) + cp_time[CP_NICE]++; + else + cp_time[CP_USER]++; + } else { +#ifdef GPROF + /* + * Kernel statistics are just like addupc_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = CLKF_PC(frame) - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif + if (--pscnt > 0) + return; + /* + * Came from kernel mode, so we were: + * - handling an interrupt, + * - doing syscall or trap work on behalf of the current + * user process, or + * - spinning in the idle loop. + * Whichever it is, charge the time as appropriate. + * Note that we charge interrupts to the current process, + * regardless of whether they are ``for'' that process, + * so that we know how much of its real time was spent + * in ``non-process'' (i.e., interrupt) work. + */ + p = curproc; + if (CLKF_INTR(frame)) { + if (p != NULL) + p->p_iticks++; + cp_time[CP_INTR]++; + } else if (p != NULL) { + p->p_sticks++; + cp_time[CP_SYS]++; + } else + cp_time[CP_IDLE]++; + } + pscnt = psdiv; + + /* + * We maintain statistics shown by user-level statistics + * programs: the amount of time in each cpu state, and + * the amount of time each of DK_NDRIVE ``drives'' is busy. + * + * XXX should either run linked list of drives, or (better) + * grab timestamps in the start & done code. + */ + for (i = 0; i < DK_NDRIVE; i++) + if (dk_busy & (1 << i)) + dk_time[i]++; + + /* + * We adjust the priority of the current process. The priority of + * a process gets worse as it accumulates CPU time. The cpu usage + * estimator (p_estcpu) is increased here. The formula for computing + * priorities (in kern_synch.c) will compute a different value each + * time p_estcpu increases by 4. The cpu usage estimator ramps up + * quite quickly when the process is running (linearly), and decays + * away exponentially, at a rate which is proportionally slower when + * the system is busy. The basic principal is that the system will + * 90% forget that the process used a lot of CPU time in 5 * loadav + * seconds. This causes the system to favor processes which haven't + * run much recently, and to round-robin among other processes. + */ + if (p != NULL) { + p->p_cpticks++; + if (++p->p_estcpu == 0) + p->p_estcpu--; + if ((p->p_estcpu & 3) == 0) { + resetpriority(p); + if (p->p_priority >= PUSER) + p->p_priority = p->p_usrpri; + } + } +} + +/* + * Return information about system clocks. + */ +sysctl_clockrate(where, sizep) + register char *where; + size_t *sizep; +{ + struct clockinfo clkinfo; + + /* + * Construct clockinfo structure. + */ + clkinfo.hz = hz; + clkinfo.tick = tick; + clkinfo.profhz = profhz; + clkinfo.stathz = stathz ? stathz : hz; + return (sysctl_rdstruct(where, sizep, NULL, &clkinfo, sizeof(clkinfo))); +} diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c new file mode 100644 index 00000000000..64fac9105d7 --- /dev/null +++ b/sys/kern/kern_xxx.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +struct reboot_args { + int opt; +}; +/* ARGSUSED */ +reboot(p, uap, retval) + struct proc *p; + struct reboot_args *uap; + int *retval; +{ + int error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + boot(uap->opt); + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + +struct gethostname_args { + char *hostname; + u_int len; +}; +/* ARGSUSED */ +ogethostname(p, uap, retval) + struct proc *p; + struct gethostname_args *uap; + int *retval; +{ + int name; + + name = KERN_HOSTNAME; + return (kern_sysctl(&name, 1, uap->hostname, &uap->len, 0, 0)); +} + +struct sethostname_args { + char *hostname; + u_int len; +}; +/* ARGSUSED */ +osethostname(p, uap, retval) + struct proc *p; + register struct sethostname_args *uap; + int *retval; +{ + int name; + int error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + name = KERN_HOSTNAME; + return (kern_sysctl(&name, 1, 0, 0, uap->hostname, uap->len)); +} + +extern long hostid; + +struct gethostid_args { + int dummy; +}; +/* ARGSUSED */ +ogethostid(p, uap, retval) + struct proc *p; + struct gethostid_args *uap; + int *retval; +{ + + *(long *)retval = hostid; + return (0); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +#ifdef COMPAT_43 +struct sethostid_args { + long hostid; +}; +/* ARGSUSED */ +osethostid(p, uap, retval) + struct proc *p; + struct sethostid_args *uap; + int *retval; +{ + int error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + hostid = uap->hostid; + return (0); +} + +oquota() +{ + + return (ENOSYS); +} +#endif /* COMPAT_43 */ diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh new file mode 100644 index 00000000000..0ddea0c28fa --- /dev/null +++ b/sys/kern/makesyscalls.sh @@ -0,0 +1,171 @@ +#! /bin/sh - +# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93 + +set -e + +# name of compat option: +compat=COMPAT_43 + +# output files: +sysnames="syscalls.c" +syshdr="../sys/syscall.h" +syssw="init_sysent.c" + +# tmp files: +sysdcl="sysent.dcl" +syscompat="sysent.compat" +sysent="sysent.switch" + +trap "rm $sysdcl $syscompat $sysent" 0 + +case $# in + 0) echo "Usage: $0 input-file" 1>&2 + exit 1 + ;; +esac + +awk < $1 " + BEGIN { + sysdcl = \"$sysdcl\" + syscompat = \"$syscompat\" + sysent = \"$sysent\" + sysnames = \"$sysnames\" + syshdr = \"$syshdr\" + compat = \"$compat\" + infile = \"$1\" + "' + + printf "/*\n * System call switch table.\n *\n" > sysdcl + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysdcl + + printf "\n#ifdef %s\n", compat > syscompat + printf "#define compat(n, name) n, __CONCAT(o,name)\n\n" > syscompat + + printf "/*\n * System call names.\n *\n" > sysnames + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames + + printf "/*\n * System call numbers.\n *\n" > syshdr + printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr + } + NR == 1 { + printf " * created from%s\n */\n\n", $0 > sysdcl + printf "#include \n" > sysdcl + printf "#include \n\n" > sysdcl + printf "int\tnosys();\n\n" > sysdcl + + printf "struct sysent sysent[] = {\n" > sysent + + printf " * created from%s\n */\n\n", $0 > sysnames + printf "char *syscallnames[] = {\n" > sysnames + + printf " * created from%s\n */\n\n", $0 > syshdr + next + } + NF == 0 || $1 ~ /^;/ { + next + } + $1 ~ /^#[ ]*if/ { + print > sysent + print > sysdcl + print > syscompat + print > sysnames + savesyscall = syscall + next + } + $1 ~ /^#[ ]*else/ { + print > sysent + print > sysdcl + print > syscompat + print > sysnames + syscall = savesyscall + next + } + $1 ~ /^#/ { + print > sysent + print > sysdcl + print > syscompat + print > sysnames + next + } + syscall != $1 { + printf "%s: line %d: syscall number out of sync at %d\n", \ + infile, NR, syscall + printf "line is:\n" + print + exit 1 + } + { comment = $4 + for (i = 5; i <= NF; i++) + comment = comment " " $i + if (NF < 5) + $5 = $4 + } + $2 == "STD" { + printf("int\t%s();\n", $4) > sysdcl + printf("\t{ %d, %s },\t\t\t/* %d = %s */\n", \ + $3, $4, syscall, $5) > sysent + printf("\t\"%s\",\t\t\t/* %d = %s */\n", \ + $5, syscall, $5) > sysnames + printf("#define\tSYS_%s\t%d\n", \ + $5, syscall) > syshdr + syscall++ + next + } + $2 == "COMPAT" { + printf("int\to%s();\n", $4) > syscompat + printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n", \ + $3, $4, syscall, $5) > sysent + printf("\t\"old.%s\",\t\t/* %d = old %s */\n", \ + $5, syscall, $5) > sysnames + printf("\t\t\t\t/* %d is old %s */\n", \ + syscall, comment) > syshdr + syscall++ + next + } + $2 == "LIBCOMPAT" { + printf("int\to%s();\n", $4) > syscompat + printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n", \ + $3, $4, syscall, $5) > sysent + printf("\t\"old.%s\",\t\t/* %d = old %s */\n", \ + $5, syscall, $5) > sysnames + printf("#define\tSYS_%s\t%d\t/* compatibility; still used by libc */\n", \ + $5, syscall) > syshdr + syscall++ + next + } + $2 == "OBSOL" { + printf("\t{ 0, nosys },\t\t\t/* %d = obsolete %s */\n", \ + syscall, comment) > sysent + printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n", \ + $4, syscall, comment) > sysnames + printf("\t\t\t\t/* %d is obsolete %s */\n", \ + syscall, comment) > syshdr + syscall++ + next + } + $2 == "UNIMPL" { + printf("\t{ 0, nosys },\t\t\t/* %d = %s */\n", \ + syscall, comment) > sysent + printf("\t\"#%d\",\t\t\t/* %d = %s */\n", \ + syscall, syscall, comment) > sysnames + syscall++ + next + } + { + printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2 + exit 1 + } + END { + printf("\n#else /* %s */\n", compat) > syscompat + printf("#define compat(n, name) 0, nosys\n") > syscompat + printf("#endif /* %s */\n\n", compat) > syscompat + + printf("};\n\n") > sysent + printf("int\tnsysent = sizeof(sysent) / sizeof(sysent[0]);\n") > sysent + + printf("};\n") > sysnames + } ' + +cat $sysdcl $syscompat $sysent >$syssw + +chmod 444 $sysnames $syshdr $syssw diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c new file mode 100644 index 00000000000..af17988c935 --- /dev/null +++ b/sys/kern/subr_autoconf.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Lawrence Berkeley Laboratories. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_autoconf.c 8.1 (Berkeley) 6/10/93 + * + * from: $Header: subr_autoconf.c,v 1.12 93/02/01 19:31:48 torek Exp $ (LBL) + */ + +#include +#include +#include + +/* + * Autoconfiguration subroutines. + */ + +/* + * ioconf.c exports exactly two names: cfdata and cfroots. All system + * devices and drivers are found via these tables. + */ +extern struct cfdata cfdata[]; +extern short cfroots[]; + +#define ROOT ((struct device *)NULL) + +struct matchinfo { + cfmatch_t fn; + struct device *parent; + void *aux; + struct cfdata *match; + int pri; +}; + +/* + * Apply the matching function and choose the best. This is used + * a few times and we want to keep the code small. + */ +static void +mapply(m, cf) + register struct matchinfo *m; + register struct cfdata *cf; +{ + register int pri; + + if (m->fn != NULL) + pri = (*m->fn)(m->parent, cf, m->aux); + else + pri = (*cf->cf_driver->cd_match)(m->parent, cf, m->aux); + if (pri > m->pri) { + m->match = cf; + m->pri = pri; + } +} + +/* + * Iterate over all potential children of some device, calling the given + * function (default being the child's match function) for each one. + * Nonzero returns are matches; the highest value returned is considered + * the best match. Return the `found child' if we got a match, or NULL + * otherwise. The `aux' pointer is simply passed on through. + * + * Note that this function is designed so that it can be used to apply + * an arbitrary function to all potential children (its return value + * can be ignored). + */ +struct cfdata * +config_search(fn, parent, aux) + cfmatch_t fn; + register struct device *parent; + void *aux; +{ + register struct cfdata *cf; + register short *p; + struct matchinfo m; + + m.fn = fn; + m.parent = parent; + m.aux = aux; + m.match = NULL; + m.pri = 0; + for (cf = cfdata; cf->cf_driver; cf++) { + /* + * Skip cf if no longer eligible, otherwise scan through + * parents for one matching `parent', and try match function. + */ + if (cf->cf_fstate == FSTATE_FOUND) + continue; + for (p = cf->cf_parents; *p >= 0; p++) + if (parent->dv_cfdata == &cfdata[*p]) + mapply(&m, cf); + } + return (m.match); +} + +/* + * Find the given root device. + * This is much like config_search, but there is no parent. + */ +struct cfdata * +config_rootsearch(fn, rootname, aux) + register cfmatch_t fn; + register char *rootname; + register void *aux; +{ + register struct cfdata *cf; + register short *p; + struct matchinfo m; + + m.fn = fn; + m.parent = ROOT; + m.aux = aux; + m.match = NULL; + m.pri = 0; + /* + * Look at root entries for matching name. We do not bother + * with found-state here since only one root should ever be + * searched (and it must be done first). + */ + for (p = cfroots; *p >= 0; p++) { + cf = &cfdata[*p]; + if (strcmp(cf->cf_driver->cd_name, rootname) == 0) + mapply(&m, cf); + } + return (m.match); +} + +static char *msgs[3] = { "", " not configured\n", " unsupported\n" }; + +/* + * The given `aux' argument describes a device that has been found + * on the given parent, but not necessarily configured. Locate the + * configuration data for that device (using the cd_match configuration + * driver function) and attach it, and return true. If the device was + * not configured, call the given `print' function and return 0. + */ +int +config_found(parent, aux, print) + struct device *parent; + void *aux; + cfprint_t print; +{ + struct cfdata *cf; + + if ((cf = config_search((cfmatch_t)NULL, parent, aux)) != NULL) { + config_attach(parent, cf, aux, print); + return (1); + } + printf(msgs[(*print)(aux, parent->dv_xname)]); + return (0); +} + +/* + * As above, but for root devices. + */ +int +config_rootfound(rootname, aux) + char *rootname; + void *aux; +{ + struct cfdata *cf; + + if ((cf = config_rootsearch((cfmatch_t)NULL, rootname, aux)) != NULL) { + config_attach(ROOT, cf, aux, (cfprint_t)NULL); + return (1); + } + printf("root device %s not configured\n", rootname); + return (0); +} + +/* just like sprintf(buf, "%d") except that it works from the end */ +static char * +number(ep, n) + register char *ep; + register int n; +{ + + *--ep = 0; + while (n >= 10) { + *--ep = (n % 10) + '0'; + n /= 10; + } + *--ep = n + '0'; + return (ep); +} + +/* + * Attach a found device. Allocates memory for device variables. + */ +void +config_attach(parent, cf, aux, print) + register struct device *parent; + register struct cfdata *cf; + register void *aux; + cfprint_t print; +{ + register struct device *dev; + register struct cfdriver *cd; + register size_t lname, lunit; + register char *xunit; + int myunit; + char num[10]; + static struct device **nextp = &alldevs; + + cd = cf->cf_driver; + if (cd->cd_devsize < sizeof(struct device)) + panic("config_attach"); + myunit = cf->cf_unit; + if (cf->cf_fstate == FSTATE_NOTFOUND) + cf->cf_fstate = FSTATE_FOUND; + else + cf->cf_unit++; + + /* compute length of name and decimal expansion of unit number */ + lname = strlen(cd->cd_name); + xunit = number(&num[sizeof num], myunit); + lunit = &num[sizeof num] - xunit; + if (lname + lunit >= sizeof(dev->dv_xname)) + panic("config_attach: device name too long"); + + /* get memory for all device vars */ + dev = (struct device *)malloc(cd->cd_devsize, M_DEVBUF, M_WAITOK); + /* XXX cannot wait! */ + bzero(dev, cd->cd_devsize); + *nextp = dev; /* link up */ + nextp = &dev->dv_next; + dev->dv_class = cd->cd_class; + dev->dv_cfdata = cf; + dev->dv_unit = myunit; + bcopy(cd->cd_name, dev->dv_xname, lname); + bcopy(xunit, dev->dv_xname + lname, lunit); + dev->dv_parent = parent; + if (parent == ROOT) + printf("%s (root)", dev->dv_xname); + else { + printf("%s at %s", dev->dv_xname, parent->dv_xname); + (void) (*print)(aux, (char *)0); + } + + /* put this device in the devices array */ + if (dev->dv_unit >= cd->cd_ndevs) { + /* + * Need to expand the array. + */ + int old = cd->cd_ndevs, oldbytes, new, newbytes; + void **nsp; + + if (old == 0) { + nsp = malloc(MINALLOCSIZE, M_DEVBUF, M_WAITOK); /*XXX*/ + bzero(nsp, MINALLOCSIZE); + cd->cd_ndevs = MINALLOCSIZE / sizeof(void *); + } else { + new = cd->cd_ndevs; + do { + new *= 2; + } while (new <= dev->dv_unit); + cd->cd_ndevs = new; + oldbytes = old * sizeof(void *); + newbytes = new * sizeof(void *); + nsp = malloc(newbytes, M_DEVBUF, M_WAITOK); /*XXX*/ + bcopy(cd->cd_devs, nsp, oldbytes); + bzero(&nsp[old], newbytes - oldbytes); + free(cd->cd_devs, M_DEVBUF); + } + cd->cd_devs = nsp; + } + if (cd->cd_devs[dev->dv_unit]) + panic("config_attach: duplicate %s", dev->dv_xname); + cd->cd_devs[dev->dv_unit] = dev; + + /* + * Before attaching, clobber any unfound devices that are + * otherwise identical. + */ + for (cf = cfdata; cf->cf_driver; cf++) + if (cf->cf_driver == cd && cf->cf_unit == dev->dv_unit && + cf->cf_fstate == FSTATE_NOTFOUND) + cf->cf_fstate = FSTATE_FOUND; + (*cd->cd_attach)(parent, dev, aux); +} + +/* + * Attach an event. These must come from initially-zero space (see + * commented-out assignments below), but that occurs naturally for + * device instance variables. + */ +void +evcnt_attach(dev, name, ev) + struct device *dev; + const char *name; + struct evcnt *ev; +{ + static struct evcnt **nextp = &allevents; + +#ifdef DIAGNOSTIC + if (strlen(name) >= sizeof(ev->ev_name)) + panic("evcnt_attach"); +#endif + /* ev->ev_next = NULL; */ + ev->ev_dev = dev; + /* ev->ev_count = 0; */ + strcpy(ev->ev_name, name); + *nextp = ev; + nextp = &ev->ev_next; +} diff --git a/sys/kern/subr_clist.c b/sys/kern/subr_clist.c new file mode 100644 index 00000000000..fe8f000f87d --- /dev/null +++ b/sys/kern/subr_clist.c @@ -0,0 +1,159 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)tty_subr.c 8.2 (Berkeley) 9/5/93 + */ + +#include +#include +#include + +char cwaiting; +struct cblock *cfree, *cfreelist; +int cfreecount, nclist; + +void +clist_init() +{ + + /* + * Body deleted. + */ + return; +} + +getc(a1) + struct clist *a1; +{ + + /* + * Body deleted. + */ + return ((char)0); +} + +q_to_b(a1, a2, a3) + struct clist *a1; + char *a2; + int a3; +{ + + /* + * Body deleted. + */ + return (0); +} + +ndqb(a1, a2) + struct clist *a1; + int a2; +{ + + /* + * Body deleted. + */ + return (0); +} + +void +ndflush(a1, a2) + struct clist *a1; + int a2; +{ + + /* + * Body deleted. + */ + return; +} + +putc(a1, a2) + char a1; + struct clist *a2; +{ + + /* + * Body deleted. + */ + return (0); +} + +b_to_q(a1, a2, a3) + char *a1; + int a2; + struct clist *a3; +{ + + /* + * Body deleted. + */ + return (0); +} + +char * +nextc(a1, a2, a3) + struct clist *a1; + char *a2; + int *a3; +{ + + /* + * Body deleted. + */ + return ((char *)0); +} + +unputc(a1) + struct clist *a1; +{ + + /* + * Body deleted. + */ + return ((char)0); +} + +void +catq(a1, a2) + struct clist *a1, *a2; +{ + + /* + * Body deleted. + */ + return; +} diff --git a/sys/kern/subr_disklabel.c b/sys/kern/subr_disklabel.c new file mode 100644 index 00000000000..78dede4da77 --- /dev/null +++ b/sys/kern/subr_disklabel.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include + +/* + * Seek sort for disks. We depend on the driver which calls us using b_resid + * as the current cylinder number. + * + * The argument ap structure holds a b_actf activity chain pointer on which we + * keep two queues, sorted in ascending cylinder order. The first queue holds + * those requests which are positioned after the current cylinder (in the first + * request); the second holds requests which came in after their cylinder number + * was passed. Thus we implement a one way scan, retracting after reaching the + * end of the drive to the first request on the second queue, at which time it + * becomes the first queue. + * + * A one-way scan is natural because of the way UNIX read-ahead blocks are + * allocated. + */ + +/* + * For portability with historic industry practice, the + * cylinder number has to be maintained in the `b_resid' + * field. + */ +#define b_cylinder b_resid + +void +disksort(ap, bp) + register struct buf *ap, *bp; +{ + register struct buf *bq; + + /* If the queue is empty, then it's easy. */ + if (ap->b_actf == NULL) { + bp->b_actf = NULL; + ap->b_actf = bp; + return; + } + + /* + * If we lie after the first (currently active) request, then we + * must locate the second request list and add ourselves to it. + */ + bq = ap->b_actf; + if (bp->b_cylinder < bq->b_cylinder) { + while (bq->b_actf) { + /* + * Check for an ``inversion'' in the normally ascending + * cylinder numbers, indicating the start of the second + * request list. + */ + if (bq->b_actf->b_cylinder < bq->b_cylinder) { + /* + * Search the second request list for the first + * request at a larger cylinder number. We go + * before that; if there is no such request, we + * go at end. + */ + do { + if (bp->b_cylinder < + bq->b_actf->b_cylinder) + goto insert; + if (bp->b_cylinder == + bq->b_actf->b_cylinder && + bp->b_blkno < bq->b_actf->b_blkno) + goto insert; + bq = bq->b_actf; + } while (bq->b_actf); + goto insert; /* after last */ + } + bq = bq->b_actf; + } + /* + * No inversions... we will go after the last, and + * be the first request in the second request list. + */ + goto insert; + } + /* + * Request is at/after the current request... + * sort in the first request list. + */ + while (bq->b_actf) { + /* + * We want to go after the current request if there is an + * inversion after it (i.e. it is the end of the first + * request list), or if the next request is a larger cylinder + * than our request. + */ + if (bq->b_actf->b_cylinder < bq->b_cylinder || + bp->b_cylinder < bq->b_actf->b_cylinder || + (bp->b_cylinder == bq->b_actf->b_cylinder && + bp->b_blkno < bq->b_actf->b_blkno)) + goto insert; + bq = bq->b_actf; + } + /* + * Neither a second list nor a larger request... we go at the end of + * the first list, which is the same as the end of the whole schebang. + */ +insert: bp->b_actf = bq->b_actf; + bq->b_actf = bp; +} + +/* + * Attempt to read a disk label from a device using the indicated stategy + * routine. The label must be partly set up before this: secpercyl and + * anything required in the strategy routine (e.g., sector size) must be + * filled in before calling us. Returns NULL on success and an error + * string on failure. + */ +char * +readdisklabel(dev, strat, lp) + dev_t dev; + int (*strat)(); + register struct disklabel *lp; +{ + register struct buf *bp; + struct disklabel *dlp; + char *msg = NULL; + + if (lp->d_secperunit == 0) + lp->d_secperunit = 0x1fffffff; + lp->d_npartitions = 1; + if (lp->d_partitions[0].p_size == 0) + lp->d_partitions[0].p_size = 0x1fffffff; + lp->d_partitions[0].p_offset = 0; + + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dev; + bp->b_blkno = LABELSECTOR; + bp->b_bcount = lp->d_secsize; + bp->b_flags = B_BUSY | B_READ; + bp->b_cylinder = LABELSECTOR / lp->d_secpercyl; + (*strat)(bp); + if (biowait(bp)) + msg = "I/O error"; + else for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *)((char *)bp->b_data + + DEV_BSIZE - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) { + if (msg == NULL) + msg = "no disk label"; + } else if (dlp->d_npartitions > MAXPARTITIONS || + dkcksum(dlp) != 0) + msg = "disk label corrupted"; + else { + *lp = *dlp; + msg = NULL; + break; + } + } + bp->b_flags = B_INVAL | B_AGE; + brelse(bp); + return (msg); +} + +/* + * Check new disk label for sensibility before setting it. + */ +int +setdisklabel(olp, nlp, openmask) + register struct disklabel *olp, *nlp; + u_long openmask; +{ + register i; + register struct partition *opp, *npp; + + if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC || + dkcksum(nlp) != 0) + return (EINVAL); + while ((i = ffs((long)openmask)) != 0) { + i--; + openmask &= ~(1 << i); + if (nlp->d_npartitions <= i) + return (EBUSY); + opp = &olp->d_partitions[i]; + npp = &nlp->d_partitions[i]; + if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size) + return (EBUSY); + /* + * Copy internally-set partition information + * if new label doesn't include it. XXX + */ + if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) { + npp->p_fstype = opp->p_fstype; + npp->p_fsize = opp->p_fsize; + npp->p_frag = opp->p_frag; + npp->p_cpg = opp->p_cpg; + } + } + nlp->d_checksum = 0; + nlp->d_checksum = dkcksum(nlp); + *olp = *nlp; + return (0); +} + +/* encoding of disk minor numbers, should be elsewhere... */ +#define dkunit(dev) (minor(dev) >> 3) +#define dkpart(dev) (minor(dev) & 07) +#define dkminor(unit, part) (((unit) << 3) | (part)) + +/* + * Write disk label back to device after modification. + */ +int +writedisklabel(dev, strat, lp) + dev_t dev; + int (*strat)(); + register struct disklabel *lp; +{ + struct buf *bp; + struct disklabel *dlp; + int labelpart; + int error = 0; + + labelpart = dkpart(dev); + if (lp->d_partitions[labelpart].p_offset != 0) { + if (lp->d_partitions[0].p_offset != 0) + return (EXDEV); /* not quite right */ + labelpart = 0; + } + bp = geteblk((int)lp->d_secsize); + bp->b_dev = makedev(major(dev), dkminor(dkunit(dev), labelpart)); + bp->b_blkno = LABELSECTOR; + bp->b_bcount = lp->d_secsize; + bp->b_flags = B_READ; + (*strat)(bp); + if (error = biowait(bp)) + goto done; + for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *) + ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC && + dkcksum(dlp) == 0) { + *dlp = *lp; + bp->b_flags = B_WRITE; + (*strat)(bp); + error = biowait(bp); + goto done; + } + } + error = ESRCH; +done: + brelse(bp); + return (error); +} + +/* + * Compute checksum for disk label. + */ +dkcksum(lp) + register struct disklabel *lp; +{ + register u_short *start, *end; + register u_short sum = 0; + + start = (u_short *)lp; + end = (u_short *)&lp->d_partitions[lp->d_npartitions]; + while (start < end) + sum ^= *start++; + return (sum); +} + +/* + * Disk error is the preface to plaintive error messages + * about failing disk transfers. It prints messages of the form + +hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) + + * if the offset of the error in the transfer and a disk label + * are both available. blkdone should be -1 if the position of the error + * is unknown; the disklabel pointer may be null from drivers that have not + * been converted to use them. The message is printed with printf + * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. + * The message should be completed (with at least a newline) with printf + * or addlog, respectively. There is no trailing space. + */ +void +diskerr(bp, dname, what, pri, blkdone, lp) + register struct buf *bp; + char *dname, *what; + int pri, blkdone; + register struct disklabel *lp; +{ + int unit = dkunit(bp->b_dev), part = dkpart(bp->b_dev); + register void (*pr) __P((const char *, ...)); + char partname = 'a' + part; + int sn; + + if (pri != LOG_PRINTF) { + log(pri, ""); + pr = addlog; + } else + pr = printf; + (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, + bp->b_flags & B_READ ? "read" : "writ"); + sn = bp->b_blkno; + if (bp->b_bcount <= DEV_BSIZE) + (*pr)("%d", sn); + else { + if (blkdone >= 0) { + sn += blkdone; + (*pr)("%d of ", sn); + } + (*pr)("%d-%d", bp->b_blkno, + bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); + } + if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { +#ifdef tahoe + sn *= DEV_BSIZE / lp->d_secsize; /* XXX */ +#endif + sn += lp->d_partitions[part].p_offset; + (*pr)(" (%s%d bn %d; cn %d", dname, unit, sn, + sn / lp->d_secpercyl); + sn %= lp->d_secpercyl; + (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors); + } +} diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c new file mode 100644 index 00000000000..f065761d756 --- /dev/null +++ b/sys/kern/subr_log.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_log.c 8.1 (Berkeley) 6/10/93 + */ + +/* + * Error log buffer for kernel printf's. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define LOG_RDPRI (PZERO + 1) + +#define LOG_ASYNC 0x04 +#define LOG_RDWAIT 0x08 + +struct logsoftc { + int sc_state; /* see above for possibilities */ + struct selinfo sc_selp; /* process waiting on select call */ + int sc_pgid; /* process/group for async I/O */ +} logsoftc; + +int log_open; /* also used in log() */ + +/*ARGSUSED*/ +logopen(dev, flags, mode, p) + dev_t dev; + int flags, mode; + struct proc *p; +{ + register struct msgbuf *mbp = msgbufp; + + if (log_open) + return (EBUSY); + log_open = 1; + logsoftc.sc_pgid = p->p_pid; /* signal process only */ + /* + * Potential race here with putchar() but since putchar should be + * called by autoconf, msg_magic should be initialized by the time + * we get here. + */ + if (mbp->msg_magic != MSG_MAGIC) { + register int i; + + mbp->msg_magic = MSG_MAGIC; + mbp->msg_bufx = mbp->msg_bufr = 0; + for (i=0; i < MSG_BSIZE; i++) + mbp->msg_bufc[i] = 0; + } + return (0); +} + +/*ARGSUSED*/ +logclose(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + + log_open = 0; + logsoftc.sc_state = 0; + return (0); +} + +/*ARGSUSED*/ +logread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct msgbuf *mbp = msgbufp; + register long l; + register int s; + int error = 0; + + s = splhigh(); + while (mbp->msg_bufr == mbp->msg_bufx) { + if (flag & IO_NDELAY) { + splx(s); + return (EWOULDBLOCK); + } + logsoftc.sc_state |= LOG_RDWAIT; + if (error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH, + "klog", 0)) { + splx(s); + return (error); + } + } + splx(s); + logsoftc.sc_state &= ~LOG_RDWAIT; + + while (uio->uio_resid > 0) { + l = mbp->msg_bufx - mbp->msg_bufr; + if (l < 0) + l = MSG_BSIZE - mbp->msg_bufr; + l = min(l, uio->uio_resid); + if (l == 0) + break; + error = uiomove((caddr_t)&mbp->msg_bufc[mbp->msg_bufr], + (int)l, uio); + if (error) + break; + mbp->msg_bufr += l; + if (mbp->msg_bufr < 0 || mbp->msg_bufr >= MSG_BSIZE) + mbp->msg_bufr = 0; + } + return (error); +} + +/*ARGSUSED*/ +logselect(dev, rw, p) + dev_t dev; + int rw; + struct proc *p; +{ + int s = splhigh(); + + switch (rw) { + + case FREAD: + if (msgbufp->msg_bufr != msgbufp->msg_bufx) { + splx(s); + return (1); + } + selrecord(p, &logsoftc.sc_selp); + break; + } + splx(s); + return (0); +} + +logwakeup() +{ + struct proc *p; + + if (!log_open) + return; + selwakeup(&logsoftc.sc_selp); + if (logsoftc.sc_state & LOG_ASYNC) { + if (logsoftc.sc_pgid < 0) + gsignal(-logsoftc.sc_pgid, SIGIO); + else if (p = pfind(logsoftc.sc_pgid)) + psignal(p, SIGIO); + } + if (logsoftc.sc_state & LOG_RDWAIT) { + wakeup((caddr_t)msgbufp); + logsoftc.sc_state &= ~LOG_RDWAIT; + } +} + +/*ARGSUSED*/ +logioctl(dev, com, data, flag, p) + dev_t dev; + int com; + caddr_t data; + int flag; + struct proc *p; +{ + long l; + int s; + + switch (com) { + + /* return number of characters immediately available */ + case FIONREAD: + s = splhigh(); + l = msgbufp->msg_bufx - msgbufp->msg_bufr; + splx(s); + if (l < 0) + l += MSG_BSIZE; + *(int *)data = l; + break; + + case FIONBIO: + break; + + case FIOASYNC: + if (*(int *)data) + logsoftc.sc_state |= LOG_ASYNC; + else + logsoftc.sc_state &= ~LOG_ASYNC; + break; + + case TIOCSPGRP: + logsoftc.sc_pgid = *(int *)data; + break; + + case TIOCGPGRP: + *(int *)data = logsoftc.sc_pgid; + break; + + default: + return (-1); + } + return (0); +} diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c new file mode 100644 index 00000000000..9f4e2cae857 --- /dev/null +++ b/sys/kern/subr_param.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 1980, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)param.c 8.2 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef SYSVSHM +#include +#include +#endif + +/* + * System parameter formulae. + * + * This file is copied into each directory where we compile + * the kernel; it should be modified there to suit local taste + * if necessary. + * + * Compiled with -DHZ=xx -DTIMEZONE=x -DDST=x -DMAXUSERS=xx + */ + +#ifndef HZ +#define HZ 100 +#endif +int hz = HZ; +int tick = 1000000 / HZ; +int tickadj = 30000 / (60 * HZ); /* can adjust 30ms in 60s */ +struct timezone tz = { TIMEZONE, DST }; +#define NPROC (20 + 16 * MAXUSERS) +int maxproc = NPROC; +#define NTEXT (80 + NPROC / 8) /* actually the object cache */ +#define NVNODE (NPROC + NTEXT + 100) +int desiredvnodes = NVNODE; +int maxfiles = 3 * (NPROC + MAXUSERS) + 80; +int ncallout = 16 + NPROC; +int nclist = 60 + 12 * MAXUSERS; +int nmbclusters = NMBCLUSTERS; +int fscale = FSCALE; /* kernel uses `FSCALE', user uses `fscale' */ + +/* + * Values in support of System V compatible shared memory. XXX + */ +#ifdef SYSVSHM +#define SHMMAX (SHMMAXPGS*NBPG) +#define SHMMIN 1 +#define SHMMNI 32 /* <= SHMMMNI in shm.h */ +#define SHMSEG 8 +#define SHMALL (SHMMAXPGS/CLSIZE) + +struct shminfo shminfo = { + SHMMAX, + SHMMIN, + SHMMNI, + SHMSEG, + SHMALL +}; +#endif + +/* + * These are initialized at bootstrap time + * to values dependent on memory size + */ +int nbuf, nswbuf; + +/* + * These have to be allocated somewhere; allocating + * them here forces loader errors if this file is omitted + * (if they've been externed everywhere else; hah!). + */ +struct callout *callout; +struct cblock *cfree; +struct buf *buf, *swbuf; +char *buffers; + +/* + * Proc/pgrp hashing. + * Here so that hash table sizes can depend on MAXUSERS/NPROC. + * Hash size must be a power of two. + * NOW omission of this file will cause loader errors! + */ + +#if NPROC > 1024 +#define PIDHSZ 512 +#else +#if NPROC > 512 +#define PIDHSZ 256 +#else +#if NPROC > 256 +#define PIDHSZ 128 +#else +#define PIDHSZ 64 +#endif +#endif +#endif + +struct proc *pidhash[PIDHSZ]; +struct pgrp *pgrphash[PIDHSZ]; +int pidhashmask = PIDHSZ - 1; diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c new file mode 100644 index 00000000000..2adb7793a3c --- /dev/null +++ b/sys/kern/subr_prf.c @@ -0,0 +1,601 @@ +/*- + * Copyright (c) 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Note that stdarg.h and the ANSI style va_start macro is used for both + * ANSI and traditional C compilers. + */ +#include + +#ifdef KADB +#include +#endif + +#define TOCONS 0x01 +#define TOTTY 0x02 +#define TOLOG 0x04 + +struct tty *constty; /* pointer to console "window" tty */ + +extern cnputc(); /* standard console putc */ +int (*v_putc)() = cnputc; /* routine to putc on virtual console */ + +void logpri __P((int level)); +static void putchar __P((int ch, int flags, struct tty *tp)); +static char *ksprintn __P((u_long num, int base, int *len)); +void kprintf __P((const char *fmt, int flags, struct tty *tp, va_list ap)); + +int consintr = 1; /* Ok to handle console interrupts? */ + +/* + * Variable panicstr contains argument to first call to panic; used as flag + * to indicate that the kernel has already called panic. + */ +const char *panicstr; + +/* + * Panic is called on unresolvable fatal errors. It prints "panic: mesg", + * and then reboots. If we are called twice, then we avoid trying to sync + * the disks as this often leads to recursive panics. + */ +#ifdef __GNUC__ +volatile void boot(int flags); /* boot() does not return */ +volatile /* panic() does not return */ +#endif +void +#ifdef __STDC__ +panic(const char *fmt, ...) +#else +panic(fmt, va_alist) + char *fmt; +#endif +{ + int bootopt; + va_list ap; + + bootopt = RB_AUTOBOOT | RB_DUMP; + if (panicstr) + bootopt |= RB_NOSYNC; + else + panicstr = fmt; + + va_start(ap, fmt); + printf("panic: %r\n", fmt, ap); + va_end(ap); + +#ifdef KGDB + kgdb_panic(); +#endif +#ifdef KADB + if (boothowto & RB_KDB) + kdbpanic(); +#endif + boot(bootopt); +} + +/* + * Warn that a system table is full. + */ +void +tablefull(tab) + const char *tab; +{ + + log(LOG_ERR, "%s: table is full\n", tab); +} + +/* + * Uprintf prints to the controlling terminal for the current process. + * It may block if the tty queue is overfull. No message is printed if + * the queue does not clear in a reasonable time. + */ +void +#ifdef __STDC__ +uprintf(const char *fmt, ...) +#else +uprintf(fmt, va_alist) + char *fmt; +#endif +{ + register struct proc *p = curproc; + va_list ap; + + if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { + va_start(ap, fmt); + kprintf(fmt, TOTTY, p->p_session->s_ttyp, ap); + va_end(ap); + } +} + +tpr_t +tprintf_open(p) + register struct proc *p; +{ + + if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { + SESSHOLD(p->p_session); + return ((tpr_t) p->p_session); + } + return ((tpr_t) NULL); +} + +void +tprintf_close(sess) + tpr_t sess; +{ + + if (sess) + SESSRELE((struct session *) sess); +} + +/* + * tprintf prints on the controlling terminal associated + * with the given session. + */ +void +#ifdef __STDC__ +tprintf(tpr_t tpr, const char *fmt, ...) +#else +tprintf(tpr, fmt, va_alist) + tpr_t tpr; + char *fmt; +#endif +{ + register struct session *sess = (struct session *)tpr; + struct tty *tp = NULL; + int flags = TOLOG; + va_list ap; + + logpri(LOG_INFO); + if (sess && sess->s_ttyvp && ttycheckoutq(sess->s_ttyp, 0)) { + flags |= TOTTY; + tp = sess->s_ttyp; + } + va_start(ap, fmt); + kprintf(fmt, flags, tp, ap); + va_end(ap); + logwakeup(); +} + +/* + * Ttyprintf displays a message on a tty; it should be used only by + * the tty driver, or anything that knows the underlying tty will not + * be revoke(2)'d away. Other callers should use tprintf. + */ +void +#ifdef __STDC__ +ttyprintf(struct tty *tp, const char *fmt, ...) +#else +ttyprintf(tp, fmt, va_alist) + struct tty *tp; + char *fmt; +#endif +{ + va_list ap; + + va_start(ap, fmt); + kprintf(fmt, TOTTY, tp, ap); + va_end(ap); +} + +extern int log_open; + +/* + * Log writes to the log buffer, and guarantees not to sleep (so can be + * called by interrupt routines). If there is no process reading the + * log yet, it writes to the console also. + */ +void +#ifdef __STDC__ +log(int level, const char *fmt, ...) +#else +log(level, fmt, va_alist) + int level; + char *fmt; +#endif +{ + register int s; + va_list ap; + + s = splhigh(); + logpri(level); + va_start(ap, fmt); + kprintf(fmt, TOLOG, NULL, ap); + splx(s); + va_end(ap); + if (!log_open) { + va_start(ap, fmt); + kprintf(fmt, TOCONS, NULL, ap); + va_end(ap); + } + logwakeup(); +} + +void +logpri(level) + int level; +{ + register int ch; + register char *p; + + putchar('<', TOLOG, NULL); + for (p = ksprintn((u_long)level, 10, NULL); ch = *p--;) + putchar(ch, TOLOG, NULL); + putchar('>', TOLOG, NULL); +} + +void +#ifdef __STDC__ +addlog(const char *fmt, ...) +#else +addlog(fmt, va_alist) + char *fmt; +#endif +{ + register int s; + va_list ap; + + s = splhigh(); + va_start(ap, fmt); + kprintf(fmt, TOLOG, NULL, ap); + splx(s); + va_end(ap); + if (!log_open) { + va_start(ap, fmt); + kprintf(fmt, TOCONS, NULL, ap); + va_end(ap); + } + logwakeup(); +} + +void +#ifdef __STDC__ +printf(const char *fmt, ...) +#else +printf(fmt, va_alist) + char *fmt; +#endif +{ + va_list ap; + register int savintr; + + savintr = consintr; /* disable interrupts */ + consintr = 0; + va_start(ap, fmt); + kprintf(fmt, TOCONS | TOLOG, NULL, ap); + va_end(ap); + if (!panicstr) + logwakeup(); + consintr = savintr; /* reenable interrupts */ +} + +/* + * Scaled down version of printf(3). + * + * Two additional formats: + * + * The format %b is supported to decode error registers. + * Its usage is: + * + * printf("reg=%b\n", regval, "*"); + * + * where is the output base expressed as a control character, e.g. + * \10 gives octal; \20 gives hex. Each arg is a sequence of characters, + * the first of which gives the bit number to be inspected (origin 1), and + * the next characters (up to a control character, i.e. a character <= 32), + * give the name of the register. Thus: + * + * kprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n"); + * + * would produce output: + * + * reg=3 + * + * The format %r passes an additional format string and argument list + * recursively. Its usage is: + * + * fn(char *fmt, ...) + * { + * va_list ap; + * va_start(ap, fmt); + * printf("prefix: %r: suffix\n", fmt, ap); + * va_end(ap); + * } + * + * Space or zero padding and a field width are supported for the numeric + * formats only. + */ +void +kprintf(fmt, flags, tp, ap) + register const char *fmt; + int flags; + struct tty *tp; + va_list ap; +{ + register char *p, *q; + register int ch, n; + u_long ul; + int base, lflag, tmp, width; + char padc; + + for (;;) { + padc = ' '; + width = 0; + while ((ch = *(u_char *)fmt++) != '%') { + if (ch == '\0') + return; + putchar(ch, flags, tp); + } + lflag = 0; +reswitch: switch (ch = *(u_char *)fmt++) { + case '0': + padc = '0'; + goto reswitch; + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + for (width = 0;; ++fmt) { + width = width * 10 + ch - '0'; + ch = *fmt; + if (ch < '0' || ch > '9') + break; + } + goto reswitch; + case 'l': + lflag = 1; + goto reswitch; + case 'b': + ul = va_arg(ap, int); + p = va_arg(ap, char *); + for (q = ksprintn(ul, *p++, NULL); ch = *q--;) + putchar(ch, flags, tp); + + if (!ul) + break; + + for (tmp = 0; n = *p++;) { + if (ul & (1 << (n - 1))) { + putchar(tmp ? ',' : '<', flags, tp); + for (; (n = *p) > ' '; ++p) + putchar(n, flags, tp); + tmp = 1; + } else + for (; *p > ' '; ++p) + continue; + } + if (tmp) + putchar('>', flags, tp); + break; + case 'c': + putchar(va_arg(ap, int), flags, tp); + break; + case 'r': + p = va_arg(ap, char *); + kprintf(p, flags, tp, va_arg(ap, va_list)); + break; + case 's': + p = va_arg(ap, char *); + while (ch = *p++) + putchar(ch, flags, tp); + break; + case 'd': + ul = lflag ? va_arg(ap, long) : va_arg(ap, int); + if ((long)ul < 0) { + putchar('-', flags, tp); + ul = -(long)ul; + } + base = 10; + goto number; + case 'o': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 8; + goto number; + case 'u': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 10; + goto number; + case 'x': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 16; +number: p = ksprintn(ul, base, &tmp); + if (width && (width -= tmp) > 0) + while (width--) + putchar(padc, flags, tp); + while (ch = *p--) + putchar(ch, flags, tp); + break; + default: + putchar('%', flags, tp); + if (lflag) + putchar('l', flags, tp); + /* FALLTHROUGH */ + case '%': + putchar(ch, flags, tp); + } + } +} + +/* + * Print a character on console or users terminal. If destination is + * the console then the last MSGBUFS characters are saved in msgbuf for + * inspection later. + */ +static void +putchar(c, flags, tp) + register int c; + int flags; + struct tty *tp; +{ + extern int msgbufmapped; + register struct msgbuf *mbp; + + if (panicstr) + constty = NULL; + if ((flags & TOCONS) && tp == NULL && constty) { + tp = constty; + flags |= TOTTY; + } + if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 && + (flags & TOCONS) && tp == constty) + constty = NULL; + if ((flags & TOLOG) && + c != '\0' && c != '\r' && c != 0177 && msgbufmapped) { + mbp = msgbufp; + if (mbp->msg_magic != MSG_MAGIC) { + bzero((caddr_t)mbp, sizeof(*mbp)); + mbp->msg_magic = MSG_MAGIC; + } + mbp->msg_bufc[mbp->msg_bufx++] = c; + if (mbp->msg_bufx < 0 || mbp->msg_bufx >= MSG_BSIZE) + mbp->msg_bufx = 0; + } + if ((flags & TOCONS) && constty == NULL && c != '\0') + (*v_putc)(c); +} + +/* + * Scaled down version of sprintf(3). + */ +#ifdef __STDC__ +sprintf(char *buf, const char *cfmt, ...) +#else +sprintf(buf, cfmt, va_alist) + char *buf, *cfmt; +#endif +{ + register const char *fmt = cfmt; + register char *p, *bp; + register int ch, base; + u_long ul; + int lflag; + va_list ap; + + va_start(ap, cfmt); + for (bp = buf; ; ) { + while ((ch = *(u_char *)fmt++) != '%') + if ((*bp++ = ch) == '\0') + return ((bp - buf) - 1); + + lflag = 0; +reswitch: switch (ch = *(u_char *)fmt++) { + case 'l': + lflag = 1; + goto reswitch; + case 'c': + *bp++ = va_arg(ap, int); + break; + case 's': + p = va_arg(ap, char *); + while (*bp++ = *p++) + continue; + --bp; + break; + case 'd': + ul = lflag ? va_arg(ap, long) : va_arg(ap, int); + if ((long)ul < 0) { + *bp++ = '-'; + ul = -(long)ul; + } + base = 10; + goto number; + break; + case 'o': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 8; + goto number; + break; + case 'u': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 10; + goto number; + break; + case 'x': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 16; +number: for (p = ksprintn(ul, base, NULL); ch = *p--;) + *bp++ = ch; + break; + default: + *bp++ = '%'; + if (lflag) + *bp++ = 'l'; + /* FALLTHROUGH */ + case '%': + *bp++ = ch; + } + } + va_end(ap); +} + +/* + * Put a number (base <= 16) in a buffer in reverse order; return an + * optional length and a pointer to the NULL terminated (preceded?) + * buffer. + */ +static char * +ksprintn(ul, base, lenp) + register u_long ul; + register int base, *lenp; +{ /* A long in base 8, plus NULL. */ + static char buf[sizeof(long) * NBBY / 3 + 2]; + register char *p; + + p = buf; + do { + *++p = "0123456789abcdef"[ul % base]; + } while (ul /= base); + if (lenp) + *lenp = p - buf; + return (p); +} diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c new file mode 100644 index 00000000000..4fb81d823ca --- /dev/null +++ b/sys/kern/subr_prof.c @@ -0,0 +1,256 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_prof.c 8.3 (Berkeley) 9/23/93 + */ + +#include +#include +#include +#include +#include +#include + +#ifdef GPROF +#include +#include + +/* + * Froms is actually a bunch of unsigned shorts indexing tos + */ +struct gmonparam _gmonparam = { GMON_PROF_OFF }; + +extern char etext[]; + +kmstartup() +{ + char *cp; + struct gmonparam *p = &_gmonparam; + /* + * Round lowpc and highpc to multiples of the density we're using + * so the rest of the scaling (here and in gprof) stays in ints. + */ + p->lowpc = ROUNDDOWN(KERNBASE, HISTFRACTION * sizeof(HISTCOUNTER)); + p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER)); + p->textsize = p->highpc - p->lowpc; + printf("Profiling kernel, textsize=%d [%x..%x]\n", + p->textsize, p->lowpc, p->highpc); + p->kcountsize = p->textsize / HISTFRACTION; + p->hashfraction = HASHFRACTION; + p->fromssize = p->textsize / HASHFRACTION; + p->tolimit = p->textsize * ARCDENSITY / 100; + if (p->tolimit < MINARCS) + p->tolimit = MINARCS; + else if (p->tolimit > MAXARCS) + p->tolimit = MAXARCS; + p->tossize = p->tolimit * sizeof(struct tostruct); + cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize, + M_GPROF, M_NOWAIT); + if (cp == 0) { + printf("No memory for profiling.\n"); + return; + } + bzero(cp, p->kcountsize + p->tossize + p->fromssize); + p->tos = (struct tostruct *)cp; + cp += p->tossize; + p->kcount = (u_short *)cp; + cp += p->kcountsize; + p->froms = (u_short *)cp; +} + +/* + * Return kernel profiling information. + */ +sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; +{ + struct gmonparam *gp = &_gmonparam; + int error; + + /* all sysctl names at this level are terminal */ + if (namelen != 1) + return (ENOTDIR); /* overloaded */ + + switch (name[0]) { + case GPROF_STATE: + error = sysctl_int(oldp, oldlenp, newp, newlen, &gp->state); + if (error) + return (error); + if (gp->state == GMON_PROF_OFF) + stopprofclock(&proc0); + else + startprofclock(&proc0); + return (0); + case GPROF_COUNT: + return (sysctl_struct(oldp, oldlenp, newp, newlen, + gp->kcount, gp->kcountsize)); + case GPROF_FROMS: + return (sysctl_struct(oldp, oldlenp, newp, newlen, + gp->froms, gp->fromssize)); + case GPROF_TOS: + return (sysctl_struct(oldp, oldlenp, newp, newlen, + gp->tos, gp->tossize)); + case GPROF_GMONPARAM: + return (sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp)); + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} +#endif /* GPROF */ + +/* + * Profiling system call. + * + * The scale factor is a fixed point number with 16 bits of fraction, so that + * 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling. + */ +struct profil_args { + caddr_t samples; + u_int size; + u_int offset; + u_int scale; +}; +/* ARGSUSED */ +profil(p, uap, retval) + struct proc *p; + register struct profil_args *uap; + int *retval; +{ + register struct uprof *upp; + int s; + + if (uap->scale > (1 << 16)) + return (EINVAL); + if (uap->scale == 0) { + stopprofclock(p); + return (0); + } + upp = &p->p_stats->p_prof; + + /* Block profile interrupts while changing state. */ + s = splstatclock(); + upp->pr_off = uap->offset; + upp->pr_scale = uap->scale; + upp->pr_base = uap->samples; + upp->pr_size = uap->size; + startprofclock(p); + splx(s); + + return (0); +} + +/* + * Scale is a fixed-point number with the binary point 16 bits + * into the value, and is <= 1.0. pc is at most 32 bits, so the + * intermediate result is at most 48 bits. + */ +#define PC_TO_INDEX(pc, prof) \ + ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \ + (u_quad_t)((prof)->pr_scale)) >> 16) & ~1) + +/* + * Collect user-level profiling statistics; called on a profiling tick, + * when a process is running in user-mode. This routine may be called + * from an interrupt context. We try to update the user profiling buffers + * cheaply with fuswintr() and suswintr(). If that fails, we revert to + * an AST that will vector us to trap() with a context in which copyin + * and copyout will work. Trap will then call addupc_task(). + * + * Note that we may (rarely) not get around to the AST soon enough, and + * lose profile ticks when the next tick overwrites this one, but in this + * case the system is overloaded and the profile is probably already + * inaccurate. + */ +void +addupc_intr(p, pc, ticks) + register struct proc *p; + register u_long pc; + u_int ticks; +{ + register struct uprof *prof; + register caddr_t addr; + register u_int i; + register int v; + + if (ticks == 0) + return; + prof = &p->p_stats->p_prof; + if (pc < prof->pr_off || + (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) + return; /* out of range; ignore */ + + addr = prof->pr_base + i; + if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) { + prof->pr_addr = pc; + prof->pr_ticks = ticks; + need_proftick(p); + } +} + +/* + * Much like before, but we can afford to take faults here. If the + * update fails, we simply turn off profiling. + */ +void +addupc_task(p, pc, ticks) + register struct proc *p; + register u_long pc; + u_int ticks; +{ + register struct uprof *prof; + register caddr_t addr; + register u_int i; + u_short v; + + /* Testing P_PROFIL may be unnecessary, but is certainly safe. */ + if ((p->p_flag & P_PROFIL) == 0 || ticks == 0) + return; + + prof = &p->p_stats->p_prof; + if (pc < prof->pr_off || + (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) + return; + + addr = prof->pr_base + i; + if (copyin(addr, (caddr_t)&v, sizeof(v)) == 0) { + v += ticks; + if (copyout((caddr_t)&v, addr, sizeof(v)) == 0) + return; + } + stopprofclock(p); +} diff --git a/sys/kern/subr_rmap.c b/sys/kern/subr_rmap.c new file mode 100644 index 00000000000..2f31173321d --- /dev/null +++ b/sys/kern/subr_rmap.c @@ -0,0 +1,81 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)subr_rmap.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include + +void +rminit(a1, a2, a3, a4, a5) + struct map *a1; + long a2, a3; + char *a4; + int a5; +{ + + /* + * Body deleted. + */ + return; +} + +long +rmalloc(a1, a2) + struct map *a1; + long a2; +{ + + /* + * Body deleted. + */ + return (0); +} + +void +rmfree(a1, a2, a3) + struct map *a1; + long a2, a3; +{ + + /* + * Body deleted. + */ + return; +} diff --git a/sys/kern/subr_xxx.c b/sys/kern/subr_xxx.c new file mode 100644 index 00000000000..c692ec11a3b --- /dev/null +++ b/sys/kern/subr_xxx.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_xxx.c 8.1 (Berkeley) 6/10/93 + */ + +/* + * Miscellaneous trivial functions, including many + * that are often inline-expanded or done in assembler. + */ +#include +#include + +#include + +/* + * Unsupported device function (e.g. writing to read-only device). + */ +enodev() +{ + + return (ENODEV); +} + +/* + * Unconfigured device function; driver not configured. + */ +enxio() +{ + + return (ENXIO); +} + +/* + * Unsupported ioctl function. + */ +enoioctl() +{ + + return (ENOTTY); +} + +/* + * Unsupported system function. + * This is used for an otherwise-reasonable operation + * that is not supported by the current system binary. + */ +enosys() +{ + + return (ENOSYS); +} + +/* + * Return error for operation not supported + * on a specific object or file type. + */ +eopnotsupp() +{ + + return (EOPNOTSUPP); +} + +/* + * Generic null operation, always returns success. + */ +nullop() +{ + + return (0); +} diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c new file mode 100644 index 00000000000..a121209f9fe --- /dev/null +++ b/sys/kern/sys_generic.c @@ -0,0 +1,683 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef KTRACE +#include +#endif + +/* + * Read system call. + */ +struct read_args { + int fd; + char *buf; + u_int nbyte; +}; +/* ARGSUSED */ +read(p, uap, retval) + struct proc *p; + register struct read_args *uap; + int *retval; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + struct iovec aiov; + long cnt, error = 0; +#ifdef KTRACE + struct iovec ktriov; +#endif + + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || + (fp->f_flag & FREAD) == 0) + return (EBADF); + aiov.iov_base = (caddr_t)uap->buf; + aiov.iov_len = uap->nbyte; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = uap->nbyte; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) + ktriov = aiov; +#endif + cnt = uap->nbyte; + if (error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)) + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + cnt -= auio.uio_resid; +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO) && error == 0) + ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktriov, cnt, error); +#endif + *retval = cnt; + return (error); +} + +/* + * Scatter read system call. + */ +struct readv_args { + int fdes; + struct iovec *iovp; + u_int iovcnt; +}; +readv(p, uap, retval) + struct proc *p; + register struct readv_args *uap; + int *retval; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + register struct iovec *iov; + struct iovec *needfree; + struct iovec aiov[UIO_SMALLIOV]; + long i, cnt, error = 0; + u_int iovlen; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + if (((u_int)uap->fdes) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fdes]) == NULL || + (fp->f_flag & FREAD) == 0) + return (EBADF); + /* note: can't use iovlen until iovcnt is validated */ + iovlen = uap->iovcnt * sizeof (struct iovec); + if (uap->iovcnt > UIO_SMALLIOV) { + if (uap->iovcnt > UIO_MAXIOV) + return (EINVAL); + MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); + needfree = iov; + } else { + iov = aiov; + needfree = NULL; + } + auio.uio_iov = iov; + auio.uio_iovcnt = uap->iovcnt; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + if (error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)) + goto done; + auio.uio_resid = 0; + for (i = 0; i < uap->iovcnt; i++) { + if (iov->iov_len < 0) { + error = EINVAL; + goto done; + } + auio.uio_resid += iov->iov_len; + if (auio.uio_resid < 0) { + error = EINVAL; + goto done; + } + iov++; + } +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) { + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + cnt = auio.uio_resid; + if (error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)) + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + cnt -= auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, uap->fdes, UIO_READ, ktriov, + cnt, error); + FREE(ktriov, M_TEMP); + } +#endif + *retval = cnt; +done: + if (needfree) + FREE(needfree, M_IOV); + return (error); +} + +/* + * Write system call + */ +struct write_args { + int fd; + char *buf; + u_int nbyte; +}; +write(p, uap, retval) + struct proc *p; + register struct write_args *uap; + int *retval; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + struct iovec aiov; + long cnt, error = 0; +#ifdef KTRACE + struct iovec ktriov; +#endif + + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || + (fp->f_flag & FWRITE) == 0) + return (EBADF); + aiov.iov_base = (caddr_t)uap->buf; + aiov.iov_len = uap->nbyte; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = uap->nbyte; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) + ktriov = aiov; +#endif + cnt = uap->nbyte; + if (error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred)) { + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + if (error == EPIPE) + psignal(p, SIGPIPE); + } + cnt -= auio.uio_resid; +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO) && error == 0) + ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, + &ktriov, cnt, error); +#endif + *retval = cnt; + return (error); +} + +/* + * Gather write system call + */ +struct writev_args { + int fd; + struct iovec *iovp; + u_int iovcnt; +}; +writev(p, uap, retval) + struct proc *p; + register struct writev_args *uap; + int *retval; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + register struct iovec *iov; + struct iovec *needfree; + struct iovec aiov[UIO_SMALLIOV]; + long i, cnt, error = 0; + u_int iovlen; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || + (fp->f_flag & FWRITE) == 0) + return (EBADF); + /* note: can't use iovlen until iovcnt is validated */ + iovlen = uap->iovcnt * sizeof (struct iovec); + if (uap->iovcnt > UIO_SMALLIOV) { + if (uap->iovcnt > UIO_MAXIOV) + return (EINVAL); + MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); + needfree = iov; + } else { + iov = aiov; + needfree = NULL; + } + auio.uio_iov = iov; + auio.uio_iovcnt = uap->iovcnt; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + if (error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)) + goto done; + auio.uio_resid = 0; + for (i = 0; i < uap->iovcnt; i++) { + if (iov->iov_len < 0) { + error = EINVAL; + goto done; + } + auio.uio_resid += iov->iov_len; + if (auio.uio_resid < 0) { + error = EINVAL; + goto done; + } + iov++; + } +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) { + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + cnt = auio.uio_resid; + if (error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred)) { + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + if (error == EPIPE) + psignal(p, SIGPIPE); + } + cnt -= auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, + ktriov, cnt, error); + FREE(ktriov, M_TEMP); + } +#endif + *retval = cnt; +done: + if (needfree) + FREE(needfree, M_IOV); + return (error); +} + +/* + * Ioctl system call + */ +struct ioctl_args { + int fd; + int com; + caddr_t data; +}; +/* ARGSUSED */ +ioctl(p, uap, retval) + struct proc *p; + register struct ioctl_args *uap; + int *retval; +{ + register struct file *fp; + register struct filedesc *fdp; + register int com, error; + register u_int size; + caddr_t data, memp; + int tmp; +#define STK_PARAMS 128 + char stkbuf[STK_PARAMS]; + + fdp = p->p_fd; + if ((u_int)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + + if ((fp->f_flag & (FREAD | FWRITE)) == 0) + return (EBADF); + + switch (com = uap->com) { + case FIONCLEX: + fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; + return (0); + case FIOCLEX: + fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; + return (0); + } + + /* + * Interpret high order word to find amount of data to be + * copied to/from the user's address space. + */ + size = IOCPARM_LEN(com); + if (size > IOCPARM_MAX) + return (ENOTTY); + memp = NULL; + if (size > sizeof (stkbuf)) { + memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); + data = memp; + } else + data = stkbuf; + if (com&IOC_IN) { + if (size) { + error = copyin(uap->data, data, (u_int)size); + if (error) { + if (memp) + free(memp, M_IOCTLOPS); + return (error); + } + } else + *(caddr_t *)data = uap->data; + } else if ((com&IOC_OUT) && size) + /* + * Zero the buffer so the user always + * gets back something deterministic. + */ + bzero(data, size); + else if (com&IOC_VOID) + *(caddr_t *)data = uap->data; + + switch (com) { + + case FIONBIO: + if (tmp = *(int *)data) + fp->f_flag |= FNONBLOCK; + else + fp->f_flag &= ~FNONBLOCK; + error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); + break; + + case FIOASYNC: + if (tmp = *(int *)data) + fp->f_flag |= FASYNC; + else + fp->f_flag &= ~FASYNC; + error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); + break; + + case FIOSETOWN: + tmp = *(int *)data; + if (fp->f_type == DTYPE_SOCKET) { + ((struct socket *)fp->f_data)->so_pgid = tmp; + error = 0; + break; + } + if (tmp <= 0) { + tmp = -tmp; + } else { + struct proc *p1 = pfind(tmp); + if (p1 == 0) { + error = ESRCH; + break; + } + tmp = p1->p_pgrp->pg_id; + } + error = (*fp->f_ops->fo_ioctl) + (fp, (int)TIOCSPGRP, (caddr_t)&tmp, p); + break; + + case FIOGETOWN: + if (fp->f_type == DTYPE_SOCKET) { + error = 0; + *(int *)data = ((struct socket *)fp->f_data)->so_pgid; + break; + } + error = (*fp->f_ops->fo_ioctl)(fp, (int)TIOCGPGRP, data, p); + *(int *)data = -*(int *)data; + break; + + default: + error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); + /* + * Copy any data to user, size was + * already set and checked above. + */ + if (error == 0 && (com&IOC_OUT) && size) + error = copyout(data, uap->data, (u_int)size); + break; + } + if (memp) + free(memp, M_IOCTLOPS); + return (error); +} + +int selwait, nselcoll; + +/* + * Select system call. + */ +struct select_args { + u_int nd; + fd_set *in, *ou, *ex; + struct timeval *tv; +}; +select(p, uap, retval) + register struct proc *p; + register struct select_args *uap; + int *retval; +{ + fd_set ibits[3], obits[3]; + struct timeval atv; + int s, ncoll, error = 0, timo; + u_int ni; + + bzero((caddr_t)ibits, sizeof(ibits)); + bzero((caddr_t)obits, sizeof(obits)); + if (uap->nd > FD_SETSIZE) + return (EINVAL); + if (uap->nd > p->p_fd->fd_nfiles) + uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */ + ni = howmany(uap->nd, NFDBITS) * sizeof(fd_mask); + +#define getbits(name, x) \ + if (uap->name && \ + (error = copyin((caddr_t)uap->name, (caddr_t)&ibits[x], ni))) \ + goto done; + getbits(in, 0); + getbits(ou, 1); + getbits(ex, 2); +#undef getbits + + if (uap->tv) { + error = copyin((caddr_t)uap->tv, (caddr_t)&atv, + sizeof (atv)); + if (error) + goto done; + if (itimerfix(&atv)) { + error = EINVAL; + goto done; + } + s = splclock(); + timevaladd(&atv, (struct timeval *)&time); + timo = hzto(&atv); + /* + * Avoid inadvertently sleeping forever. + */ + if (timo == 0) + timo = 1; + splx(s); + } else + timo = 0; +retry: + ncoll = nselcoll; + p->p_flag |= P_SELECT; + error = selscan(p, ibits, obits, uap->nd, retval); + if (error || *retval) + goto done; + s = splhigh(); + /* this should be timercmp(&time, &atv, >=) */ + if (uap->tv && (time.tv_sec > atv.tv_sec || + time.tv_sec == atv.tv_sec && time.tv_usec >= atv.tv_usec)) { + splx(s); + goto done; + } + if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) { + splx(s); + goto retry; + } + p->p_flag &= ~P_SELECT; + error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo); + splx(s); + if (error == 0) + goto retry; +done: + p->p_flag &= ~P_SELECT; + /* select is not restarted after signals... */ + if (error == ERESTART) + error = EINTR; + if (error == EWOULDBLOCK) + error = 0; +#define putbits(name, x) \ + if (uap->name && \ + (error2 = copyout((caddr_t)&obits[x], (caddr_t)uap->name, ni))) \ + error = error2; + if (error == 0) { + int error2; + + putbits(in, 0); + putbits(ou, 1); + putbits(ex, 2); +#undef putbits + } + return (error); +} + +selscan(p, ibits, obits, nfd, retval) + struct proc *p; + fd_set *ibits, *obits; + int nfd, *retval; +{ + register struct filedesc *fdp = p->p_fd; + register int msk, i, j, fd; + register fd_mask bits; + struct file *fp; + int n = 0; + static int flag[3] = { FREAD, FWRITE, 0 }; + + for (msk = 0; msk < 3; msk++) { + for (i = 0; i < nfd; i += NFDBITS) { + bits = ibits[msk].fds_bits[i/NFDBITS]; + while ((j = ffs(bits)) && (fd = i + --j) < nfd) { + bits &= ~(1 << j); + fp = fdp->fd_ofiles[fd]; + if (fp == NULL) + return (EBADF); + if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) { + FD_SET(fd, &obits[msk]); + n++; + } + } + } + } + *retval = n; + return (0); +} + +/*ARGSUSED*/ +seltrue(dev, flag, p) + dev_t dev; + int flag; + struct proc *p; +{ + + return (1); +} + +/* + * Record a select request. + */ +void +selrecord(selector, sip) + struct proc *selector; + struct selinfo *sip; +{ + struct proc *p; + pid_t mypid; + + mypid = selector->p_pid; + if (sip->si_pid == mypid) + return; + if (sip->si_pid && (p = pfind(sip->si_pid)) && + p->p_wchan == (caddr_t)&selwait) + sip->si_flags |= SI_COLL; + else + sip->si_pid = mypid; +} + +/* + * Do a wakeup when a selectable event occurs. + */ +void +selwakeup(sip) + register struct selinfo *sip; +{ + register struct proc *p; + int s; + + if (sip->si_pid == 0) + return; + if (sip->si_flags & SI_COLL) { + nselcoll++; + sip->si_flags &= ~SI_COLL; + wakeup((caddr_t)&selwait); + } + p = pfind(sip->si_pid); + sip->si_pid = 0; + if (p != NULL) { + s = splhigh(); + if (p->p_wchan == (caddr_t)&selwait) { + if (p->p_stat == SSLEEP) + setrunnable(p); + else + unsleep(p); + } else if (p->p_flag & P_SELECT) + p->p_flag &= ~P_SELECT; + splx(s); + } +} diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c new file mode 100644 index 00000000000..4cc40baf582 --- /dev/null +++ b/sys/kern/sys_process.c @@ -0,0 +1,74 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include + +/* + * Process debugging system call. + */ +struct ptrace_args { + int req; + pid_t pid; + caddr_t addr; + int data; +}; +ptrace(a1, a2, a3) + struct proc *a1; + struct ptrace_args *a2; + int *a3; +{ + + /* + * Body deleted. + */ + return (ENOSYS); +} + +trace_req(a1) + struct proc *a1; +{ + + /* + * Body deleted. + */ + return (0); +} diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c new file mode 100644 index 00000000000..a93ae86df85 --- /dev/null +++ b/sys/kern/sys_socket.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct fileops socketops = + { soo_read, soo_write, soo_ioctl, soo_select, soo_close }; + +/* ARGSUSED */ +soo_read(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + + return (soreceive((struct socket *)fp->f_data, (struct mbuf **)0, + uio, (struct mbuf **)0, (struct mbuf **)0, (int *)0)); +} + +/* ARGSUSED */ +soo_write(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + + return (sosend((struct socket *)fp->f_data, (struct mbuf *)0, + uio, (struct mbuf *)0, (struct mbuf *)0, 0)); +} + +soo_ioctl(fp, cmd, data, p) + struct file *fp; + int cmd; + register caddr_t data; + struct proc *p; +{ + register struct socket *so = (struct socket *)fp->f_data; + + switch (cmd) { + + case FIONBIO: + if (*(int *)data) + so->so_state |= SS_NBIO; + else + so->so_state &= ~SS_NBIO; + return (0); + + case FIOASYNC: + if (*(int *)data) { + so->so_state |= SS_ASYNC; + so->so_rcv.sb_flags |= SB_ASYNC; + so->so_snd.sb_flags |= SB_ASYNC; + } else { + so->so_state &= ~SS_ASYNC; + so->so_rcv.sb_flags &= ~SB_ASYNC; + so->so_snd.sb_flags &= ~SB_ASYNC; + } + return (0); + + case FIONREAD: + *(int *)data = so->so_rcv.sb_cc; + return (0); + + case SIOCSPGRP: + so->so_pgid = *(int *)data; + return (0); + + case SIOCGPGRP: + *(int *)data = so->so_pgid; + return (0); + + case SIOCATMARK: + *(int *)data = (so->so_state&SS_RCVATMARK) != 0; + return (0); + } + /* + * Interface/routing/protocol specific ioctls: + * interface and routing ioctls should have a + * different entry since a socket's unnecessary + */ + if (IOCGROUP(cmd) == 'i') + return (ifioctl(so, cmd, data, p)); + if (IOCGROUP(cmd) == 'r') + return (rtioctl(cmd, data, p)); + return ((*so->so_proto->pr_usrreq)(so, PRU_CONTROL, + (struct mbuf *)cmd, (struct mbuf *)data, (struct mbuf *)0)); +} + +soo_select(fp, which, p) + struct file *fp; + int which; + struct proc *p; +{ + register struct socket *so = (struct socket *)fp->f_data; + register int s = splnet(); + + switch (which) { + + case FREAD: + if (soreadable(so)) { + splx(s); + return (1); + } + selrecord(p, &so->so_rcv.sb_sel); + so->so_rcv.sb_flags |= SB_SEL; + break; + + case FWRITE: + if (sowriteable(so)) { + splx(s); + return (1); + } + selrecord(p, &so->so_snd.sb_sel); + so->so_snd.sb_flags |= SB_SEL; + break; + + case 0: + if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { + splx(s); + return (1); + } + selrecord(p, &so->so_rcv.sb_sel); + so->so_rcv.sb_flags |= SB_SEL; + break; + } + splx(s); + return (0); +} + +soo_stat(so, ub) + register struct socket *so; + register struct stat *ub; +{ + + bzero((caddr_t)ub, sizeof (*ub)); + ub->st_mode = S_IFSOCK; + return ((*so->so_proto->pr_usrreq)(so, PRU_SENSE, + (struct mbuf *)ub, (struct mbuf *)0, + (struct mbuf *)0)); +} + +/* ARGSUSED */ +soo_close(fp, p) + struct file *fp; + struct proc *p; +{ + int error = 0; + + if (fp->f_data) + error = soclose((struct socket *)fp->f_data); + fp->f_data = 0; + return (error); +} diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c new file mode 100644 index 00000000000..1809905a4f6 --- /dev/null +++ b/sys/kern/syscalls.c @@ -0,0 +1,251 @@ +/* + * System call names. + * + * DO NOT EDIT-- this file is automatically generated. + * created from @(#)syscalls.master 8.2 (Berkeley) 1/13/94 + */ + +char *syscallnames[] = { + "syscall", /* 0 = syscall */ + "exit", /* 1 = exit */ + "fork", /* 2 = fork */ + "read", /* 3 = read */ + "write", /* 4 = write */ + "open", /* 5 = open */ + "close", /* 6 = close */ + "wait4", /* 7 = wait4 */ + "old.creat", /* 8 = old creat */ + "link", /* 9 = link */ + "unlink", /* 10 = unlink */ + "obs_execv", /* 11 = obsolete execv */ + "chdir", /* 12 = chdir */ + "fchdir", /* 13 = fchdir */ + "mknod", /* 14 = mknod */ + "chmod", /* 15 = chmod */ + "chown", /* 16 = chown */ + "break", /* 17 = break */ + "getfsstat", /* 18 = getfsstat */ + "old.lseek", /* 19 = old lseek */ + "getpid", /* 20 = getpid */ + "mount", /* 21 = mount */ + "unmount", /* 22 = unmount */ + "setuid", /* 23 = setuid */ + "getuid", /* 24 = getuid */ + "geteuid", /* 25 = geteuid */ + "ptrace", /* 26 = ptrace */ + "recvmsg", /* 27 = recvmsg */ + "sendmsg", /* 28 = sendmsg */ + "recvfrom", /* 29 = recvfrom */ + "accept", /* 30 = accept */ + "getpeername", /* 31 = getpeername */ + "getsockname", /* 32 = getsockname */ + "access", /* 33 = access */ + "chflags", /* 34 = chflags */ + "fchflags", /* 35 = fchflags */ + "sync", /* 36 = sync */ + "kill", /* 37 = kill */ + "old.stat", /* 38 = old stat */ + "getppid", /* 39 = getppid */ + "old.lstat", /* 40 = old lstat */ + "dup", /* 41 = dup */ + "pipe", /* 42 = pipe */ + "getegid", /* 43 = getegid */ + "profil", /* 44 = profil */ +#ifdef KTRACE + "ktrace", /* 45 = ktrace */ +#else + "#45", /* 45 = ktrace */ +#endif + "sigaction", /* 46 = sigaction */ + "getgid", /* 47 = getgid */ + "sigprocmask", /* 48 = sigprocmask */ + "getlogin", /* 49 = getlogin */ + "setlogin", /* 50 = setlogin */ + "acct", /* 51 = acct */ + "sigpending", /* 52 = sigpending */ + "sigaltstack", /* 53 = sigaltstack */ + "ioctl", /* 54 = ioctl */ + "reboot", /* 55 = reboot */ + "revoke", /* 56 = revoke */ + "symlink", /* 57 = symlink */ + "readlink", /* 58 = readlink */ + "execve", /* 59 = execve */ + "umask", /* 60 = umask */ + "chroot", /* 61 = chroot */ + "old.fstat", /* 62 = old fstat */ + "old.getkerninfo", /* 63 = old getkerninfo */ + "old.getpagesize", /* 64 = old getpagesize */ + "msync", /* 65 = msync */ + "vfork", /* 66 = vfork */ + "obs_vread", /* 67 = obsolete vread */ + "obs_vwrite", /* 68 = obsolete vwrite */ + "sbrk", /* 69 = sbrk */ + "sstk", /* 70 = sstk */ + "old.mmap", /* 71 = old mmap */ + "vadvise", /* 72 = vadvise */ + "munmap", /* 73 = munmap */ + "mprotect", /* 74 = mprotect */ + "madvise", /* 75 = madvise */ + "obs_vhangup", /* 76 = obsolete vhangup */ + "obs_vlimit", /* 77 = obsolete vlimit */ + "mincore", /* 78 = mincore */ + "getgroups", /* 79 = getgroups */ + "setgroups", /* 80 = setgroups */ + "getpgrp", /* 81 = getpgrp */ + "setpgid", /* 82 = setpgid */ + "setitimer", /* 83 = setitimer */ + "old.wait", /* 84 = old wait */ + "swapon", /* 85 = swapon */ + "getitimer", /* 86 = getitimer */ + "old.gethostname", /* 87 = old gethostname */ + "old.sethostname", /* 88 = old sethostname */ + "getdtablesize", /* 89 = getdtablesize */ + "dup2", /* 90 = dup2 */ + "#91", /* 91 = getdopt */ + "fcntl", /* 92 = fcntl */ + "select", /* 93 = select */ + "#94", /* 94 = setdopt */ + "fsync", /* 95 = fsync */ + "setpriority", /* 96 = setpriority */ + "socket", /* 97 = socket */ + "connect", /* 98 = connect */ + "old.accept", /* 99 = old accept */ + "getpriority", /* 100 = getpriority */ + "old.send", /* 101 = old send */ + "old.recv", /* 102 = old recv */ + "sigreturn", /* 103 = sigreturn */ + "bind", /* 104 = bind */ + "setsockopt", /* 105 = setsockopt */ + "listen", /* 106 = listen */ + "obs_vtimes", /* 107 = obsolete vtimes */ + "old.sigvec", /* 108 = old sigvec */ + "old.sigblock", /* 109 = old sigblock */ + "old.sigsetmask", /* 110 = old sigsetmask */ + "sigsuspend", /* 111 = sigsuspend */ + "old.sigstack", /* 112 = old sigstack */ + "old.recvmsg", /* 113 = old recvmsg */ + "old.sendmsg", /* 114 = old sendmsg */ +#ifdef TRACE + "vtrace", /* 115 = vtrace */ +#else + "obs_vtrace", /* 115 = obsolete vtrace */ +#endif + "gettimeofday", /* 116 = gettimeofday */ + "getrusage", /* 117 = getrusage */ + "getsockopt", /* 118 = getsockopt */ +#ifdef vax + "resuba", /* 119 = resuba */ +#else + "#119", /* 119 = nosys */ +#endif + "readv", /* 120 = readv */ + "writev", /* 121 = writev */ + "settimeofday", /* 122 = settimeofday */ + "fchown", /* 123 = fchown */ + "fchmod", /* 124 = fchmod */ + "old.recvfrom", /* 125 = old recvfrom */ + "old.setreuid", /* 126 = old setreuid */ + "old.setregid", /* 127 = old setregid */ + "rename", /* 128 = rename */ + "old.truncate", /* 129 = old truncate */ + "old.ftruncate", /* 130 = old ftruncate */ + "flock", /* 131 = flock */ + "mkfifo", /* 132 = mkfifo */ + "sendto", /* 133 = sendto */ + "shutdown", /* 134 = shutdown */ + "socketpair", /* 135 = socketpair */ + "mkdir", /* 136 = mkdir */ + "rmdir", /* 137 = rmdir */ + "utimes", /* 138 = utimes */ + "obs_4.2", /* 139 = obsolete 4.2 sigreturn */ + "adjtime", /* 140 = adjtime */ + "old.getpeername", /* 141 = old getpeername */ + "old.gethostid", /* 142 = old gethostid */ + "old.sethostid", /* 143 = old sethostid */ + "old.getrlimit", /* 144 = old getrlimit */ + "old.setrlimit", /* 145 = old setrlimit */ + "old.killpg", /* 146 = old killpg */ + "setsid", /* 147 = setsid */ + "quotactl", /* 148 = quotactl */ + "old.quota", /* 149 = old quota */ + "old.getsockname", /* 150 = old getsockname */ + "#151", /* 151 = nosys */ + "#152", /* 152 = nosys */ + "#153", /* 153 = nosys */ + "#154", /* 154 = nosys */ +#ifdef NFS + "nfssvc", /* 155 = nfssvc */ +#else + "#155", /* 155 = nosys */ +#endif + "old.getdirentries", /* 156 = old getdirentries */ + "statfs", /* 157 = statfs */ + "fstatfs", /* 158 = fstatfs */ + "#159", /* 159 = nosys */ + "#160", /* 160 = nosys */ +#ifdef NFS + "getfh", /* 161 = getfh */ +#else + "#161", /* 161 = nosys */ +#endif + "#162", /* 162 = nosys */ + "#163", /* 163 = nosys */ + "#164", /* 164 = nosys */ + "#165", /* 165 = nosys */ + "#166", /* 166 = nosys */ + "#167", /* 167 = nosys */ + "#168", /* 168 = nosys */ + "#169", /* 169 = nosys */ + "#170", /* 170 = nosys */ +#ifdef SYSVSHM + "shmsys", /* 171 = shmsys */ +#else + "#171", /* 171 = nosys */ +#endif + "#172", /* 172 = nosys */ + "#173", /* 173 = nosys */ + "#174", /* 174 = nosys */ + "#175", /* 175 = nosys */ + "#176", /* 176 = nosys */ + "#177", /* 177 = nosys */ + "#178", /* 178 = nosys */ + "#179", /* 179 = nosys */ + "#180", /* 180 = nosys */ + "setgid", /* 181 = setgid */ + "setegid", /* 182 = setegid */ + "seteuid", /* 183 = seteuid */ +#ifdef LFS + "lfs_bmapv", /* 184 = lfs_bmapv */ + "lfs_markv", /* 185 = lfs_markv */ + "lfs_segclean", /* 186 = lfs_segclean */ + "lfs_segwait", /* 187 = lfs_segwait */ +#else + "#184", /* 184 = nosys */ + "#185", /* 185 = nosys */ + "#186", /* 186 = nosys */ + "#187", /* 187 = nosys */ +#endif + "stat", /* 188 = stat */ + "fstat", /* 189 = fstat */ + "lstat", /* 190 = lstat */ + "pathconf", /* 191 = pathconf */ + "fpathconf", /* 192 = fpathconf */ + "#193", /* 193 = nosys */ + "getrlimit", /* 194 = getrlimit */ + "setrlimit", /* 195 = setrlimit */ + "getdirentries", /* 196 = getdirentries */ + "mmap", /* 197 = mmap */ + "__syscall", /* 198 = __syscall */ + "lseek", /* 199 = lseek */ + "truncate", /* 200 = truncate */ + "ftruncate", /* 201 = ftruncate */ + "__sysctl", /* 202 = __sysctl */ + "mlock", /* 203 = mlock */ + "munlock", /* 204 = munlock */ + "#205", /* 205 = nosys */ + "#206", /* 206 = nosys */ + "#207", /* 207 = nosys */ + "#208", /* 208 = nosys */ + "#209", /* 209 = nosys */ + "#210", /* 210 = nosys */ +}; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master new file mode 100644 index 00000000000..1b8de145fba --- /dev/null +++ b/sys/kern/syscalls.master @@ -0,0 +1,276 @@ + @(#)syscalls.master 8.2 (Berkeley) 1/13/94 +; System call name/number master file. +; Processed to created init_sysent.c, syscalls.c and syscall.h. + +; Columns: number type nargs name altname/comments +; number system call number, must be in order +; type one of STD, OBSOL, UNIMPL, COMPAT +; nargs number of arguments +; name name of syscall routine +; altname name of system call if different +; for UNIMPL/OBSOL, name continues with comments + +; types: +; STD always included +; COMPAT included on COMPAT #ifdef +; LIBCOMPAT included on COMPAT #ifdef, and placed in syscall.h +; OBSOL obsolete, not included in system, only specifies name +; UNIMPL not implemented, placeholder only + +; #ifdef's, etc. may be included, and are copied to the output files. + +; Reserved/unimplemented system calls in the range 0-150 inclusive +; are reserved for use in future Berkeley releases. +; Additional system calls implemented in vendor and other +; redistributions should be placed in the reserved range at the end +; of the current calls. + +0 STD 0 nosys syscall +1 STD 1 exit +2 STD 0 fork +3 STD 3 read +4 STD 3 write +5 STD 3 open +6 STD 1 close +7 STD 4 wait4 +8 COMPAT 2 creat +9 STD 2 link +10 STD 1 unlink +11 OBSOL 2 execv +12 STD 1 chdir +13 STD 1 fchdir +14 STD 3 mknod +15 STD 2 chmod +16 STD 3 chown +17 STD 1 obreak break +18 STD 3 getfsstat +19 COMPAT 3 lseek +20 STD 0 getpid +21 STD 4 mount +22 STD 2 unmount +23 STD 1 setuid +24 STD 0 getuid +25 STD 0 geteuid +26 STD 4 ptrace +27 STD 3 recvmsg +28 STD 3 sendmsg +29 STD 6 recvfrom +30 STD 3 accept +31 STD 3 getpeername +32 STD 3 getsockname +33 STD 2 access +34 STD 2 chflags +35 STD 2 fchflags +36 STD 0 sync +37 STD 2 kill +38 COMPAT 2 stat +39 STD 0 getppid +40 COMPAT 2 lstat +41 STD 2 dup +42 STD 0 pipe +43 STD 0 getegid +44 STD 4 profil +#ifdef KTRACE +45 STD 4 ktrace +#else +45 UNIMPL 0 ktrace +#endif +46 STD 3 sigaction +47 STD 0 getgid +48 STD 2 sigprocmask +49 STD 2 getlogin +50 STD 1 setlogin +51 STD 1 acct +52 STD 0 sigpending +53 STD 2 sigaltstack +54 STD 3 ioctl +55 STD 1 reboot +56 STD 1 revoke +57 STD 2 symlink +58 STD 3 readlink +59 STD 3 execve +60 STD 1 umask +61 STD 1 chroot +62 COMPAT 2 fstat +63 COMPAT 4 getkerninfo +64 COMPAT 0 getpagesize +65 STD 2 msync +66 STD 0 vfork +67 OBSOL 0 vread +68 OBSOL 0 vwrite +69 STD 1 sbrk +70 STD 1 sstk +71 COMPAT 7 mmap +72 STD 1 ovadvise vadvise +73 STD 2 munmap +74 STD 3 mprotect +75 STD 3 madvise +76 OBSOL 0 vhangup +77 OBSOL 0 vlimit +78 STD 3 mincore +79 STD 2 getgroups +80 STD 2 setgroups +81 STD 0 getpgrp +82 STD 2 setpgid +83 STD 3 setitimer +84 COMPAT 0 wait +85 STD 1 swapon +86 STD 2 getitimer +87 COMPAT 2 gethostname +88 COMPAT 2 sethostname +89 STD 0 getdtablesize +90 STD 2 dup2 +91 UNIMPL 2 getdopt +92 STD 3 fcntl +93 STD 5 select +94 UNIMPL 2 setdopt +95 STD 1 fsync +96 STD 3 setpriority +97 STD 3 socket +98 STD 3 connect +99 COMPAT 3 accept +100 STD 2 getpriority +101 COMPAT 4 send +102 COMPAT 4 recv +103 STD 1 sigreturn +104 STD 3 bind +105 STD 5 setsockopt +106 STD 2 listen +107 OBSOL 0 vtimes +108 COMPAT 3 sigvec +109 COMPAT 1 sigblock +110 COMPAT 1 sigsetmask +111 STD 1 sigsuspend +112 COMPAT 2 sigstack +113 COMPAT 3 recvmsg +114 COMPAT 3 sendmsg +#ifdef TRACE +115 STD 2 vtrace +#else +115 OBSOL 2 vtrace +#endif +116 STD 2 gettimeofday +117 STD 2 getrusage +118 STD 5 getsockopt +#ifdef vax +119 STD 1 resuba +#else +119 UNIMPL 0 nosys +#endif +120 STD 3 readv +121 STD 3 writev +122 STD 2 settimeofday +123 STD 3 fchown +124 STD 2 fchmod +125 COMPAT 6 recvfrom +126 COMPAT 2 setreuid +127 COMPAT 2 setregid +128 STD 2 rename +129 COMPAT 2 truncate +130 COMPAT 2 ftruncate +131 STD 2 flock +132 STD 2 mkfifo +133 STD 6 sendto +134 STD 2 shutdown +135 STD 5 socketpair +136 STD 2 mkdir +137 STD 1 rmdir +138 STD 2 utimes +139 OBSOL 0 4.2 sigreturn +140 STD 2 adjtime +141 COMPAT 3 getpeername +142 COMPAT 0 gethostid +143 COMPAT 1 sethostid +144 COMPAT 2 getrlimit +145 COMPAT 2 setrlimit +146 COMPAT 2 killpg +147 STD 0 setsid +148 STD 4 quotactl +149 COMPAT 4 quota +150 COMPAT 3 getsockname + +; Syscalls 151-180 inclusive are reserved for vendor-specific +; system calls. (This includes various calls added for compatibity +; with other Unix variants.) +; Some of these calls are now supported by BSD... +151 UNIMPL 0 nosys +152 UNIMPL 0 nosys +153 UNIMPL 0 nosys +154 UNIMPL 0 nosys +#ifdef NFS +155 STD 2 nfssvc +#else +155 UNIMPL 0 nosys +#endif +156 COMPAT 4 getdirentries +157 STD 2 statfs +158 STD 2 fstatfs +159 UNIMPL 0 nosys +160 UNIMPL 0 nosys +#ifdef NFS +161 STD 2 getfh +#else +161 UNIMPL 0 nosys +#endif +162 UNIMPL 0 nosys +163 UNIMPL 0 nosys +164 UNIMPL 0 nosys +165 UNIMPL 0 nosys +166 UNIMPL 0 nosys +167 UNIMPL 0 nosys +168 UNIMPL 0 nosys +169 UNIMPL 0 nosys +170 UNIMPL 0 nosys +#ifdef SYSVSHM +171 STD 4 shmsys +#else +171 UNIMPL 0 nosys +#endif +172 UNIMPL 0 nosys +173 UNIMPL 0 nosys +174 UNIMPL 0 nosys +175 UNIMPL 0 nosys +176 UNIMPL 0 nosys +177 UNIMPL 0 nosys +178 UNIMPL 0 nosys +179 UNIMPL 0 nosys +180 UNIMPL 0 nosys + +; Syscalls 180-199 are used by/reserved for BSD +181 STD 1 setgid +182 STD 1 setegid +183 STD 1 seteuid +#ifdef LFS +184 STD 3 lfs_bmapv +185 STD 3 lfs_markv +186 STD 2 lfs_segclean +187 STD 2 lfs_segwait +#else +184 UNIMPL 0 nosys +185 UNIMPL 0 nosys +186 UNIMPL 0 nosys +187 UNIMPL 0 nosys +#endif +188 STD 2 stat +189 STD 2 fstat +190 STD 2 lstat +191 STD 2 pathconf +192 STD 2 fpathconf +193 UNIMPL 0 nosys +194 STD 2 getrlimit +195 STD 2 setrlimit +196 STD 4 getdirentries +197 STD 8 mmap +198 STD 0 nosys __syscall +199 STD 5 lseek +200 STD 4 truncate +201 STD 4 ftruncate +202 STD 6 __sysctl +203 STD 2 mlock +204 STD 2 munlock +205 UNIMPL 0 nosys +206 UNIMPL 0 nosys +207 UNIMPL 0 nosys +208 UNIMPL 0 nosys +209 UNIMPL 0 nosys +210 UNIMPL 0 nosys diff --git a/sys/kern/tty.c b/sys/kern/tty.c new file mode 100644 index 00000000000..6cc7be23700 --- /dev/null +++ b/sys/kern/tty.c @@ -0,0 +1,1923 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty.c 8.8 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#define TTYDEFCHARS +#include +#undef TTYDEFCHARS +#include +#include +#include +#include +#include +#include +#include + +#include + +static int proc_compare __P((struct proc *p1, struct proc *p2)); +static int ttnread __P((struct tty *)); +static void ttyblock __P((struct tty *tp)); +static void ttyecho __P((int, struct tty *tp)); +static void ttyrubo __P((struct tty *, int)); + +/* Symbolic sleep message strings. */ +char ttclos[] = "ttycls"; +char ttopen[] = "ttyopn"; +char ttybg[] = "ttybg"; +char ttybuf[] = "ttybuf"; +char ttyin[] = "ttyin"; +char ttyout[] = "ttyout"; + +/* + * Table with character classes and parity. The 8th bit indicates parity, + * the 7th bit indicates the character is an alphameric or underscore (for + * ALTWERASE), and the low 6 bits indicate delay type. If the low 6 bits + * are 0 then the character needs no special processing on output; classes + * other than 0 might be translated or (not currently) require delays. + */ +#define E 0x00 /* Even parity. */ +#define O 0x80 /* Odd parity. */ +#define PARITY(c) (char_type[c] & O) + +#define ALPHA 0x40 /* Alpha or underscore. */ +#define ISALPHA(c) (char_type[(c) & TTY_CHARMASK] & ALPHA) + +#define CCLASSMASK 0x3f +#define CCLASS(c) (char_type[c] & CCLASSMASK) + +#define BS BACKSPACE +#define CC CONTROL +#define CR RETURN +#define NA ORDINARY | ALPHA +#define NL NEWLINE +#define NO ORDINARY +#define TB TAB +#define VT VTAB + +char const char_type[] = { + E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */ + O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */ + O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */ + E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */ + O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */ + E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */ + O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */ + O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */ + O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */ + E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */ + O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */ + O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */ + E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */ + /* + * Meta chars; should be settable per character set; + * for now, treat them all as normal characters. + */ + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, +}; +#undef BS +#undef CC +#undef CR +#undef NA +#undef NL +#undef NO +#undef TB +#undef VT + +/* Macros to clear/set/test flags. */ +#define SET(t, f) (t) |= (f) +#define CLR(t, f) (t) &= ~(f) +#define ISSET(t, f) ((t) & (f)) + +/* + * Initial open of tty, or (re)entry to standard tty line discipline. + */ +int +ttyopen(device, tp) + dev_t device; + register struct tty *tp; +{ + int s; + + s = spltty(); + tp->t_dev = device; + if (!ISSET(tp->t_state, TS_ISOPEN)) { + SET(tp->t_state, TS_ISOPEN); + bzero(&tp->t_winsize, sizeof(tp->t_winsize)); + } + CLR(tp->t_state, TS_WOPEN); + splx(s); + return (0); +} + +/* + * Handle close() on a tty line: flush and set to initial state, + * bumping generation number so that pending read/write calls + * can detect recycling of the tty. + */ +int +ttyclose(tp) + register struct tty *tp; +{ + extern struct tty *constty; /* Temporary virtual console. */ + + if (constty == tp) + constty = NULL; + + ttyflush(tp, FREAD | FWRITE); + + tp->t_gen++; + tp->t_pgrp = NULL; + tp->t_session = NULL; + tp->t_state = 0; + return (0); +} + +#define FLUSHQ(q) { \ + if ((q)->c_cc) \ + ndflush(q, (q)->c_cc); \ +} + +/* Is 'c' a line delimiter ("break" character)? */ +#define TTBREAKC(c) \ + ((c) == '\n' || ((c) == cc[VEOF] || \ + (c) == cc[VEOL] || (c) == cc[VEOL2]) && (c) != _POSIX_VDISABLE) + + +/* + * Process input of a single character received on a tty. + */ +int +ttyinput(c, tp) + register int c; + register struct tty *tp; +{ + register int iflag, lflag; + register u_char *cc; + int i, err; + + /* + * If input is pending take it first. + */ + lflag = tp->t_lflag; + if (ISSET(lflag, PENDIN)) + ttypend(tp); + /* + * Gather stats. + */ + if (ISSET(lflag, ICANON)) { + ++tk_cancc; + ++tp->t_cancc; + } else { + ++tk_rawcc; + ++tp->t_rawcc; + } + ++tk_nin; + + /* Handle exceptional conditions (break, parity, framing). */ + cc = tp->t_cc; + iflag = tp->t_iflag; + if (err = (ISSET(c, TTY_ERRORMASK))) { + CLR(c, TTY_ERRORMASK); + if (ISSET(err, TTY_FE) && !c) { /* Break. */ + if (ISSET(iflag, IGNBRK)) + goto endcase; + else if (ISSET(iflag, BRKINT) && + ISSET(lflag, ISIG) && + (cc[VINTR] != _POSIX_VDISABLE)) + c = cc[VINTR]; + else if (ISSET(iflag, PARMRK)) + goto parmrk; + } else if (ISSET(err, TTY_PE) && + ISSET(iflag, INPCK) || ISSET(err, TTY_FE)) { + if (ISSET(iflag, IGNPAR)) + goto endcase; + else if (ISSET(iflag, PARMRK)) { +parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); + (void)putc(0 | TTY_QUOTE, &tp->t_rawq); + (void)putc(c | TTY_QUOTE, &tp->t_rawq); + goto endcase; + } else + c = 0; + } + } + /* + * In tandem mode, check high water mark. + */ + if (ISSET(iflag, IXOFF)) + ttyblock(tp); + if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP)) + CLR(c, 0x80); + if (!ISSET(lflag, EXTPROC)) { + /* + * Check for literal nexting very first + */ + if (ISSET(tp->t_state, TS_LNCH)) { + SET(c, TTY_QUOTE); + CLR(tp->t_state, TS_LNCH); + } + /* + * Scan for special characters. This code + * is really just a big case statement with + * non-constant cases. The bottom of the + * case statement is labeled ``endcase'', so goto + * it after a case match, or similar. + */ + + /* + * Control chars which aren't controlled + * by ICANON, ISIG, or IXON. + */ + if (ISSET(lflag, IEXTEN)) { + if (CCEQ(cc[VLNEXT], c)) { + if (ISSET(lflag, ECHO)) { + if (ISSET(lflag, ECHOE)) { + (void)ttyoutput('^', tp); + (void)ttyoutput('\b', tp); + } else + ttyecho(c, tp); + } + SET(tp->t_state, TS_LNCH); + goto endcase; + } + if (CCEQ(cc[VDISCARD], c)) { + if (ISSET(lflag, FLUSHO)) + CLR(tp->t_lflag, FLUSHO); + else { + ttyflush(tp, FWRITE); + ttyecho(c, tp); + if (tp->t_rawq.c_cc + tp->t_canq.c_cc) + ttyretype(tp); + SET(tp->t_lflag, FLUSHO); + } + goto startoutput; + } + } + /* + * Signals. + */ + if (ISSET(lflag, ISIG)) { + if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) { + if (!ISSET(lflag, NOFLSH)) + ttyflush(tp, FREAD | FWRITE); + ttyecho(c, tp); + pgsignal(tp->t_pgrp, + CCEQ(cc[VINTR], c) ? SIGINT : SIGQUIT, 1); + goto endcase; + } + if (CCEQ(cc[VSUSP], c)) { + if (!ISSET(lflag, NOFLSH)) + ttyflush(tp, FREAD); + ttyecho(c, tp); + pgsignal(tp->t_pgrp, SIGTSTP, 1); + goto endcase; + } + } + /* + * Handle start/stop characters. + */ + if (ISSET(iflag, IXON)) { + if (CCEQ(cc[VSTOP], c)) { + if (!ISSET(tp->t_state, TS_TTSTOP)) { + SET(tp->t_state, TS_TTSTOP); +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, 0); +#else + (*cdevsw[major(tp->t_dev)].d_stop)(tp, + 0); +#endif + return (0); + } + if (!CCEQ(cc[VSTART], c)) + return (0); + /* + * if VSTART == VSTOP then toggle + */ + goto endcase; + } + if (CCEQ(cc[VSTART], c)) + goto restartoutput; + } + /* + * IGNCR, ICRNL, & INLCR + */ + if (c == '\r') { + if (ISSET(iflag, IGNCR)) + goto endcase; + else if (ISSET(iflag, ICRNL)) + c = '\n'; + } else if (c == '\n' && ISSET(iflag, INLCR)) + c = '\r'; + } + if (!ISSET(tp->t_lflag, EXTPROC) && ISSET(lflag, ICANON)) { + /* + * From here on down canonical mode character + * processing takes place. + */ + /* + * erase (^H / ^?) + */ + if (CCEQ(cc[VERASE], c)) { + if (tp->t_rawq.c_cc) + ttyrub(unputc(&tp->t_rawq), tp); + goto endcase; + } + /* + * kill (^U) + */ + if (CCEQ(cc[VKILL], c)) { + if (ISSET(lflag, ECHOKE) && + tp->t_rawq.c_cc == tp->t_rocount && + !ISSET(lflag, ECHOPRT)) + while (tp->t_rawq.c_cc) + ttyrub(unputc(&tp->t_rawq), tp); + else { + ttyecho(c, tp); + if (ISSET(lflag, ECHOK) || + ISSET(lflag, ECHOKE)) + ttyecho('\n', tp); + FLUSHQ(&tp->t_rawq); + tp->t_rocount = 0; + } + CLR(tp->t_state, TS_LOCAL); + goto endcase; + } + /* + * word erase (^W) + */ + if (CCEQ(cc[VWERASE], c)) { + int alt = ISSET(lflag, ALTWERASE); + int ctype; + + /* + * erase whitespace + */ + while ((c = unputc(&tp->t_rawq)) == ' ' || c == '\t') + ttyrub(c, tp); + if (c == -1) + goto endcase; + /* + * erase last char of word and remember the + * next chars type (for ALTWERASE) + */ + ttyrub(c, tp); + c = unputc(&tp->t_rawq); + if (c == -1) + goto endcase; + if (c == ' ' || c == '\t') { + (void)putc(c, &tp->t_rawq); + goto endcase; + } + ctype = ISALPHA(c); + /* + * erase rest of word + */ + do { + ttyrub(c, tp); + c = unputc(&tp->t_rawq); + if (c == -1) + goto endcase; + } while (c != ' ' && c != '\t' && + (alt == 0 || ISALPHA(c) == ctype)); + (void)putc(c, &tp->t_rawq); + goto endcase; + } + /* + * reprint line (^R) + */ + if (CCEQ(cc[VREPRINT], c)) { + ttyretype(tp); + goto endcase; + } + /* + * ^T - kernel info and generate SIGINFO + */ + if (CCEQ(cc[VSTATUS], c)) { + if (ISSET(lflag, ISIG)) + pgsignal(tp->t_pgrp, SIGINFO, 1); + if (!ISSET(lflag, NOKERNINFO)) + ttyinfo(tp); + goto endcase; + } + } + /* + * Check for input buffer overflow + */ + if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= TTYHOG) { + if (ISSET(iflag, IMAXBEL)) { + if (tp->t_outq.c_cc < tp->t_hiwat) + (void)ttyoutput(CTRL('g'), tp); + } else + ttyflush(tp, FREAD | FWRITE); + goto endcase; + } + /* + * Put data char in q for user and + * wakeup on seeing a line delimiter. + */ + if (putc(c, &tp->t_rawq) >= 0) { + if (!ISSET(lflag, ICANON)) { + ttwakeup(tp); + ttyecho(c, tp); + goto endcase; + } + if (TTBREAKC(c)) { + tp->t_rocount = 0; + catq(&tp->t_rawq, &tp->t_canq); + ttwakeup(tp); + } else if (tp->t_rocount++ == 0) + tp->t_rocol = tp->t_column; + if (ISSET(tp->t_state, TS_ERASE)) { + /* + * end of prterase \.../ + */ + CLR(tp->t_state, TS_ERASE); + (void)ttyoutput('/', tp); + } + i = tp->t_column; + ttyecho(c, tp); + if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) { + /* + * Place the cursor over the '^' of the ^D. + */ + i = min(2, tp->t_column - i); + while (i > 0) { + (void)ttyoutput('\b', tp); + i--; + } + } + } +endcase: + /* + * IXANY means allow any character to restart output. + */ + if (ISSET(tp->t_state, TS_TTSTOP) && + !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP]) + return (0); +restartoutput: + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_TTSTOP); +startoutput: + return (ttstart(tp)); +} + +/* + * Output a single character on a tty, doing output processing + * as needed (expanding tabs, newline processing, etc.). + * Returns < 0 if succeeds, otherwise returns char to resend. + * Must be recursive. + */ +int +ttyoutput(c, tp) + register int c; + register struct tty *tp; +{ + register long oflag; + register int col, s; + + oflag = tp->t_oflag; + if (!ISSET(oflag, OPOST)) { + if (ISSET(tp->t_lflag, FLUSHO)) + return (-1); + if (putc(c, &tp->t_outq)) + return (c); + tk_nout++; + tp->t_outcc++; + return (-1); + } + /* + * Do tab expansion if OXTABS is set. Special case if we external + * processing, we don't do the tab expansion because we'll probably + * get it wrong. If tab expansion needs to be done, let it happen + * externally. + */ + CLR(c, ~TTY_CHARMASK); + if (c == '\t' && + ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) { + c = 8 - (tp->t_column & 7); + if (!ISSET(tp->t_lflag, FLUSHO)) { + s = spltty(); /* Don't interrupt tabs. */ + c -= b_to_q(" ", c, &tp->t_outq); + tk_nout += c; + tp->t_outcc += c; + splx(s); + } + tp->t_column += c; + return (c ? -1 : '\t'); + } + if (c == CEOT && ISSET(oflag, ONOEOT)) + return (-1); + + /* + * Newline translation: if ONLCR is set, + * translate newline into "\r\n". + */ + if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) { + tk_nout++; + tp->t_outcc++; + if (putc('\r', &tp->t_outq)) + return (c); + } + tk_nout++; + tp->t_outcc++; + if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq)) + return (c); + + col = tp->t_column; + switch (CCLASS(c)) { + case BACKSPACE: + if (col > 0) + --col; + break; + case CONTROL: + break; + case NEWLINE: + case RETURN: + col = 0; + break; + case ORDINARY: + ++col; + break; + case TAB: + col = (col + 8) & ~7; + break; + } + tp->t_column = col; + return (-1); +} + +/* + * Ioctls for all tty devices. Called after line-discipline specific ioctl + * has been called to do discipline-specific functions and/or reject any + * of these ioctl commands. + */ +/* ARGSUSED */ +int +ttioctl(tp, cmd, data, flag) + register struct tty *tp; + int cmd, flag; + void *data; +{ + extern struct tty *constty; /* Temporary virtual console. */ + extern int nlinesw; + register struct proc *p; + int s, error; + + p = curproc; /* XXX */ + + /* If the ioctl involves modification, hang if in the background. */ + switch (cmd) { + case TIOCFLUSH: + case TIOCSETA: + case TIOCSETD: + case TIOCSETAF: + case TIOCSETAW: +#ifdef notdef + case TIOCSPGRP: +#endif + case TIOCSTI: + case TIOCSWINSZ: +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + case TIOCLBIC: + case TIOCLBIS: + case TIOCLSET: + case TIOCSETC: + case OTIOCSETD: + case TIOCSETN: + case TIOCSETP: + case TIOCSLTC: +#endif + while (isbackground(curproc, tp) && + p->p_pgrp->pg_jobc && (p->p_flag & P_PPWAIT) == 0 && + (p->p_sigignore & sigmask(SIGTTOU)) == 0 && + (p->p_sigmask & sigmask(SIGTTOU)) == 0) { + pgsignal(p->p_pgrp, SIGTTOU, 1); + if (error = ttysleep(tp, + &lbolt, TTOPRI | PCATCH, ttybg, 0)) + return (error); + } + break; + } + + switch (cmd) { /* Process the ioctl. */ + case FIOASYNC: /* set/clear async i/o */ + s = spltty(); + if (*(int *)data) + SET(tp->t_state, TS_ASYNC); + else + CLR(tp->t_state, TS_ASYNC); + splx(s); + break; + case FIONBIO: /* set/clear non-blocking i/o */ + break; /* XXX: delete. */ + case FIONREAD: /* get # bytes to read */ + *(int *)data = ttnread(tp); + break; + case TIOCEXCL: /* set exclusive use of tty */ + s = spltty(); + SET(tp->t_state, TS_XCLUDE); + splx(s); + break; + case TIOCFLUSH: { /* flush buffers */ + register int flags = *(int *)data; + + if (flags == 0) + flags = FREAD | FWRITE; + else + flags &= FREAD | FWRITE; + ttyflush(tp, flags); + break; + } + case TIOCCONS: /* become virtual console */ + if (*(int *)data) { + if (constty && constty != tp && + ISSET(constty->t_state, TS_CARR_ON | TS_ISOPEN) == + (TS_CARR_ON | TS_ISOPEN)) + return (EBUSY); +#ifndef UCONSOLE + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); +#endif + constty = tp; + } else if (tp == constty) + constty = NULL; + break; + case TIOCDRAIN: /* wait till output drained */ + if (error = ttywait(tp)) + return (error); + break; + case TIOCGETA: { /* get termios struct */ + struct termios *t = (struct termios *)data; + + bcopy(&tp->t_termios, t, sizeof(struct termios)); + break; + } + case TIOCGETD: /* get line discipline */ + *(int *)data = tp->t_line; + break; + case TIOCGWINSZ: /* get window size */ + *(struct winsize *)data = tp->t_winsize; + break; + case TIOCGPGRP: /* get pgrp of tty */ + if (!isctty(p, tp)) + return (ENOTTY); + *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; + break; +#ifdef TIOCHPCL + case TIOCHPCL: /* hang up on last close */ + s = spltty(); + SET(tp->t_cflag, HUPCL); + splx(s); + break; +#endif + case TIOCNXCL: /* reset exclusive use of tty */ + s = spltty(); + CLR(tp->t_state, TS_XCLUDE); + splx(s); + break; + case TIOCOUTQ: /* output queue size */ + *(int *)data = tp->t_outq.c_cc; + break; + case TIOCSETA: /* set termios struct */ + case TIOCSETAW: /* drain output, set */ + case TIOCSETAF: { /* drn out, fls in, set */ + register struct termios *t = (struct termios *)data; + + s = spltty(); + if (cmd == TIOCSETAW || cmd == TIOCSETAF) { + if (error = ttywait(tp)) { + splx(s); + return (error); + } + if (cmd == TIOCSETAF) + ttyflush(tp, FREAD); + } + if (!ISSET(t->c_cflag, CIGNORE)) { + /* + * Set device hardware. + */ + if (tp->t_param && (error = (*tp->t_param)(tp, t))) { + splx(s); + return (error); + } else { + if (!ISSET(tp->t_state, TS_CARR_ON) && + ISSET(tp->t_cflag, CLOCAL) && + !ISSET(t->c_cflag, CLOCAL)) { + CLR(tp->t_state, TS_ISOPEN); + SET(tp->t_state, TS_WOPEN); + ttwakeup(tp); + } + tp->t_cflag = t->c_cflag; + tp->t_ispeed = t->c_ispeed; + tp->t_ospeed = t->c_ospeed; + } + ttsetwater(tp); + } + if (cmd != TIOCSETAF) { + if (ISSET(t->c_lflag, ICANON) != + ISSET(tp->t_lflag, ICANON)) + if (ISSET(t->c_lflag, ICANON)) { + SET(tp->t_lflag, PENDIN); + ttwakeup(tp); + } else { + struct clist tq; + + catq(&tp->t_rawq, &tp->t_canq); + tq = tp->t_rawq; + tp->t_rawq = tp->t_canq; + tp->t_canq = tq; + CLR(tp->t_lflag, PENDIN); + } + } + tp->t_iflag = t->c_iflag; + tp->t_oflag = t->c_oflag; + /* + * Make the EXTPROC bit read only. + */ + if (ISSET(tp->t_lflag, EXTPROC)) + SET(t->c_lflag, EXTPROC); + else + CLR(t->c_lflag, EXTPROC); + tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN); + bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc)); + splx(s); + break; + } + case TIOCSETD: { /* set line discipline */ + register int t = *(int *)data; + dev_t device = tp->t_dev; + + if ((u_int)t >= nlinesw) + return (ENXIO); + if (t != tp->t_line) { + s = spltty(); + (*linesw[tp->t_line].l_close)(tp, flag); + error = (*linesw[t].l_open)(device, tp); + if (error) { + (void)(*linesw[tp->t_line].l_open)(device, tp); + splx(s); + return (error); + } + tp->t_line = t; + splx(s); + } + break; + } + case TIOCSTART: /* start output, like ^Q */ + s = spltty(); + if (ISSET(tp->t_state, TS_TTSTOP) || + ISSET(tp->t_lflag, FLUSHO)) { + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_TTSTOP); + ttstart(tp); + } + splx(s); + break; + case TIOCSTI: /* simulate terminal input */ + if (p->p_ucred->cr_uid && (flag & FREAD) == 0) + return (EPERM); + if (p->p_ucred->cr_uid && !isctty(p, tp)) + return (EACCES); + (*linesw[tp->t_line].l_rint)(*(u_char *)data, tp); + break; + case TIOCSTOP: /* stop output, like ^S */ + s = spltty(); + if (!ISSET(tp->t_state, TS_TTSTOP)) { + SET(tp->t_state, TS_TTSTOP); +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, 0); +#else + (*cdevsw[major(tp->t_dev)].d_stop)(tp, 0); +#endif + } + splx(s); + break; + case TIOCSCTTY: /* become controlling tty */ + /* Session ctty vnode pointer set in vnode layer. */ + if (!SESS_LEADER(p) || + (p->p_session->s_ttyvp || tp->t_session) && + (tp->t_session != p->p_session)) + return (EPERM); + tp->t_session = p->p_session; + tp->t_pgrp = p->p_pgrp; + p->p_session->s_ttyp = tp; + p->p_flag |= P_CONTROLT; + break; + case TIOCSPGRP: { /* set pgrp of tty */ + register struct pgrp *pgrp = pgfind(*(int *)data); + + if (!isctty(p, tp)) + return (ENOTTY); + else if (pgrp == NULL || pgrp->pg_session != p->p_session) + return (EPERM); + tp->t_pgrp = pgrp; + break; + } + case TIOCSWINSZ: /* set window size */ + if (bcmp((caddr_t)&tp->t_winsize, data, + sizeof (struct winsize))) { + tp->t_winsize = *(struct winsize *)data; + pgsignal(tp->t_pgrp, SIGWINCH, 1); + } + break; + default: +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + return (ttcompat(tp, cmd, data, flag)); +#else + return (-1); +#endif + } + return (0); +} + +int +ttselect(device, rw, p) + dev_t device; + int rw; + struct proc *p; +{ + register struct tty *tp; + int nread, s; + + tp = &cdevsw[major(device)].d_ttys[minor(device)]; + + s = spltty(); + switch (rw) { + case FREAD: + nread = ttnread(tp); + if (nread > 0 || !ISSET(tp->t_cflag, CLOCAL) && + !ISSET(tp->t_state, TS_CARR_ON)) + goto win; + selrecord(p, &tp->t_rsel); + break; + case FWRITE: + if (tp->t_outq.c_cc <= tp->t_lowat) { +win: splx(s); + return (1); + } + selrecord(p, &tp->t_wsel); + break; + } + splx(s); + return (0); +} + +static int +ttnread(tp) + struct tty *tp; +{ + int nread; + + if (ISSET(tp->t_lflag, PENDIN)) + ttypend(tp); + nread = tp->t_canq.c_cc; + if (!ISSET(tp->t_lflag, ICANON)) + nread += tp->t_rawq.c_cc; + return (nread); +} + +/* + * Wait for output to drain. + */ +int +ttywait(tp) + register struct tty *tp; +{ + int error, s; + + error = 0; + s = spltty(); + while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && + (ISSET(tp->t_state, TS_CARR_ON) || ISSET(tp->t_cflag, CLOCAL)) + && tp->t_oproc) { + (*tp->t_oproc)(tp); + SET(tp->t_state, TS_ASLEEP); + if (error = ttysleep(tp, + &tp->t_outq, TTOPRI | PCATCH, ttyout, 0)) + break; + } + splx(s); + return (error); +} + +/* + * Flush if successfully wait. + */ +int +ttywflush(tp) + struct tty *tp; +{ + int error; + + if ((error = ttywait(tp)) == 0) + ttyflush(tp, FREAD); + return (error); +} + +/* + * Flush tty read and/or write queues, notifying anyone waiting. + */ +void +ttyflush(tp, rw) + register struct tty *tp; + int rw; +{ + register int s; + + s = spltty(); + if (rw & FREAD) { + FLUSHQ(&tp->t_canq); + FLUSHQ(&tp->t_rawq); + tp->t_rocount = 0; + tp->t_rocol = 0; + CLR(tp->t_state, TS_LOCAL); + ttwakeup(tp); + } + if (rw & FWRITE) { + CLR(tp->t_state, TS_TTSTOP); +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, rw); +#else + (*cdevsw[major(tp->t_dev)].d_stop)(tp, rw); +#endif + FLUSHQ(&tp->t_outq); + wakeup((caddr_t)&tp->t_outq); + selwakeup(&tp->t_wsel); + } + splx(s); +} + +/* + * Copy in the default termios characters. + */ +void +ttychars(tp) + struct tty *tp; +{ + + bcopy(ttydefchars, tp->t_cc, sizeof(ttydefchars)); +} + +/* + * Send stop character on input overflow. + */ +static void +ttyblock(tp) + register struct tty *tp; +{ + register int total; + + total = tp->t_rawq.c_cc + tp->t_canq.c_cc; + if (tp->t_rawq.c_cc > TTYHOG) { + ttyflush(tp, FREAD | FWRITE); + CLR(tp->t_state, TS_TBLOCK); + } + /* + * Block further input iff: current input > threshold + * AND input is available to user program. + */ + if (total >= TTYHOG / 2 && + !ISSET(tp->t_state, TS_TBLOCK) && + !ISSET(tp->t_lflag, ICANON) || tp->t_canq.c_cc > 0 && + tp->t_cc[VSTOP] != _POSIX_VDISABLE) { + if (putc(tp->t_cc[VSTOP], &tp->t_outq) == 0) { + SET(tp->t_state, TS_TBLOCK); + ttstart(tp); + } + } +} + +void +ttrstrt(tp_arg) + void *tp_arg; +{ + struct tty *tp; + int s; + +#ifdef DIAGNOSTIC + if (tp_arg == NULL) + panic("ttrstrt"); +#endif + tp = tp_arg; + s = spltty(); + + CLR(tp->t_state, TS_TIMEOUT); + ttstart(tp); + + splx(s); +} + +int +ttstart(tp) + struct tty *tp; +{ + + if (tp->t_oproc != NULL) /* XXX: Kludge for pty. */ + (*tp->t_oproc)(tp); + return (0); +} + +/* + * "close" a line discipline + */ +int +ttylclose(tp, flag) + struct tty *tp; + int flag; +{ + + if (flag & IO_NDELAY) + ttyflush(tp, FREAD | FWRITE); + else + ttywflush(tp); + return (0); +} + +/* + * Handle modem control transition on a tty. + * Flag indicates new state of carrier. + * Returns 0 if the line should be turned off, otherwise 1. + */ +int +ttymodem(tp, flag) + register struct tty *tp; + int flag; +{ + + if (!ISSET(tp->t_state, TS_WOPEN) && ISSET(tp->t_cflag, MDMBUF)) { + /* + * MDMBUF: do flow control according to carrier flag + */ + if (flag) { + CLR(tp->t_state, TS_TTSTOP); + ttstart(tp); + } else if (!ISSET(tp->t_state, TS_TTSTOP)) { + SET(tp->t_state, TS_TTSTOP); +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, 0); +#else + (*cdevsw[major(tp->t_dev)].d_stop)(tp, 0); +#endif + } + } else if (flag == 0) { + /* + * Lost carrier. + */ + CLR(tp->t_state, TS_CARR_ON); + if (ISSET(tp->t_state, TS_ISOPEN) && + !ISSET(tp->t_cflag, CLOCAL)) { + if (tp->t_session && tp->t_session->s_leader) + psignal(tp->t_session->s_leader, SIGHUP); + ttyflush(tp, FREAD | FWRITE); + return (0); + } + } else { + /* + * Carrier now on. + */ + SET(tp->t_state, TS_CARR_ON); + ttwakeup(tp); + } + return (1); +} + +/* + * Default modem control routine (for other line disciplines). + * Return argument flag, to turn off device on carrier drop. + */ +int +nullmodem(tp, flag) + register struct tty *tp; + int flag; +{ + + if (flag) + SET(tp->t_state, TS_CARR_ON); + else { + CLR(tp->t_state, TS_CARR_ON); + if (!ISSET(tp->t_cflag, CLOCAL)) { + if (tp->t_session && tp->t_session->s_leader) + psignal(tp->t_session->s_leader, SIGHUP); + return (0); + } + } + return (1); +} + +/* + * Reinput pending characters after state switch + * call at spltty(). + */ +void +ttypend(tp) + register struct tty *tp; +{ + struct clist tq; + register c; + + CLR(tp->t_lflag, PENDIN); + SET(tp->t_state, TS_TYPEN); + tq = tp->t_rawq; + tp->t_rawq.c_cc = 0; + tp->t_rawq.c_cf = tp->t_rawq.c_cl = 0; + while ((c = getc(&tq)) >= 0) + ttyinput(c, tp); + CLR(tp->t_state, TS_TYPEN); +} + +/* + * Process a read call on a tty device. + */ +int +ttread(tp, uio, flag) + register struct tty *tp; + struct uio *uio; + int flag; +{ + register struct clist *qp; + register int c; + register long lflag; + register u_char *cc = tp->t_cc; + register struct proc *p = curproc; + int s, first, error = 0; + +loop: lflag = tp->t_lflag; + s = spltty(); + /* + * take pending input first + */ + if (ISSET(lflag, PENDIN)) + ttypend(tp); + splx(s); + + /* + * Hang process if it's in the background. + */ + if (isbackground(p, tp)) { + if ((p->p_sigignore & sigmask(SIGTTIN)) || + (p->p_sigmask & sigmask(SIGTTIN)) || + p->p_flag & P_PPWAIT || p->p_pgrp->pg_jobc == 0) + return (EIO); + pgsignal(p->p_pgrp, SIGTTIN, 1); + if (error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, ttybg, 0)) + return (error); + goto loop; + } + + /* + * If canonical, use the canonical queue, + * else use the raw queue. + * + * (should get rid of clists...) + */ + qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq; + + /* + * If there is no input, sleep on rawq + * awaiting hardware receipt and notification. + * If we have data, we don't need to check for carrier. + */ + s = spltty(); + if (qp->c_cc <= 0) { + int carrier; + + carrier = ISSET(tp->t_state, TS_CARR_ON) || + ISSET(tp->t_cflag, CLOCAL); + if (!carrier && ISSET(tp->t_state, TS_ISOPEN)) { + splx(s); + return (0); /* EOF */ + } + if (flag & IO_NDELAY) { + splx(s); + return (EWOULDBLOCK); + } + error = ttysleep(tp, &tp->t_rawq, TTIPRI | PCATCH, + carrier ? ttyin : ttopen, 0); + splx(s); + if (error) + return (error); + goto loop; + } + splx(s); + + /* + * Input present, check for input mapping and processing. + */ + first = 1; + while ((c = getc(qp)) >= 0) { + /* + * delayed suspend (^Y) + */ + if (CCEQ(cc[VDSUSP], c) && ISSET(lflag, ISIG)) { + pgsignal(tp->t_pgrp, SIGTSTP, 1); + if (first) { + if (error = ttysleep(tp, + &lbolt, TTIPRI | PCATCH, ttybg, 0)) + break; + goto loop; + } + break; + } + /* + * Interpret EOF only in canonical mode. + */ + if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON)) + break; + /* + * Give user character. + */ + error = ureadc(c, uio); + if (error) + break; + if (uio->uio_resid == 0) + break; + /* + * In canonical mode check for a "break character" + * marking the end of a "line of input". + */ + if (ISSET(lflag, ICANON) && TTBREAKC(c)) + break; + first = 0; + } + /* + * Look to unblock output now that (presumably) + * the input queue has gone down. + */ + s = spltty(); + if (ISSET(tp->t_state, TS_TBLOCK) && tp->t_rawq.c_cc < TTYHOG/5) { + if (cc[VSTART] != _POSIX_VDISABLE && + putc(cc[VSTART], &tp->t_outq) == 0) { + CLR(tp->t_state, TS_TBLOCK); + ttstart(tp); + } + } + splx(s); + return (error); +} + +/* + * Check the output queue on tp for space for a kernel message (from uprintf + * or tprintf). Allow some space over the normal hiwater mark so we don't + * lose messages due to normal flow control, but don't let the tty run amok. + * Sleeps here are not interruptible, but we return prematurely if new signals + * arrive. + */ +int +ttycheckoutq(tp, wait) + register struct tty *tp; + int wait; +{ + int hiwat, s, oldsig; + + hiwat = tp->t_hiwat; + s = spltty(); + oldsig = wait ? curproc->p_siglist : 0; + if (tp->t_outq.c_cc > hiwat + 200) + while (tp->t_outq.c_cc > hiwat) { + ttstart(tp); + if (wait == 0 || curproc->p_siglist != oldsig) { + splx(s); + return (0); + } + timeout((void (*)__P((void *)))wakeup, + (void *)&tp->t_outq, hz); + SET(tp->t_state, TS_ASLEEP); + sleep((caddr_t)&tp->t_outq, PZERO - 1); + } + splx(s); + return (1); +} + +/* + * Process a write call on a tty device. + */ +int +ttwrite(tp, uio, flag) + register struct tty *tp; + register struct uio *uio; + int flag; +{ + register char *cp; + register int cc, ce; + register struct proc *p; + int i, hiwat, cnt, error, s; + char obuf[OBUFSIZ]; + + hiwat = tp->t_hiwat; + cnt = uio->uio_resid; + error = 0; + cc = 0; +loop: + s = spltty(); + if (!ISSET(tp->t_state, TS_CARR_ON) && + !ISSET(tp->t_cflag, CLOCAL)) { + if (ISSET(tp->t_state, TS_ISOPEN)) { + splx(s); + return (EIO); + } else if (flag & IO_NDELAY) { + splx(s); + error = EWOULDBLOCK; + goto out; + } else { + /* Sleep awaiting carrier. */ + error = ttysleep(tp, + &tp->t_rawq, TTIPRI | PCATCH,ttopen, 0); + splx(s); + if (error) + goto out; + goto loop; + } + } + splx(s); + /* + * Hang the process if it's in the background. + */ + p = curproc; + if (isbackground(p, tp) && + ISSET(tp->t_lflag, TOSTOP) && (p->p_flag & P_PPWAIT) == 0 && + (p->p_sigignore & sigmask(SIGTTOU)) == 0 && + (p->p_sigmask & sigmask(SIGTTOU)) == 0 && + p->p_pgrp->pg_jobc) { + pgsignal(p->p_pgrp, SIGTTOU, 1); + if (error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, ttybg, 0)) + goto out; + goto loop; + } + /* + * Process the user's data in at most OBUFSIZ chunks. Perform any + * output translation. Keep track of high water mark, sleep on + * overflow awaiting device aid in acquiring new space. + */ + while (uio->uio_resid > 0 || cc > 0) { + if (ISSET(tp->t_lflag, FLUSHO)) { + uio->uio_resid = 0; + return (0); + } + if (tp->t_outq.c_cc > hiwat) + goto ovhiwat; + /* + * Grab a hunk of data from the user, unless we have some + * leftover from last time. + */ + if (cc == 0) { + cc = min(uio->uio_resid, OBUFSIZ); + cp = obuf; + error = uiomove(cp, cc, uio); + if (error) { + cc = 0; + break; + } + } + /* + * If nothing fancy need be done, grab those characters we + * can handle without any of ttyoutput's processing and + * just transfer them to the output q. For those chars + * which require special processing (as indicated by the + * bits in char_type), call ttyoutput. After processing + * a hunk of data, look for FLUSHO so ^O's will take effect + * immediately. + */ + while (cc > 0) { + if (!ISSET(tp->t_oflag, OPOST)) + ce = cc; + else { + ce = cc - scanc((u_int)cc, (u_char *)cp, + (u_char *)char_type, CCLASSMASK); + /* + * If ce is zero, then we're processing + * a special character through ttyoutput. + */ + if (ce == 0) { + tp->t_rocount = 0; + if (ttyoutput(*cp, tp) >= 0) { + /* No Clists, wait a bit. */ + ttstart(tp); + if (error = ttysleep(tp, &lbolt, + TTOPRI | PCATCH, ttybuf, 0)) + break; + goto loop; + } + cp++; + cc--; + if (ISSET(tp->t_lflag, FLUSHO) || + tp->t_outq.c_cc > hiwat) + goto ovhiwat; + continue; + } + } + /* + * A bunch of normal characters have been found. + * Transfer them en masse to the output queue and + * continue processing at the top of the loop. + * If there are any further characters in this + * <= OBUFSIZ chunk, the first should be a character + * requiring special handling by ttyoutput. + */ + tp->t_rocount = 0; + i = b_to_q(cp, ce, &tp->t_outq); + ce -= i; + tp->t_column += ce; + cp += ce, cc -= ce, tk_nout += ce; + tp->t_outcc += ce; + if (i > 0) { + /* No Clists, wait a bit. */ + ttstart(tp); + if (error = ttysleep(tp, + &lbolt, TTOPRI | PCATCH, ttybuf, 0)) + break; + goto loop; + } + if (ISSET(tp->t_lflag, FLUSHO) || + tp->t_outq.c_cc > hiwat) + break; + } + ttstart(tp); + } +out: + /* + * If cc is nonzero, we leave the uio structure inconsistent, as the + * offset and iov pointers have moved forward, but it doesn't matter + * (the call will either return short or restart with a new uio). + */ + uio->uio_resid += cc; + return (error); + +ovhiwat: + ttstart(tp); + s = spltty(); + /* + * This can only occur if FLUSHO is set in t_lflag, + * or if ttstart/oproc is synchronous (or very fast). + */ + if (tp->t_outq.c_cc <= hiwat) { + splx(s); + goto loop; + } + if (flag & IO_NDELAY) { + splx(s); + uio->uio_resid += cc; + return (uio->uio_resid == cnt ? EWOULDBLOCK : 0); + } + SET(tp->t_state, TS_ASLEEP); + error = ttysleep(tp, &tp->t_outq, TTOPRI | PCATCH, ttyout, 0); + splx(s); + if (error) + goto out; + goto loop; +} + +/* + * Rubout one character from the rawq of tp + * as cleanly as possible. + */ +void +ttyrub(c, tp) + register int c; + register struct tty *tp; +{ + register char *cp; + register int savecol; + int tabc, s; + + if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC)) + return; + CLR(tp->t_lflag, FLUSHO); + if (ISSET(tp->t_lflag, ECHOE)) { + if (tp->t_rocount == 0) { + /* + * Screwed by ttwrite; retype + */ + ttyretype(tp); + return; + } + if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE)) + ttyrubo(tp, 2); + else { + CLR(c, ~TTY_CHARMASK); + switch (CCLASS(c)) { + case ORDINARY: + ttyrubo(tp, 1); + break; + case BACKSPACE: + case CONTROL: + case NEWLINE: + case RETURN: + case VTAB: + if (ISSET(tp->t_lflag, ECHOCTL)) + ttyrubo(tp, 2); + break; + case TAB: + if (tp->t_rocount < tp->t_rawq.c_cc) { + ttyretype(tp); + return; + } + s = spltty(); + savecol = tp->t_column; + SET(tp->t_state, TS_CNTTB); + SET(tp->t_lflag, FLUSHO); + tp->t_column = tp->t_rocol; + cp = tp->t_rawq.c_cf; + if (cp) + tabc = *cp; /* XXX FIX NEXTC */ + for (; cp; cp = nextc(&tp->t_rawq, cp, &tabc)) + ttyecho(tabc, tp); + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_CNTTB); + splx(s); + + /* savecol will now be length of the tab. */ + savecol -= tp->t_column; + tp->t_column += savecol; + if (savecol > 8) + savecol = 8; /* overflow screw */ + while (--savecol >= 0) + (void)ttyoutput('\b', tp); + break; + default: /* XXX */ +#define PANICSTR "ttyrub: would panic c = %d, val = %d\n" + (void)printf(PANICSTR, c, CCLASS(c)); +#ifdef notdef + panic(PANICSTR, c, CCLASS(c)); +#endif + } + } + } else if (ISSET(tp->t_lflag, ECHOPRT)) { + if (!ISSET(tp->t_state, TS_ERASE)) { + SET(tp->t_state, TS_ERASE); + (void)ttyoutput('\\', tp); + } + ttyecho(c, tp); + } else + ttyecho(tp->t_cc[VERASE], tp); + --tp->t_rocount; +} + +/* + * Back over cnt characters, erasing them. + */ +static void +ttyrubo(tp, cnt) + register struct tty *tp; + int cnt; +{ + + while (cnt-- > 0) { + (void)ttyoutput('\b', tp); + (void)ttyoutput(' ', tp); + (void)ttyoutput('\b', tp); + } +} + +/* + * ttyretype -- + * Reprint the rawq line. Note, it is assumed that c_cc has already + * been checked. + */ +void +ttyretype(tp) + register struct tty *tp; +{ + register char *cp; + int s, c; + + /* Echo the reprint character. */ + if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE) + ttyecho(tp->t_cc[VREPRINT], tp); + + (void)ttyoutput('\n', tp); + + /* + * XXX + * FIX: NEXTC IS BROKEN - DOESN'T CHECK QUOTE + * BIT OF FIRST CHAR. + */ + s = spltty(); + for (cp = tp->t_canq.c_cf, c = (cp != NULL ? *cp : 0); + cp != NULL; cp = nextc(&tp->t_canq, cp, &c)) + ttyecho(c, tp); + for (cp = tp->t_rawq.c_cf, c = (cp != NULL ? *cp : 0); + cp != NULL; cp = nextc(&tp->t_rawq, cp, &c)) + ttyecho(c, tp); + CLR(tp->t_state, TS_ERASE); + splx(s); + + tp->t_rocount = tp->t_rawq.c_cc; + tp->t_rocol = 0; +} + +/* + * Echo a typed character to the terminal. + */ +static void +ttyecho(c, tp) + register int c; + register struct tty *tp; +{ + + if (!ISSET(tp->t_state, TS_CNTTB)) + CLR(tp->t_lflag, FLUSHO); + if ((!ISSET(tp->t_lflag, ECHO) && + (!ISSET(tp->t_lflag, ECHONL) || c == '\n')) || + ISSET(tp->t_lflag, EXTPROC)) + return; + if (ISSET(tp->t_lflag, ECHOCTL) && + (ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n' || + ISSET(c, TTY_CHARMASK) == 0177)) { + (void)ttyoutput('^', tp); + CLR(c, ~TTY_CHARMASK); + if (c == 0177) + c = '?'; + else + c += 'A' - 1; + } + (void)ttyoutput(c, tp); +} + +/* + * Wake up any readers on a tty. + */ +void +ttwakeup(tp) + register struct tty *tp; +{ + + selwakeup(&tp->t_rsel); + if (ISSET(tp->t_state, TS_ASYNC)) + pgsignal(tp->t_pgrp, SIGIO, 1); + wakeup((caddr_t)&tp->t_rawq); +} + +/* + * Look up a code for a specified speed in a conversion table; + * used by drivers to map software speed values to hardware parameters. + */ +int +ttspeedtab(speed, table) + int speed; + register struct speedtab *table; +{ + + for ( ; table->sp_speed != -1; table++) + if (table->sp_speed == speed) + return (table->sp_code); + return (-1); +} + +/* + * Set tty hi and low water marks. + * + * Try to arrange the dynamics so there's about one second + * from hi to low water. + * + */ +void +ttsetwater(tp) + struct tty *tp; +{ + register int cps, x; + +#define CLAMP(x, h, l) ((x) > h ? h : ((x) < l) ? l : (x)) + + cps = tp->t_ospeed / 10; + tp->t_lowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT); + x += cps; + x = CLAMP(x, TTMAXHIWAT, TTMINHIWAT); + tp->t_hiwat = roundup(x, CBSIZE); +#undef CLAMP +} + +/* + * Report on state of foreground process group. + */ +void +ttyinfo(tp) + register struct tty *tp; +{ + register struct proc *p, *pick; + struct timeval utime, stime; + int tmp; + + if (ttycheckoutq(tp,0) == 0) + return; + + /* Print load average. */ + tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; + ttyprintf(tp, "load: %d.%02d ", tmp / 100, tmp % 100); + + if (tp->t_session == NULL) + ttyprintf(tp, "not a controlling terminal\n"); + else if (tp->t_pgrp == NULL) + ttyprintf(tp, "no foreground process group\n"); + else if ((p = tp->t_pgrp->pg_mem) == NULL) + ttyprintf(tp, "empty foreground process group\n"); + else { + /* Pick interesting process. */ + for (pick = NULL; p != NULL; p = p->p_pgrpnxt) + if (proc_compare(pick, p)) + pick = p; + + ttyprintf(tp, " cmd: %s %d [%s] ", pick->p_comm, pick->p_pid, + pick->p_stat == SRUN ? "running" : + pick->p_wmesg ? pick->p_wmesg : "iowait"); + + calcru(pick, &utime, &stime, NULL); + + /* Print user time. */ + ttyprintf(tp, "%d.%02du ", + utime.tv_sec, (utime.tv_usec + 5000) / 10000); + + /* Print system time. */ + ttyprintf(tp, "%d.%02ds ", + stime.tv_sec, (stime.tv_usec + 5000) / 10000); + +#define pgtok(a) (((a) * NBPG) / 1024) + /* Print percentage cpu, resident set size. */ + tmp = pick->p_pctcpu * 10000 + FSCALE / 2 >> FSHIFT; + ttyprintf(tp, "%d%% %dk\n", + tmp / 100, + pick->p_stat == SIDL || pick->p_stat == SZOMB ? 0 : +#ifdef pmap_resident_count + pgtok(pmap_resident_count(&pick->p_vmspace->vm_pmap)) +#else + pgtok(pick->p_vmspace->vm_rssize) +#endif + ); + } + tp->t_rocount = 0; /* so pending input will be retyped if BS */ +} + +/* + * Returns 1 if p2 is "better" than p1 + * + * The algorithm for picking the "interesting" process is thus: + * + * 1) Only foreground processes are eligible - implied. + * 2) Runnable processes are favored over anything else. The runner + * with the highest cpu utilization is picked (p_estcpu). Ties are + * broken by picking the highest pid. + * 3) The sleeper with the shortest sleep time is next. With ties, + * we pick out just "short-term" sleepers (P_SINTR == 0). + * 4) Further ties are broken by picking the highest pid. + */ +#define ISRUN(p) (((p)->p_stat == SRUN) || ((p)->p_stat == SIDL)) +#define TESTAB(a, b) ((a)<<1 | (b)) +#define ONLYA 2 +#define ONLYB 1 +#define BOTH 3 + +static int +proc_compare(p1, p2) + register struct proc *p1, *p2; +{ + + if (p1 == NULL) + return (1); + /* + * see if at least one of them is runnable + */ + switch (TESTAB(ISRUN(p1), ISRUN(p2))) { + case ONLYA: + return (0); + case ONLYB: + return (1); + case BOTH: + /* + * tie - favor one with highest recent cpu utilization + */ + if (p2->p_estcpu > p1->p_estcpu) + return (1); + if (p1->p_estcpu > p2->p_estcpu) + return (0); + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + } + /* + * weed out zombies + */ + switch (TESTAB(p1->p_stat == SZOMB, p2->p_stat == SZOMB)) { + case ONLYA: + return (1); + case ONLYB: + return (0); + case BOTH: + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + } + /* + * pick the one with the smallest sleep time + */ + if (p2->p_slptime > p1->p_slptime) + return (0); + if (p1->p_slptime > p2->p_slptime) + return (1); + /* + * favor one sleeping in a non-interruptible sleep + */ + if (p1->p_flag & P_SINTR && (p2->p_flag & P_SINTR) == 0) + return (1); + if (p2->p_flag & P_SINTR && (p1->p_flag & P_SINTR) == 0) + return (0); + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ +} + +/* + * Output char to tty; console putchar style. + */ +int +tputchar(c, tp) + int c; + struct tty *tp; +{ + register int s; + + s = spltty(); + if (ISSET(tp->t_state, + TS_CARR_ON | TS_ISOPEN) != (TS_CARR_ON | TS_ISOPEN)) { + splx(s); + return (-1); + } + if (c == '\n') + (void)ttyoutput('\r', tp); + (void)ttyoutput(c, tp); + ttstart(tp); + splx(s); + return (0); +} + +/* + * Sleep on chan, returning ERESTART if tty changed while we napped and + * returning any errors (e.g. EINTR/ETIMEDOUT) reported by tsleep. If + * the tty is revoked, restarting a pending call will redo validation done + * at the start of the call. + */ +int +ttysleep(tp, chan, pri, wmesg, timo) + struct tty *tp; + void *chan; + int pri, timo; + char *wmesg; +{ + int error; + short gen; + + gen = tp->t_gen; + if (error = tsleep(chan, pri, wmesg, timo)) + return (error); + return (tp->t_gen == gen ? 0 : ERESTART); +} diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c new file mode 100644 index 00000000000..a6a39d9d7bf --- /dev/null +++ b/sys/kern/tty_compat.c @@ -0,0 +1,411 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_compat.c 8.1 (Berkeley) 6/10/93 + */ + +/* + * mapping routines for old line discipline (yuck) + */ +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int ttydebug = 0; + +static struct speedtab compatspeeds[] = { + { 38400, 15 }, + { 19200, 14 }, + { 9600, 13 }, + { 4800, 12 }, + { 2400, 11 }, + { 1800, 10 }, + { 1200, 9 }, + { 600, 8 }, + { 300, 7 }, + { 200, 6 }, + { 150, 5 }, + { 134, 4 }, + { 110, 3 }, + { 75, 2 }, + { 50, 1 }, + { 0, 0 }, + { -1, -1 }, +}; +static int compatspcodes[16] = { + 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, + 1800, 2400, 4800, 9600, 19200, 38400, +}; + +/*ARGSUSED*/ +ttcompat(tp, com, data, flag) + register struct tty *tp; + int com; + caddr_t data; + int flag; +{ + + switch (com) { + case TIOCGETP: { + register struct sgttyb *sg = (struct sgttyb *)data; + register u_char *cc = tp->t_cc; + register speed; + + speed = ttspeedtab(tp->t_ospeed, compatspeeds); + sg->sg_ospeed = (speed == -1) ? 15 : speed; + if (tp->t_ispeed == 0) + sg->sg_ispeed = sg->sg_ospeed; + else { + speed = ttspeedtab(tp->t_ispeed, compatspeeds); + sg->sg_ispeed = (speed == -1) ? 15 : speed; + } + sg->sg_erase = cc[VERASE]; + sg->sg_kill = cc[VKILL]; + sg->sg_flags = ttcompatgetflags(tp); + break; + } + + case TIOCSETP: + case TIOCSETN: { + register struct sgttyb *sg = (struct sgttyb *)data; + struct termios term; + int speed; + + term = tp->t_termios; + if ((speed = sg->sg_ispeed) > 15 || speed < 0) + term.c_ispeed = speed; + else + term.c_ispeed = compatspcodes[speed]; + if ((speed = sg->sg_ospeed) > 15 || speed < 0) + term.c_ospeed = speed; + else + term.c_ospeed = compatspcodes[speed]; + term.c_cc[VERASE] = sg->sg_erase; + term.c_cc[VKILL] = sg->sg_kill; + tp->t_flags = tp->t_flags&0xffff0000 | sg->sg_flags&0xffff; + ttcompatsetflags(tp, &term); + return (ttioctl(tp, com == TIOCSETP ? TIOCSETAF : TIOCSETA, + &term, flag)); + } + + case TIOCGETC: { + struct tchars *tc = (struct tchars *)data; + register u_char *cc = tp->t_cc; + + tc->t_intrc = cc[VINTR]; + tc->t_quitc = cc[VQUIT]; + tc->t_startc = cc[VSTART]; + tc->t_stopc = cc[VSTOP]; + tc->t_eofc = cc[VEOF]; + tc->t_brkc = cc[VEOL]; + break; + } + case TIOCSETC: { + struct tchars *tc = (struct tchars *)data; + register u_char *cc = tp->t_cc; + + cc[VINTR] = tc->t_intrc; + cc[VQUIT] = tc->t_quitc; + cc[VSTART] = tc->t_startc; + cc[VSTOP] = tc->t_stopc; + cc[VEOF] = tc->t_eofc; + cc[VEOL] = tc->t_brkc; + if (tc->t_brkc == -1) + cc[VEOL2] = _POSIX_VDISABLE; + break; + } + case TIOCSLTC: { + struct ltchars *ltc = (struct ltchars *)data; + register u_char *cc = tp->t_cc; + + cc[VSUSP] = ltc->t_suspc; + cc[VDSUSP] = ltc->t_dsuspc; + cc[VREPRINT] = ltc->t_rprntc; + cc[VDISCARD] = ltc->t_flushc; + cc[VWERASE] = ltc->t_werasc; + cc[VLNEXT] = ltc->t_lnextc; + break; + } + case TIOCGLTC: { + struct ltchars *ltc = (struct ltchars *)data; + register u_char *cc = tp->t_cc; + + ltc->t_suspc = cc[VSUSP]; + ltc->t_dsuspc = cc[VDSUSP]; + ltc->t_rprntc = cc[VREPRINT]; + ltc->t_flushc = cc[VDISCARD]; + ltc->t_werasc = cc[VWERASE]; + ltc->t_lnextc = cc[VLNEXT]; + break; + } + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: { + struct termios term; + + term = tp->t_termios; + if (com == TIOCLSET) + tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16; + else { + tp->t_flags = + (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff); + if (com == TIOCLBIS) + tp->t_flags |= *(int *)data<<16; + else + tp->t_flags &= ~(*(int *)data<<16); + } + ttcompatsetlflags(tp, &term); + return (ttioctl(tp, TIOCSETA, &term, flag)); + } + case TIOCLGET: + *(int *)data = ttcompatgetflags(tp)>>16; + if (ttydebug) + printf("CLGET: returning %x\n", *(int *)data); + break; + + case OTIOCGETD: + *(int *)data = tp->t_line ? tp->t_line : 2; + break; + + case OTIOCSETD: { + int ldisczero = 0; + + return (ttioctl(tp, TIOCSETD, + *(int *)data == 2 ? (caddr_t)&ldisczero : data, flag)); + } + + case OTIOCCONS: + *(int *)data = 1; + return (ttioctl(tp, TIOCCONS, data, flag)); + + default: + return (-1); + } + return (0); +} + +ttcompatgetflags(tp) + register struct tty *tp; +{ + register long iflag = tp->t_iflag; + register long lflag = tp->t_lflag; + register long oflag = tp->t_oflag; + register long cflag = tp->t_cflag; + register flags = 0; + + if (iflag&IXOFF) + flags |= TANDEM; + if (iflag&ICRNL || oflag&ONLCR) + flags |= CRMOD; + if (cflag&PARENB) { + if (iflag&INPCK) { + if (cflag&PARODD) + flags |= ODDP; + else + flags |= EVENP; + } else + flags |= EVENP | ODDP; + } else { + if ((tp->t_flags&LITOUT) && !(oflag&OPOST)) + flags |= LITOUT; + if (tp->t_flags&PASS8) + flags |= PASS8; + } + + if ((lflag&ICANON) == 0) { + /* fudge */ + if (iflag&IXON || lflag&ISIG || lflag&IEXTEN || cflag&PARENB) + flags |= CBREAK; + else + flags |= RAW; + } + if (cflag&MDMBUF) + flags |= MDMBUF; + if ((cflag&HUPCL) == 0) + flags |= NOHANG; + if (oflag&OXTABS) + flags |= XTABS; + if (lflag&ECHOE) + flags |= CRTERA|CRTBS; + if (lflag&ECHOKE) + flags |= CRTKIL|CRTBS; + if (lflag&ECHOPRT) + flags |= PRTERA; + if (lflag&ECHOCTL) + flags |= CTLECH; + if ((iflag&IXANY) == 0) + flags |= DECCTQ; + flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH); +if (ttydebug) + printf("getflags: %x\n", flags); + return (flags); +} + +ttcompatsetflags(tp, t) + register struct tty *tp; + register struct termios *t; +{ + register flags = tp->t_flags; + register long iflag = t->c_iflag; + register long oflag = t->c_oflag; + register long lflag = t->c_lflag; + register long cflag = t->c_cflag; + + if (flags & RAW) { + iflag &= IXOFF; + oflag &= ~OPOST; + lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN); + } else { + iflag |= BRKINT|IXON|IMAXBEL; + oflag |= OPOST; + lflag |= ISIG|IEXTEN|ECHOCTL; /* XXX was echoctl on ? */ + if (flags & XTABS) + oflag |= OXTABS; + else + oflag &= ~OXTABS; + if (flags & CBREAK) + lflag &= ~ICANON; + else + lflag |= ICANON; + if (flags&CRMOD) { + iflag |= ICRNL; + oflag |= ONLCR; + } else { + iflag &= ~ICRNL; + oflag &= ~ONLCR; + } + } + if (flags&ECHO) + lflag |= ECHO; + else + lflag &= ~ECHO; + + if (flags&(RAW|LITOUT|PASS8)) { + cflag &= ~(CSIZE|PARENB); + cflag |= CS8; + if ((flags&(RAW|PASS8)) == 0) + iflag |= ISTRIP; + else + iflag &= ~ISTRIP; + } else { + cflag &= ~CSIZE; + cflag |= CS7|PARENB; + iflag |= ISTRIP; + } + if ((flags&(EVENP|ODDP)) == EVENP) { + iflag |= INPCK; + cflag &= ~PARODD; + } else if ((flags&(EVENP|ODDP)) == ODDP) { + iflag |= INPCK; + cflag |= PARODD; + } else + iflag &= ~INPCK; + if (flags&LITOUT) + oflag &= ~OPOST; /* move earlier ? */ + if (flags&TANDEM) + iflag |= IXOFF; + else + iflag &= ~IXOFF; + t->c_iflag = iflag; + t->c_oflag = oflag; + t->c_lflag = lflag; + t->c_cflag = cflag; +} + +ttcompatsetlflags(tp, t) + register struct tty *tp; + register struct termios *t; +{ + register flags = tp->t_flags; + register long iflag = t->c_iflag; + register long oflag = t->c_oflag; + register long lflag = t->c_lflag; + register long cflag = t->c_cflag; + + if (flags&CRTERA) + lflag |= ECHOE; + else + lflag &= ~ECHOE; + if (flags&CRTKIL) + lflag |= ECHOKE; + else + lflag &= ~ECHOKE; + if (flags&PRTERA) + lflag |= ECHOPRT; + else + lflag &= ~ECHOPRT; + if (flags&CTLECH) + lflag |= ECHOCTL; + else + lflag &= ~ECHOCTL; + if ((flags&DECCTQ) == 0) + iflag |= IXANY; + else + iflag &= ~IXANY; + if (flags & MDMBUF) + cflag |= MDMBUF; + else + cflag &= ~MDMBUF; + if (flags&NOHANG) + cflag &= ~HUPCL; + else + cflag |= HUPCL; + lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH); + lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH); + if (flags&(LITOUT|PASS8)) { + iflag &= ~ISTRIP; + cflag &= ~(CSIZE|PARENB); + cflag |= CS8; + if (flags&LITOUT) + oflag &= ~OPOST; + if ((flags&(PASS8|RAW)) == 0) + iflag |= ISTRIP; + } else if ((flags&RAW) == 0) { + cflag &= ~CSIZE; + cflag |= CS7|PARENB; + oflag |= OPOST; + } + t->c_iflag = iflag; + t->c_oflag = oflag; + t->c_lflag = lflag; + t->c_cflag = cflag; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ diff --git a/sys/kern/tty_conf.c b/sys/kern/tty_conf.c new file mode 100644 index 00000000000..b53edb42975 --- /dev/null +++ b/sys/kern/tty_conf.c @@ -0,0 +1,126 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_conf.c 8.4 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#define ttynodisc ((int (*) __P((dev_t, struct tty *)))enodev) +#define ttyerrclose ((int (*) __P((struct tty *, int flags)))enodev) +#define ttyerrio ((int (*) __P((struct tty *, struct uio *, int)))enodev) +#define ttyerrinput ((int (*) __P((int c, struct tty *)))enodev) +#define ttyerrstart ((int (*) __P((struct tty *)))enodev) + +int nullioctl __P((struct tty *tp, int cmd, caddr_t data, + int flag, struct proc *p)); + +#include "tb.h" +#if NTB > 0 +int tbopen __P((dev_t dev, struct tty *tp)); +int tbclose __P((struct tty *tp, int flags)); +int tbread __P((struct tty *, struct uio *, int flags)); +int tbioctl __P((struct tty *tp, int cmd, caddr_t data, + int flag, struct proc *p)); +int tbinput __P((int c, struct tty *tp)); +#endif + +#include "sl.h" +#if NSL > 0 +int slopen __P((dev_t dev, struct tty *tp)); +int slclose __P((struct tty *tp, int flags)); +int sltioctl __P((struct tty *tp, int cmd, caddr_t data, + int flag, struct proc *p)); +int slinput __P((int c, struct tty *tp)); +int slstart __P((struct tty *tp)); +#endif + + +struct linesw linesw[] = +{ + { ttyopen, ttylclose, ttread, ttwrite, nullioctl, + ttyinput, ttstart, ttymodem }, /* 0- termios */ + + { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, + ttyerrinput, ttyerrstart, nullmodem }, /* 1- defunct */ + + { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, + ttyerrinput, ttyerrstart, nullmodem }, /* 2- defunct */ + +#if NTB > 0 + { tbopen, tbclose, tbread, enodev, tbioctl, + tbinput, ttstart, nullmodem }, /* 3- TABLDISC */ +#else + { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, + ttyerrinput, ttyerrstart, nullmodem }, +#endif + +#if NSL > 0 + { slopen, slclose, ttyerrio, ttyerrio, sltioctl, + slinput, slstart, nullmodem }, /* 4- SLIPDISC */ +#else + { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, + ttyerrinput, ttyerrstart, nullmodem }, +#endif +}; + +int nlinesw = sizeof (linesw) / sizeof (linesw[0]); + +/* + * Do nothing specific version of line + * discipline specific ioctl command. + */ +/*ARGSUSED*/ +nullioctl(tp, cmd, data, flags, p) + struct tty *tp; + int cmd; + char *data; + int flags; + struct proc *p; +{ + +#ifdef lint + tp = tp; data = data; flags = flags; p = p; +#endif + return (-1); +} diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c new file mode 100644 index 00000000000..0e6911b63e1 --- /dev/null +++ b/sys/kern/tty_pty.c @@ -0,0 +1,691 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_pty.c 8.2 (Berkeley) 9/23/93 + */ + +/* + * Pseudo-teletype Driver + * (Actually two drivers, requiring two entries in 'cdevsw') + */ +#include "pty.h" /* XXX */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if NPTY == 1 +#undef NPTY +#define NPTY 32 /* crude XXX */ +#endif + +#define BUFSIZ 100 /* Chunk size iomoved to/from user */ + +/* + * pts == /dev/tty[pqrs]? + * ptc == /dev/pty[pqrs]? + */ +struct tty pt_tty[NPTY]; /* XXX */ +struct pt_ioctl { + int pt_flags; + struct selinfo pt_selr, pt_selw; + u_char pt_send; + u_char pt_ucntl; +} pt_ioctl[NPTY]; /* XXX */ +int npty = NPTY; /* for pstat -t */ + +#define PF_PKT 0x08 /* packet mode */ +#define PF_STOPPED 0x10 /* user told stopped */ +#define PF_REMOTE 0x20 /* remote and flow controlled input */ +#define PF_NOSTOP 0x40 +#define PF_UCNTL 0x80 /* user control mode */ + +void ptsstop __P((struct tty *, int)); + +/* + * Establish n (or default if n is 1) ptys in the system. + * + * XXX cdevsw & pstat require the array `pty[]' to be an array + */ +void +ptyattach(n) + int n; +{ +#ifdef notyet + char *mem; + register u_long ntb; +#define DEFAULT_NPTY 32 + + /* maybe should allow 0 => none? */ + if (n <= 1) + n = DEFAULT_NPTY; + ntb = n * sizeof(struct tty); + mem = malloc(ntb + ALIGNBYTES + n * sizeof(struct pt_ioctl), + M_DEVBUF, M_WAITOK); + pt_tty = (struct tty *)mem; + mem = (char *)ALIGN(mem + ntb); + pt_ioctl = (struct pt_ioctl *)mem; + npty = n; +#endif +} + +/*ARGSUSED*/ +ptsopen(dev, flag, devtype, p) + dev_t dev; + int flag, devtype; + struct proc *p; +{ + register struct tty *tp; + int error; + + if (minor(dev) >= npty) + return (ENXIO); + tp = &pt_tty[minor(dev)]; + if ((tp->t_state & TS_ISOPEN) == 0) { + tp->t_state |= TS_WOPEN; + ttychars(tp); /* Set up default chars */ + tp->t_iflag = TTYDEF_IFLAG; + tp->t_oflag = TTYDEF_OFLAG; + tp->t_lflag = TTYDEF_LFLAG; + tp->t_cflag = TTYDEF_CFLAG; + tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; + ttsetwater(tp); /* would be done in xxparam() */ + } else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0) + return (EBUSY); + if (tp->t_oproc) /* Ctrlr still around. */ + tp->t_state |= TS_CARR_ON; + while ((tp->t_state & TS_CARR_ON) == 0) { + tp->t_state |= TS_WOPEN; + if (flag&FNONBLOCK) + break; + if (error = ttysleep(tp, (caddr_t)&tp->t_rawq, TTIPRI | PCATCH, + ttopen, 0)) + return (error); + } + error = (*linesw[tp->t_line].l_open)(dev, tp); + ptcwakeup(tp, FREAD|FWRITE); + return (error); +} + +ptsclose(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + register struct tty *tp; + int err; + + tp = &pt_tty[minor(dev)]; + err = (*linesw[tp->t_line].l_close)(tp, flag); + err |= ttyclose(tp); + ptcwakeup(tp, FREAD|FWRITE); + return (err); +} + +ptsread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + struct proc *p = curproc; + register struct tty *tp = &pt_tty[minor(dev)]; + register struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + int error = 0; + +again: + if (pti->pt_flags & PF_REMOTE) { + while (isbackground(p, tp)) { + if ((p->p_sigignore & sigmask(SIGTTIN)) || + (p->p_sigmask & sigmask(SIGTTIN)) || + p->p_pgrp->pg_jobc == 0 || + p->p_flag & P_PPWAIT) + return (EIO); + pgsignal(p->p_pgrp, SIGTTIN, 1); + if (error = ttysleep(tp, (caddr_t)&lbolt, + TTIPRI | PCATCH, ttybg, 0)) + return (error); + } + if (tp->t_canq.c_cc == 0) { + if (flag & IO_NDELAY) + return (EWOULDBLOCK); + if (error = ttysleep(tp, (caddr_t)&tp->t_canq, + TTIPRI | PCATCH, ttyin, 0)) + return (error); + goto again; + } + while (tp->t_canq.c_cc > 1 && uio->uio_resid > 0) + if (ureadc(getc(&tp->t_canq), uio) < 0) { + error = EFAULT; + break; + } + if (tp->t_canq.c_cc == 1) + (void) getc(&tp->t_canq); + if (tp->t_canq.c_cc) + return (error); + } else + if (tp->t_oproc) + error = (*linesw[tp->t_line].l_read)(tp, uio, flag); + ptcwakeup(tp, FWRITE); + return (error); +} + +/* + * Write to pseudo-tty. + * Wakeups of controlling tty will happen + * indirectly, when tty driver calls ptsstart. + */ +ptswrite(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct tty *tp; + + tp = &pt_tty[minor(dev)]; + if (tp->t_oproc == 0) + return (EIO); + return ((*linesw[tp->t_line].l_write)(tp, uio, flag)); +} + +/* + * Start output on pseudo-tty. + * Wake up process selecting or sleeping for input from controlling tty. + */ +void +ptsstart(tp) + struct tty *tp; +{ + register struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; + + if (tp->t_state & TS_TTSTOP) + return; + if (pti->pt_flags & PF_STOPPED) { + pti->pt_flags &= ~PF_STOPPED; + pti->pt_send = TIOCPKT_START; + } + ptcwakeup(tp, FREAD); +} + +ptcwakeup(tp, flag) + struct tty *tp; + int flag; +{ + struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; + + if (flag & FREAD) { + selwakeup(&pti->pt_selr); + wakeup((caddr_t)&tp->t_outq.c_cf); + } + if (flag & FWRITE) { + selwakeup(&pti->pt_selw); + wakeup((caddr_t)&tp->t_rawq.c_cf); + } +} + +/*ARGSUSED*/ +#ifdef __STDC__ +ptcopen(dev_t dev, int flag, int devtype, struct proc *p) +#else +ptcopen(dev, flag, devtype, p) + dev_t dev; + int flag, devtype; + struct proc *p; +#endif +{ + register struct tty *tp; + struct pt_ioctl *pti; + + if (minor(dev) >= npty) + return (ENXIO); + tp = &pt_tty[minor(dev)]; + if (tp->t_oproc) + return (EIO); + tp->t_oproc = ptsstart; +#ifdef sun4c + tp->t_stop = ptsstop; +#endif + (void)(*linesw[tp->t_line].l_modem)(tp, 1); + tp->t_lflag &= ~EXTPROC; + pti = &pt_ioctl[minor(dev)]; + pti->pt_flags = 0; + pti->pt_send = 0; + pti->pt_ucntl = 0; + return (0); +} + +ptcclose(dev) + dev_t dev; +{ + register struct tty *tp; + + tp = &pt_tty[minor(dev)]; + (void)(*linesw[tp->t_line].l_modem)(tp, 0); + tp->t_state &= ~TS_CARR_ON; + tp->t_oproc = 0; /* mark closed */ + tp->t_session = 0; + return (0); +} + +ptcread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + char buf[BUFSIZ]; + int error = 0, cc; + + /* + * We want to block until the slave + * is open, and there's something to read; + * but if we lost the slave or we're NBIO, + * then return the appropriate error instead. + */ + for (;;) { + if (tp->t_state&TS_ISOPEN) { + if (pti->pt_flags&PF_PKT && pti->pt_send) { + error = ureadc((int)pti->pt_send, uio); + if (error) + return (error); + if (pti->pt_send & TIOCPKT_IOCTL) { + cc = min(uio->uio_resid, + sizeof(tp->t_termios)); + uiomove(&tp->t_termios, cc, uio); + } + pti->pt_send = 0; + return (0); + } + if (pti->pt_flags&PF_UCNTL && pti->pt_ucntl) { + error = ureadc((int)pti->pt_ucntl, uio); + if (error) + return (error); + pti->pt_ucntl = 0; + return (0); + } + if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0) + break; + } + if ((tp->t_state&TS_CARR_ON) == 0) + return (0); /* EOF */ + if (flag & IO_NDELAY) + return (EWOULDBLOCK); + if (error = tsleep((caddr_t)&tp->t_outq.c_cf, TTIPRI | PCATCH, + ttyin, 0)) + return (error); + } + if (pti->pt_flags & (PF_PKT|PF_UCNTL)) + error = ureadc(0, uio); + while (uio->uio_resid > 0 && error == 0) { + cc = q_to_b(&tp->t_outq, buf, min(uio->uio_resid, BUFSIZ)); + if (cc <= 0) + break; + error = uiomove(buf, cc, uio); + } + if (tp->t_outq.c_cc <= tp->t_lowat) { + if (tp->t_state&TS_ASLEEP) { + tp->t_state &= ~TS_ASLEEP; + wakeup((caddr_t)&tp->t_outq); + } + selwakeup(&tp->t_wsel); + } + return (error); +} + +void +ptsstop(tp, flush) + register struct tty *tp; + int flush; +{ + struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; + int flag; + + /* note: FLUSHREAD and FLUSHWRITE already ok */ + if (flush == 0) { + flush = TIOCPKT_STOP; + pti->pt_flags |= PF_STOPPED; + } else + pti->pt_flags &= ~PF_STOPPED; + pti->pt_send |= flush; + /* change of perspective */ + flag = 0; + if (flush & FREAD) + flag |= FWRITE; + if (flush & FWRITE) + flag |= FREAD; + ptcwakeup(tp, flag); +} + +ptcselect(dev, rw, p) + dev_t dev; + int rw; + struct proc *p; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + int s; + + if ((tp->t_state&TS_CARR_ON) == 0) + return (1); + switch (rw) { + + case FREAD: + /* + * Need to block timeouts (ttrstart). + */ + s = spltty(); + if ((tp->t_state&TS_ISOPEN) && + tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0) { + splx(s); + return (1); + } + splx(s); + /* FALLTHROUGH */ + + case 0: /* exceptional */ + if ((tp->t_state&TS_ISOPEN) && + (pti->pt_flags&PF_PKT && pti->pt_send || + pti->pt_flags&PF_UCNTL && pti->pt_ucntl)) + return (1); + selrecord(p, &pti->pt_selr); + break; + + + case FWRITE: + if (tp->t_state&TS_ISOPEN) { + if (pti->pt_flags & PF_REMOTE) { + if (tp->t_canq.c_cc == 0) + return (1); + } else { + if (tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG-2) + return (1); + if (tp->t_canq.c_cc == 0 && (tp->t_iflag&ICANON)) + return (1); + } + } + selrecord(p, &pti->pt_selw); + break; + + } + return (0); +} + +ptcwrite(dev, uio, flag) + dev_t dev; + register struct uio *uio; + int flag; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + register u_char *cp; + register int cc = 0; + u_char locbuf[BUFSIZ]; + int cnt = 0; + struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + int error = 0; + +again: + if ((tp->t_state&TS_ISOPEN) == 0) + goto block; + if (pti->pt_flags & PF_REMOTE) { + if (tp->t_canq.c_cc) + goto block; + while (uio->uio_resid > 0 && tp->t_canq.c_cc < TTYHOG - 1) { + if (cc == 0) { + cc = min(uio->uio_resid, BUFSIZ); + cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc); + cp = locbuf; + error = uiomove((caddr_t)cp, cc, uio); + if (error) + return (error); + /* check again for safety */ + if ((tp->t_state&TS_ISOPEN) == 0) + return (EIO); + } + if (cc) + (void) b_to_q((char *)cp, cc, &tp->t_canq); + cc = 0; + } + (void) putc(0, &tp->t_canq); + ttwakeup(tp); + wakeup((caddr_t)&tp->t_canq); + return (0); + } + while (uio->uio_resid > 0) { + if (cc == 0) { + cc = min(uio->uio_resid, BUFSIZ); + cp = locbuf; + error = uiomove((caddr_t)cp, cc, uio); + if (error) + return (error); + /* check again for safety */ + if ((tp->t_state&TS_ISOPEN) == 0) + return (EIO); + } + while (cc > 0) { + if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 && + (tp->t_canq.c_cc > 0 || !(tp->t_iflag&ICANON))) { + wakeup((caddr_t)&tp->t_rawq); + goto block; + } + (*linesw[tp->t_line].l_rint)(*cp++, tp); + cnt++; + cc--; + } + cc = 0; + } + return (0); +block: + /* + * Come here to wait for slave to open, for space + * in outq, or space in rawq. + */ + if ((tp->t_state&TS_CARR_ON) == 0) + return (EIO); + if (flag & IO_NDELAY) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + if (cnt == 0) + return (EWOULDBLOCK); + return (0); + } + if (error = tsleep((caddr_t)&tp->t_rawq.c_cf, TTOPRI | PCATCH, + ttyout, 0)) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + return (error); + } + goto again; +} + +/*ARGSUSED*/ +ptyioctl(dev, cmd, data, flag, p) + dev_t dev; + int cmd; + caddr_t data; + int flag; + struct proc *p; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + register struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + register u_char *cc = tp->t_cc; + int stop, error; + + /* + * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG. + * ttywflush(tp) will hang if there are characters in the outq. + */ + if (cmd == TIOCEXT) { + /* + * When the EXTPROC bit is being toggled, we need + * to send an TIOCPKT_IOCTL if the packet driver + * is turned on. + */ + if (*(int *)data) { + if (pti->pt_flags & PF_PKT) { + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + } + tp->t_lflag |= EXTPROC; + } else { + if ((tp->t_state & EXTPROC) && + (pti->pt_flags & PF_PKT)) { + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + } + tp->t_lflag &= ~EXTPROC; + } + return(0); + } else + if (cdevsw[major(dev)].d_open == ptcopen) + switch (cmd) { + + case TIOCGPGRP: + /* + * We aviod calling ttioctl on the controller since, + * in that case, tp must be the controlling terminal. + */ + *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0; + return (0); + + case TIOCPKT: + if (*(int *)data) { + if (pti->pt_flags & PF_UCNTL) + return (EINVAL); + pti->pt_flags |= PF_PKT; + } else + pti->pt_flags &= ~PF_PKT; + return (0); + + case TIOCUCNTL: + if (*(int *)data) { + if (pti->pt_flags & PF_PKT) + return (EINVAL); + pti->pt_flags |= PF_UCNTL; + } else + pti->pt_flags &= ~PF_UCNTL; + return (0); + + case TIOCREMOTE: + if (*(int *)data) + pti->pt_flags |= PF_REMOTE; + else + pti->pt_flags &= ~PF_REMOTE; + ttyflush(tp, FREAD|FWRITE); + return (0); + +#ifdef COMPAT_43 + case TIOCSETP: + case TIOCSETN: +#endif + case TIOCSETD: + case TIOCSETA: + case TIOCSETAW: + case TIOCSETAF: + ndflush(&tp->t_outq, tp->t_outq.c_cc); + break; + + case TIOCSIG: + if (*(unsigned int *)data >= NSIG) + return(EINVAL); + if ((tp->t_lflag&NOFLSH) == 0) + ttyflush(tp, FREAD|FWRITE); + pgsignal(tp->t_pgrp, *(unsigned int *)data, 1); + if ((*(unsigned int *)data == SIGINFO) && + ((tp->t_lflag&NOKERNINFO) == 0)) + ttyinfo(tp); + return(0); + } + error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p); + if (error < 0) + error = ttioctl(tp, cmd, data, flag); + if (error < 0) { + if (pti->pt_flags & PF_UCNTL && + (cmd & ~0xff) == UIOCCMD(0)) { + if (cmd & 0xff) { + pti->pt_ucntl = (u_char)cmd; + ptcwakeup(tp, FREAD); + } + return (0); + } + error = ENOTTY; + } + /* + * If external processing and packet mode send ioctl packet. + */ + if ((tp->t_lflag&EXTPROC) && (pti->pt_flags & PF_PKT)) { + switch(cmd) { + case TIOCSETA: + case TIOCSETAW: + case TIOCSETAF: +#ifdef COMPAT_43 + case TIOCSETP: + case TIOCSETN: +#endif +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + case TIOCSETC: + case TIOCSLTC: + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: +#endif + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + default: + break; + } + } + stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s')) + && CCEQ(cc[VSTART], CTRL('q')); + if (pti->pt_flags & PF_NOSTOP) { + if (stop) { + pti->pt_send &= ~TIOCPKT_NOSTOP; + pti->pt_send |= TIOCPKT_DOSTOP; + pti->pt_flags &= ~PF_NOSTOP; + ptcwakeup(tp, FREAD); + } + } else { + if (!stop) { + pti->pt_send &= ~TIOCPKT_DOSTOP; + pti->pt_send |= TIOCPKT_NOSTOP; + pti->pt_flags |= PF_NOSTOP; + ptcwakeup(tp, FREAD); + } + } + return (error); +} diff --git a/sys/kern/tty_subr.c b/sys/kern/tty_subr.c new file mode 100644 index 00000000000..fe8f000f87d --- /dev/null +++ b/sys/kern/tty_subr.c @@ -0,0 +1,159 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)tty_subr.c 8.2 (Berkeley) 9/5/93 + */ + +#include +#include +#include + +char cwaiting; +struct cblock *cfree, *cfreelist; +int cfreecount, nclist; + +void +clist_init() +{ + + /* + * Body deleted. + */ + return; +} + +getc(a1) + struct clist *a1; +{ + + /* + * Body deleted. + */ + return ((char)0); +} + +q_to_b(a1, a2, a3) + struct clist *a1; + char *a2; + int a3; +{ + + /* + * Body deleted. + */ + return (0); +} + +ndqb(a1, a2) + struct clist *a1; + int a2; +{ + + /* + * Body deleted. + */ + return (0); +} + +void +ndflush(a1, a2) + struct clist *a1; + int a2; +{ + + /* + * Body deleted. + */ + return; +} + +putc(a1, a2) + char a1; + struct clist *a2; +{ + + /* + * Body deleted. + */ + return (0); +} + +b_to_q(a1, a2, a3) + char *a1; + int a2; + struct clist *a3; +{ + + /* + * Body deleted. + */ + return (0); +} + +char * +nextc(a1, a2, a3) + struct clist *a1; + char *a2; + int *a3; +{ + + /* + * Body deleted. + */ + return ((char *)0); +} + +unputc(a1) + struct clist *a1; +{ + + /* + * Body deleted. + */ + return ((char)0); +} + +void +catq(a1, a2) + struct clist *a1, *a2; +{ + + /* + * Body deleted. + */ + return; +} diff --git a/sys/kern/tty_tb.c b/sys/kern/tty_tb.c new file mode 100644 index 00000000000..242301a52e8 --- /dev/null +++ b/sys/kern/tty_tb.c @@ -0,0 +1,366 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_tb.c 8.1 (Berkeley) 6/10/93 + */ + +#include "tb.h" +#if NTB > 0 + +/* + * Line discipline for RS232 tablets; + * supplies binary coordinate data. + */ +#include +#include +#include + +/* + * Tablet configuration table. + */ +struct tbconf { + short tbc_recsize; /* input record size in bytes */ + short tbc_uiosize; /* size of data record returned user */ + int tbc_sync; /* mask for finding sync byte/bit */ + int (*tbc_decode)();/* decoding routine */ + char *tbc_run; /* enter run mode sequence */ + char *tbc_point; /* enter point mode sequence */ + char *tbc_stop; /* stop sequence */ + char *tbc_start; /* start/restart sequence */ + int tbc_flags; +#define TBF_POL 0x1 /* polhemus hack */ +#define TBF_INPROX 0x2 /* tablet has proximity info */ +}; + +static int tbdecode(), gtcodecode(), poldecode(); +static int tblresdecode(), tbhresdecode(); + +struct tbconf tbconf[TBTYPE] = { +{ 0 }, +{ 5, sizeof (struct tbpos), 0200, tbdecode, "6", "4" }, +{ 5, sizeof (struct tbpos), 0200, tbdecode, "\1CN", "\1RT", "\2", "\4" }, +{ 8, sizeof (struct gtcopos), 0200, gtcodecode }, +{17, sizeof (struct polpos), 0200, poldecode, 0, 0, "\21", "\5\22\2\23", + TBF_POL }, +{ 5, sizeof (struct tbpos), 0100, tblresdecode, "\1CN", "\1PT", "\2", "\4", + TBF_INPROX }, +{ 6, sizeof (struct tbpos), 0200, tbhresdecode, "\1CN", "\1PT", "\2", "\4", + TBF_INPROX }, +{ 5, sizeof (struct tbpos), 0100, tblresdecode, "\1CL\33", "\1PT\33", 0, 0}, +{ 6, sizeof (struct tbpos), 0200, tbhresdecode, "\1CL\33", "\1PT\33", 0, 0}, +}; + +/* + * Tablet state + */ +struct tb { + int tbflags; /* mode & type bits */ +#define TBMAXREC 17 /* max input record size */ + char cbuf[TBMAXREC]; /* input buffer */ + union { + struct tbpos tbpos; + struct gtcopos gtcopos; + struct polpos polpos; + } rets; /* processed state */ +#define NTBS 16 +} tb[NTBS]; + +/* + * Open as tablet discipline; called on discipline change. + */ +/*ARGSUSED*/ +tbopen(dev, tp) + dev_t dev; + register struct tty *tp; +{ + register struct tb *tbp; + + if (tp->t_line == TABLDISC) + return (ENODEV); + ttywflush(tp); + for (tbp = tb; tbp < &tb[NTBS]; tbp++) + if (tbp->tbflags == 0) + break; + if (tbp >= &tb[NTBS]) + return (EBUSY); + tbp->tbflags = TBTIGER|TBPOINT; /* default */ + tp->t_cp = tbp->cbuf; + tp->t_inbuf = 0; + bzero((caddr_t)&tbp->rets, sizeof (tbp->rets)); + tp->T_LINEP = (caddr_t)tbp; + tp->t_flags |= LITOUT; + return (0); +} + +/* + * Line discipline change or last device close. + */ +tbclose(tp) + register struct tty *tp; +{ + register int s; + int modebits = TBPOINT|TBSTOP; + + tbioctl(tp, BIOSMODE, &modebits, 0); + s = spltty(); + ((struct tb *)tp->T_LINEP)->tbflags = 0; + tp->t_cp = 0; + tp->t_inbuf = 0; + tp->t_rawq.c_cc = 0; /* clear queues -- paranoid */ + tp->t_canq.c_cc = 0; + tp->t_line = 0; /* paranoid: avoid races */ + splx(s); +} + +/* + * Read from a tablet line. + * Characters have been buffered in a buffer and decoded. + */ +tbread(tp, uio) + register struct tty *tp; + struct uio *uio; +{ + register struct tb *tbp = (struct tb *)tp->T_LINEP; + register struct tbconf *tc = &tbconf[tbp->tbflags & TBTYPE]; + int ret; + + if ((tp->t_state&TS_CARR_ON) == 0) + return (EIO); + ret = uiomove(&tbp->rets, tc->tbc_uiosize, uio); + if (tc->tbc_flags&TBF_POL) + tbp->rets.polpos.p_key = ' '; + return (ret); +} + +/* + * Low level character input routine. + * Stuff the character in the buffer, and decode + * if all the chars are there. + * + * This routine could be expanded in-line in the receiver + * interrupt routine to make it run as fast as possible. + */ +tbinput(c, tp) + register int c; + register struct tty *tp; +{ + register struct tb *tbp = (struct tb *)tp->T_LINEP; + register struct tbconf *tc = &tbconf[tbp->tbflags & TBTYPE]; + + if (tc->tbc_recsize == 0 || tc->tbc_decode == 0) /* paranoid? */ + return; + /* + * Locate sync bit/byte or reset input buffer. + */ + if (c&tc->tbc_sync || tp->t_inbuf == tc->tbc_recsize) { + tp->t_cp = tbp->cbuf; + tp->t_inbuf = 0; + } + *tp->t_cp++ = c&0177; + /* + * Call decode routine only if a full record has been collected. + */ + if (++tp->t_inbuf == tc->tbc_recsize) + (*tc->tbc_decode)(tc, tbp->cbuf, &tbp->rets); +} + +/* + * Decode GTCO 8 byte format (high res, tilt, and pressure). + */ +static +gtcodecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct gtcopos *tbpos; +{ + + tbpos->pressure = *cp >> 2; + tbpos->status = (tbpos->pressure > 16) | TBINPROX; /* half way down */ + tbpos->xpos = (*cp++ & 03) << 14; + tbpos->xpos |= *cp++ << 7; + tbpos->xpos |= *cp++; + tbpos->ypos = (*cp++ & 03) << 14; + tbpos->ypos |= *cp++ << 7; + tbpos->ypos |= *cp++; + tbpos->xtilt = *cp++; + tbpos->ytilt = *cp++; + tbpos->scount++; +} + +/* + * Decode old Hitachi 5 byte format (low res). + */ +static +tbdecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct tbpos *tbpos; +{ + register char byte; + + byte = *cp++; + tbpos->status = (byte&0100) ? TBINPROX : 0; + byte &= ~0100; + if (byte > 036) + tbpos->status |= 1 << ((byte-040)/2); + tbpos->xpos = *cp++ << 7; + tbpos->xpos |= *cp++; + if (tbpos->xpos < 256) /* tablet wraps around at 256 */ + tbpos->status &= ~TBINPROX; /* make it out of proximity */ + tbpos->ypos = *cp++ << 7; + tbpos->ypos |= *cp++; + tbpos->scount++; +} + +/* + * Decode new Hitach 5-byte format (low res). + */ +static +tblresdecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct tbpos *tbpos; +{ + + *cp &= ~0100; /* mask sync bit */ + tbpos->status = (*cp++ >> 2) | TBINPROX; + if (tc->tbc_flags&TBF_INPROX && tbpos->status&020) + tbpos->status &= ~(020|TBINPROX); + tbpos->xpos = *cp++; + tbpos->xpos |= *cp++ << 6; + tbpos->ypos = *cp++; + tbpos->ypos |= *cp++ << 6; + tbpos->scount++; +} + +/* + * Decode new Hitach 6-byte format (high res). + */ +static +tbhresdecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct tbpos *tbpos; +{ + char byte; + + byte = *cp++; + tbpos->xpos = (byte & 03) << 14; + tbpos->xpos |= *cp++ << 7; + tbpos->xpos |= *cp++; + tbpos->ypos = *cp++ << 14; + tbpos->ypos |= *cp++ << 7; + tbpos->ypos |= *cp++; + tbpos->status = (byte >> 2) | TBINPROX; + if (tc->tbc_flags&TBF_INPROX && tbpos->status&020) + tbpos->status &= ~(020|TBINPROX); + tbpos->scount++; +} + +/* + * Polhemus decode. + */ +static +poldecode(tc, cp, polpos) + struct tbconf *tc; + register char *cp; + register struct polpos *polpos; +{ + + polpos->p_x = cp[4] | cp[3]<<7 | (cp[9] & 0x03) << 14; + polpos->p_y = cp[6] | cp[5]<<7 | (cp[9] & 0x0c) << 12; + polpos->p_z = cp[8] | cp[7]<<7 | (cp[9] & 0x30) << 10; + polpos->p_azi = cp[11] | cp[10]<<7 | (cp[16] & 0x03) << 14; + polpos->p_pit = cp[13] | cp[12]<<7 | (cp[16] & 0x0c) << 12; + polpos->p_rol = cp[15] | cp[14]<<7 | (cp[16] & 0x30) << 10; + polpos->p_stat = cp[1] | cp[0]<<7; + if (cp[2] != ' ') + polpos->p_key = cp[2]; +} + +/*ARGSUSED*/ +tbioctl(tp, cmd, data, flag) + struct tty *tp; + caddr_t data; +{ + register struct tb *tbp = (struct tb *)tp->T_LINEP; + + switch (cmd) { + + case BIOGMODE: + *(int *)data = tbp->tbflags & TBMODE; + break; + + case BIOSTYPE: + if (tbconf[*(int *)data & TBTYPE].tbc_recsize == 0 || + tbconf[*(int *)data & TBTYPE].tbc_decode == 0) + return (EINVAL); + tbp->tbflags &= ~TBTYPE; + tbp->tbflags |= *(int *)data & TBTYPE; + /* fall thru... to set mode bits */ + + case BIOSMODE: { + register struct tbconf *tc; + + tbp->tbflags &= ~TBMODE; + tbp->tbflags |= *(int *)data & TBMODE; + tc = &tbconf[tbp->tbflags & TBTYPE]; + if (tbp->tbflags&TBSTOP) { + if (tc->tbc_stop) + ttyout(tc->tbc_stop, tp); + } else if (tc->tbc_start) + ttyout(tc->tbc_start, tp); + if (tbp->tbflags&TBPOINT) { + if (tc->tbc_point) + ttyout(tc->tbc_point, tp); + } else if (tc->tbc_run) + ttyout(tc->tbc_run, tp); + ttstart(tp); + break; + } + + case BIOGTYPE: + *(int *)data = tbp->tbflags & TBTYPE; + break; + + case TIOCSETD: + case TIOCGETD: + case TIOCGETP: + case TIOCGETC: + return (-1); /* pass thru... */ + + default: + return (ENOTTY); + } + return (0); +} +#endif diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c new file mode 100644 index 00000000000..964fc6f6d5e --- /dev/null +++ b/sys/kern/tty_tty.c @@ -0,0 +1,147 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_tty.c 8.2 (Berkeley) 9/23/93 + */ + +/* + * Indirect driver for controlling tty. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define cttyvp(p) ((p)->p_flag & P_CONTROLT ? (p)->p_session->s_ttyvp : NULL) + +/*ARGSUSED*/ +cttyopen(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + struct vnode *ttyvp = cttyvp(p); + int error; + + if (ttyvp == NULL) + return (ENXIO); + VOP_LOCK(ttyvp); +#ifdef PARANOID + /* + * Since group is tty and mode is 620 on most terminal lines + * and since sessions protect terminals from processes outside + * your session, this check is probably no longer necessary. + * Since it inhibits setuid root programs that later switch + * to another user from accessing /dev/tty, we have decided + * to delete this test. (mckusick 5/93) + */ + error = VOP_ACCESS(ttyvp, + (flag&FREAD ? VREAD : 0) | (flag&FWRITE ? VWRITE : 0), p->p_ucred, p); + if (!error) +#endif /* PARANOID */ + error = VOP_OPEN(ttyvp, flag, NOCRED, p); + VOP_UNLOCK(ttyvp); + return (error); +} + +/*ARGSUSED*/ +cttyread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct vnode *ttyvp = cttyvp(uio->uio_procp); + int error; + + if (ttyvp == NULL) + return (EIO); + VOP_LOCK(ttyvp); + error = VOP_READ(ttyvp, uio, flag, NOCRED); + VOP_UNLOCK(ttyvp); + return (error); +} + +/*ARGSUSED*/ +cttywrite(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct vnode *ttyvp = cttyvp(uio->uio_procp); + int error; + + if (ttyvp == NULL) + return (EIO); + VOP_LOCK(ttyvp); + error = VOP_WRITE(ttyvp, uio, flag, NOCRED); + VOP_UNLOCK(ttyvp); + return (error); +} + +/*ARGSUSED*/ +cttyioctl(dev, cmd, addr, flag, p) + dev_t dev; + int cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + struct vnode *ttyvp = cttyvp(p); + + if (ttyvp == NULL) + return (EIO); + if (cmd == TIOCNOTTY) { + if (!SESS_LEADER(p)) { + p->p_flag &= ~P_CONTROLT; + return (0); + } else + return (EINVAL); + } + return (VOP_IOCTL(ttyvp, cmd, addr, flag, NOCRED, p)); +} + +/*ARGSUSED*/ +cttyselect(dev, flag, p) + dev_t dev; + int flag; + struct proc *p; +{ + struct vnode *ttyvp = cttyvp(p); + + if (ttyvp == NULL) + return (1); /* try operation to get EOF/failure */ + return (VOP_SELECT(ttyvp, flag, FREAD|FWRITE, NOCRED, p)); +} diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c new file mode 100644 index 00000000000..8834dbf4442 --- /dev/null +++ b/sys/kern/uipc_domain.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void pffasttimo __P((void *)); +void pfslowtimo __P((void *)); + +#define ADDDOMAIN(x) { \ + extern struct domain __CONCAT(x,domain); \ + __CONCAT(x,domain.dom_next) = domains; \ + domains = &__CONCAT(x,domain); \ +} + +domaininit() +{ + register struct domain *dp; + register struct protosw *pr; + +#undef unix +#ifndef lint + ADDDOMAIN(unix); + ADDDOMAIN(route); +#ifdef INET + ADDDOMAIN(inet); +#endif +#ifdef NS + ADDDOMAIN(ns); +#endif +#ifdef ISO + ADDDOMAIN(iso); +#endif +#ifdef CCITT + ADDDOMAIN(ccitt); +#endif +#include "imp.h" +#if NIMP > 0 + ADDDOMAIN(imp); +#endif +#endif + + for (dp = domains; dp; dp = dp->dom_next) { + if (dp->dom_init) + (*dp->dom_init)(); + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_init) + (*pr->pr_init)(); + } + +if (max_linkhdr < 16) /* XXX */ +max_linkhdr = 16; + max_hdr = max_linkhdr + max_protohdr; + max_datalen = MHLEN - max_hdr; + timeout(pffasttimo, (void *)0, 1); + timeout(pfslowtimo, (void *)0, 1); +} + +struct protosw * +pffindtype(family, type) + int family, type; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + if (dp->dom_family == family) + goto found; + return (0); +found: + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_type && pr->pr_type == type) + return (pr); + return (0); +} + +struct protosw * +pffindproto(family, protocol, type) + int family, protocol, type; +{ + register struct domain *dp; + register struct protosw *pr; + struct protosw *maybe = 0; + + if (family == 0) + return (0); + for (dp = domains; dp; dp = dp->dom_next) + if (dp->dom_family == family) + goto found; + return (0); +found: + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { + if ((pr->pr_protocol == protocol) && (pr->pr_type == type)) + return (pr); + + if (type == SOCK_RAW && pr->pr_type == SOCK_RAW && + pr->pr_protocol == 0 && maybe == (struct protosw *)0) + maybe = pr; + } + return (maybe); +} + +net_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + register struct domain *dp; + register struct protosw *pr; + int family, protocol; + + /* + * All sysctl names at this level are nonterminal; + * next two components are protocol family and protocol number, + * then at least one addition component. + */ + if (namelen < 3) + return (EISDIR); /* overloaded */ + family = name[0]; + protocol = name[1]; + + if (family == 0) + return (0); + for (dp = domains; dp; dp = dp->dom_next) + if (dp->dom_family == family) + goto found; + return (ENOPROTOOPT); +found: + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_protocol == protocol && pr->pr_sysctl) + return ((*pr->pr_sysctl)(name + 2, namelen - 2, + oldp, oldlenp, newp, newlen)); + return (ENOPROTOOPT); +} + +pfctlinput(cmd, sa) + int cmd; + struct sockaddr *sa; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_ctlinput) + (*pr->pr_ctlinput)(cmd, sa, (caddr_t)0); +} + +void +pfslowtimo(arg) + void *arg; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_slowtimo) + (*pr->pr_slowtimo)(); + timeout(pfslowtimo, (void *)0, hz/2); +} + +void +pffasttimo(arg) + void *arg; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_fasttimo) + (*pr->pr_fasttimo)(); + timeout(pffasttimo, (void *)0, hz/5); +} diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c new file mode 100644 index 00000000000..b71c6345e36 --- /dev/null +++ b/sys/kern/uipc_mbuf.c @@ -0,0 +1,655 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#define MBTYPES +#include +#include +#include +#include +#include + +#include + +extern vm_map_t mb_map; +struct mbuf *mbutl; +char *mclrefcnt; + +mbinit() +{ + int s; + +#if CLBYTES < 4096 +#define NCL_INIT (4096/CLBYTES) +#else +#define NCL_INIT 1 +#endif + s = splimp(); + if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0) + goto bad; + splx(s); + return; +bad: + panic("mbinit"); +} + +/* + * Allocate some number of mbuf clusters + * and place on cluster free list. + * Must be called at splimp. + */ +/* ARGSUSED */ +m_clalloc(ncl, nowait) + register int ncl; + int nowait; +{ + static int logged; + register caddr_t p; + register int i; + int npg; + + npg = ncl * CLSIZE; + p = (caddr_t)kmem_malloc(mb_map, ctob(npg), !nowait); + if (p == NULL) { + if (logged == 0) { + logged++; + log(LOG_ERR, "mb_map full\n"); + } + return (0); + } + ncl = ncl * CLBYTES / MCLBYTES; + for (i = 0; i < ncl; i++) { + ((union mcluster *)p)->mcl_next = mclfree; + mclfree = (union mcluster *)p; + p += MCLBYTES; + mbstat.m_clfree++; + } + mbstat.m_clusters += ncl; + return (1); +} + +/* + * When MGET failes, ask protocols to free space when short of memory, + * then re-attempt to allocate an mbuf. + */ +struct mbuf * +m_retry(i, t) + int i, t; +{ + register struct mbuf *m; + + m_reclaim(); +#define m_retry(i, t) (struct mbuf *)0 + MGET(m, i, t); +#undef m_retry + return (m); +} + +/* + * As above; retry an MGETHDR. + */ +struct mbuf * +m_retryhdr(i, t) + int i, t; +{ + register struct mbuf *m; + + m_reclaim(); +#define m_retryhdr(i, t) (struct mbuf *)0 + MGETHDR(m, i, t); +#undef m_retryhdr + return (m); +} + +m_reclaim() +{ + register struct domain *dp; + register struct protosw *pr; + int s = splimp(); + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_drain) + (*pr->pr_drain)(); + splx(s); + mbstat.m_drain++; +} + +/* + * Space allocation routines. + * These are also available as macros + * for critical paths. + */ +struct mbuf * +m_get(nowait, type) + int nowait, type; +{ + register struct mbuf *m; + + MGET(m, nowait, type); + return (m); +} + +struct mbuf * +m_gethdr(nowait, type) + int nowait, type; +{ + register struct mbuf *m; + + MGETHDR(m, nowait, type); + return (m); +} + +struct mbuf * +m_getclr(nowait, type) + int nowait, type; +{ + register struct mbuf *m; + + MGET(m, nowait, type); + if (m == 0) + return (0); + bzero(mtod(m, caddr_t), MLEN); + return (m); +} + +struct mbuf * +m_free(m) + struct mbuf *m; +{ + register struct mbuf *n; + + MFREE(m, n); + return (n); +} + +void +m_freem(m) + register struct mbuf *m; +{ + register struct mbuf *n; + + if (m == NULL) + return; + do { + MFREE(m, n); + } while (m = n); +} + +/* + * Mbuffer utility routines. + */ + +/* + * Lesser-used path for M_PREPEND: + * allocate new mbuf to prepend to chain, + * copy junk along. + */ +struct mbuf * +m_prepend(m, len, how) + register struct mbuf *m; + int len, how; +{ + struct mbuf *mn; + + MGET(mn, how, m->m_type); + if (mn == (struct mbuf *)NULL) { + m_freem(m); + return ((struct mbuf *)NULL); + } + if (m->m_flags & M_PKTHDR) { + M_COPY_PKTHDR(mn, m); + m->m_flags &= ~M_PKTHDR; + } + mn->m_next = m; + m = mn; + if (len < MHLEN) + MH_ALIGN(m, len); + m->m_len = len; + return (m); +} + +/* + * Make a copy of an mbuf chain starting "off0" bytes from the beginning, + * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. + * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. + */ +int MCFail; + +struct mbuf * +m_copym(m, off0, len, wait) + register struct mbuf *m; + int off0, wait; + register int len; +{ + register struct mbuf *n, **np; + register int off = off0; + struct mbuf *top; + int copyhdr = 0; + + if (off < 0 || len < 0) + panic("m_copym"); + if (off == 0 && m->m_flags & M_PKTHDR) + copyhdr = 1; + while (off > 0) { + if (m == 0) + panic("m_copym"); + if (off < m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + np = ⊤ + top = 0; + while (len > 0) { + if (m == 0) { + if (len != M_COPYALL) + panic("m_copym"); + break; + } + MGET(n, wait, m->m_type); + *np = n; + if (n == 0) + goto nospace; + if (copyhdr) { + M_COPY_PKTHDR(n, m); + if (len == M_COPYALL) + n->m_pkthdr.len -= off0; + else + n->m_pkthdr.len = len; + copyhdr = 0; + } + n->m_len = min(len, m->m_len - off); + if (m->m_flags & M_EXT) { + n->m_data = m->m_data + off; + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + n->m_ext = m->m_ext; + n->m_flags |= M_EXT; + } else + bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), + (unsigned)n->m_len); + if (len != M_COPYALL) + len -= n->m_len; + off = 0; + m = m->m_next; + np = &n->m_next; + } + if (top == 0) + MCFail++; + return (top); +nospace: + m_freem(top); + MCFail++; + return (0); +} + +/* + * Copy data from an mbuf chain starting "off" bytes from the beginning, + * continuing for "len" bytes, into the indicated buffer. + */ +m_copydata(m, off, len, cp) + register struct mbuf *m; + register int off; + register int len; + caddr_t cp; +{ + register unsigned count; + + if (off < 0 || len < 0) + panic("m_copydata"); + while (off > 0) { + if (m == 0) + panic("m_copydata"); + if (off < m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + while (len > 0) { + if (m == 0) + panic("m_copydata"); + count = min(m->m_len - off, len); + bcopy(mtod(m, caddr_t) + off, cp, count); + len -= count; + cp += count; + off = 0; + m = m->m_next; + } +} + +/* + * Concatenate mbuf chain n to m. + * Both chains must be of the same type (e.g. MT_DATA). + * Any m_pkthdr is not updated. + */ +m_cat(m, n) + register struct mbuf *m, *n; +{ + while (m->m_next) + m = m->m_next; + while (n) { + if (m->m_flags & M_EXT || + m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { + /* just join the two chains */ + m->m_next = n; + return; + } + /* splat the data from one into the other */ + bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, + (u_int)n->m_len); + m->m_len += n->m_len; + n = m_free(n); + } +} + +m_adj(mp, req_len) + struct mbuf *mp; + int req_len; +{ + register int len = req_len; + register struct mbuf *m; + register count; + + if ((m = mp) == NULL) + return; + if (len >= 0) { + /* + * Trim from head. + */ + while (m != NULL && len > 0) { + if (m->m_len <= len) { + len -= m->m_len; + m->m_len = 0; + m = m->m_next; + } else { + m->m_len -= len; + m->m_data += len; + len = 0; + } + } + m = mp; + if (mp->m_flags & M_PKTHDR) + m->m_pkthdr.len -= (req_len - len); + } else { + /* + * Trim from tail. Scan the mbuf chain, + * calculating its length and finding the last mbuf. + * If the adjustment only affects this mbuf, then just + * adjust and return. Otherwise, rescan and truncate + * after the remaining size. + */ + len = -len; + count = 0; + for (;;) { + count += m->m_len; + if (m->m_next == (struct mbuf *)0) + break; + m = m->m_next; + } + if (m->m_len >= len) { + m->m_len -= len; + if (mp->m_flags & M_PKTHDR) + mp->m_pkthdr.len -= len; + return; + } + count -= len; + if (count < 0) + count = 0; + /* + * Correct length for chain is "count". + * Find the mbuf with last data, adjust its length, + * and toss data from remaining mbufs on chain. + */ + m = mp; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len = count; + for (; m; m = m->m_next) { + if (m->m_len >= count) { + m->m_len = count; + break; + } + count -= m->m_len; + } + while (m = m->m_next) + m->m_len = 0; + } +} + +/* + * Rearange an mbuf chain so that len bytes are contiguous + * and in the data area of an mbuf (so that mtod and dtom + * will work for a structure of size len). Returns the resulting + * mbuf chain on success, frees it and returns null on failure. + * If there is room, it will add up to max_protohdr-len extra bytes to the + * contiguous region in an attempt to avoid being called next time. + */ +int MPFail; + +struct mbuf * +m_pullup(n, len) + register struct mbuf *n; + int len; +{ + register struct mbuf *m; + register int count; + int space; + + /* + * If first mbuf has no cluster, and has room for len bytes + * without shifting current data, pullup into it, + * otherwise allocate a new mbuf to prepend to the chain. + */ + if ((n->m_flags & M_EXT) == 0 && + n->m_data + len < &n->m_dat[MLEN] && n->m_next) { + if (n->m_len >= len) + return (n); + m = n; + n = n->m_next; + len -= m->m_len; + } else { + if (len > MHLEN) + goto bad; + MGET(m, M_DONTWAIT, n->m_type); + if (m == 0) + goto bad; + m->m_len = 0; + if (n->m_flags & M_PKTHDR) { + M_COPY_PKTHDR(m, n); + n->m_flags &= ~M_PKTHDR; + } + } + space = &m->m_dat[MLEN] - (m->m_data + m->m_len); + do { + count = min(min(max(len, max_protohdr), space), n->m_len); + bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, + (unsigned)count); + len -= count; + m->m_len += count; + n->m_len -= count; + space -= count; + if (n->m_len) + n->m_data += count; + else + n = m_free(n); + } while (len > 0 && n); + if (len > 0) { + (void) m_free(m); + goto bad; + } + m->m_next = n; + return (m); +bad: + m_freem(n); + MPFail++; + return (0); +} + +/* + * Partition an mbuf chain in two pieces, returning the tail -- + * all but the first len0 bytes. In case of failure, it returns NULL and + * attempts to restore the chain to its original state. + */ +struct mbuf * +m_split(m0, len0, wait) + register struct mbuf *m0; + int len0, wait; +{ + register struct mbuf *m, *n; + unsigned len = len0, remain; + + for (m = m0; m && len > m->m_len; m = m->m_next) + len -= m->m_len; + if (m == 0) + return (0); + remain = m->m_len - len; + if (m0->m_flags & M_PKTHDR) { + MGETHDR(n, wait, m0->m_type); + if (n == 0) + return (0); + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + n->m_pkthdr.len = m0->m_pkthdr.len - len0; + m0->m_pkthdr.len = len0; + if (m->m_flags & M_EXT) + goto extpacket; + if (remain > MHLEN) { + /* m can't be the lead packet */ + MH_ALIGN(n, 0); + n->m_next = m_split(m, len, wait); + if (n->m_next == 0) { + (void) m_free(n); + return (0); + } else + return (n); + } else + MH_ALIGN(n, remain); + } else if (remain == 0) { + n = m->m_next; + m->m_next = 0; + return (n); + } else { + MGET(n, wait, m->m_type); + if (n == 0) + return (0); + M_ALIGN(n, remain); + } +extpacket: + if (m->m_flags & M_EXT) { + n->m_flags |= M_EXT; + n->m_ext = m->m_ext; + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */ + n->m_data = m->m_data + len; + } else { + bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); + } + n->m_len = remain; + m->m_len = len; + n->m_next = m->m_next; + m->m_next = 0; + return (n); +} +/* + * Routine to copy from device local memory into mbufs. + */ +struct mbuf * +m_devget(buf, totlen, off0, ifp, copy) + char *buf; + int totlen, off0; + struct ifnet *ifp; + void (*copy)(); +{ + register struct mbuf *m; + struct mbuf *top = 0, **mp = ⊤ + register int off = off0, len; + register char *cp; + char *epkt; + + cp = buf; + epkt = cp + totlen; + if (off) { + cp += off + 2 * sizeof(u_short); + totlen -= 2 * sizeof(u_short); + } + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == 0) + return (0); + m->m_pkthdr.rcvif = ifp; + m->m_pkthdr.len = totlen; + m->m_len = MHLEN; + + while (totlen > 0) { + if (top) { + MGET(m, M_DONTWAIT, MT_DATA); + if (m == 0) { + m_freem(top); + return (0); + } + m->m_len = MLEN; + } + len = min(totlen, epkt - cp); + if (len >= MINCLSIZE) { + MCLGET(m, M_DONTWAIT); + if (m->m_flags & M_EXT) + m->m_len = len = min(len, MCLBYTES); + else + len = m->m_len; + } else { + /* + * Place initial small packet/header at end of mbuf. + */ + if (len < m->m_len) { + if (top == 0 && len + max_linkhdr <= m->m_len) + m->m_data += max_linkhdr; + m->m_len = len; + } else + len = m->m_len; + } + if (copy) + copy(cp, mtod(m, caddr_t), (unsigned)len); + else + bcopy(cp, mtod(m, caddr_t), (unsigned)len); + cp += len; + *mp = m; + mp = &m->m_next; + totlen -= len; + if (cp == epkt) + cp = buf; + } + return (top); +} diff --git a/sys/kern/uipc_proto.c b/sys/kern/uipc_proto.c new file mode 100644 index 00000000000..da9828aa267 --- /dev/null +++ b/sys/kern/uipc_proto.c @@ -0,0 +1,72 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_proto.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include + +/* + * Definitions of protocols supported in the UNIX domain. + */ + +int uipc_usrreq(), raw_usrreq(); +void raw_init(),raw_input(),raw_ctlinput(); +extern struct domain unixdomain; /* or at least forward */ + +struct protosw unixsw[] = { +{ SOCK_STREAM, &unixdomain, 0, PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, + 0, 0, 0, 0, + uipc_usrreq, + 0, 0, 0, 0, +}, +{ SOCK_DGRAM, &unixdomain, 0, PR_ATOMIC|PR_ADDR|PR_RIGHTS, + 0, 0, 0, 0, + uipc_usrreq, + 0, 0, 0, 0, +}, +{ 0, 0, 0, 0, + raw_input, 0, raw_ctlinput, 0, + raw_usrreq, + raw_init, 0, 0, 0, +} +}; + +int unp_externalize(), unp_dispose(); + +struct domain unixdomain = + { AF_UNIX, "unix", 0, unp_externalize, unp_dispose, + unixsw, &unixsw[sizeof(unixsw)/sizeof(unixsw[0])] }; diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c new file mode 100644 index 00000000000..d4af592d79b --- /dev/null +++ b/sys/kern/uipc_sockbuf.c @@ -0,0 +1,755 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Primitive routines for operating on sockets and socket buffers + */ + +/* strings for sleep message: */ +char netio[] = "netio"; +char netcon[] = "netcon"; +char netcls[] = "netcls"; + +u_long sb_max = SB_MAX; /* patchable */ + +/* + * Procedures to manipulate state flags of socket + * and do appropriate wakeups. Normal sequence from the + * active (originating) side is that soisconnecting() is + * called during processing of connect() call, + * resulting in an eventual call to soisconnected() if/when the + * connection is established. When the connection is torn down + * soisdisconnecting() is called during processing of disconnect() call, + * and soisdisconnected() is called when the connection to the peer + * is totally severed. The semantics of these routines are such that + * connectionless protocols can call soisconnected() and soisdisconnected() + * only, bypassing the in-progress calls when setting up a ``connection'' + * takes no time. + * + * From the passive side, a socket is created with + * two queues of sockets: so_q0 for connections in progress + * and so_q for connections already made and awaiting user acceptance. + * As a protocol is preparing incoming connections, it creates a socket + * structure queued on so_q0 by calling sonewconn(). When the connection + * is established, soisconnected() is called, and transfers the + * socket structure to so_q, making it available to accept(). + * + * If a socket is closed with sockets on either + * so_q0 or so_q, these sockets are dropped. + * + * If higher level protocols are implemented in + * the kernel, the wakeups done here will sometimes + * cause software-interrupt process scheduling. + */ + +soisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= SS_ISCONNECTING; +} + +soisconnected(so) + register struct socket *so; +{ + register struct socket *head = so->so_head; + + so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); + so->so_state |= SS_ISCONNECTED; + if (head && soqremque(so, 0)) { + soqinsque(head, so, 1); + sorwakeup(head); + wakeup((caddr_t)&head->so_timeo); + } else { + wakeup((caddr_t)&so->so_timeo); + sorwakeup(so); + sowwakeup(so); + } +} + +soisdisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~SS_ISCONNECTING; + so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +soisdisconnected(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +/* + * When an attempt at a new connection is noted on a socket + * which accepts connections, sonewconn is called. If the + * connection is possible (subject to space constraints, etc.) + * then we allocate a new structure, propoerly linked into the + * data structure of the original socket, and return this. + * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. + * + * Currently, sonewconn() is defined as sonewconn1() in socketvar.h + * to catch calls that are missing the (new) second parameter. + */ +struct socket * +sonewconn1(head, connstatus) + register struct socket *head; + int connstatus; +{ + register struct socket *so; + int soqueue = connstatus ? 1 : 0; + + if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) + return ((struct socket *)0); + MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_DONTWAIT); + if (so == NULL) + return ((struct socket *)0); + bzero((caddr_t)so, sizeof(*so)); + so->so_type = head->so_type; + so->so_options = head->so_options &~ SO_ACCEPTCONN; + so->so_linger = head->so_linger; + so->so_state = head->so_state | SS_NOFDREF; + so->so_proto = head->so_proto; + so->so_timeo = head->so_timeo; + so->so_pgid = head->so_pgid; + (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); + soqinsque(head, so, soqueue); + if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)) { + (void) soqremque(so, soqueue); + (void) free((caddr_t)so, M_SOCKET); + return ((struct socket *)0); + } + if (connstatus) { + sorwakeup(head); + wakeup((caddr_t)&head->so_timeo); + so->so_state |= connstatus; + } + return (so); +} + +soqinsque(head, so, q) + register struct socket *head, *so; + int q; +{ + + register struct socket **prev; + so->so_head = head; + if (q == 0) { + head->so_q0len++; + so->so_q0 = 0; + for (prev = &(head->so_q0); *prev; ) + prev = &((*prev)->so_q0); + } else { + head->so_qlen++; + so->so_q = 0; + for (prev = &(head->so_q); *prev; ) + prev = &((*prev)->so_q); + } + *prev = so; +} + +soqremque(so, q) + register struct socket *so; + int q; +{ + register struct socket *head, *prev, *next; + + head = so->so_head; + prev = head; + for (;;) { + next = q ? prev->so_q : prev->so_q0; + if (next == so) + break; + if (next == 0) + return (0); + prev = next; + } + if (q == 0) { + prev->so_q0 = next->so_q0; + head->so_q0len--; + } else { + prev->so_q = next->so_q; + head->so_qlen--; + } + next->so_q0 = next->so_q = 0; + next->so_head = 0; + return (1); +} + +/* + * Socantsendmore indicates that no more data will be sent on the + * socket; it would normally be applied to a socket when the user + * informs the system that no more data is to be sent, by the protocol + * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data + * will be received, and will normally be applied to the socket by a + * protocol when it detects that the peer will send no more data. + * Data queued for reading in the socket may yet be read. + */ + +socantsendmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTSENDMORE; + sowwakeup(so); +} + +socantrcvmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTRCVMORE; + sorwakeup(so); +} + +/* + * Wait for data to arrive at/drain from a socket buffer. + */ +sbwait(sb) + struct sockbuf *sb; +{ + + sb->sb_flags |= SB_WAIT; + return (tsleep((caddr_t)&sb->sb_cc, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio, + sb->sb_timeo)); +} + +/* + * Lock a sockbuf already known to be locked; + * return any error returned from sleep (EINTR). + */ +sb_lock(sb) + register struct sockbuf *sb; +{ + int error; + + while (sb->sb_flags & SB_LOCK) { + sb->sb_flags |= SB_WANT; + if (error = tsleep((caddr_t)&sb->sb_flags, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, + netio, 0)) + return (error); + } + sb->sb_flags |= SB_LOCK; + return (0); +} + +/* + * Wakeup processes waiting on a socket buffer. + * Do asynchronous notification via SIGIO + * if the socket has the SS_ASYNC flag set. + */ +sowakeup(so, sb) + register struct socket *so; + register struct sockbuf *sb; +{ + struct proc *p; + + selwakeup(&sb->sb_sel); + sb->sb_flags &= ~SB_SEL; + if (sb->sb_flags & SB_WAIT) { + sb->sb_flags &= ~SB_WAIT; + wakeup((caddr_t)&sb->sb_cc); + } + if (so->so_state & SS_ASYNC) { + if (so->so_pgid < 0) + gsignal(-so->so_pgid, SIGIO); + else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) + psignal(p, SIGIO); + } +} + +/* + * Socket buffer (struct sockbuf) utility routines. + * + * Each socket contains two socket buffers: one for sending data and + * one for receiving data. Each buffer contains a queue of mbufs, + * information about the number of mbufs and amount of data in the + * queue, and other fields allowing select() statements and notification + * on data availability to be implemented. + * + * Data stored in a socket buffer is maintained as a list of records. + * Each record is a list of mbufs chained together with the m_next + * field. Records are chained together with the m_nextpkt field. The upper + * level routine soreceive() expects the following conventions to be + * observed when placing information in the receive buffer: + * + * 1. If the protocol requires each message be preceded by the sender's + * name, then a record containing that name must be present before + * any associated data (mbuf's must be of type MT_SONAME). + * 2. If the protocol supports the exchange of ``access rights'' (really + * just additional data associated with the message), and there are + * ``rights'' to be received, then a record containing this data + * should be present (mbuf's must be of type MT_RIGHTS). + * 3. If a name or rights record exists, then it must be followed by + * a data record, perhaps of zero length. + * + * Before using a new socket structure it is first necessary to reserve + * buffer space to the socket, by calling sbreserve(). This should commit + * some of the available buffer space in the system buffer pool for the + * socket (currently, it does nothing but enforce limits). The space + * should be released by calling sbrelease() when the socket is destroyed. + */ + +soreserve(so, sndcc, rcvcc) + register struct socket *so; + u_long sndcc, rcvcc; +{ + + if (sbreserve(&so->so_snd, sndcc) == 0) + goto bad; + if (sbreserve(&so->so_rcv, rcvcc) == 0) + goto bad2; + if (so->so_rcv.sb_lowat == 0) + so->so_rcv.sb_lowat = 1; + if (so->so_snd.sb_lowat == 0) + so->so_snd.sb_lowat = MCLBYTES; + if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) + so->so_snd.sb_lowat = so->so_snd.sb_hiwat; + return (0); +bad2: + sbrelease(&so->so_snd); +bad: + return (ENOBUFS); +} + +/* + * Allot mbufs to a sockbuf. + * Attempt to scale mbmax so that mbcnt doesn't become limiting + * if buffering efficiency is near the normal case. + */ +sbreserve(sb, cc) + struct sockbuf *sb; + u_long cc; +{ + + if (cc > sb_max * MCLBYTES / (MSIZE + MCLBYTES)) + return (0); + sb->sb_hiwat = cc; + sb->sb_mbmax = min(cc * 2, sb_max); + if (sb->sb_lowat > sb->sb_hiwat) + sb->sb_lowat = sb->sb_hiwat; + return (1); +} + +/* + * Free mbufs held by a socket, and reserved mbuf space. + */ +sbrelease(sb) + struct sockbuf *sb; +{ + + sbflush(sb); + sb->sb_hiwat = sb->sb_mbmax = 0; +} + +/* + * Routines to add and remove + * data from an mbuf queue. + * + * The routines sbappend() or sbappendrecord() are normally called to + * append new mbufs to a socket buffer, after checking that adequate + * space is available, comparing the function sbspace() with the amount + * of data to be added. sbappendrecord() differs from sbappend() in + * that data supplied is treated as the beginning of a new record. + * To place a sender's address, optional access rights, and data in a + * socket receive buffer, sbappendaddr() should be used. To place + * access rights and data in a socket receive buffer, sbappendrights() + * should be used. In either case, the new data begins a new record. + * Note that unlike sbappend() and sbappendrecord(), these routines check + * for the caller that there will be enough space to store the data. + * Each fails if there is not enough space, or if it cannot find mbufs + * to store additional information in. + * + * Reliable protocols may use the socket send buffer to hold data + * awaiting acknowledgement. Data is normally copied from a socket + * send buffer in a protocol with m_copy for output to a peer, + * and then removing the data from the socket buffer with sbdrop() + * or sbdroprecord() when the data is acknowledged by the peer. + */ + +/* + * Append mbuf chain m to the last record in the + * socket buffer sb. The additional space associated + * the mbuf chain is recorded in sb. Empty mbufs are + * discarded and mbufs are compacted where possible. + */ +sbappend(sb, m) + struct sockbuf *sb; + struct mbuf *m; +{ + register struct mbuf *n; + + if (m == 0) + return; + if (n = sb->sb_mb) { + while (n->m_nextpkt) + n = n->m_nextpkt; + do { + if (n->m_flags & M_EOR) { + sbappendrecord(sb, m); /* XXXXXX!!!! */ + return; + } + } while (n->m_next && (n = n->m_next)); + } + sbcompress(sb, m, n); +} + +#ifdef SOCKBUF_DEBUG +sbcheck(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m; + register int len = 0, mbcnt = 0; + + for (m = sb->sb_mb; m; m = m->m_next) { + len += m->m_len; + mbcnt += MSIZE; + if (m->m_flags & M_EXT) + mbcnt += m->m_ext.ext_size; + if (m->m_nextpkt) + panic("sbcheck nextpkt"); + } + if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { + printf("cc %d != %d || mbcnt %d != %d\n", len, sb->sb_cc, + mbcnt, sb->sb_mbcnt); + panic("sbcheck"); + } +} +#endif + +/* + * As above, except the mbuf chain + * begins a new record. + */ +sbappendrecord(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + + if (m0 == 0) + return; + if (m = sb->sb_mb) + while (m->m_nextpkt) + m = m->m_nextpkt; + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + if (m) + m->m_nextpkt = m0; + else + sb->sb_mb = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * As above except that OOB data + * is inserted at the beginning of the sockbuf, + * but after any other OOB data. + */ +sbinsertoob(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + register struct mbuf **mp; + + if (m0 == 0) + return; + for (mp = &sb->sb_mb; m = *mp; mp = &((*mp)->m_nextpkt)) { + again: + switch (m->m_type) { + + case MT_OOBDATA: + continue; /* WANT next train */ + + case MT_CONTROL: + if (m = m->m_next) + goto again; /* inspect THIS train further */ + } + break; + } + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + m0->m_nextpkt = *mp; + *mp = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * Append address and data, and optionally, control (ancillary) data + * to the receive queue of a socket. If present, + * m0 must include a packet header with total length. + * Returns 0 if no space in sockbuf or insufficient mbufs. + */ +sbappendaddr(sb, asa, m0, control) + register struct sockbuf *sb; + struct sockaddr *asa; + struct mbuf *m0, *control; +{ + register struct mbuf *m, *n; + int space = asa->sa_len; + +if (m0 && (m0->m_flags & M_PKTHDR) == 0) +panic("sbappendaddr"); + if (m0) + space += m0->m_pkthdr.len; + for (n = control; n; n = n->m_next) { + space += n->m_len; + if (n->m_next == 0) /* keep pointer to last control buf */ + break; + } + if (space > sbspace(sb)) + return (0); + if (asa->sa_len > MLEN) + return (0); + MGET(m, M_DONTWAIT, MT_SONAME); + if (m == 0) + return (0); + m->m_len = asa->sa_len; + bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len); + if (n) + n->m_next = m0; /* concatenate data to control */ + else + control = m0; + m->m_next = control; + for (n = m; n; n = n->m_next) + sballoc(sb, n); + if (n = sb->sb_mb) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = m; + } else + sb->sb_mb = m; + return (1); +} + +sbappendcontrol(sb, m0, control) + struct sockbuf *sb; + struct mbuf *control, *m0; +{ + register struct mbuf *m, *n; + int space = 0; + + if (control == 0) + panic("sbappendcontrol"); + for (m = control; ; m = m->m_next) { + space += m->m_len; + if (m->m_next == 0) + break; + } + n = m; /* save pointer to last control buffer */ + for (m = m0; m; m = m->m_next) + space += m->m_len; + if (space > sbspace(sb)) + return (0); + n->m_next = m0; /* concatenate data to control */ + for (m = control; m; m = m->m_next) + sballoc(sb, m); + if (n = sb->sb_mb) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = control; + } else + sb->sb_mb = control; + return (1); +} + +/* + * Compress mbuf chain m into the socket + * buffer sb following mbuf n. If n + * is null, the buffer is presumed empty. + */ +sbcompress(sb, m, n) + register struct sockbuf *sb; + register struct mbuf *m, *n; +{ + register int eor = 0; + register struct mbuf *o; + + while (m) { + eor |= m->m_flags & M_EOR; + if (m->m_len == 0 && + (eor == 0 || + (((o = m->m_next) || (o = n)) && + o->m_type == m->m_type))) { + m = m_free(m); + continue; + } + if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 && + (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] && + n->m_type == m->m_type) { + bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, + (unsigned)m->m_len); + n->m_len += m->m_len; + sb->sb_cc += m->m_len; + m = m_free(m); + continue; + } + if (n) + n->m_next = m; + else + sb->sb_mb = m; + sballoc(sb, m); + n = m; + m->m_flags &= ~M_EOR; + m = m->m_next; + n->m_next = 0; + } + if (eor) { + if (n) + n->m_flags |= eor; + else + printf("semi-panic: sbcompress\n"); + } +} + +/* + * Free all mbufs in a sockbuf. + * Check that all resources are reclaimed. + */ +sbflush(sb) + register struct sockbuf *sb; +{ + + if (sb->sb_flags & SB_LOCK) + panic("sbflush"); + while (sb->sb_mbcnt) + sbdrop(sb, (int)sb->sb_cc); + if (sb->sb_cc || sb->sb_mb) + panic("sbflush 2"); +} + +/* + * Drop data from (the front of) a sockbuf. + */ +sbdrop(sb, len) + register struct sockbuf *sb; + register int len; +{ + register struct mbuf *m, *mn; + struct mbuf *next; + + next = (m = sb->sb_mb) ? m->m_nextpkt : 0; + while (len > 0) { + if (m == 0) { + if (next == 0) + panic("sbdrop"); + m = next; + next = m->m_nextpkt; + continue; + } + if (m->m_len > len) { + m->m_len -= len; + m->m_data += len; + sb->sb_cc -= len; + break; + } + len -= m->m_len; + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + while (m && m->m_len == 0) { + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + if (m) { + sb->sb_mb = m; + m->m_nextpkt = next; + } else + sb->sb_mb = next; +} + +/* + * Drop a record off the front of a sockbuf + * and move the next record to the front. + */ +sbdroprecord(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m, *mn; + + m = sb->sb_mb; + if (m) { + sb->sb_mb = m->m_nextpkt; + do { + sbfree(sb, m); + MFREE(m, mn); + } while (m = mn); + } +} diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c new file mode 100644 index 00000000000..ed09ee63b9f --- /dev/null +++ b/sys/kern/uipc_socket.c @@ -0,0 +1,1024 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Socket operation routines. + * These routines are called by the routines in + * sys_socket.c or from a system process, and + * implement the semantics of socket operations by + * switching out to the protocol specific routines. + */ +/*ARGSUSED*/ +socreate(dom, aso, type, proto) + int dom; + struct socket **aso; + register int type; + int proto; +{ + struct proc *p = curproc; /* XXX */ + register struct protosw *prp; + register struct socket *so; + register int error; + + if (proto) + prp = pffindproto(dom, proto, type); + else + prp = pffindtype(dom, type); + if (prp == 0 || prp->pr_usrreq == 0) + return (EPROTONOSUPPORT); + if (prp->pr_type != type) + return (EPROTOTYPE); + MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT); + bzero((caddr_t)so, sizeof(*so)); + so->so_type = type; + if (p->p_ucred->cr_uid == 0) + so->so_state = SS_PRIV; + so->so_proto = prp; + error = + (*prp->pr_usrreq)(so, PRU_ATTACH, + (struct mbuf *)0, (struct mbuf *)proto, (struct mbuf *)0); + if (error) { + so->so_state |= SS_NOFDREF; + sofree(so); + return (error); + } + *aso = so; + return (0); +} + +sobind(so, nam) + struct socket *so; + struct mbuf *nam; +{ + int s = splnet(); + int error; + + error = + (*so->so_proto->pr_usrreq)(so, PRU_BIND, + (struct mbuf *)0, nam, (struct mbuf *)0); + splx(s); + return (error); +} + +solisten(so, backlog) + register struct socket *so; + int backlog; +{ + int s = splnet(), error; + + error = + (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); + if (error) { + splx(s); + return (error); + } + if (so->so_q == 0) + so->so_options |= SO_ACCEPTCONN; + if (backlog < 0) + backlog = 0; + so->so_qlimit = min(backlog, SOMAXCONN); + splx(s); + return (0); +} + +sofree(so) + register struct socket *so; +{ + + if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) + return; + if (so->so_head) { + if (!soqremque(so, 0) && !soqremque(so, 1)) + panic("sofree dq"); + so->so_head = 0; + } + sbrelease(&so->so_snd); + sorflush(so); + FREE(so, M_SOCKET); +} + +/* + * Close a socket on last file table reference removal. + * Initiate disconnect if connected. + * Free socket when disconnect complete. + */ +soclose(so) + register struct socket *so; +{ + int s = splnet(); /* conservative */ + int error = 0; + + if (so->so_options & SO_ACCEPTCONN) { + while (so->so_q0) + (void) soabort(so->so_q0); + while (so->so_q) + (void) soabort(so->so_q); + } + if (so->so_pcb == 0) + goto discard; + if (so->so_state & SS_ISCONNECTED) { + if ((so->so_state & SS_ISDISCONNECTING) == 0) { + error = sodisconnect(so); + if (error) + goto drop; + } + if (so->so_options & SO_LINGER) { + if ((so->so_state & SS_ISDISCONNECTING) && + (so->so_state & SS_NBIO)) + goto drop; + while (so->so_state & SS_ISCONNECTED) + if (error = tsleep((caddr_t)&so->so_timeo, + PSOCK | PCATCH, netcls, so->so_linger)) + break; + } + } +drop: + if (so->so_pcb) { + int error2 = + (*so->so_proto->pr_usrreq)(so, PRU_DETACH, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); + if (error == 0) + error = error2; + } +discard: + if (so->so_state & SS_NOFDREF) + panic("soclose: NOFDREF"); + so->so_state |= SS_NOFDREF; + sofree(so); + splx(s); + return (error); +} + +/* + * Must be called at splnet... + */ +soabort(so) + struct socket *so; +{ + + return ( + (*so->so_proto->pr_usrreq)(so, PRU_ABORT, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)); +} + +soaccept(so, nam) + register struct socket *so; + struct mbuf *nam; +{ + int s = splnet(); + int error; + + if ((so->so_state & SS_NOFDREF) == 0) + panic("soaccept: !NOFDREF"); + so->so_state &= ~SS_NOFDREF; + error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, + (struct mbuf *)0, nam, (struct mbuf *)0); + splx(s); + return (error); +} + +soconnect(so, nam) + register struct socket *so; + struct mbuf *nam; +{ + int s; + int error; + + if (so->so_options & SO_ACCEPTCONN) + return (EOPNOTSUPP); + s = splnet(); + /* + * If protocol is connection-based, can only connect once. + * Otherwise, if connected, try to disconnect first. + * This allows user to disconnect by connecting to, e.g., + * a null address. + */ + if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && + ((so->so_proto->pr_flags & PR_CONNREQUIRED) || + (error = sodisconnect(so)))) + error = EISCONN; + else + error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, + (struct mbuf *)0, nam, (struct mbuf *)0); + splx(s); + return (error); +} + +soconnect2(so1, so2) + register struct socket *so1; + struct socket *so2; +{ + int s = splnet(); + int error; + + error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, + (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0); + splx(s); + return (error); +} + +sodisconnect(so) + register struct socket *so; +{ + int s = splnet(); + int error; + + if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + goto bad; + } + if (so->so_state & SS_ISDISCONNECTING) { + error = EALREADY; + goto bad; + } + error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); +bad: + splx(s); + return (error); +} + +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) +/* + * Send on a socket. + * If send must go all at once and message is larger than + * send buffering, then hard error. + * Lock against other senders. + * If must go all at once and not enough room now, then + * inform user that this would block and do nothing. + * Otherwise, if nonblocking, send as much as possible. + * The data to be sent is described by "uio" if nonzero, + * otherwise by the mbuf chain "top" (which must be null + * if uio is not). Data provided in mbuf chain must be small + * enough to send all at once. + * + * Returns nonzero on error, timeout or signal; callers + * must check for short counts if EINTR/ERESTART are returned. + * Data and control buffers are freed on return. + */ +sosend(so, addr, uio, top, control, flags) + register struct socket *so; + struct mbuf *addr; + struct uio *uio; + struct mbuf *top; + struct mbuf *control; + int flags; +{ + struct proc *p = curproc; /* XXX */ + struct mbuf **mp; + register struct mbuf *m; + register long space, len, resid; + int clen = 0, error, s, dontroute, mlen; + int atomic = sosendallatonce(so) || top; + + if (uio) + resid = uio->uio_resid; + else + resid = top->m_pkthdr.len; + /* + * In theory resid should be unsigned. + * However, space must be signed, as it might be less than 0 + * if we over-committed, and we must use a signed comparison + * of space and resid. On the other hand, a negative resid + * causes us to loop sending 0-length segments to the protocol. + */ + if (resid < 0) + return (EINVAL); + dontroute = + (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && + (so->so_proto->pr_flags & PR_ATOMIC); + p->p_stats->p_ru.ru_msgsnd++; + if (control) + clen = control->m_len; +#define snderr(errno) { error = errno; splx(s); goto release; } + +restart: + if (error = sblock(&so->so_snd, SBLOCKWAIT(flags))) + goto out; + do { + s = splnet(); + if (so->so_state & SS_CANTSENDMORE) + snderr(EPIPE); + if (so->so_error) + snderr(so->so_error); + if ((so->so_state & SS_ISCONNECTED) == 0) { + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + if ((so->so_state & SS_ISCONFIRMING) == 0 && + !(resid == 0 && clen != 0)) + snderr(ENOTCONN); + } else if (addr == 0) + snderr(EDESTADDRREQ); + } + space = sbspace(&so->so_snd); + if (flags & MSG_OOB) + space += 1024; + if (atomic && resid > so->so_snd.sb_hiwat || + clen > so->so_snd.sb_hiwat) + snderr(EMSGSIZE); + if (space < resid + clen && uio && + (atomic || space < so->so_snd.sb_lowat || space < clen)) { + if (so->so_state & SS_NBIO) + snderr(EWOULDBLOCK); + sbunlock(&so->so_snd); + error = sbwait(&so->so_snd); + splx(s); + if (error) + goto out; + goto restart; + } + splx(s); + mp = ⊤ + space -= clen; + do { + if (uio == NULL) { + /* + * Data is prepackaged in "top". + */ + resid = 0; + if (flags & MSG_EOR) + top->m_flags |= M_EOR; + } else do { + if (top == 0) { + MGETHDR(m, M_WAIT, MT_DATA); + mlen = MHLEN; + m->m_pkthdr.len = 0; + m->m_pkthdr.rcvif = (struct ifnet *)0; + } else { + MGET(m, M_WAIT, MT_DATA); + mlen = MLEN; + } + if (resid >= MINCLSIZE && space >= MCLBYTES) { + MCLGET(m, M_WAIT); + if ((m->m_flags & M_EXT) == 0) + goto nopages; + mlen = MCLBYTES; +#ifdef MAPPED_MBUFS + len = min(MCLBYTES, resid); +#else + if (atomic && top == 0) { + len = min(MCLBYTES - max_hdr, resid); + m->m_data += max_hdr; + } else + len = min(MCLBYTES, resid); +#endif + space -= MCLBYTES; + } else { +nopages: + len = min(min(mlen, resid), space); + space -= len; + /* + * For datagram protocols, leave room + * for protocol headers in first mbuf. + */ + if (atomic && top == 0 && len < mlen) + MH_ALIGN(m, len); + } + error = uiomove(mtod(m, caddr_t), (int)len, uio); + resid = uio->uio_resid; + m->m_len = len; + *mp = m; + top->m_pkthdr.len += len; + if (error) + goto release; + mp = &m->m_next; + if (resid <= 0) { + if (flags & MSG_EOR) + top->m_flags |= M_EOR; + break; + } + } while (space > 0 && atomic); + if (dontroute) + so->so_options |= SO_DONTROUTE; + s = splnet(); /* XXX */ + error = (*so->so_proto->pr_usrreq)(so, + (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, + top, addr, control); + splx(s); + if (dontroute) + so->so_options &= ~SO_DONTROUTE; + clen = 0; + control = 0; + top = 0; + mp = ⊤ + if (error) + goto release; + } while (resid && space > 0); + } while (resid); + +release: + sbunlock(&so->so_snd); +out: + if (top) + m_freem(top); + if (control) + m_freem(control); + return (error); +} + +/* + * Implement receive operations on a socket. + * We depend on the way that records are added to the sockbuf + * by sbappend*. In particular, each record (mbufs linked through m_next) + * must begin with an address if the protocol so specifies, + * followed by an optional mbuf or mbufs containing ancillary data, + * and then zero or more mbufs of data. + * In order to avoid blocking network interrupts for the entire time here, + * we splx() while doing the actual copy to user space. + * Although the sockbuf is locked, new data may still be appended, + * and thus we must maintain consistency of the sockbuf during that time. + * + * The caller may receive the data as a single mbuf chain by supplying + * an mbuf **mp0 for use in returning the chain. The uio is then used + * only for the count in uio_resid. + */ +soreceive(so, paddr, uio, mp0, controlp, flagsp) + register struct socket *so; + struct mbuf **paddr; + struct uio *uio; + struct mbuf **mp0; + struct mbuf **controlp; + int *flagsp; +{ + register struct mbuf *m, **mp; + register int flags, len, error, s, offset; + struct protosw *pr = so->so_proto; + struct mbuf *nextrecord; + int moff, type; + int orig_resid = uio->uio_resid; + + mp = mp0; + if (paddr) + *paddr = 0; + if (controlp) + *controlp = 0; + if (flagsp) + flags = *flagsp &~ MSG_EOR; + else + flags = 0; + if (flags & MSG_OOB) { + m = m_get(M_WAIT, MT_DATA); + error = (*pr->pr_usrreq)(so, PRU_RCVOOB, + m, (struct mbuf *)(flags & MSG_PEEK), (struct mbuf *)0); + if (error) + goto bad; + do { + error = uiomove(mtod(m, caddr_t), + (int) min(uio->uio_resid, m->m_len), uio); + m = m_free(m); + } while (uio->uio_resid && error == 0 && m); +bad: + if (m) + m_freem(m); + return (error); + } + if (mp) + *mp = (struct mbuf *)0; + if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) + (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, + (struct mbuf *)0, (struct mbuf *)0); + +restart: + if (error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) + return (error); + s = splnet(); + + m = so->so_rcv.sb_mb; + /* + * If we have less data than requested, block awaiting more + * (subject to any timeout) if: + * 1. the current count is less than the low water mark, or + * 2. MSG_WAITALL is set, and it is possible to do the entire + * receive operation at once if we block (resid <= hiwat). + * 3. MSG_DONTWAIT is not set + * If MSG_WAITALL is set but resid is larger than the receive buffer, + * we have to do the receive in sections, and thus risk returning + * a short count if a timeout or signal occurs after we start. + */ + if (m == 0 || ((flags & MSG_DONTWAIT) == 0 && + so->so_rcv.sb_cc < uio->uio_resid) && + (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || + ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && + m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0) { +#ifdef DIAGNOSTIC + if (m == 0 && so->so_rcv.sb_cc) + panic("receive 1"); +#endif + if (so->so_error) { + if (m) + goto dontblock; + error = so->so_error; + if ((flags & MSG_PEEK) == 0) + so->so_error = 0; + goto release; + } + if (so->so_state & SS_CANTRCVMORE) { + if (m) + goto dontblock; + else + goto release; + } + for (; m; m = m->m_next) + if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { + m = so->so_rcv.sb_mb; + goto dontblock; + } + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { + error = ENOTCONN; + goto release; + } + if (uio->uio_resid == 0) + goto release; + if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { + error = EWOULDBLOCK; + goto release; + } + sbunlock(&so->so_rcv); + error = sbwait(&so->so_rcv); + splx(s); + if (error) + return (error); + goto restart; + } +dontblock: + if (uio->uio_procp) + uio->uio_procp->p_stats->p_ru.ru_msgrcv++; + nextrecord = m->m_nextpkt; + if (pr->pr_flags & PR_ADDR) { +#ifdef DIAGNOSTIC + if (m->m_type != MT_SONAME) + panic("receive 1a"); +#endif + orig_resid = 0; + if (flags & MSG_PEEK) { + if (paddr) + *paddr = m_copy(m, 0, m->m_len); + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + if (paddr) { + *paddr = m; + so->so_rcv.sb_mb = m->m_next; + m->m_next = 0; + m = so->so_rcv.sb_mb; + } else { + MFREE(m, so->so_rcv.sb_mb); + m = so->so_rcv.sb_mb; + } + } + } + while (m && m->m_type == MT_CONTROL && error == 0) { + if (flags & MSG_PEEK) { + if (controlp) + *controlp = m_copy(m, 0, m->m_len); + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + if (controlp) { + if (pr->pr_domain->dom_externalize && + mtod(m, struct cmsghdr *)->cmsg_type == + SCM_RIGHTS) + error = (*pr->pr_domain->dom_externalize)(m); + *controlp = m; + so->so_rcv.sb_mb = m->m_next; + m->m_next = 0; + m = so->so_rcv.sb_mb; + } else { + MFREE(m, so->so_rcv.sb_mb); + m = so->so_rcv.sb_mb; + } + } + if (controlp) { + orig_resid = 0; + controlp = &(*controlp)->m_next; + } + } + if (m) { + if ((flags & MSG_PEEK) == 0) + m->m_nextpkt = nextrecord; + type = m->m_type; + if (type == MT_OOBDATA) + flags |= MSG_OOB; + } + moff = 0; + offset = 0; + while (m && uio->uio_resid > 0 && error == 0) { + if (m->m_type == MT_OOBDATA) { + if (type != MT_OOBDATA) + break; + } else if (type == MT_OOBDATA) + break; +#ifdef DIAGNOSTIC + else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) + panic("receive 3"); +#endif + so->so_state &= ~SS_RCVATMARK; + len = uio->uio_resid; + if (so->so_oobmark && len > so->so_oobmark - offset) + len = so->so_oobmark - offset; + if (len > m->m_len - moff) + len = m->m_len - moff; + /* + * If mp is set, just pass back the mbufs. + * Otherwise copy them out via the uio, then free. + * Sockbuf must be consistent here (points to current mbuf, + * it points to next record) when we drop priority; + * we must note any additions to the sockbuf when we + * block interrupts again. + */ + if (mp == 0) { + splx(s); + error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); + s = splnet(); + } else + uio->uio_resid -= len; + if (len == m->m_len - moff) { + if (m->m_flags & M_EOR) + flags |= MSG_EOR; + if (flags & MSG_PEEK) { + m = m->m_next; + moff = 0; + } else { + nextrecord = m->m_nextpkt; + sbfree(&so->so_rcv, m); + if (mp) { + *mp = m; + mp = &m->m_next; + so->so_rcv.sb_mb = m = m->m_next; + *mp = (struct mbuf *)0; + } else { + MFREE(m, so->so_rcv.sb_mb); + m = so->so_rcv.sb_mb; + } + if (m) + m->m_nextpkt = nextrecord; + } + } else { + if (flags & MSG_PEEK) + moff += len; + else { + if (mp) + *mp = m_copym(m, 0, len, M_WAIT); + m->m_data += len; + m->m_len -= len; + so->so_rcv.sb_cc -= len; + } + } + if (so->so_oobmark) { + if ((flags & MSG_PEEK) == 0) { + so->so_oobmark -= len; + if (so->so_oobmark == 0) { + so->so_state |= SS_RCVATMARK; + break; + } + } else { + offset += len; + if (offset == so->so_oobmark) + break; + } + } + if (flags & MSG_EOR) + break; + /* + * If the MSG_WAITALL flag is set (for non-atomic socket), + * we must not quit until "uio->uio_resid == 0" or an error + * termination. If a signal/timeout occurs, return + * with a short count but without error. + * Keep sockbuf locked against other readers. + */ + while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && + !sosendallatonce(so) && !nextrecord) { + if (so->so_error || so->so_state & SS_CANTRCVMORE) + break; + error = sbwait(&so->so_rcv); + if (error) { + sbunlock(&so->so_rcv); + splx(s); + return (0); + } + if (m = so->so_rcv.sb_mb) + nextrecord = m->m_nextpkt; + } + } + + if (m && pr->pr_flags & PR_ATOMIC) { + flags |= MSG_TRUNC; + if ((flags & MSG_PEEK) == 0) + (void) sbdroprecord(&so->so_rcv); + } + if ((flags & MSG_PEEK) == 0) { + if (m == 0) + so->so_rcv.sb_mb = nextrecord; + if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) + (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, + (struct mbuf *)flags, (struct mbuf *)0, + (struct mbuf *)0); + } + if (orig_resid == uio->uio_resid && orig_resid && + (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { + sbunlock(&so->so_rcv); + splx(s); + goto restart; + } + + if (flagsp) + *flagsp |= flags; +release: + sbunlock(&so->so_rcv); + splx(s); + return (error); +} + +soshutdown(so, how) + register struct socket *so; + register int how; +{ + register struct protosw *pr = so->so_proto; + + how++; + if (how & FREAD) + sorflush(so); + if (how & FWRITE) + return ((*pr->pr_usrreq)(so, PRU_SHUTDOWN, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)); + return (0); +} + +sorflush(so) + register struct socket *so; +{ + register struct sockbuf *sb = &so->so_rcv; + register struct protosw *pr = so->so_proto; + register int s; + struct sockbuf asb; + + sb->sb_flags |= SB_NOINTR; + (void) sblock(sb, M_WAITOK); + s = splimp(); + socantrcvmore(so); + sbunlock(sb); + asb = *sb; + bzero((caddr_t)sb, sizeof (*sb)); + splx(s); + if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) + (*pr->pr_domain->dom_dispose)(asb.sb_mb); + sbrelease(&asb); +} + +sosetopt(so, level, optname, m0) + register struct socket *so; + int level, optname; + struct mbuf *m0; +{ + int error = 0; + register struct mbuf *m = m0; + + if (level != SOL_SOCKET) { + if (so->so_proto && so->so_proto->pr_ctloutput) + return ((*so->so_proto->pr_ctloutput) + (PRCO_SETOPT, so, level, optname, &m0)); + error = ENOPROTOOPT; + } else { + switch (optname) { + + case SO_LINGER: + if (m == NULL || m->m_len != sizeof (struct linger)) { + error = EINVAL; + goto bad; + } + so->so_linger = mtod(m, struct linger *)->l_linger; + /* fall thru... */ + + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_DONTROUTE: + case SO_USELOOPBACK: + case SO_BROADCAST: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_OOBINLINE: + if (m == NULL || m->m_len < sizeof (int)) { + error = EINVAL; + goto bad; + } + if (*mtod(m, int *)) + so->so_options |= optname; + else + so->so_options &= ~optname; + break; + + case SO_SNDBUF: + case SO_RCVBUF: + case SO_SNDLOWAT: + case SO_RCVLOWAT: + if (m == NULL || m->m_len < sizeof (int)) { + error = EINVAL; + goto bad; + } + switch (optname) { + + case SO_SNDBUF: + case SO_RCVBUF: + if (sbreserve(optname == SO_SNDBUF ? + &so->so_snd : &so->so_rcv, + (u_long) *mtod(m, int *)) == 0) { + error = ENOBUFS; + goto bad; + } + break; + + case SO_SNDLOWAT: + so->so_snd.sb_lowat = *mtod(m, int *); + break; + case SO_RCVLOWAT: + so->so_rcv.sb_lowat = *mtod(m, int *); + break; + } + break; + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + { + struct timeval *tv; + short val; + + if (m == NULL || m->m_len < sizeof (*tv)) { + error = EINVAL; + goto bad; + } + tv = mtod(m, struct timeval *); + if (tv->tv_sec > SHRT_MAX / hz - hz) { + error = EDOM; + goto bad; + } + val = tv->tv_sec * hz + tv->tv_usec / tick; + + switch (optname) { + + case SO_SNDTIMEO: + so->so_snd.sb_timeo = val; + break; + case SO_RCVTIMEO: + so->so_rcv.sb_timeo = val; + break; + } + break; + } + + default: + error = ENOPROTOOPT; + break; + } + if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { + (void) ((*so->so_proto->pr_ctloutput) + (PRCO_SETOPT, so, level, optname, &m0)); + m = NULL; /* freed by protocol */ + } + } +bad: + if (m) + (void) m_free(m); + return (error); +} + +sogetopt(so, level, optname, mp) + register struct socket *so; + int level, optname; + struct mbuf **mp; +{ + register struct mbuf *m; + + if (level != SOL_SOCKET) { + if (so->so_proto && so->so_proto->pr_ctloutput) { + return ((*so->so_proto->pr_ctloutput) + (PRCO_GETOPT, so, level, optname, mp)); + } else + return (ENOPROTOOPT); + } else { + m = m_get(M_WAIT, MT_SOOPTS); + m->m_len = sizeof (int); + + switch (optname) { + + case SO_LINGER: + m->m_len = sizeof (struct linger); + mtod(m, struct linger *)->l_onoff = + so->so_options & SO_LINGER; + mtod(m, struct linger *)->l_linger = so->so_linger; + break; + + case SO_USELOOPBACK: + case SO_DONTROUTE: + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_BROADCAST: + case SO_OOBINLINE: + *mtod(m, int *) = so->so_options & optname; + break; + + case SO_TYPE: + *mtod(m, int *) = so->so_type; + break; + + case SO_ERROR: + *mtod(m, int *) = so->so_error; + so->so_error = 0; + break; + + case SO_SNDBUF: + *mtod(m, int *) = so->so_snd.sb_hiwat; + break; + + case SO_RCVBUF: + *mtod(m, int *) = so->so_rcv.sb_hiwat; + break; + + case SO_SNDLOWAT: + *mtod(m, int *) = so->so_snd.sb_lowat; + break; + + case SO_RCVLOWAT: + *mtod(m, int *) = so->so_rcv.sb_lowat; + break; + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + { + int val = (optname == SO_SNDTIMEO ? + so->so_snd.sb_timeo : so->so_rcv.sb_timeo); + + m->m_len = sizeof(struct timeval); + mtod(m, struct timeval *)->tv_sec = val / hz; + mtod(m, struct timeval *)->tv_usec = + (val % hz) / tick; + break; + } + + default: + (void)m_free(m); + return (ENOPROTOOPT); + } + *mp = m; + return (0); + } +} + +sohasoutofband(so) + register struct socket *so; +{ + struct proc *p; + + if (so->so_pgid < 0) + gsignal(-so->so_pgid, SIGURG); + else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) + psignal(p, SIGURG); + selwakeup(&so->so_rcv.sb_sel); +} diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c new file mode 100644 index 00000000000..d4af592d79b --- /dev/null +++ b/sys/kern/uipc_socket2.c @@ -0,0 +1,755 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Primitive routines for operating on sockets and socket buffers + */ + +/* strings for sleep message: */ +char netio[] = "netio"; +char netcon[] = "netcon"; +char netcls[] = "netcls"; + +u_long sb_max = SB_MAX; /* patchable */ + +/* + * Procedures to manipulate state flags of socket + * and do appropriate wakeups. Normal sequence from the + * active (originating) side is that soisconnecting() is + * called during processing of connect() call, + * resulting in an eventual call to soisconnected() if/when the + * connection is established. When the connection is torn down + * soisdisconnecting() is called during processing of disconnect() call, + * and soisdisconnected() is called when the connection to the peer + * is totally severed. The semantics of these routines are such that + * connectionless protocols can call soisconnected() and soisdisconnected() + * only, bypassing the in-progress calls when setting up a ``connection'' + * takes no time. + * + * From the passive side, a socket is created with + * two queues of sockets: so_q0 for connections in progress + * and so_q for connections already made and awaiting user acceptance. + * As a protocol is preparing incoming connections, it creates a socket + * structure queued on so_q0 by calling sonewconn(). When the connection + * is established, soisconnected() is called, and transfers the + * socket structure to so_q, making it available to accept(). + * + * If a socket is closed with sockets on either + * so_q0 or so_q, these sockets are dropped. + * + * If higher level protocols are implemented in + * the kernel, the wakeups done here will sometimes + * cause software-interrupt process scheduling. + */ + +soisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= SS_ISCONNECTING; +} + +soisconnected(so) + register struct socket *so; +{ + register struct socket *head = so->so_head; + + so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); + so->so_state |= SS_ISCONNECTED; + if (head && soqremque(so, 0)) { + soqinsque(head, so, 1); + sorwakeup(head); + wakeup((caddr_t)&head->so_timeo); + } else { + wakeup((caddr_t)&so->so_timeo); + sorwakeup(so); + sowwakeup(so); + } +} + +soisdisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~SS_ISCONNECTING; + so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +soisdisconnected(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +/* + * When an attempt at a new connection is noted on a socket + * which accepts connections, sonewconn is called. If the + * connection is possible (subject to space constraints, etc.) + * then we allocate a new structure, propoerly linked into the + * data structure of the original socket, and return this. + * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. + * + * Currently, sonewconn() is defined as sonewconn1() in socketvar.h + * to catch calls that are missing the (new) second parameter. + */ +struct socket * +sonewconn1(head, connstatus) + register struct socket *head; + int connstatus; +{ + register struct socket *so; + int soqueue = connstatus ? 1 : 0; + + if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) + return ((struct socket *)0); + MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_DONTWAIT); + if (so == NULL) + return ((struct socket *)0); + bzero((caddr_t)so, sizeof(*so)); + so->so_type = head->so_type; + so->so_options = head->so_options &~ SO_ACCEPTCONN; + so->so_linger = head->so_linger; + so->so_state = head->so_state | SS_NOFDREF; + so->so_proto = head->so_proto; + so->so_timeo = head->so_timeo; + so->so_pgid = head->so_pgid; + (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); + soqinsque(head, so, soqueue); + if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH, + (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)) { + (void) soqremque(so, soqueue); + (void) free((caddr_t)so, M_SOCKET); + return ((struct socket *)0); + } + if (connstatus) { + sorwakeup(head); + wakeup((caddr_t)&head->so_timeo); + so->so_state |= connstatus; + } + return (so); +} + +soqinsque(head, so, q) + register struct socket *head, *so; + int q; +{ + + register struct socket **prev; + so->so_head = head; + if (q == 0) { + head->so_q0len++; + so->so_q0 = 0; + for (prev = &(head->so_q0); *prev; ) + prev = &((*prev)->so_q0); + } else { + head->so_qlen++; + so->so_q = 0; + for (prev = &(head->so_q); *prev; ) + prev = &((*prev)->so_q); + } + *prev = so; +} + +soqremque(so, q) + register struct socket *so; + int q; +{ + register struct socket *head, *prev, *next; + + head = so->so_head; + prev = head; + for (;;) { + next = q ? prev->so_q : prev->so_q0; + if (next == so) + break; + if (next == 0) + return (0); + prev = next; + } + if (q == 0) { + prev->so_q0 = next->so_q0; + head->so_q0len--; + } else { + prev->so_q = next->so_q; + head->so_qlen--; + } + next->so_q0 = next->so_q = 0; + next->so_head = 0; + return (1); +} + +/* + * Socantsendmore indicates that no more data will be sent on the + * socket; it would normally be applied to a socket when the user + * informs the system that no more data is to be sent, by the protocol + * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data + * will be received, and will normally be applied to the socket by a + * protocol when it detects that the peer will send no more data. + * Data queued for reading in the socket may yet be read. + */ + +socantsendmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTSENDMORE; + sowwakeup(so); +} + +socantrcvmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTRCVMORE; + sorwakeup(so); +} + +/* + * Wait for data to arrive at/drain from a socket buffer. + */ +sbwait(sb) + struct sockbuf *sb; +{ + + sb->sb_flags |= SB_WAIT; + return (tsleep((caddr_t)&sb->sb_cc, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio, + sb->sb_timeo)); +} + +/* + * Lock a sockbuf already known to be locked; + * return any error returned from sleep (EINTR). + */ +sb_lock(sb) + register struct sockbuf *sb; +{ + int error; + + while (sb->sb_flags & SB_LOCK) { + sb->sb_flags |= SB_WANT; + if (error = tsleep((caddr_t)&sb->sb_flags, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, + netio, 0)) + return (error); + } + sb->sb_flags |= SB_LOCK; + return (0); +} + +/* + * Wakeup processes waiting on a socket buffer. + * Do asynchronous notification via SIGIO + * if the socket has the SS_ASYNC flag set. + */ +sowakeup(so, sb) + register struct socket *so; + register struct sockbuf *sb; +{ + struct proc *p; + + selwakeup(&sb->sb_sel); + sb->sb_flags &= ~SB_SEL; + if (sb->sb_flags & SB_WAIT) { + sb->sb_flags &= ~SB_WAIT; + wakeup((caddr_t)&sb->sb_cc); + } + if (so->so_state & SS_ASYNC) { + if (so->so_pgid < 0) + gsignal(-so->so_pgid, SIGIO); + else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) + psignal(p, SIGIO); + } +} + +/* + * Socket buffer (struct sockbuf) utility routines. + * + * Each socket contains two socket buffers: one for sending data and + * one for receiving data. Each buffer contains a queue of mbufs, + * information about the number of mbufs and amount of data in the + * queue, and other fields allowing select() statements and notification + * on data availability to be implemented. + * + * Data stored in a socket buffer is maintained as a list of records. + * Each record is a list of mbufs chained together with the m_next + * field. Records are chained together with the m_nextpkt field. The upper + * level routine soreceive() expects the following conventions to be + * observed when placing information in the receive buffer: + * + * 1. If the protocol requires each message be preceded by the sender's + * name, then a record containing that name must be present before + * any associated data (mbuf's must be of type MT_SONAME). + * 2. If the protocol supports the exchange of ``access rights'' (really + * just additional data associated with the message), and there are + * ``rights'' to be received, then a record containing this data + * should be present (mbuf's must be of type MT_RIGHTS). + * 3. If a name or rights record exists, then it must be followed by + * a data record, perhaps of zero length. + * + * Before using a new socket structure it is first necessary to reserve + * buffer space to the socket, by calling sbreserve(). This should commit + * some of the available buffer space in the system buffer pool for the + * socket (currently, it does nothing but enforce limits). The space + * should be released by calling sbrelease() when the socket is destroyed. + */ + +soreserve(so, sndcc, rcvcc) + register struct socket *so; + u_long sndcc, rcvcc; +{ + + if (sbreserve(&so->so_snd, sndcc) == 0) + goto bad; + if (sbreserve(&so->so_rcv, rcvcc) == 0) + goto bad2; + if (so->so_rcv.sb_lowat == 0) + so->so_rcv.sb_lowat = 1; + if (so->so_snd.sb_lowat == 0) + so->so_snd.sb_lowat = MCLBYTES; + if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) + so->so_snd.sb_lowat = so->so_snd.sb_hiwat; + return (0); +bad2: + sbrelease(&so->so_snd); +bad: + return (ENOBUFS); +} + +/* + * Allot mbufs to a sockbuf. + * Attempt to scale mbmax so that mbcnt doesn't become limiting + * if buffering efficiency is near the normal case. + */ +sbreserve(sb, cc) + struct sockbuf *sb; + u_long cc; +{ + + if (cc > sb_max * MCLBYTES / (MSIZE + MCLBYTES)) + return (0); + sb->sb_hiwat = cc; + sb->sb_mbmax = min(cc * 2, sb_max); + if (sb->sb_lowat > sb->sb_hiwat) + sb->sb_lowat = sb->sb_hiwat; + return (1); +} + +/* + * Free mbufs held by a socket, and reserved mbuf space. + */ +sbrelease(sb) + struct sockbuf *sb; +{ + + sbflush(sb); + sb->sb_hiwat = sb->sb_mbmax = 0; +} + +/* + * Routines to add and remove + * data from an mbuf queue. + * + * The routines sbappend() or sbappendrecord() are normally called to + * append new mbufs to a socket buffer, after checking that adequate + * space is available, comparing the function sbspace() with the amount + * of data to be added. sbappendrecord() differs from sbappend() in + * that data supplied is treated as the beginning of a new record. + * To place a sender's address, optional access rights, and data in a + * socket receive buffer, sbappendaddr() should be used. To place + * access rights and data in a socket receive buffer, sbappendrights() + * should be used. In either case, the new data begins a new record. + * Note that unlike sbappend() and sbappendrecord(), these routines check + * for the caller that there will be enough space to store the data. + * Each fails if there is not enough space, or if it cannot find mbufs + * to store additional information in. + * + * Reliable protocols may use the socket send buffer to hold data + * awaiting acknowledgement. Data is normally copied from a socket + * send buffer in a protocol with m_copy for output to a peer, + * and then removing the data from the socket buffer with sbdrop() + * or sbdroprecord() when the data is acknowledged by the peer. + */ + +/* + * Append mbuf chain m to the last record in the + * socket buffer sb. The additional space associated + * the mbuf chain is recorded in sb. Empty mbufs are + * discarded and mbufs are compacted where possible. + */ +sbappend(sb, m) + struct sockbuf *sb; + struct mbuf *m; +{ + register struct mbuf *n; + + if (m == 0) + return; + if (n = sb->sb_mb) { + while (n->m_nextpkt) + n = n->m_nextpkt; + do { + if (n->m_flags & M_EOR) { + sbappendrecord(sb, m); /* XXXXXX!!!! */ + return; + } + } while (n->m_next && (n = n->m_next)); + } + sbcompress(sb, m, n); +} + +#ifdef SOCKBUF_DEBUG +sbcheck(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m; + register int len = 0, mbcnt = 0; + + for (m = sb->sb_mb; m; m = m->m_next) { + len += m->m_len; + mbcnt += MSIZE; + if (m->m_flags & M_EXT) + mbcnt += m->m_ext.ext_size; + if (m->m_nextpkt) + panic("sbcheck nextpkt"); + } + if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { + printf("cc %d != %d || mbcnt %d != %d\n", len, sb->sb_cc, + mbcnt, sb->sb_mbcnt); + panic("sbcheck"); + } +} +#endif + +/* + * As above, except the mbuf chain + * begins a new record. + */ +sbappendrecord(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + + if (m0 == 0) + return; + if (m = sb->sb_mb) + while (m->m_nextpkt) + m = m->m_nextpkt; + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + if (m) + m->m_nextpkt = m0; + else + sb->sb_mb = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * As above except that OOB data + * is inserted at the beginning of the sockbuf, + * but after any other OOB data. + */ +sbinsertoob(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + register struct mbuf **mp; + + if (m0 == 0) + return; + for (mp = &sb->sb_mb; m = *mp; mp = &((*mp)->m_nextpkt)) { + again: + switch (m->m_type) { + + case MT_OOBDATA: + continue; /* WANT next train */ + + case MT_CONTROL: + if (m = m->m_next) + goto again; /* inspect THIS train further */ + } + break; + } + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + m0->m_nextpkt = *mp; + *mp = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * Append address and data, and optionally, control (ancillary) data + * to the receive queue of a socket. If present, + * m0 must include a packet header with total length. + * Returns 0 if no space in sockbuf or insufficient mbufs. + */ +sbappendaddr(sb, asa, m0, control) + register struct sockbuf *sb; + struct sockaddr *asa; + struct mbuf *m0, *control; +{ + register struct mbuf *m, *n; + int space = asa->sa_len; + +if (m0 && (m0->m_flags & M_PKTHDR) == 0) +panic("sbappendaddr"); + if (m0) + space += m0->m_pkthdr.len; + for (n = control; n; n = n->m_next) { + space += n->m_len; + if (n->m_next == 0) /* keep pointer to last control buf */ + break; + } + if (space > sbspace(sb)) + return (0); + if (asa->sa_len > MLEN) + return (0); + MGET(m, M_DONTWAIT, MT_SONAME); + if (m == 0) + return (0); + m->m_len = asa->sa_len; + bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len); + if (n) + n->m_next = m0; /* concatenate data to control */ + else + control = m0; + m->m_next = control; + for (n = m; n; n = n->m_next) + sballoc(sb, n); + if (n = sb->sb_mb) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = m; + } else + sb->sb_mb = m; + return (1); +} + +sbappendcontrol(sb, m0, control) + struct sockbuf *sb; + struct mbuf *control, *m0; +{ + register struct mbuf *m, *n; + int space = 0; + + if (control == 0) + panic("sbappendcontrol"); + for (m = control; ; m = m->m_next) { + space += m->m_len; + if (m->m_next == 0) + break; + } + n = m; /* save pointer to last control buffer */ + for (m = m0; m; m = m->m_next) + space += m->m_len; + if (space > sbspace(sb)) + return (0); + n->m_next = m0; /* concatenate data to control */ + for (m = control; m; m = m->m_next) + sballoc(sb, m); + if (n = sb->sb_mb) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = control; + } else + sb->sb_mb = control; + return (1); +} + +/* + * Compress mbuf chain m into the socket + * buffer sb following mbuf n. If n + * is null, the buffer is presumed empty. + */ +sbcompress(sb, m, n) + register struct sockbuf *sb; + register struct mbuf *m, *n; +{ + register int eor = 0; + register struct mbuf *o; + + while (m) { + eor |= m->m_flags & M_EOR; + if (m->m_len == 0 && + (eor == 0 || + (((o = m->m_next) || (o = n)) && + o->m_type == m->m_type))) { + m = m_free(m); + continue; + } + if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 && + (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] && + n->m_type == m->m_type) { + bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, + (unsigned)m->m_len); + n->m_len += m->m_len; + sb->sb_cc += m->m_len; + m = m_free(m); + continue; + } + if (n) + n->m_next = m; + else + sb->sb_mb = m; + sballoc(sb, m); + n = m; + m->m_flags &= ~M_EOR; + m = m->m_next; + n->m_next = 0; + } + if (eor) { + if (n) + n->m_flags |= eor; + else + printf("semi-panic: sbcompress\n"); + } +} + +/* + * Free all mbufs in a sockbuf. + * Check that all resources are reclaimed. + */ +sbflush(sb) + register struct sockbuf *sb; +{ + + if (sb->sb_flags & SB_LOCK) + panic("sbflush"); + while (sb->sb_mbcnt) + sbdrop(sb, (int)sb->sb_cc); + if (sb->sb_cc || sb->sb_mb) + panic("sbflush 2"); +} + +/* + * Drop data from (the front of) a sockbuf. + */ +sbdrop(sb, len) + register struct sockbuf *sb; + register int len; +{ + register struct mbuf *m, *mn; + struct mbuf *next; + + next = (m = sb->sb_mb) ? m->m_nextpkt : 0; + while (len > 0) { + if (m == 0) { + if (next == 0) + panic("sbdrop"); + m = next; + next = m->m_nextpkt; + continue; + } + if (m->m_len > len) { + m->m_len -= len; + m->m_data += len; + sb->sb_cc -= len; + break; + } + len -= m->m_len; + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + while (m && m->m_len == 0) { + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + if (m) { + sb->sb_mb = m; + m->m_nextpkt = next; + } else + sb->sb_mb = next; +} + +/* + * Drop a record off the front of a sockbuf + * and move the next record to the front. + */ +sbdroprecord(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m, *mn; + + m = sb->sb_mb; + if (m) { + sb->sb_mb = m->m_nextpkt; + do { + sbfree(sb, m); + MFREE(m, mn); + } while (m = mn); + } +} diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c new file mode 100644 index 00000000000..89b7ffdf196 --- /dev/null +++ b/sys/kern/uipc_syscalls.c @@ -0,0 +1,1217 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef KTRACE +#include +#endif + +/* + * System call interface to the socket abstraction. + */ +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#define COMPAT_OLDSOCK +#endif + +extern struct fileops socketops; + +struct socket_args { + int domain; + int type; + int protocol; +}; +socket(p, uap, retval) + struct proc *p; + register struct socket_args *uap; + int *retval; +{ + struct filedesc *fdp = p->p_fd; + struct socket *so; + struct file *fp; + int fd, error; + + if (error = falloc(p, &fp, &fd)) + return (error); + fp->f_flag = FREAD|FWRITE; + fp->f_type = DTYPE_SOCKET; + fp->f_ops = &socketops; + if (error = socreate(uap->domain, &so, uap->type, uap->protocol)) { + fdp->fd_ofiles[fd] = 0; + ffree(fp); + } else { + fp->f_data = (caddr_t)so; + *retval = fd; + } + return (error); +} + +struct bind_args { + int s; + caddr_t name; + int namelen; +}; +/* ARGSUSED */ +bind(p, uap, retval) + struct proc *p; + register struct bind_args *uap; + int *retval; +{ + struct file *fp; + struct mbuf *nam; + int error; + + if (error = getsock(p->p_fd, uap->s, &fp)) + return (error); + if (error = sockargs(&nam, uap->name, uap->namelen, MT_SONAME)) + return (error); + error = sobind((struct socket *)fp->f_data, nam); + m_freem(nam); + return (error); +} + +struct listen_args { + int s; + int backlog; +}; +/* ARGSUSED */ +listen(p, uap, retval) + struct proc *p; + register struct listen_args *uap; + int *retval; +{ + struct file *fp; + int error; + + if (error = getsock(p->p_fd, uap->s, &fp)) + return (error); + return (solisten((struct socket *)fp->f_data, uap->backlog)); +} + +struct accept_args { + int s; + caddr_t name; + int *anamelen; +#ifdef COMPAT_OLDSOCK + int compat_43; /* pseudo */ +#endif +}; + +#ifdef COMPAT_OLDSOCK +accept(p, uap, retval) + struct proc *p; + struct accept_args *uap; + int *retval; +{ + + uap->compat_43 = 0; + return (accept1(p, uap, retval)); +} + +oaccept(p, uap, retval) + struct proc *p; + struct accept_args *uap; + int *retval; +{ + + uap->compat_43 = 1; + return (accept1(p, uap, retval)); +} +#else /* COMPAT_OLDSOCK */ + +#define accept1 accept +#endif + +accept1(p, uap, retval) + struct proc *p; + register struct accept_args *uap; + int *retval; +{ + struct file *fp; + struct mbuf *nam; + int namelen, error, s; + register struct socket *so; + + if (uap->name && (error = copyin((caddr_t)uap->anamelen, + (caddr_t)&namelen, sizeof (namelen)))) + return (error); + if (error = getsock(p->p_fd, uap->s, &fp)) + return (error); + s = splnet(); + so = (struct socket *)fp->f_data; + if ((so->so_options & SO_ACCEPTCONN) == 0) { + splx(s); + return (EINVAL); + } + if ((so->so_state & SS_NBIO) && so->so_qlen == 0) { + splx(s); + return (EWOULDBLOCK); + } + while (so->so_qlen == 0 && so->so_error == 0) { + if (so->so_state & SS_CANTRCVMORE) { + so->so_error = ECONNABORTED; + break; + } + if (error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, + netcon, 0)) { + splx(s); + return (error); + } + } + if (so->so_error) { + error = so->so_error; + so->so_error = 0; + splx(s); + return (error); + } + if (error = falloc(p, &fp, retval)) { + splx(s); + return (error); + } + { struct socket *aso = so->so_q; + if (soqremque(aso, 1) == 0) + panic("accept"); + so = aso; + } + fp->f_type = DTYPE_SOCKET; + fp->f_flag = FREAD|FWRITE; + fp->f_ops = &socketops; + fp->f_data = (caddr_t)so; + nam = m_get(M_WAIT, MT_SONAME); + (void) soaccept(so, nam); + if (uap->name) { +#ifdef COMPAT_OLDSOCK + if (uap->compat_43) + mtod(nam, struct osockaddr *)->sa_family = + mtod(nam, struct sockaddr *)->sa_family; +#endif + if (namelen > nam->m_len) + namelen = nam->m_len; + /* SHOULD COPY OUT A CHAIN HERE */ + if ((error = copyout(mtod(nam, caddr_t), (caddr_t)uap->name, + (u_int)namelen)) == 0) + error = copyout((caddr_t)&namelen, + (caddr_t)uap->anamelen, sizeof (*uap->anamelen)); + } + m_freem(nam); + splx(s); + return (error); +} + +struct connect_args { + int s; + caddr_t name; + int namelen; +}; +/* ARGSUSED */ +connect(p, uap, retval) + struct proc *p; + register struct connect_args *uap; + int *retval; +{ + struct file *fp; + register struct socket *so; + struct mbuf *nam; + int error, s; + + if (error = getsock(p->p_fd, uap->s, &fp)) + return (error); + so = (struct socket *)fp->f_data; + if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) + return (EALREADY); + if (error = sockargs(&nam, uap->name, uap->namelen, MT_SONAME)) + return (error); + error = soconnect(so, nam); + if (error) + goto bad; + if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { + m_freem(nam); + return (EINPROGRESS); + } + s = splnet(); + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) + if (error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, + netcon, 0)) + break; + if (error == 0) { + error = so->so_error; + so->so_error = 0; + } + splx(s); +bad: + so->so_state &= ~SS_ISCONNECTING; + m_freem(nam); + if (error == ERESTART) + error = EINTR; + return (error); +} + +struct socketpair_args { + int domain; + int type; + int protocol; + int *rsv; +}; +socketpair(p, uap, retval) + struct proc *p; + register struct socketpair_args *uap; + int retval[]; +{ + register struct filedesc *fdp = p->p_fd; + struct file *fp1, *fp2; + struct socket *so1, *so2; + int fd, error, sv[2]; + + if (error = socreate(uap->domain, &so1, uap->type, uap->protocol)) + return (error); + if (error = socreate(uap->domain, &so2, uap->type, uap->protocol)) + goto free1; + if (error = falloc(p, &fp1, &fd)) + goto free2; + sv[0] = fd; + fp1->f_flag = FREAD|FWRITE; + fp1->f_type = DTYPE_SOCKET; + fp1->f_ops = &socketops; + fp1->f_data = (caddr_t)so1; + if (error = falloc(p, &fp2, &fd)) + goto free3; + fp2->f_flag = FREAD|FWRITE; + fp2->f_type = DTYPE_SOCKET; + fp2->f_ops = &socketops; + fp2->f_data = (caddr_t)so2; + sv[1] = fd; + if (error = soconnect2(so1, so2)) + goto free4; + if (uap->type == SOCK_DGRAM) { + /* + * Datagram socket connection is asymmetric. + */ + if (error = soconnect2(so2, so1)) + goto free4; + } + error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int)); + retval[0] = sv[0]; /* XXX ??? */ + retval[1] = sv[1]; /* XXX ??? */ + return (error); +free4: + ffree(fp2); + fdp->fd_ofiles[sv[1]] = 0; +free3: + ffree(fp1); + fdp->fd_ofiles[sv[0]] = 0; +free2: + (void)soclose(so2); +free1: + (void)soclose(so1); + return (error); +} + +struct sendto_args { + int s; + caddr_t buf; + size_t len; + int flags; + caddr_t to; + int tolen; +}; +sendto(p, uap, retval) + struct proc *p; + register struct sendto_args *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov; + + msg.msg_name = uap->to; + msg.msg_namelen = uap->tolen; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + msg.msg_control = 0; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = 0; +#endif + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + return (sendit(p, uap->s, &msg, uap->flags, retval)); +} + +#ifdef COMPAT_OLDSOCK +struct osend_args { + int s; + caddr_t buf; + int len; + int flags; +}; +osend(p, uap, retval) + struct proc *p; + register struct osend_args *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov; + + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + msg.msg_control = 0; + msg.msg_flags = 0; + return (sendit(p, uap->s, &msg, uap->flags, retval)); +} + +#define MSG_COMPAT 0x8000 +struct osendmsg_args { + int s; + caddr_t msg; + int flags; +}; +osendmsg(p, uap, retval) + struct proc *p; + register struct osendmsg_args *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + if (error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr))) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; + if (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))) + goto done; + msg.msg_flags = MSG_COMPAT; + msg.msg_iov = iov; + error = sendit(p, uap->s, &msg, uap->flags, retval); +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} +#endif + +struct sendmsg_args { + int s; + caddr_t msg; + int flags; +}; +sendmsg(p, uap, retval) + struct proc *p; + register struct sendmsg_args *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + if (error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg))) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; + if (msg.msg_iovlen && + (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) + goto done; + msg.msg_iov = iov; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = 0; +#endif + error = sendit(p, uap->s, &msg, uap->flags, retval); +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} + +sendit(p, s, mp, flags, retsize) + register struct proc *p; + int s; + register struct msghdr *mp; + int flags, *retsize; +{ + struct file *fp; + struct uio auio; + register struct iovec *iov; + register int i; + struct mbuf *to, *control; + int len, error; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + if (error = getsock(p->p_fd, s, &fp)) + return (error); + auio.uio_iov = mp->msg_iov; + auio.uio_iovcnt = mp->msg_iovlen; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_procp = p; + auio.uio_offset = 0; /* XXX */ + auio.uio_resid = 0; + iov = mp->msg_iov; + for (i = 0; i < mp->msg_iovlen; i++, iov++) { + if (iov->iov_len < 0) + return (EINVAL); + if ((auio.uio_resid += iov->iov_len) < 0) + return (EINVAL); + } + if (mp->msg_name) { + if (error = sockargs(&to, mp->msg_name, mp->msg_namelen, + MT_SONAME)) + return (error); + } else + to = 0; + if (mp->msg_control) { + if (mp->msg_controllen < sizeof(struct cmsghdr) +#ifdef COMPAT_OLDSOCK + && mp->msg_flags != MSG_COMPAT +#endif + ) { + error = EINVAL; + goto bad; + } + if (error = sockargs(&control, mp->msg_control, + mp->msg_controllen, MT_CONTROL)) + goto bad; +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags == MSG_COMPAT) { + register struct cmsghdr *cm; + + M_PREPEND(control, sizeof(*cm), M_WAIT); + if (control == 0) { + error = ENOBUFS; + goto bad; + } else { + cm = mtod(control, struct cmsghdr *); + cm->cmsg_len = control->m_len; + cm->cmsg_level = SOL_SOCKET; + cm->cmsg_type = SCM_RIGHTS; + } + } +#endif + } else + control = 0; +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO)) { + int iovlen = auio.uio_iovcnt * sizeof (struct iovec); + + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + len = auio.uio_resid; + if (error = sosend((struct socket *)fp->f_data, to, &auio, + (struct mbuf *)0, control, flags)) { + if (auio.uio_resid != len && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + if (error == EPIPE) + psignal(p, SIGPIPE); + } + if (error == 0) + *retsize = len - auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, s, UIO_WRITE, + ktriov, *retsize, error); + FREE(ktriov, M_TEMP); + } +#endif +bad: + if (to) + m_freem(to); + return (error); +} + +struct recvfrom_args { + int s; + caddr_t buf; + size_t len; + int flags; + caddr_t from; + int *fromlenaddr; +}; + +#ifdef COMPAT_OLDSOCK +orecvfrom(p, uap, retval) + struct proc *p; + struct recvfrom_args *uap; + int *retval; +{ + + uap->flags |= MSG_COMPAT; + return (recvfrom(p, uap, retval)); +} +#endif + +recvfrom(p, uap, retval) + struct proc *p; + register struct recvfrom_args *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov; + int error; + + if (uap->fromlenaddr) { + if (error = copyin((caddr_t)uap->fromlenaddr, + (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen))) + return (error); + } else + msg.msg_namelen = 0; + msg.msg_name = uap->from; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + msg.msg_control = 0; + msg.msg_flags = uap->flags; + return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr, retval)); +} + +#ifdef COMPAT_OLDSOCK +struct orecv_args { + int s; + caddr_t buf; + int len; + int flags; +}; +orecv(p, uap, retval) + struct proc *p; + register struct orecv_args *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov; + + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + msg.msg_control = 0; + msg.msg_flags = uap->flags; + return (recvit(p, uap->s, &msg, (caddr_t)0, retval)); +} + +/* + * Old recvmsg. This code takes advantage of the fact that the old msghdr + * overlays the new one, missing only the flags, and with the (old) access + * rights where the control fields are now. + */ +struct orecvmsg_args { + int s; + struct omsghdr *msg; + int flags; +}; +orecvmsg(p, uap, retval) + struct proc *p; + register struct orecvmsg_args *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + if (error = copyin((caddr_t)uap->msg, (caddr_t)&msg, + sizeof (struct omsghdr))) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; + msg.msg_flags = uap->flags | MSG_COMPAT; + if (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))) + goto done; + msg.msg_iov = iov; + error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen, retval); + + if (msg.msg_controllen && error == 0) + error = copyout((caddr_t)&msg.msg_controllen, + (caddr_t)&uap->msg->msg_accrightslen, sizeof (int)); +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} +#endif + +struct recvmsg_args { + int s; + struct msghdr *msg; + int flags; +}; +recvmsg(p, uap, retval) + struct proc *p; + register struct recvmsg_args *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; + register int error; + + if (error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg))) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = uap->flags &~ MSG_COMPAT; +#else + msg.msg_flags = uap->flags; +#endif + uiov = msg.msg_iov; + msg.msg_iov = iov; + if (error = copyin((caddr_t)uiov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))) + goto done; + if ((error = recvit(p, uap->s, &msg, (caddr_t)0, retval)) == 0) { + msg.msg_iov = uiov; + error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg)); + } +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} + +recvit(p, s, mp, namelenp, retsize) + register struct proc *p; + int s; + register struct msghdr *mp; + caddr_t namelenp; + int *retsize; +{ + struct file *fp; + struct uio auio; + register struct iovec *iov; + register int i; + int len, error; + struct mbuf *from = 0, *control = 0; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + if (error = getsock(p->p_fd, s, &fp)) + return (error); + auio.uio_iov = mp->msg_iov; + auio.uio_iovcnt = mp->msg_iovlen; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_rw = UIO_READ; + auio.uio_procp = p; + auio.uio_offset = 0; /* XXX */ + auio.uio_resid = 0; + iov = mp->msg_iov; + for (i = 0; i < mp->msg_iovlen; i++, iov++) { + if (iov->iov_len < 0) + return (EINVAL); + if ((auio.uio_resid += iov->iov_len) < 0) + return (EINVAL); + } +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO)) { + int iovlen = auio.uio_iovcnt * sizeof (struct iovec); + + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + len = auio.uio_resid; + if (error = soreceive((struct socket *)fp->f_data, &from, &auio, + (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, + &mp->msg_flags)) { + if (auio.uio_resid != len && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + } +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, s, UIO_READ, + ktriov, len - auio.uio_resid, error); + FREE(ktriov, M_TEMP); + } +#endif + if (error) + goto out; + *retsize = len - auio.uio_resid; + if (mp->msg_name) { + len = mp->msg_namelen; + if (len <= 0 || from == 0) + len = 0; + else { +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags & MSG_COMPAT) + mtod(from, struct osockaddr *)->sa_family = + mtod(from, struct sockaddr *)->sa_family; +#endif + if (len > from->m_len) + len = from->m_len; + /* else if len < from->m_len ??? */ + if (error = copyout(mtod(from, caddr_t), + (caddr_t)mp->msg_name, (unsigned)len)) + goto out; + } + mp->msg_namelen = len; + if (namelenp && + (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) { +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags & MSG_COMPAT) + error = 0; /* old recvfrom didn't check */ + else +#endif + goto out; + } + } + if (mp->msg_control) { +#ifdef COMPAT_OLDSOCK + /* + * We assume that old recvmsg calls won't receive access + * rights and other control info, esp. as control info + * is always optional and those options didn't exist in 4.3. + * If we receive rights, trim the cmsghdr; anything else + * is tossed. + */ + if (control && mp->msg_flags & MSG_COMPAT) { + if (mtod(control, struct cmsghdr *)->cmsg_level != + SOL_SOCKET || + mtod(control, struct cmsghdr *)->cmsg_type != + SCM_RIGHTS) { + mp->msg_controllen = 0; + goto out; + } + control->m_len -= sizeof (struct cmsghdr); + control->m_data += sizeof (struct cmsghdr); + } +#endif + len = mp->msg_controllen; + if (len <= 0 || control == 0) + len = 0; + else { + if (len >= control->m_len) + len = control->m_len; + else + mp->msg_flags |= MSG_CTRUNC; + error = copyout((caddr_t)mtod(control, caddr_t), + (caddr_t)mp->msg_control, (unsigned)len); + } + mp->msg_controllen = len; + } +out: + if (from) + m_freem(from); + if (control) + m_freem(control); + return (error); +} + +struct shutdown_args { + int s; + int how; +}; +/* ARGSUSED */ +shutdown(p, uap, retval) + struct proc *p; + register struct shutdown_args *uap; + int *retval; +{ + struct file *fp; + int error; + + if (error = getsock(p->p_fd, uap->s, &fp)) + return (error); + return (soshutdown((struct socket *)fp->f_data, uap->how)); +} + +struct setsockopt_args { + int s; + int level; + int name; + caddr_t val; + int valsize; +}; +/* ARGSUSED */ +setsockopt(p, uap, retval) + struct proc *p; + register struct setsockopt_args *uap; + int *retval; +{ + struct file *fp; + struct mbuf *m = NULL; + int error; + + if (error = getsock(p->p_fd, uap->s, &fp)) + return (error); + if (uap->valsize > MLEN) + return (EINVAL); + if (uap->val) { + m = m_get(M_WAIT, MT_SOOPTS); + if (m == NULL) + return (ENOBUFS); + if (error = copyin(uap->val, mtod(m, caddr_t), + (u_int)uap->valsize)) { + (void) m_free(m); + return (error); + } + m->m_len = uap->valsize; + } + return (sosetopt((struct socket *)fp->f_data, uap->level, + uap->name, m)); +} + +struct getsockopt_args { + int s; + int level; + int name; + caddr_t val; + int *avalsize; +}; +/* ARGSUSED */ +getsockopt(p, uap, retval) + struct proc *p; + register struct getsockopt_args *uap; + int *retval; +{ + struct file *fp; + struct mbuf *m = NULL; + int valsize, error; + + if (error = getsock(p->p_fd, uap->s, &fp)) + return (error); + if (uap->val) { + if (error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize, + sizeof (valsize))) + return (error); + } else + valsize = 0; + if ((error = sogetopt((struct socket *)fp->f_data, uap->level, + uap->name, &m)) == 0 && uap->val && valsize && m != NULL) { + if (valsize > m->m_len) + valsize = m->m_len; + error = copyout(mtod(m, caddr_t), uap->val, (u_int)valsize); + if (error == 0) + error = copyout((caddr_t)&valsize, + (caddr_t)uap->avalsize, sizeof (valsize)); + } + if (m != NULL) + (void) m_free(m); + return (error); +} + +struct pipe_args { + int dummy; +}; +/* ARGSUSED */ +pipe(p, uap, retval) + struct proc *p; + struct pipe_args *uap; + int retval[]; +{ + register struct filedesc *fdp = p->p_fd; + struct file *rf, *wf; + struct socket *rso, *wso; + int fd, error; + + if (error = socreate(AF_UNIX, &rso, SOCK_STREAM, 0)) + return (error); + if (error = socreate(AF_UNIX, &wso, SOCK_STREAM, 0)) + goto free1; + if (error = falloc(p, &rf, &fd)) + goto free2; + retval[0] = fd; + rf->f_flag = FREAD; + rf->f_type = DTYPE_SOCKET; + rf->f_ops = &socketops; + rf->f_data = (caddr_t)rso; + if (error = falloc(p, &wf, &fd)) + goto free3; + wf->f_flag = FWRITE; + wf->f_type = DTYPE_SOCKET; + wf->f_ops = &socketops; + wf->f_data = (caddr_t)wso; + retval[1] = fd; + if (error = unp_connect2(wso, rso)) + goto free4; + return (0); +free4: + ffree(wf); + fdp->fd_ofiles[retval[1]] = 0; +free3: + ffree(rf); + fdp->fd_ofiles[retval[0]] = 0; +free2: + (void)soclose(wso); +free1: + (void)soclose(rso); + return (error); +} + +/* + * Get socket name. + */ +struct getsockname_args { + int fdes; + caddr_t asa; + int *alen; +#ifdef COMPAT_OLDSOCK + int compat_43; /* pseudo */ +#endif +}; +#ifdef COMPAT_OLDSOCK +getsockname(p, uap, retval) + struct proc *p; + struct getsockname_args *uap; + int *retval; +{ + + uap->compat_43 = 0; + return (getsockname1(p, uap, retval)); +} + +ogetsockname(p, uap, retval) + struct proc *p; + struct getsockname_args *uap; + int *retval; +{ + + uap->compat_43 = 1; + return (getsockname1(p, uap, retval)); +} +#else /* COMPAT_OLDSOCK */ + +#define getsockname1 getsockname +#endif + +/* ARGSUSED */ +getsockname1(p, uap, retval) + struct proc *p; + register struct getsockname_args *uap; + int *retval; +{ + struct file *fp; + register struct socket *so; + struct mbuf *m; + int len, error; + + if (error = getsock(p->p_fd, uap->fdes, &fp)) + return (error); + if (error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len))) + return (error); + so = (struct socket *)fp->f_data; + m = m_getclr(M_WAIT, MT_SONAME); + if (m == NULL) + return (ENOBUFS); + if (error = (*so->so_proto->pr_usrreq)(so, PRU_SOCKADDR, 0, m, 0)) + goto bad; + if (len > m->m_len) + len = m->m_len; +#ifdef COMPAT_OLDSOCK + if (uap->compat_43) + mtod(m, struct osockaddr *)->sa_family = + mtod(m, struct sockaddr *)->sa_family; +#endif + error = copyout(mtod(m, caddr_t), (caddr_t)uap->asa, (u_int)len); + if (error == 0) + error = copyout((caddr_t)&len, (caddr_t)uap->alen, + sizeof (len)); +bad: + m_freem(m); + return (error); +} + +/* + * Get name of peer for connected socket. + */ +struct getpeername_args { + int fdes; + caddr_t asa; + int *alen; +#ifdef COMPAT_OLDSOCK + int compat_43; /* pseudo */ +#endif +}; + +#ifdef COMPAT_OLDSOCK +getpeername(p, uap, retval) + struct proc *p; + struct getpeername_args *uap; + int *retval; +{ + + uap->compat_43 = 0; + return (getpeername1(p, uap, retval)); +} + +ogetpeername(p, uap, retval) + struct proc *p; + struct getpeername_args *uap; + int *retval; +{ + + uap->compat_43 = 1; + return (getpeername1(p, uap, retval)); +} +#else /* COMPAT_OLDSOCK */ + +#define getpeername1 getpeername +#endif + +/* ARGSUSED */ +getpeername1(p, uap, retval) + struct proc *p; + register struct getpeername_args *uap; + int *retval; +{ + struct file *fp; + register struct socket *so; + struct mbuf *m; + int len, error; + + if (error = getsock(p->p_fd, uap->fdes, &fp)) + return (error); + so = (struct socket *)fp->f_data; + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) + return (ENOTCONN); + if (error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len))) + return (error); + m = m_getclr(M_WAIT, MT_SONAME); + if (m == NULL) + return (ENOBUFS); + if (error = (*so->so_proto->pr_usrreq)(so, PRU_PEERADDR, 0, m, 0)) + goto bad; + if (len > m->m_len) + len = m->m_len; +#ifdef COMPAT_OLDSOCK + if (uap->compat_43) + mtod(m, struct osockaddr *)->sa_family = + mtod(m, struct sockaddr *)->sa_family; +#endif + if (error = copyout(mtod(m, caddr_t), (caddr_t)uap->asa, (u_int)len)) + goto bad; + error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len)); +bad: + m_freem(m); + return (error); +} + +sockargs(mp, buf, buflen, type) + struct mbuf **mp; + caddr_t buf; + int buflen, type; +{ + register struct sockaddr *sa; + register struct mbuf *m; + int error; + + if ((u_int)buflen > MLEN) { +#ifdef COMPAT_OLDSOCK + if (type == MT_SONAME && (u_int)buflen <= 112) + buflen = MLEN; /* unix domain compat. hack */ + else +#endif + return (EINVAL); + } + m = m_get(M_WAIT, type); + if (m == NULL) + return (ENOBUFS); + m->m_len = buflen; + error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); + if (error) + (void) m_free(m); + else { + *mp = m; + if (type == MT_SONAME) { + sa = mtod(m, struct sockaddr *); + +#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN + if (sa->sa_family == 0 && sa->sa_len < AF_MAX) + sa->sa_family = sa->sa_len; +#endif + sa->sa_len = buflen; + } + } + return (error); +} + +getsock(fdp, fdes, fpp) + struct filedesc *fdp; + int fdes; + struct file **fpp; +{ + register struct file *fp; + + if ((unsigned)fdes >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fdes]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_SOCKET) + return (ENOTSOCK); + *fpp = fp; + return (0); +} diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c new file mode 100644 index 00000000000..94bf8f744c8 --- /dev/null +++ b/sys/kern/uipc_usrreq.c @@ -0,0 +1,823 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Unix communications domain. + * + * TODO: + * SEQPACKET, RDM + * rethink name space problems + * need a proper out-of-band + */ +struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX }; +ino_t unp_ino; /* prototype for fake inode numbers */ + +/*ARGSUSED*/ +uipc_usrreq(so, req, m, nam, control) + struct socket *so; + int req; + struct mbuf *m, *nam, *control; +{ + struct unpcb *unp = sotounpcb(so); + register struct socket *so2; + register int error = 0; + struct proc *p = curproc; /* XXX */ + + if (req == PRU_CONTROL) + return (EOPNOTSUPP); + if (req != PRU_SEND && control && control->m_len) { + error = EOPNOTSUPP; + goto release; + } + if (unp == 0 && req != PRU_ATTACH) { + error = EINVAL; + goto release; + } + switch (req) { + + case PRU_ATTACH: + if (unp) { + error = EISCONN; + break; + } + error = unp_attach(so); + break; + + case PRU_DETACH: + unp_detach(unp); + break; + + case PRU_BIND: + error = unp_bind(unp, nam, p); + break; + + case PRU_LISTEN: + if (unp->unp_vnode == 0) + error = EINVAL; + break; + + case PRU_CONNECT: + error = unp_connect(so, nam, p); + break; + + case PRU_CONNECT2: + error = unp_connect2(so, (struct socket *)nam); + break; + + case PRU_DISCONNECT: + unp_disconnect(unp); + break; + + case PRU_ACCEPT: + /* + * Pass back name of connected socket, + * if it was bound and we are still connected + * (our peer may have closed already!). + */ + if (unp->unp_conn && unp->unp_conn->unp_addr) { + nam->m_len = unp->unp_conn->unp_addr->m_len; + bcopy(mtod(unp->unp_conn->unp_addr, caddr_t), + mtod(nam, caddr_t), (unsigned)nam->m_len); + } else { + nam->m_len = sizeof(sun_noname); + *(mtod(nam, struct sockaddr *)) = sun_noname; + } + break; + + case PRU_SHUTDOWN: + socantsendmore(so); + unp_shutdown(unp); + break; + + case PRU_RCVD: + switch (so->so_type) { + + case SOCK_DGRAM: + panic("uipc 1"); + /*NOTREACHED*/ + + case SOCK_STREAM: +#define rcv (&so->so_rcv) +#define snd (&so2->so_snd) + if (unp->unp_conn == 0) + break; + so2 = unp->unp_conn->unp_socket; + /* + * Adjust backpressure on sender + * and wakeup any waiting to write. + */ + snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt; + unp->unp_mbcnt = rcv->sb_mbcnt; + snd->sb_hiwat += unp->unp_cc - rcv->sb_cc; + unp->unp_cc = rcv->sb_cc; + sowwakeup(so2); +#undef snd +#undef rcv + break; + + default: + panic("uipc 2"); + } + break; + + case PRU_SEND: + if (control && (error = unp_internalize(control, p))) + break; + switch (so->so_type) { + + case SOCK_DGRAM: { + struct sockaddr *from; + + if (nam) { + if (unp->unp_conn) { + error = EISCONN; + break; + } + error = unp_connect(so, nam, p); + if (error) + break; + } else { + if (unp->unp_conn == 0) { + error = ENOTCONN; + break; + } + } + so2 = unp->unp_conn->unp_socket; + if (unp->unp_addr) + from = mtod(unp->unp_addr, struct sockaddr *); + else + from = &sun_noname; + if (sbappendaddr(&so2->so_rcv, from, m, control)) { + sorwakeup(so2); + m = 0; + control = 0; + } else + error = ENOBUFS; + if (nam) + unp_disconnect(unp); + break; + } + + case SOCK_STREAM: +#define rcv (&so2->so_rcv) +#define snd (&so->so_snd) + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + break; + } + if (unp->unp_conn == 0) + panic("uipc 3"); + so2 = unp->unp_conn->unp_socket; + /* + * Send to paired receive port, and then reduce + * send buffer hiwater marks to maintain backpressure. + * Wake up readers. + */ + if (control) { + if (sbappendcontrol(rcv, m, control)) + control = 0; + } else + sbappend(rcv, m); + snd->sb_mbmax -= + rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; + unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; + snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc; + unp->unp_conn->unp_cc = rcv->sb_cc; + sorwakeup(so2); + m = 0; +#undef snd +#undef rcv + break; + + default: + panic("uipc 4"); + } + break; + + case PRU_ABORT: + unp_drop(unp, ECONNABORTED); + break; + + case PRU_SENSE: + ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; + if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) { + so2 = unp->unp_conn->unp_socket; + ((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc; + } + ((struct stat *) m)->st_dev = NODEV; + if (unp->unp_ino == 0) + unp->unp_ino = unp_ino++; + ((struct stat *) m)->st_ino = unp->unp_ino; + return (0); + + case PRU_RCVOOB: + return (EOPNOTSUPP); + + case PRU_SENDOOB: + error = EOPNOTSUPP; + break; + + case PRU_SOCKADDR: + if (unp->unp_addr) { + nam->m_len = unp->unp_addr->m_len; + bcopy(mtod(unp->unp_addr, caddr_t), + mtod(nam, caddr_t), (unsigned)nam->m_len); + } else + nam->m_len = 0; + break; + + case PRU_PEERADDR: + if (unp->unp_conn && unp->unp_conn->unp_addr) { + nam->m_len = unp->unp_conn->unp_addr->m_len; + bcopy(mtod(unp->unp_conn->unp_addr, caddr_t), + mtod(nam, caddr_t), (unsigned)nam->m_len); + } else + nam->m_len = 0; + break; + + case PRU_SLOWTIMO: + break; + + default: + panic("piusrreq"); + } +release: + if (control) + m_freem(control); + if (m) + m_freem(m); + return (error); +} + +/* + * Both send and receive buffers are allocated PIPSIZ bytes of buffering + * for stream sockets, although the total for sender and receiver is + * actually only PIPSIZ. + * Datagram sockets really use the sendspace as the maximum datagram size, + * and don't really want to reserve the sendspace. Their recvspace should + * be large enough for at least one max-size datagram plus address. + */ +#define PIPSIZ 4096 +u_long unpst_sendspace = PIPSIZ; +u_long unpst_recvspace = PIPSIZ; +u_long unpdg_sendspace = 2*1024; /* really max datagram size */ +u_long unpdg_recvspace = 4*1024; + +int unp_rights; /* file descriptors in flight */ + +unp_attach(so) + struct socket *so; +{ + register struct mbuf *m; + register struct unpcb *unp; + int error; + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + switch (so->so_type) { + + case SOCK_STREAM: + error = soreserve(so, unpst_sendspace, unpst_recvspace); + break; + + case SOCK_DGRAM: + error = soreserve(so, unpdg_sendspace, unpdg_recvspace); + break; + + default: + panic("unp_attach"); + } + if (error) + return (error); + } + m = m_getclr(M_DONTWAIT, MT_PCB); + if (m == NULL) + return (ENOBUFS); + unp = mtod(m, struct unpcb *); + so->so_pcb = (caddr_t)unp; + unp->unp_socket = so; + return (0); +} + +unp_detach(unp) + register struct unpcb *unp; +{ + + if (unp->unp_vnode) { + unp->unp_vnode->v_socket = 0; + vrele(unp->unp_vnode); + unp->unp_vnode = 0; + } + if (unp->unp_conn) + unp_disconnect(unp); + while (unp->unp_refs) + unp_drop(unp->unp_refs, ECONNRESET); + soisdisconnected(unp->unp_socket); + unp->unp_socket->so_pcb = 0; + m_freem(unp->unp_addr); + (void) m_free(dtom(unp)); + if (unp_rights) { + /* + * Normally the receive buffer is flushed later, + * in sofree, but if our receive buffer holds references + * to descriptors that are now garbage, we will dispose + * of those descriptor references after the garbage collector + * gets them (resulting in a "panic: closef: count < 0"). + */ + sorflush(unp->unp_socket); + unp_gc(); + } +} + +unp_bind(unp, nam, p) + struct unpcb *unp; + struct mbuf *nam; + struct proc *p; +{ + struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *); + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE, + soun->sun_path, p); + if (unp->unp_vnode != NULL) + return (EINVAL); + if (nam->m_len == MLEN) { + if (*(mtod(nam, caddr_t) + nam->m_len - 1) != 0) + return (EINVAL); + } else + *(mtod(nam, caddr_t) + nam->m_len) = 0; +/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + return (EADDRINUSE); + } + VATTR_NULL(&vattr); + vattr.va_type = VSOCK; + vattr.va_mode = ACCESSPERMS; + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr)) + return (error); + vp = nd.ni_vp; + vp->v_socket = unp->unp_socket; + unp->unp_vnode = vp; + unp->unp_addr = m_copy(nam, 0, (int)M_COPYALL); + VOP_UNLOCK(vp); + return (0); +} + +unp_connect(so, nam, p) + struct socket *so; + struct mbuf *nam; + struct proc *p; +{ + register struct sockaddr_un *soun = mtod(nam, struct sockaddr_un *); + register struct vnode *vp; + register struct socket *so2, *so3; + struct unpcb *unp2, *unp3; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p); + if (nam->m_data + nam->m_len == &nam->m_dat[MLEN]) { /* XXX */ + if (*(mtod(nam, caddr_t) + nam->m_len - 1) != 0) + return (EMSGSIZE); + } else + *(mtod(nam, caddr_t) + nam->m_len) = 0; + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VSOCK) { + error = ENOTSOCK; + goto bad; + } + if (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) + goto bad; + so2 = vp->v_socket; + if (so2 == 0) { + error = ECONNREFUSED; + goto bad; + } + if (so->so_type != so2->so_type) { + error = EPROTOTYPE; + goto bad; + } + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + if ((so2->so_options & SO_ACCEPTCONN) == 0 || + (so3 = sonewconn(so2, 0)) == 0) { + error = ECONNREFUSED; + goto bad; + } + unp2 = sotounpcb(so2); + unp3 = sotounpcb(so3); + if (unp2->unp_addr) + unp3->unp_addr = + m_copy(unp2->unp_addr, 0, (int)M_COPYALL); + so2 = so3; + } + error = unp_connect2(so, so2); +bad: + vput(vp); + return (error); +} + +unp_connect2(so, so2) + register struct socket *so; + register struct socket *so2; +{ + register struct unpcb *unp = sotounpcb(so); + register struct unpcb *unp2; + + if (so2->so_type != so->so_type) + return (EPROTOTYPE); + unp2 = sotounpcb(so2); + unp->unp_conn = unp2; + switch (so->so_type) { + + case SOCK_DGRAM: + unp->unp_nextref = unp2->unp_refs; + unp2->unp_refs = unp; + soisconnected(so); + break; + + case SOCK_STREAM: + unp2->unp_conn = unp; + soisconnected(so); + soisconnected(so2); + break; + + default: + panic("unp_connect2"); + } + return (0); +} + +unp_disconnect(unp) + struct unpcb *unp; +{ + register struct unpcb *unp2 = unp->unp_conn; + + if (unp2 == 0) + return; + unp->unp_conn = 0; + switch (unp->unp_socket->so_type) { + + case SOCK_DGRAM: + if (unp2->unp_refs == unp) + unp2->unp_refs = unp->unp_nextref; + else { + unp2 = unp2->unp_refs; + for (;;) { + if (unp2 == 0) + panic("unp_disconnect"); + if (unp2->unp_nextref == unp) + break; + unp2 = unp2->unp_nextref; + } + unp2->unp_nextref = unp->unp_nextref; + } + unp->unp_nextref = 0; + unp->unp_socket->so_state &= ~SS_ISCONNECTED; + break; + + case SOCK_STREAM: + soisdisconnected(unp->unp_socket); + unp2->unp_conn = 0; + soisdisconnected(unp2->unp_socket); + break; + } +} + +#ifdef notdef +unp_abort(unp) + struct unpcb *unp; +{ + + unp_detach(unp); +} +#endif + +unp_shutdown(unp) + struct unpcb *unp; +{ + struct socket *so; + + if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && + (so = unp->unp_conn->unp_socket)) + socantrcvmore(so); +} + +unp_drop(unp, errno) + struct unpcb *unp; + int errno; +{ + struct socket *so = unp->unp_socket; + + so->so_error = errno; + unp_disconnect(unp); + if (so->so_head) { + so->so_pcb = (caddr_t) 0; + m_freem(unp->unp_addr); + (void) m_free(dtom(unp)); + sofree(so); + } +} + +#ifdef notdef +unp_drain() +{ + +} +#endif + +unp_externalize(rights) + struct mbuf *rights; +{ + struct proc *p = curproc; /* XXX */ + register int i; + register struct cmsghdr *cm = mtod(rights, struct cmsghdr *); + register struct file **rp = (struct file **)(cm + 1); + register struct file *fp; + int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof (int); + int f; + + if (!fdavail(p, newfds)) { + for (i = 0; i < newfds; i++) { + fp = *rp; + unp_discard(fp); + *rp++ = 0; + } + return (EMSGSIZE); + } + for (i = 0; i < newfds; i++) { + if (fdalloc(p, 0, &f)) + panic("unp_externalize"); + fp = *rp; + p->p_fd->fd_ofiles[f] = fp; + fp->f_msgcount--; + unp_rights--; + *(int *)rp++ = f; + } + return (0); +} + +unp_internalize(control, p) + struct mbuf *control; + struct proc *p; +{ + struct filedesc *fdp = p->p_fd; + register struct cmsghdr *cm = mtod(control, struct cmsghdr *); + register struct file **rp; + register struct file *fp; + register int i, fd; + int oldfds; + + if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || + cm->cmsg_len != control->m_len) + return (EINVAL); + oldfds = (cm->cmsg_len - sizeof (*cm)) / sizeof (int); + rp = (struct file **)(cm + 1); + for (i = 0; i < oldfds; i++) { + fd = *(int *)rp++; + if ((unsigned)fd >= fdp->fd_nfiles || + fdp->fd_ofiles[fd] == NULL) + return (EBADF); + } + rp = (struct file **)(cm + 1); + for (i = 0; i < oldfds; i++) { + fp = fdp->fd_ofiles[*(int *)rp]; + *rp++ = fp; + fp->f_count++; + fp->f_msgcount++; + unp_rights++; + } + return (0); +} + +int unp_defer, unp_gcing; +int unp_mark(); +extern struct domain unixdomain; + +unp_gc() +{ + register struct file *fp, *nextfp; + register struct socket *so; + struct file **extra_ref, **fpp; + int nunref, i; + + if (unp_gcing) + return; + unp_gcing = 1; + unp_defer = 0; + for (fp = filehead; fp; fp = fp->f_filef) + fp->f_flag &= ~(FMARK|FDEFER); + do { + for (fp = filehead; fp; fp = fp->f_filef) { + if (fp->f_count == 0) + continue; + if (fp->f_flag & FDEFER) { + fp->f_flag &= ~FDEFER; + unp_defer--; + } else { + if (fp->f_flag & FMARK) + continue; + if (fp->f_count == fp->f_msgcount) + continue; + fp->f_flag |= FMARK; + } + if (fp->f_type != DTYPE_SOCKET || + (so = (struct socket *)fp->f_data) == 0) + continue; + if (so->so_proto->pr_domain != &unixdomain || + (so->so_proto->pr_flags&PR_RIGHTS) == 0) + continue; +#ifdef notdef + if (so->so_rcv.sb_flags & SB_LOCK) { + /* + * This is problematical; it's not clear + * we need to wait for the sockbuf to be + * unlocked (on a uniprocessor, at least), + * and it's also not clear what to do + * if sbwait returns an error due to receipt + * of a signal. If sbwait does return + * an error, we'll go into an infinite + * loop. Delete all of this for now. + */ + (void) sbwait(&so->so_rcv); + goto restart; + } +#endif + unp_scan(so->so_rcv.sb_mb, unp_mark); + } + } while (unp_defer); + /* + * We grab an extra reference to each of the file table entries + * that are not otherwise accessible and then free the rights + * that are stored in messages on them. + * + * The bug in the orginal code is a little tricky, so I'll describe + * what's wrong with it here. + * + * It is incorrect to simply unp_discard each entry for f_msgcount + * times -- consider the case of sockets A and B that contain + * references to each other. On a last close of some other socket, + * we trigger a gc since the number of outstanding rights (unp_rights) + * is non-zero. If during the sweep phase the gc code un_discards, + * we end up doing a (full) closef on the descriptor. A closef on A + * results in the following chain. Closef calls soo_close, which + * calls soclose. Soclose calls first (through the switch + * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply + * returns because the previous instance had set unp_gcing, and + * we return all the way back to soclose, which marks the socket + * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush + * to free up the rights that are queued in messages on the socket A, + * i.e., the reference on B. The sorflush calls via the dom_dispose + * switch unp_dispose, which unp_scans with unp_discard. This second + * instance of unp_discard just calls closef on B. + * + * Well, a similar chain occurs on B, resulting in a sorflush on B, + * which results in another closef on A. Unfortunately, A is already + * being closed, and the descriptor has already been marked with + * SS_NOFDREF, and soclose panics at this point. + * + * Here, we first take an extra reference to each inaccessible + * descriptor. Then, we call sorflush ourself, since we know + * it is a Unix domain socket anyhow. After we destroy all the + * rights carried in messages, we do a last closef to get rid + * of our extra reference. This is the last close, and the + * unp_detach etc will shut down the socket. + * + * 91/09/19, bsy@cs.cmu.edu + */ + extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK); + for (nunref = 0, fp = filehead, fpp = extra_ref; fp; fp = nextfp) { + nextfp = fp->f_filef; + if (fp->f_count == 0) + continue; + if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) { + *fpp++ = fp; + nunref++; + fp->f_count++; + } + } + for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) + sorflush((struct socket *)(*fpp)->f_data); + for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) + closef(*fpp); + free((caddr_t)extra_ref, M_FILE); + unp_gcing = 0; +} + +unp_dispose(m) + struct mbuf *m; +{ + int unp_discard(); + + if (m) + unp_scan(m, unp_discard); +} + +unp_scan(m0, op) + register struct mbuf *m0; + int (*op)(); +{ + register struct mbuf *m; + register struct file **rp; + register struct cmsghdr *cm; + register int i; + int qfds; + + while (m0) { + for (m = m0; m; m = m->m_next) + if (m->m_type == MT_CONTROL && + m->m_len >= sizeof(*cm)) { + cm = mtod(m, struct cmsghdr *); + if (cm->cmsg_level != SOL_SOCKET || + cm->cmsg_type != SCM_RIGHTS) + continue; + qfds = (cm->cmsg_len - sizeof *cm) + / sizeof (struct file *); + rp = (struct file **)(cm + 1); + for (i = 0; i < qfds; i++) + (*op)(*rp++); + break; /* XXX, but saves time */ + } + m0 = m0->m_act; + } +} + +unp_mark(fp) + struct file *fp; +{ + + if (fp->f_flag & FMARK) + return; + unp_defer++; + fp->f_flag |= (FMARK|FDEFER); +} + +unp_discard(fp) + struct file *fp; +{ + + fp->f_msgcount--; + unp_rights--; + (void) closef(fp, (struct proc *)NULL); +} diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c new file mode 100644 index 00000000000..ec5c962f7df --- /dev/null +++ b/sys/kern/vfs_bio.c @@ -0,0 +1,339 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Definitions for the buffer hash lists. + */ +#define BUFHASH(dvp, lbn) \ + (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) +LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; +u_long bufhash; + +/* + * Insq/Remq for the buffer hash lists. + */ +#define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash) +#define bremhash(bp) LIST_REMOVE(bp, b_hash) + +/* + * Definitions for the buffer free lists. + */ +#define BQUEUES 4 /* number of free buffer queues */ + +#define BQ_LOCKED 0 /* super-blocks &c */ +#define BQ_LRU 1 /* lru, useful buffers */ +#define BQ_AGE 2 /* rubbish */ +#define BQ_EMPTY 3 /* buffer headers with no memory */ + +TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; +int needbuffer; + +/* + * Insq/Remq for the buffer free lists. + */ +#define binsheadfree(bp, dp) TAILQ_INSERT_HEAD(dp, bp, b_freelist) +#define binstailfree(bp, dp) TAILQ_INSERT_TAIL(dp, bp, b_freelist) + +void +bremfree(bp) + struct buf *bp; +{ + struct bqueues *dp = NULL; + + /* + * We only calculate the head of the freelist when removing + * the last element of the list as that is the only time that + * it is needed (e.g. to reset the tail pointer). + * + * NB: This makes an assumption about how tailq's are implemented. + */ + if (bp->b_freelist.tqe_next == NULL) { + for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) + if (dp->tqh_last == &bp->b_freelist.tqe_next) + break; + if (dp == &bufqueues[BQUEUES]) + panic("bremfree: lost tail"); + } + TAILQ_REMOVE(dp, bp, b_freelist); +} + +/* + * Initialize buffers and hash links for buffers. + */ +void +bufinit() +{ + register struct buf *bp; + struct bqueues *dp; + register int i; + int base, residual; + + for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) + TAILQ_INIT(dp); + bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash); + base = bufpages / nbuf; + residual = bufpages % nbuf; + for (i = 0; i < nbuf; i++) { + bp = &buf[i]; + bzero((char *)bp, sizeof *bp); + bp->b_dev = NODEV; + bp->b_rcred = NOCRED; + bp->b_wcred = NOCRED; + bp->b_vnbufs.le_next = NOLIST; + bp->b_data = buffers + i * MAXBSIZE; + if (i < residual) + bp->b_bufsize = (base + 1) * CLBYTES; + else + bp->b_bufsize = base * CLBYTES; + bp->b_flags = B_INVAL; + dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; + binsheadfree(bp, dp); + binshash(bp, &invalhash); + } +} + +bread(a1, a2, a3, a4, a5) + struct vnode *a1; + daddr_t a2; + int a3; + struct ucred *a4; + struct buf **a5; +{ + + /* + * Body deleted. + */ + return (EIO); +} + +breadn(a1, a2, a3, a4, a5, a6, a7, a8) + struct vnode *a1; + daddr_t a2; int a3; + daddr_t a4[]; int a5[]; + int a6; + struct ucred *a7; + struct buf **a8; +{ + + /* + * Body deleted. + */ + return (EIO); +} + +bwrite(a1) + struct buf *a1; +{ + + /* + * Body deleted. + */ + return (EIO); +} + +int +vn_bwrite(ap) + struct vop_bwrite_args *ap; +{ + return (bwrite(ap->a_bp)); +} + +bdwrite(a1) + struct buf *a1; +{ + + /* + * Body deleted. + */ + return; +} + +bawrite(a1) + struct buf *a1; +{ + + /* + * Body deleted. + */ + return; +} + +brelse(a1) + struct buf *a1; +{ + + /* + * Body deleted. + */ + return; +} + +struct buf * +incore(a1, a2) + struct vnode *a1; + daddr_t a2; +{ + + /* + * Body deleted. + */ + return (0); +} + +struct buf * +getblk(a1, a2, a3, a4, a5) + struct vnode *a1; + daddr_t a2; + int a3, a4, a5; +{ + + /* + * Body deleted. + */ + return ((struct buf *)0); +} + +struct buf * +geteblk(a1) + int a1; +{ + + /* + * Body deleted. + */ + return ((struct buf *)0); +} + +allocbuf(a1, a2) + struct buf *a1; + int a2; +{ + + /* + * Body deleted. + */ + return (0); +} + +struct buf * +getnewbuf(a1, a2) + int a1, a2; +{ + + /* + * Body deleted. + */ + return ((struct buf *)0); +} + +biowait(a1) + struct buf *a1; +{ + + /* + * Body deleted. + */ + return (EIO); +} + +void +biodone(a1) + struct buf *a1; +{ + + /* + * Body deleted. + */ + return; +} + +int +count_lock_queue() +{ + + /* + * Body deleted. + */ + return (0); +} + +#ifdef DIAGNOSTIC +/* + * Print out statistics on the current allocation of the buffer pool. + * Can be enabled to print out on every ``sync'' by setting "syncprt" + * in vfs_syscalls.c using sysctl. + */ +void +vfs_bufstats() +{ + int s, i, j, count; + register struct buf *bp; + register struct bqueues *dp; + int counts[MAXBSIZE/CLBYTES+1]; + static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; + + for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { + count = 0; + for (j = 0; j <= MAXBSIZE/CLBYTES; j++) + counts[j] = 0; + s = splbio(); + for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) { + counts[bp->b_bufsize/CLBYTES]++; + count++; + } + splx(s); + printf("%s: total-%d", bname[i], count); + for (j = 0; j <= MAXBSIZE/CLBYTES; j++) + if (counts[j] != 0) + printf(", %d-%d", j * CLBYTES, counts[j]); + printf("\n"); + } +} +#endif /* DIAGNOSTIC */ diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c new file mode 100644 index 00000000000..4ccfd7289a0 --- /dev/null +++ b/sys/kern/vfs_cache.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_cache.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Name caching works as follows: + * + * Names found by directory scans are retained in a cache + * for future reference. It is managed LRU, so frequently + * used names will hang around. Cache is indexed by hash value + * obtained from (vp, name) where vp refers to the directory + * containing name. + * + * For simplicity (and economy of storage), names longer than + * a maximum length of NCHNAMLEN are not cached; they occur + * infrequently in any case, and are almost never of interest. + * + * Upon reaching the last segment of a path, if the reference + * is for DELETE, or NOCACHE is set (rewrite), and the + * name is located in the cache, it will be dropped. + */ + +/* + * Structures associated with name cacheing. + */ +struct namecache **nchashtbl; +u_long nchash; /* size of hash table - 1 */ +long numcache; /* number of cache entries allocated */ +struct namecache *nchhead, **nchtail; /* LRU chain pointers */ +struct nchstats nchstats; /* cache effectiveness statistics */ + +int doingcache = 1; /* 1 => enable the cache */ + +/* + * Look for a the name in the cache. We don't do this + * if the segment name is long, simply so the cache can avoid + * holding long names (which would either waste space, or + * add greatly to the complexity). + * + * Lookup is called with ni_dvp pointing to the directory to search, + * ni_ptr pointing to the name of the entry being sought, ni_namelen + * tells the length of the name, and ni_hash contains a hash of + * the name. If the lookup succeeds, the vnode is returned in ni_vp + * and a status of -1 is returned. If the lookup determines that + * the name does not exist (negative cacheing), a status of ENOENT + * is returned. If the lookup fails, a status of zero is returned. + */ +int +cache_lookup(dvp, vpp, cnp) + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; +{ + register struct namecache *ncp, *ncq, **ncpp; + + if (!doingcache) + return (0); + if (cnp->cn_namelen > NCHNAMLEN) { + nchstats.ncs_long++; + cnp->cn_flags &= ~MAKEENTRY; + return (0); + } + ncpp = &nchashtbl[cnp->cn_hash & nchash]; + for (ncp = *ncpp; ncp; ncp = ncp->nc_forw) { + if (ncp->nc_dvp == dvp && + ncp->nc_dvpid == dvp->v_id && + ncp->nc_nlen == cnp->cn_namelen && + !bcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen)) + break; + } + if (ncp == NULL) { + nchstats.ncs_miss++; + return (0); + } + if (!(cnp->cn_flags & MAKEENTRY)) { + nchstats.ncs_badhits++; + } else if (ncp->nc_vp == NULL) { + if (cnp->cn_nameiop != CREATE) { + nchstats.ncs_neghits++; + /* + * Move this slot to end of LRU chain, + * if not already there. + */ + if (ncp->nc_nxt) { + /* remove from LRU chain */ + *ncp->nc_prev = ncp->nc_nxt; + ncp->nc_nxt->nc_prev = ncp->nc_prev; + /* and replace at end of it */ + ncp->nc_nxt = NULL; + ncp->nc_prev = nchtail; + *nchtail = ncp; + nchtail = &ncp->nc_nxt; + } + return (ENOENT); + } + } else if (ncp->nc_vpid != ncp->nc_vp->v_id) { + nchstats.ncs_falsehits++; + } else { + nchstats.ncs_goodhits++; + /* + * move this slot to end of LRU chain, if not already there + */ + if (ncp->nc_nxt) { + /* remove from LRU chain */ + *ncp->nc_prev = ncp->nc_nxt; + ncp->nc_nxt->nc_prev = ncp->nc_prev; + /* and replace at end of it */ + ncp->nc_nxt = NULL; + ncp->nc_prev = nchtail; + *nchtail = ncp; + nchtail = &ncp->nc_nxt; + } + *vpp = ncp->nc_vp; + return (-1); + } + + /* + * Last component and we are renaming or deleting, + * the cache entry is invalid, or otherwise don't + * want cache entry to exist. + */ + /* remove from LRU chain */ + if (ncq = ncp->nc_nxt) + ncq->nc_prev = ncp->nc_prev; + else + nchtail = ncp->nc_prev; + *ncp->nc_prev = ncq; + /* remove from hash chain */ + if (ncq = ncp->nc_forw) + ncq->nc_back = ncp->nc_back; + *ncp->nc_back = ncq; + /* and make a dummy hash chain */ + ncp->nc_forw = NULL; + ncp->nc_back = NULL; + /* insert at head of LRU list (first to grab) */ + if (ncq = nchhead) + ncq->nc_prev = &ncp->nc_nxt; + else + nchtail = &ncp->nc_nxt; + nchhead = ncp; + ncp->nc_nxt = ncq; + ncp->nc_prev = &nchhead; + return (0); +} + +/* + * Add an entry to the cache + */ +cache_enter(dvp, vp, cnp) + struct vnode *dvp; + struct vnode *vp; + struct componentname *cnp; +{ + register struct namecache *ncp, *ncq, **ncpp; + +#ifdef DIAGNOSTIC + if (cnp->cn_namelen > NCHNAMLEN) + panic("cache_enter: name too long"); +#endif + if (!doingcache) + return; + /* + * Free the cache slot at head of lru chain. + */ + if (numcache < desiredvnodes) { + ncp = (struct namecache *) + malloc((u_long)sizeof *ncp, M_CACHE, M_WAITOK); + bzero((char *)ncp, sizeof *ncp); + numcache++; + } else if (ncp = nchhead) { + /* remove from lru chain */ + if (ncq = ncp->nc_nxt) + ncq->nc_prev = ncp->nc_prev; + else + nchtail = ncp->nc_prev; + *ncp->nc_prev = ncq; + /* remove from old hash chain, if on one */ + if (ncp->nc_back) { + if (ncq = ncp->nc_forw) + ncq->nc_back = ncp->nc_back; + *ncp->nc_back = ncq; + ncp->nc_forw = NULL; + ncp->nc_back = NULL; + } + } else + return; + /* grab the vnode we just found */ + ncp->nc_vp = vp; + if (vp) + ncp->nc_vpid = vp->v_id; + else + ncp->nc_vpid = 0; + /* fill in cache info */ + ncp->nc_dvp = dvp; + ncp->nc_dvpid = dvp->v_id; + ncp->nc_nlen = cnp->cn_namelen; + bcopy(cnp->cn_nameptr, ncp->nc_name, (unsigned)ncp->nc_nlen); + /* link at end of lru chain */ + ncp->nc_nxt = NULL; + ncp->nc_prev = nchtail; + *nchtail = ncp; + nchtail = &ncp->nc_nxt; + /* and insert on hash chain */ + ncpp = &nchashtbl[cnp->cn_hash & nchash]; + if (ncq = *ncpp) + ncq->nc_back = &ncp->nc_forw; + ncp->nc_forw = ncq; + ncp->nc_back = ncpp; + *ncpp = ncp; +} + +/* + * Name cache initialization, from vfs_init() when we are booting + */ +nchinit() +{ + + nchtail = &nchhead; + nchashtbl = hashinit(desiredvnodes, M_CACHE, &nchash); +} + +/* + * Cache flush, a particular vnode; called when a vnode is renamed to + * hide entries that would now be invalid + */ +cache_purge(vp) + struct vnode *vp; +{ + struct namecache *ncp, **ncpp; + + vp->v_id = ++nextvnodeid; + if (nextvnodeid != 0) + return; + for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { + for (ncp = *ncpp; ncp; ncp = ncp->nc_forw) { + ncp->nc_vpid = 0; + ncp->nc_dvpid = 0; + } + } + vp->v_id = ++nextvnodeid; +} + +/* + * Cache flush, a whole filesystem; called when filesys is umounted to + * remove entries that would now be invalid + * + * The line "nxtcp = nchhead" near the end is to avoid potential problems + * if the cache lru chain is modified while we are dumping the + * inode. This makes the algorithm O(n^2), but do you think I care? + */ +cache_purgevfs(mp) + struct mount *mp; +{ + register struct namecache *ncp, *nxtcp; + + for (ncp = nchhead; ncp; ncp = nxtcp) { + if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) { + nxtcp = ncp->nc_nxt; + continue; + } + /* free the resources we had */ + ncp->nc_vp = NULL; + ncp->nc_dvp = NULL; + /* remove from old hash chain, if on one */ + if (ncp->nc_back) { + if (nxtcp = ncp->nc_forw) + nxtcp->nc_back = ncp->nc_back; + *ncp->nc_back = nxtcp; + ncp->nc_forw = NULL; + ncp->nc_back = NULL; + } + /* delete this entry from LRU chain */ + if (nxtcp = ncp->nc_nxt) + nxtcp->nc_prev = ncp->nc_prev; + else + nchtail = ncp->nc_prev; + *ncp->nc_prev = nxtcp; + /* cause rescan of list, it may have altered */ + /* also put the now-free entry at head of LRU */ + if (nxtcp = nchhead) + nxtcp->nc_prev = &ncp->nc_nxt; + else + nchtail = &ncp->nc_nxt; + nchhead = ncp; + ncp->nc_nxt = nxtcp; + ncp->nc_prev = &nchhead; + } +} diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c new file mode 100644 index 00000000000..c34fbc34a67 --- /dev/null +++ b/sys/kern/vfs_cluster.c @@ -0,0 +1,746 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DEBUG +#include +#include +int doreallocblks = 1; +struct ctldebug debug13 = { "doreallocblks", &doreallocblks }; +#else +/* XXX for cluster_write */ +#define doreallocblks 1 +#endif + +/* + * Local declarations + */ +struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, + daddr_t, long, int)); +struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, + daddr_t, daddr_t, long, int, long)); +void cluster_wbuild __P((struct vnode *, struct buf *, long, + daddr_t, int, daddr_t)); +struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); + +#ifdef DIAGNOSTIC +/* + * Set to 1 if reads of block zero should cause readahead to be done. + * Set to 0 treats a read of block zero as a non-sequential read. + * + * Setting to one assumes that most reads of block zero of files are due to + * sequential passes over the files (e.g. cat, sum) where additional blocks + * will soon be needed. Setting to zero assumes that the majority are + * surgical strikes to get particular info (e.g. size, file) where readahead + * blocks will not be used and, in fact, push out other potentially useful + * blocks from the cache. The former seems intuitive, but some quick tests + * showed that the latter performed better from a system-wide point of view. + */ +int doclusterraz = 0; +#define ISSEQREAD(vp, blk) \ + (((blk) != 0 || doclusterraz) && \ + ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) +#else +#define ISSEQREAD(vp, blk) \ + ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) +#endif + +/* + * This replaces bread. If this is a bread at the beginning of a file and + * lastr is 0, we assume this is the first read and we'll read up to two + * blocks if they are sequential. After that, we'll do regular read ahead + * in clustered chunks. + * + * There are 4 or 5 cases depending on how you count: + * Desired block is in the cache: + * 1 Not sequential access (0 I/Os). + * 2 Access is sequential, do read-ahead (1 ASYNC). + * Desired block is not in cache: + * 3 Not sequential access (1 SYNC). + * 4 Sequential access, next block is contiguous (1 SYNC). + * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) + * + * There are potentially two buffers that require I/O. + * bp is the block requested. + * rbp is the read-ahead block. + * If either is NULL, then you don't have to do the I/O. + */ +cluster_read(vp, filesize, lblkno, size, cred, bpp) + struct vnode *vp; + u_quad_t filesize; + daddr_t lblkno; + long size; + struct ucred *cred; + struct buf **bpp; +{ + struct buf *bp, *rbp; + daddr_t blkno, ioblkno; + long flags; + int error, num_ra, alreadyincore; + +#ifdef DIAGNOSTIC + if (size == 0) + panic("cluster_read: size = 0"); +#endif + + error = 0; + flags = B_READ; + *bpp = bp = getblk(vp, lblkno, size, 0, 0); + if (bp->b_flags & B_CACHE) { + /* + * Desired block is in cache; do any readahead ASYNC. + * Case 1, 2. + */ + trace(TR_BREADHIT, pack(vp, size), lblkno); + flags |= B_ASYNC; + ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); + alreadyincore = (int)incore(vp, ioblkno); + bp = NULL; + } else { + /* Block wasn't in cache, case 3, 4, 5. */ + trace(TR_BREADMISS, pack(vp, size), lblkno); + bp->b_flags |= B_READ; + ioblkno = lblkno; + alreadyincore = 0; + curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + } + /* + * XXX + * Replace 1 with a window size based on some permutation of + * maxcontig and rot_delay. This will let you figure out how + * many blocks you should read-ahead (case 2, 4, 5). + * + * If the access isn't sequential, reset the window to 1. + * Note that a read to the same block is considered sequential. + * This catches the case where the file is being read sequentially, + * but at smaller than the filesystem block size. + */ + rbp = NULL; + if (!ISSEQREAD(vp, lblkno)) { + vp->v_ralen = 0; + vp->v_maxra = lblkno; + } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && + !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && + blkno != -1) { + /* + * Reading sequentially, and the next block is not in the + * cache. We are going to try reading ahead. + */ + if (num_ra) { + /* + * If our desired readahead block had been read + * in a previous readahead but is no longer in + * core, then we may be reading ahead too far + * or are not using our readahead very rapidly. + * In this case we scale back the window. + */ + if (!alreadyincore && ioblkno <= vp->v_maxra) + vp->v_ralen = max(vp->v_ralen >> 1, 1); + /* + * There are more sequential blocks than our current + * window allows, scale up. Ideally we want to get + * in sync with the filesystem maxcontig value. + */ + else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) + vp->v_ralen = vp->v_ralen ? + min(num_ra, vp->v_ralen << 1) : 1; + + if (num_ra > vp->v_ralen) + num_ra = vp->v_ralen; + } + + if (num_ra) /* case 2, 4 */ + rbp = cluster_rbuild(vp, filesize, + bp, ioblkno, blkno, size, num_ra, flags); + else if (ioblkno == lblkno) { + bp->b_blkno = blkno; + /* Case 5: check how many blocks to read ahead */ + ++ioblkno; + if ((ioblkno + 1) * size > filesize || + incore(vp, ioblkno) || (error = VOP_BMAP(vp, + ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) + goto skip_readahead; + /* + * Adjust readahead as above + */ + if (num_ra) { + if (!alreadyincore && ioblkno <= vp->v_maxra) + vp->v_ralen = max(vp->v_ralen >> 1, 1); + else if (num_ra > vp->v_ralen && + lblkno != vp->v_lastr) + vp->v_ralen = vp->v_ralen ? + min(num_ra,vp->v_ralen<<1) : 1; + if (num_ra > vp->v_ralen) + num_ra = vp->v_ralen; + } + flags |= B_ASYNC; + if (num_ra) + rbp = cluster_rbuild(vp, filesize, + NULL, ioblkno, blkno, size, num_ra, flags); + else { + rbp = getblk(vp, ioblkno, size, 0, 0); + rbp->b_flags |= flags; + rbp->b_blkno = blkno; + } + } else { + /* case 2; read ahead single block */ + rbp = getblk(vp, ioblkno, size, 0, 0); + rbp->b_flags |= flags; + rbp->b_blkno = blkno; + } + + if (rbp == bp) /* case 4 */ + rbp = NULL; + else if (rbp) { /* case 2, 5 */ + trace(TR_BREADMISSRA, + pack(vp, (num_ra + 1) * size), ioblkno); + curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + } + } + + /* XXX Kirk, do we need to make sure the bp has creds? */ +skip_readahead: + if (bp) + if (bp->b_flags & (B_DONE | B_DELWRI)) + panic("cluster_read: DONE bp"); + else + error = VOP_STRATEGY(bp); + + if (rbp) + if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { + rbp->b_flags &= ~(B_ASYNC | B_READ); + brelse(rbp); + } else + (void) VOP_STRATEGY(rbp); + + /* + * Recalculate our maximum readahead + */ + if (rbp == NULL) + rbp = bp; + if (rbp) + vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; + + if (bp) + return(biowait(bp)); + return(error); +} + +/* + * If blocks are contiguous on disk, use this to provide clustered + * read ahead. We will read as many blocks as possible sequentially + * and then parcel them up into logical blocks in the buffer hash table. + */ +struct buf * +cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) + struct vnode *vp; + u_quad_t filesize; + struct buf *bp; + daddr_t lbn; + daddr_t blkno; + long size; + int run; + long flags; +{ + struct cluster_save *b_save; + struct buf *tbp; + daddr_t bn; + int i, inc; + +#ifdef DIAGNOSTIC + if (size != vp->v_mount->mnt_stat.f_iosize) + panic("cluster_rbuild: size %d != filesize %d\n", + size, vp->v_mount->mnt_stat.f_iosize); +#endif + if (size * (lbn + run + 1) > filesize) + --run; + if (run == 0) { + if (!bp) { + bp = getblk(vp, lbn, size, 0, 0); + bp->b_blkno = blkno; + bp->b_flags |= flags; + } + return(bp); + } + + bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); + if (bp->b_flags & (B_DONE | B_DELWRI)) + return (bp); + + b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), + M_SEGMENT, M_WAITOK); + b_save->bs_bufsize = b_save->bs_bcount = size; + b_save->bs_nchildren = 0; + b_save->bs_children = (struct buf **)(b_save + 1); + b_save->bs_saveaddr = bp->b_saveaddr; + bp->b_saveaddr = (caddr_t) b_save; + + inc = btodb(size); + for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { + if (incore(vp, lbn + i)) { + if (i == 1) { + bp->b_saveaddr = b_save->bs_saveaddr; + bp->b_flags &= ~B_CALL; + bp->b_iodone = NULL; + allocbuf(bp, size); + free(b_save, M_SEGMENT); + } else + allocbuf(bp, size * i); + break; + } + tbp = getblk(vp, lbn + i, 0, 0, 0); + /* + * getblk may return some memory in the buffer if there were + * no empty buffers to shed it to. If there is currently + * memory in the buffer, we move it down size bytes to make + * room for the valid pages that cluster_callback will insert. + * We do this now so we don't have to do it at interrupt time + * in the callback routine. + */ + if (tbp->b_bufsize != 0) { + caddr_t bdata = (char *)tbp->b_data; + + if (tbp->b_bufsize + size > MAXBSIZE) + panic("cluster_rbuild: too much memory"); + if (tbp->b_bufsize > size) { + /* + * XXX if the source and destination regions + * overlap we have to copy backward to avoid + * clobbering any valid pages (i.e. pagemove + * implementations typically can't handle + * overlap). + */ + bdata += tbp->b_bufsize; + while (bdata > (char *)tbp->b_data) { + bdata -= CLBYTES; + pagemove(bdata, bdata + size, CLBYTES); + } + } else + pagemove(bdata, bdata + size, tbp->b_bufsize); + } + tbp->b_blkno = bn; + tbp->b_flags |= flags | B_READ | B_ASYNC; + ++b_save->bs_nchildren; + b_save->bs_children[i - 1] = tbp; + } + return(bp); +} + +/* + * Either get a new buffer or grow the existing one. + */ +struct buf * +cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) + struct vnode *vp; + struct buf *bp; + long flags; + daddr_t blkno; + daddr_t lblkno; + long size; + int run; +{ + if (!bp) { + bp = getblk(vp, lblkno, size, 0, 0); + if (bp->b_flags & (B_DONE | B_DELWRI)) { + bp->b_blkno = blkno; + return(bp); + } + } + allocbuf(bp, run * size); + bp->b_blkno = blkno; + bp->b_iodone = cluster_callback; + bp->b_flags |= flags | B_CALL; + return(bp); +} + +/* + * Cleanup after a clustered read or write. + * This is complicated by the fact that any of the buffers might have + * extra memory (if there were no empty buffer headers at allocbuf time) + * that we will need to shift around. + */ +void +cluster_callback(bp) + struct buf *bp; +{ + struct cluster_save *b_save; + struct buf **bpp, *tbp; + long bsize; + caddr_t cp; + int error = 0; + + /* + * Must propogate errors to all the components. + */ + if (bp->b_flags & B_ERROR) + error = bp->b_error; + + b_save = (struct cluster_save *)(bp->b_saveaddr); + bp->b_saveaddr = b_save->bs_saveaddr; + + bsize = b_save->bs_bufsize; + cp = (char *)bp->b_data + bsize; + /* + * Move memory from the large cluster buffer into the component + * buffers and mark IO as done on these. + */ + for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { + tbp = *bpp; + pagemove(cp, tbp->b_data, bsize); + tbp->b_bufsize += bsize; + tbp->b_bcount = bsize; + if (error) { + tbp->b_flags |= B_ERROR; + tbp->b_error = error; + } + biodone(tbp); + bp->b_bufsize -= bsize; + cp += bsize; + } + /* + * If there was excess memory in the cluster buffer, + * slide it up adjacent to the remaining valid data. + */ + if (bp->b_bufsize != bsize) { + if (bp->b_bufsize < bsize) + panic("cluster_callback: too little memory"); + pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); + } + bp->b_bcount = bsize; + bp->b_iodone = NULL; + free(b_save, M_SEGMENT); + if (bp->b_flags & B_ASYNC) + brelse(bp); + else { + bp->b_flags &= ~B_WANTED; + wakeup((caddr_t)bp); + } +} + +/* + * Do clustered write for FFS. + * + * Three cases: + * 1. Write is not sequential (write asynchronously) + * Write is sequential: + * 2. beginning of cluster - begin cluster + * 3. middle of a cluster - add to cluster + * 4. end of a cluster - asynchronously write cluster + */ +void +cluster_write(bp, filesize) + struct buf *bp; + u_quad_t filesize; +{ + struct vnode *vp; + daddr_t lbn; + int maxclen, cursize; + + vp = bp->b_vp; + lbn = bp->b_lblkno; + + /* Initialize vnode to beginning of file. */ + if (lbn == 0) + vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; + + if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || + (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { + maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; + if (vp->v_clen != 0) { + /* + * Next block is not sequential. + * + * If we are not writing at end of file, the process + * seeked to another point in the file since its + * last write, or we have reached our maximum + * cluster size, then push the previous cluster. + * Otherwise try reallocating to make it sequential. + */ + cursize = vp->v_lastw - vp->v_cstart + 1; + if (!doreallocblks || + (lbn + 1) * bp->b_bcount != filesize || + lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { + cluster_wbuild(vp, NULL, bp->b_bcount, + vp->v_cstart, cursize, lbn); + } else { + struct buf **bpp, **endbp; + struct cluster_save *buflist; + + buflist = cluster_collectbufs(vp, bp); + endbp = &buflist->bs_children + [buflist->bs_nchildren - 1]; + if (VOP_REALLOCBLKS(vp, buflist)) { + /* + * Failed, push the previous cluster. + */ + for (bpp = buflist->bs_children; + bpp < endbp; bpp++) + brelse(*bpp); + free(buflist, M_SEGMENT); + cluster_wbuild(vp, NULL, bp->b_bcount, + vp->v_cstart, cursize, lbn); + } else { + /* + * Succeeded, keep building cluster. + */ + for (bpp = buflist->bs_children; + bpp <= endbp; bpp++) + bdwrite(*bpp); + free(buflist, M_SEGMENT); + vp->v_lastw = lbn; + vp->v_lasta = bp->b_blkno; + return; + } + } + } + /* + * Consider beginning a cluster. + * If at end of file, make cluster as large as possible, + * otherwise find size of existing cluster. + */ + if ((lbn + 1) * bp->b_bcount != filesize && + (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || + bp->b_blkno == -1)) { + bawrite(bp); + vp->v_clen = 0; + vp->v_lasta = bp->b_blkno; + vp->v_cstart = lbn + 1; + vp->v_lastw = lbn; + return; + } + vp->v_clen = maxclen; + if (maxclen == 0) { /* I/O not contiguous */ + vp->v_cstart = lbn + 1; + bawrite(bp); + } else { /* Wait for rest of cluster */ + vp->v_cstart = lbn; + bdwrite(bp); + } + } else if (lbn == vp->v_cstart + vp->v_clen) { + /* + * At end of cluster, write it out. + */ + cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, + vp->v_clen + 1, lbn); + vp->v_clen = 0; + vp->v_cstart = lbn + 1; + } else + /* + * In the middle of a cluster, so just delay the + * I/O for now. + */ + bdwrite(bp); + vp->v_lastw = lbn; + vp->v_lasta = bp->b_blkno; +} + + +/* + * This is an awful lot like cluster_rbuild...wish they could be combined. + * The last lbn argument is the current block on which I/O is being + * performed. Check to see that it doesn't fall in the middle of + * the current block (if last_bp == NULL). + */ +void +cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) + struct vnode *vp; + struct buf *last_bp; + long size; + daddr_t start_lbn; + int len; + daddr_t lbn; +{ + struct cluster_save *b_save; + struct buf *bp, *tbp; + caddr_t cp; + int i, s; + +#ifdef DIAGNOSTIC + if (size != vp->v_mount->mnt_stat.f_iosize) + panic("cluster_wbuild: size %d != filesize %d\n", + size, vp->v_mount->mnt_stat.f_iosize); +#endif +redo: + while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { + ++start_lbn; + --len; + } + + /* Get more memory for current buffer */ + if (len <= 1) { + if (last_bp) { + bawrite(last_bp); + } else if (len) { + bp = getblk(vp, start_lbn, size, 0, 0); + bawrite(bp); + } + return; + } + + bp = getblk(vp, start_lbn, size, 0, 0); + if (!(bp->b_flags & B_DELWRI)) { + ++start_lbn; + --len; + brelse(bp); + goto redo; + } + + /* + * Extra memory in the buffer, punt on this buffer. + * XXX we could handle this in most cases, but we would have to + * push the extra memory down to after our max possible cluster + * size and then potentially pull it back up if the cluster was + * terminated prematurely--too much hassle. + */ + if (bp->b_bcount != bp->b_bufsize) { + ++start_lbn; + --len; + bawrite(bp); + goto redo; + } + + --len; + b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), + M_SEGMENT, M_WAITOK); + b_save->bs_bcount = bp->b_bcount; + b_save->bs_bufsize = bp->b_bufsize; + b_save->bs_nchildren = 0; + b_save->bs_children = (struct buf **)(b_save + 1); + b_save->bs_saveaddr = bp->b_saveaddr; + bp->b_saveaddr = (caddr_t) b_save; + + bp->b_flags |= B_CALL; + bp->b_iodone = cluster_callback; + cp = (char *)bp->b_data + size; + for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { + /* + * Block is not in core or the non-sequential block + * ending our cluster was part of the cluster (in which + * case we don't want to write it twice). + */ + if (!incore(vp, start_lbn) || + last_bp == NULL && start_lbn == lbn) + break; + + /* + * Get the desired block buffer (unless it is the final + * sequential block whose buffer was passed in explictly + * as last_bp). + */ + if (last_bp == NULL || start_lbn != lbn) { + tbp = getblk(vp, start_lbn, size, 0, 0); + if (!(tbp->b_flags & B_DELWRI)) { + brelse(tbp); + break; + } + } else + tbp = last_bp; + + ++b_save->bs_nchildren; + + /* Move memory from children to parent */ + if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { + printf("Clustered Block: %d addr %x bufsize: %d\n", + bp->b_lblkno, bp->b_blkno, bp->b_bufsize); + printf("Child Block: %d addr: %x\n", tbp->b_lblkno, + tbp->b_blkno); + panic("Clustered write to wrong blocks"); + } + + pagemove(tbp->b_data, cp, size); + bp->b_bcount += size; + bp->b_bufsize += size; + + tbp->b_bufsize -= size; + tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); + tbp->b_flags |= (B_ASYNC | B_AGE); + s = splbio(); + reassignbuf(tbp, tbp->b_vp); /* put on clean list */ + ++tbp->b_vp->v_numoutput; + splx(s); + b_save->bs_children[i] = tbp; + + cp += size; + } + + if (i == 0) { + /* None to cluster */ + bp->b_saveaddr = b_save->bs_saveaddr; + bp->b_flags &= ~B_CALL; + bp->b_iodone = NULL; + free(b_save, M_SEGMENT); + } + bawrite(bp); + if (i < len) { + len -= i + 1; + start_lbn += 1; + goto redo; + } +} + +/* + * Collect together all the buffers in a cluster. + * Plus add one additional buffer. + */ +struct cluster_save * +cluster_collectbufs(vp, last_bp) + struct vnode *vp; + struct buf *last_bp; +{ + struct cluster_save *buflist; + daddr_t lbn; + int i, len; + + len = vp->v_lastw - vp->v_cstart + 1; + buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), + M_SEGMENT, M_WAITOK); + buflist->bs_nchildren = 0; + buflist->bs_children = (struct buf **)(buflist + 1); + for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) + (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, + &buflist->bs_children[i]); + buflist->bs_children[i] = last_bp; + buflist->bs_nchildren = i + 1; + return (buflist); +} diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c new file mode 100644 index 00000000000..2fe39eb674b --- /dev/null +++ b/sys/kern/vfs_conf.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_conf.c 8.8 (Berkeley) 3/31/94 + */ + +#include +#include +#include + +#ifdef FFS +#include + +/* + * This specifies the filesystem used to mount the root. + * This specification should be done by /etc/config. + */ +int (*mountroot)() = ffs_mountroot; +#endif + +/* + * These define the root filesystem and device. + */ +struct mount *rootfs; +struct vnode *rootvnode; + +/* + * Set up the filesystem operations for vnodes. + * The types are defined in mount.h. + */ +#ifdef FFS +extern struct vfsops ufs_vfsops; +#define UFS_VFSOPS &ufs_vfsops +#else +#define UFS_VFSOPS NULL +#endif + +#ifdef LFS +extern struct vfsops lfs_vfsops; +#define LFS_VFSOPS &lfs_vfsops +#else +#define LFS_VFSOPS NULL +#endif + +#ifdef MFS +extern struct vfsops mfs_vfsops; +#define MFS_VFSOPS &mfs_vfsops +#else +#define MFS_VFSOPS NULL +#endif + +#ifdef NFS +extern struct vfsops nfs_vfsops; +#define NFS_VFSOPS &nfs_vfsops +#else +#define NFS_VFSOPS NULL +#endif + +#ifdef FDESC +extern struct vfsops fdesc_vfsops; +#define FDESC_VFSOPS &fdesc_vfsops +#else +#define FDESC_VFSOPS NULL +#endif + +#ifdef PORTAL +extern struct vfsops portal_vfsops; +#define PORTAL_VFSOPS &portal_vfsops +#else +#define PORTAL_VFSOPS NULL +#endif + +#ifdef NULLFS +extern struct vfsops null_vfsops; +#define NULL_VFSOPS &null_vfsops +#else +#define NULL_VFSOPS NULL +#endif + +#ifdef UMAPFS +extern struct vfsops umap_vfsops; +#define UMAP_VFSOPS &umap_vfsops +#else +#define UMAP_VFSOPS NULL +#endif + +#ifdef KERNFS +extern struct vfsops kernfs_vfsops; +#define KERNFS_VFSOPS &kernfs_vfsops +#else +#define KERNFS_VFSOPS NULL +#endif + +#ifdef PROCFS +extern struct vfsops procfs_vfsops; +#define PROCFS_VFSOPS &procfs_vfsops +#else +#define PROCFS_VFSOPS NULL +#endif + +#ifdef AFS +extern struct vfsops afs_vfsops; +#define AFS_VFSOPS &afs_vfsops +#else +#define AFS_VFSOPS NULL +#endif + +#ifdef CD9660 +extern struct vfsops cd9660_vfsops; +#define CD9660_VFSOPS &cd9660_vfsops +#else +#define CD9660_VFSOPS NULL +#endif + +#ifdef UNION +extern struct vfsops union_vfsops; +#define UNION_VFSOPS &union_vfsops +#else +#define UNION_VFSOPS NULL +#endif + +struct vfsops *vfssw[] = { + NULL, /* 0 = MOUNT_NONE */ + UFS_VFSOPS, /* 1 = MOUNT_UFS */ + NFS_VFSOPS, /* 2 = MOUNT_NFS */ + MFS_VFSOPS, /* 3 = MOUNT_MFS */ + NULL, /* 4 = MOUNT_PC */ + LFS_VFSOPS, /* 5 = MOUNT_LFS */ + NULL, /* 6 = MOUNT_LOFS */ + FDESC_VFSOPS, /* 7 = MOUNT_FDESC */ + PORTAL_VFSOPS, /* 8 = MOUNT_PORTAL */ + NULL_VFSOPS, /* 9 = MOUNT_NULL */ + UMAP_VFSOPS, /* 10 = MOUNT_UMAP */ + KERNFS_VFSOPS, /* 11 = MOUNT_KERNFS */ + PROCFS_VFSOPS, /* 12 = MOUNT_PROCFS */ + AFS_VFSOPS, /* 13 = MOUNT_AFS */ + CD9660_VFSOPS, /* 14 = MOUNT_CD9660 */ + UNION_VFSOPS, /* 15 = MOUNT_UNION */ + 0 +}; + + +/* + * + * vfs_opv_descs enumerates the list of vnode classes, each with it's own + * vnode operation vector. It is consulted at system boot to build operation + * vectors. It is NULL terminated. + * + */ +extern struct vnodeopv_desc ffs_vnodeop_opv_desc; +extern struct vnodeopv_desc ffs_specop_opv_desc; +extern struct vnodeopv_desc ffs_fifoop_opv_desc; +extern struct vnodeopv_desc lfs_vnodeop_opv_desc; +extern struct vnodeopv_desc lfs_specop_opv_desc; +extern struct vnodeopv_desc lfs_fifoop_opv_desc; +extern struct vnodeopv_desc mfs_vnodeop_opv_desc; +extern struct vnodeopv_desc dead_vnodeop_opv_desc; +extern struct vnodeopv_desc fifo_vnodeop_opv_desc; +extern struct vnodeopv_desc spec_vnodeop_opv_desc; +extern struct vnodeopv_desc nfsv2_vnodeop_opv_desc; +extern struct vnodeopv_desc spec_nfsv2nodeop_opv_desc; +extern struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc; +extern struct vnodeopv_desc fdesc_vnodeop_opv_desc; +extern struct vnodeopv_desc portal_vnodeop_opv_desc; +extern struct vnodeopv_desc null_vnodeop_opv_desc; +extern struct vnodeopv_desc umap_vnodeop_opv_desc; +extern struct vnodeopv_desc kernfs_vnodeop_opv_desc; +extern struct vnodeopv_desc procfs_vnodeop_opv_desc; +extern struct vnodeopv_desc cd9660_vnodeop_opv_desc; +extern struct vnodeopv_desc cd9660_specop_opv_desc; +extern struct vnodeopv_desc cd9660_fifoop_opv_desc; +extern struct vnodeopv_desc union_vnodeop_opv_desc; + +struct vnodeopv_desc *vfs_opv_descs[] = { + &ffs_vnodeop_opv_desc, + &ffs_specop_opv_desc, +#ifdef FIFO + &ffs_fifoop_opv_desc, +#endif + &dead_vnodeop_opv_desc, +#ifdef FIFO + &fifo_vnodeop_opv_desc, +#endif + &spec_vnodeop_opv_desc, +#ifdef LFS + &lfs_vnodeop_opv_desc, + &lfs_specop_opv_desc, +#ifdef FIFO + &lfs_fifoop_opv_desc, +#endif +#endif +#ifdef MFS + &mfs_vnodeop_opv_desc, +#endif +#ifdef NFS + &nfsv2_vnodeop_opv_desc, + &spec_nfsv2nodeop_opv_desc, +#ifdef FIFO + &fifo_nfsv2nodeop_opv_desc, +#endif +#endif +#ifdef FDESC + &fdesc_vnodeop_opv_desc, +#endif +#ifdef PORTAL + &portal_vnodeop_opv_desc, +#endif +#ifdef NULLFS + &null_vnodeop_opv_desc, +#endif +#ifdef UMAPFS + &umap_vnodeop_opv_desc, +#endif +#ifdef KERNFS + &kernfs_vnodeop_opv_desc, +#endif +#ifdef PROCFS + &procfs_vnodeop_opv_desc, +#endif +#ifdef CD9660 + &cd9660_vnodeop_opv_desc, + &cd9660_specop_opv_desc, +#ifdef FIFO + &cd9660_fifoop_opv_desc, +#endif +#endif +#ifdef UNION + &union_vnodeop_opv_desc, +#endif + NULL +}; diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c new file mode 100644 index 00000000000..9891fe61c19 --- /dev/null +++ b/sys/kern/vfs_export.c @@ -0,0 +1,1322 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 + */ + +/* + * External virtual filesystem routines + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +enum vtype iftovt_tab[16] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, +}; +int vttoif_tab[9] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, + S_IFSOCK, S_IFIFO, S_IFMT, +}; + +/* + * Insq/Remq for the vnode usage lists. + */ +#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) +#define bufremvn(bp) { \ + LIST_REMOVE(bp, b_vnbufs); \ + (bp)->b_vnbufs.le_next = NOLIST; \ +} + +TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ +struct mntlist mountlist; /* mounted filesystem list */ + +/* + * Initialize the vnode management data structures. + */ +vntblinit() +{ + + TAILQ_INIT(&vnode_free_list); + TAILQ_INIT(&mountlist); +} + +/* + * Lock a filesystem. + * Used to prevent access to it while mounting and unmounting. + */ +vfs_lock(mp) + register struct mount *mp; +{ + + while(mp->mnt_flag & MNT_MLOCK) { + mp->mnt_flag |= MNT_MWAIT; + sleep((caddr_t)mp, PVFS); + } + mp->mnt_flag |= MNT_MLOCK; + return (0); +} + +/* + * Unlock a locked filesystem. + * Panic if filesystem is not locked. + */ +void +vfs_unlock(mp) + register struct mount *mp; +{ + + if ((mp->mnt_flag & MNT_MLOCK) == 0) + panic("vfs_unlock: not locked"); + mp->mnt_flag &= ~MNT_MLOCK; + if (mp->mnt_flag & MNT_MWAIT) { + mp->mnt_flag &= ~MNT_MWAIT; + wakeup((caddr_t)mp); + } +} + +/* + * Mark a mount point as busy. + * Used to synchronize access and to delay unmounting. + */ +vfs_busy(mp) + register struct mount *mp; +{ + + while(mp->mnt_flag & MNT_MPBUSY) { + mp->mnt_flag |= MNT_MPWANT; + sleep((caddr_t)&mp->mnt_flag, PVFS); + } + if (mp->mnt_flag & MNT_UNMOUNT) + return (1); + mp->mnt_flag |= MNT_MPBUSY; + return (0); +} + +/* + * Free a busy filesystem. + * Panic if filesystem is not busy. + */ +vfs_unbusy(mp) + register struct mount *mp; +{ + + if ((mp->mnt_flag & MNT_MPBUSY) == 0) + panic("vfs_unbusy: not busy"); + mp->mnt_flag &= ~MNT_MPBUSY; + if (mp->mnt_flag & MNT_MPWANT) { + mp->mnt_flag &= ~MNT_MPWANT; + wakeup((caddr_t)&mp->mnt_flag); + } +} + +/* + * Lookup a mount point by filesystem identifier. + */ +struct mount * +getvfs(fsid) + fsid_t *fsid; +{ + register struct mount *mp; + + for (mp = mountlist.tqh_first; mp != NULL; mp = mp->mnt_list.tqe_next) { + if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) + return (mp); + } + return ((struct mount *)0); +} + +/* + * Get a new unique fsid + */ +void +getnewfsid(mp, mtype) + struct mount *mp; + int mtype; +{ +static u_short xxxfs_mntid; + + fsid_t tfsid; + + mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); + mp->mnt_stat.f_fsid.val[1] = mtype; + if (xxxfs_mntid == 0) + ++xxxfs_mntid; + tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid); + tfsid.val[1] = mtype; + if (mountlist.tqh_first != NULL) { + while (getvfs(&tfsid)) { + tfsid.val[0]++; + xxxfs_mntid++; + } + } + mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; +} + +/* + * Set vnode attributes to VNOVAL + */ +void vattr_null(vap) + register struct vattr *vap; +{ + + vap->va_type = VNON; + vap->va_size = vap->va_bytes = VNOVAL; + vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid = + vap->va_fsid = vap->va_fileid = + vap->va_blocksize = vap->va_rdev = + vap->va_atime.ts_sec = vap->va_atime.ts_nsec = + vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec = + vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec = + vap->va_flags = vap->va_gen = VNOVAL; + vap->va_vaflags = 0; +} + +/* + * Routines having to do with the management of the vnode table. + */ +extern int (**dead_vnodeop_p)(); +extern void vclean(); +long numvnodes; +extern struct vattr va_null; + +/* + * Return the next vnode from the free list. + */ +getnewvnode(tag, mp, vops, vpp) + enum vtagtype tag; + struct mount *mp; + int (**vops)(); + struct vnode **vpp; +{ + register struct vnode *vp; + int s; + + if ((vnode_free_list.tqh_first == NULL && + numvnodes < 2 * desiredvnodes) || + numvnodes < desiredvnodes) { + vp = (struct vnode *)malloc((u_long)sizeof *vp, + M_VNODE, M_WAITOK); + bzero((char *)vp, sizeof *vp); + numvnodes++; + } else { + if ((vp = vnode_free_list.tqh_first) == NULL) { + tablefull("vnode"); + *vpp = 0; + return (ENFILE); + } + if (vp->v_usecount) + panic("free vnode isn't"); + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + /* see comment on why 0xdeadb is set at end of vgone (below) */ + vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; + vp->v_lease = NULL; + if (vp->v_type != VBAD) + vgone(vp); +#ifdef DIAGNOSTIC + if (vp->v_data) + panic("cleaned vnode isn't"); + s = splbio(); + if (vp->v_numoutput) + panic("Clean vnode has pending I/O's"); + splx(s); +#endif + vp->v_flag = 0; + vp->v_lastr = 0; + vp->v_ralen = 0; + vp->v_maxra = 0; + vp->v_lastw = 0; + vp->v_lasta = 0; + vp->v_cstart = 0; + vp->v_clen = 0; + vp->v_socket = 0; + } + vp->v_type = VNON; + cache_purge(vp); + vp->v_tag = tag; + vp->v_op = vops; + insmntque(vp, mp); + *vpp = vp; + vp->v_usecount = 1; + vp->v_data = 0; + return (0); +} + +/* + * Move a vnode from one mount queue to another. + */ +insmntque(vp, mp) + register struct vnode *vp; + register struct mount *mp; +{ + + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + LIST_REMOVE(vp, v_mntvnodes); + /* + * Insert into list of vnodes for the new mount point, if available. + */ + if ((vp->v_mount = mp) == NULL) + return; + LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); +} + +/* + * Update outstanding I/O count and do wakeup if requested. + */ +vwakeup(bp) + register struct buf *bp; +{ + register struct vnode *vp; + + bp->b_flags &= ~B_WRITEINPROG; + if (vp = bp->b_vp) { + vp->v_numoutput--; + if (vp->v_numoutput < 0) + panic("vwakeup: neg numoutput"); + if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { + if (vp->v_numoutput < 0) + panic("vwakeup: neg numoutput"); + vp->v_flag &= ~VBWAIT; + wakeup((caddr_t)&vp->v_numoutput); + } + } +} + +/* + * Flush out and invalidate all buffers associated with a vnode. + * Called with the underlying object locked. + */ +int +vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) + register struct vnode *vp; + int flags; + struct ucred *cred; + struct proc *p; + int slpflag, slptimeo; +{ + register struct buf *bp; + struct buf *nbp, *blist; + int s, error; + + if (flags & V_SAVE) { + if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) + return (error); + if (vp->v_dirtyblkhd.lh_first != NULL) + panic("vinvalbuf: dirty bufs"); + } + for (;;) { + if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA) + while (blist && blist->b_lblkno < 0) + blist = blist->b_vnbufs.le_next; + if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && + (flags & V_SAVEMETA)) + while (blist && blist->b_lblkno < 0) + blist = blist->b_vnbufs.le_next; + if (!blist) + break; + + for (bp = blist; bp; bp = nbp) { + nbp = bp->b_vnbufs.le_next; + if (flags & V_SAVEMETA && bp->b_lblkno < 0) + continue; + s = splbio(); + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + error = tsleep((caddr_t)bp, + slpflag | (PRIBIO + 1), "vinvalbuf", + slptimeo); + splx(s); + if (error) + return (error); + break; + } + bremfree(bp); + bp->b_flags |= B_BUSY; + splx(s); + /* + * XXX Since there are no node locks for NFS, I believe + * there is a slight chance that a delayed write will + * occur while sleeping just above, so check for it. + */ + if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { + (void) VOP_BWRITE(bp); + break; + } + bp->b_flags |= B_INVAL; + brelse(bp); + } + } + if (!(flags & V_SAVEMETA) && + (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first)) + panic("vinvalbuf: flush failed"); + return (0); +} + +/* + * Associate a buffer with a vnode. + */ +bgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ + + if (bp->b_vp) + panic("bgetvp: not free"); + VHOLD(vp); + bp->b_vp = vp; + if (vp->v_type == VBLK || vp->v_type == VCHR) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; + /* + * Insert onto list for new vnode. + */ + bufinsvn(bp, &vp->v_cleanblkhd); +} + +/* + * Disassociate a buffer from a vnode. + */ +brelvp(bp) + register struct buf *bp; +{ + struct vnode *vp; + + if (bp->b_vp == (struct vnode *) 0) + panic("brelvp: NULL"); + /* + * Delete from old vnode list, if on one. + */ + if (bp->b_vnbufs.le_next != NOLIST) + bufremvn(bp); + vp = bp->b_vp; + bp->b_vp = (struct vnode *) 0; + HOLDRELE(vp); +} + +/* + * Reassign a buffer from one vnode to another. + * Used to assign file specific control information + * (indirect blocks) to the vnode to which they belong. + */ +reassignbuf(bp, newvp) + register struct buf *bp; + register struct vnode *newvp; +{ + register struct buflists *listheadp; + + if (newvp == NULL) { + printf("reassignbuf: NULL"); + return; + } + /* + * Delete from old vnode list, if on one. + */ + if (bp->b_vnbufs.le_next != NOLIST) + bufremvn(bp); + /* + * If dirty, put on list of dirty buffers; + * otherwise insert onto list of clean buffers. + */ + if (bp->b_flags & B_DELWRI) + listheadp = &newvp->v_dirtyblkhd; + else + listheadp = &newvp->v_cleanblkhd; + bufinsvn(bp, listheadp); +} + +/* + * Create a vnode for a block device. + * Used for root filesystem, argdev, and swap areas. + * Also used for memory file system special devices. + */ +bdevvp(dev, vpp) + dev_t dev; + struct vnode **vpp; +{ + register struct vnode *vp; + struct vnode *nvp; + int error; + + if (dev == NODEV) + return (0); + error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); + if (error) { + *vpp = 0; + return (error); + } + vp = nvp; + vp->v_type = VBLK; + if (nvp = checkalias(vp, dev, (struct mount *)0)) { + vput(vp); + vp = nvp; + } + *vpp = vp; + return (0); +} + +/* + * Check to see if the new vnode represents a special device + * for which we already have a vnode (either because of + * bdevvp() or because of a different vnode representing + * the same block device). If such an alias exists, deallocate + * the existing contents and return the aliased vnode. The + * caller is responsible for filling it with its new contents. + */ +struct vnode * +checkalias(nvp, nvp_rdev, mp) + register struct vnode *nvp; + dev_t nvp_rdev; + struct mount *mp; +{ + register struct vnode *vp; + struct vnode **vpp; + + if (nvp->v_type != VBLK && nvp->v_type != VCHR) + return (NULLVP); + + vpp = &speclisth[SPECHASH(nvp_rdev)]; +loop: + for (vp = *vpp; vp; vp = vp->v_specnext) { + if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + */ + if (vp->v_usecount == 0) { + vgone(vp); + goto loop; + } + if (vget(vp, 1)) + goto loop; + break; + } + if (vp == NULL || vp->v_tag != VT_NON) { + MALLOC(nvp->v_specinfo, struct specinfo *, + sizeof(struct specinfo), M_VNODE, M_WAITOK); + nvp->v_rdev = nvp_rdev; + nvp->v_hashchain = vpp; + nvp->v_specnext = *vpp; + nvp->v_specflags = 0; + *vpp = nvp; + if (vp != NULL) { + nvp->v_flag |= VALIASED; + vp->v_flag |= VALIASED; + vput(vp); + } + return (NULLVP); + } + VOP_UNLOCK(vp); + vclean(vp, 0); + vp->v_op = nvp->v_op; + vp->v_tag = nvp->v_tag; + nvp->v_type = VNON; + insmntque(vp, mp); + return (vp); +} + +/* + * Grab a particular vnode from the free list, increment its + * reference count and lock it. The vnode lock bit is set the + * vnode is being eliminated in vgone. The process is awakened + * when the transition is completed, and an error returned to + * indicate that the vnode is no longer usable (possibly having + * been changed to a new file system type). + */ +vget(vp, lockflag) + register struct vnode *vp; + int lockflag; +{ + + /* + * If the vnode is in the process of being cleaned out for + * another use, we wait for the cleaning to finish and then + * return failure. Cleaning is determined either by checking + * that the VXLOCK flag is set, or that the use count is + * zero with the back pointer set to show that it has been + * removed from the free list by getnewvnode. The VXLOCK + * flag may not have been set yet because vclean is blocked in + * the VOP_LOCK call waiting for the VOP_INACTIVE to complete. + */ + if ((vp->v_flag & VXLOCK) || + (vp->v_usecount == 0 && + vp->v_freelist.tqe_prev == (struct vnode **)0xdeadb)) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + return (1); + } + if (vp->v_usecount == 0) + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + vp->v_usecount++; + if (lockflag) + VOP_LOCK(vp); + return (0); +} + +/* + * Vnode reference, just increment the count + */ +void vref(vp) + struct vnode *vp; +{ + + if (vp->v_usecount <= 0) + panic("vref used where vget required"); + vp->v_usecount++; +} + +/* + * vput(), just unlock and vrele() + */ +void vput(vp) + register struct vnode *vp; +{ + + VOP_UNLOCK(vp); + vrele(vp); +} + +/* + * Vnode release. + * If count drops to zero, call inactive routine and return to freelist. + */ +void vrele(vp) + register struct vnode *vp; +{ + +#ifdef DIAGNOSTIC + if (vp == NULL) + panic("vrele: null vp"); +#endif + vp->v_usecount--; + if (vp->v_usecount > 0) + return; +#ifdef DIAGNOSTIC + if (vp->v_usecount != 0 || vp->v_writecount != 0) { + vprint("vrele: bad ref count", vp); + panic("vrele: ref cnt"); + } +#endif + /* + * insert at tail of LRU list + */ + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + VOP_INACTIVE(vp); +} + +/* + * Page or buffer structure gets a reference. + */ +void vhold(vp) + register struct vnode *vp; +{ + + vp->v_holdcnt++; +} + +/* + * Page or buffer structure frees a reference. + */ +void holdrele(vp) + register struct vnode *vp; +{ + + if (vp->v_holdcnt <= 0) + panic("holdrele: holdcnt"); + vp->v_holdcnt--; +} + +/* + * Remove any vnodes in the vnode table belonging to mount point mp. + * + * If MNT_NOFORCE is specified, there should not be any active ones, + * return error if any are found (nb: this is a user error, not a + * system error). If MNT_FORCE is specified, detach any active vnodes + * that are found. + */ +#ifdef DIAGNOSTIC +int busyprt = 0; /* print out busy vnodes */ +struct ctldebug debug1 = { "busyprt", &busyprt }; +#endif + +vflush(mp, skipvp, flags) + struct mount *mp; + struct vnode *skipvp; + int flags; +{ + register struct vnode *vp, *nvp; + int busy = 0; + + if ((mp->mnt_flag & MNT_MPBUSY) == 0) + panic("vflush: not busy"); +loop: + for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { + if (vp->v_mount != mp) + goto loop; + nvp = vp->v_mntvnodes.le_next; + /* + * Skip over a selected vnode. + */ + if (vp == skipvp) + continue; + /* + * Skip over a vnodes marked VSYSTEM. + */ + if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) + continue; + /* + * If WRITECLOSE is set, only flush out regular file + * vnodes open for writing. + */ + if ((flags & WRITECLOSE) && + (vp->v_writecount == 0 || vp->v_type != VREG)) + continue; + /* + * With v_usecount == 0, all we need to do is clear + * out the vnode data structures and we are done. + */ + if (vp->v_usecount == 0) { + vgone(vp); + continue; + } + /* + * If FORCECLOSE is set, forcibly close the vnode. + * For block or character devices, revert to an + * anonymous device. For all other files, just kill them. + */ + if (flags & FORCECLOSE) { + if (vp->v_type != VBLK && vp->v_type != VCHR) { + vgone(vp); + } else { + vclean(vp, 0); + vp->v_op = spec_vnodeop_p; + insmntque(vp, (struct mount *)0); + } + continue; + } +#ifdef DIAGNOSTIC + if (busyprt) + vprint("vflush: busy vnode", vp); +#endif + busy++; + } + if (busy) + return (EBUSY); + return (0); +} + +/* + * Disassociate the underlying file system from a vnode. + */ +void +vclean(vp, flags) + register struct vnode *vp; + int flags; +{ + int active; + + /* + * Check to see if the vnode is in use. + * If so we have to reference it before we clean it out + * so that its count cannot fall to zero and generate a + * race against ourselves to recycle it. + */ + if (active = vp->v_usecount) + VREF(vp); + /* + * Even if the count is zero, the VOP_INACTIVE routine may still + * have the object locked while it cleans it out. The VOP_LOCK + * ensures that the VOP_INACTIVE routine is done with its work. + * For active vnodes, it ensures that no other activity can + * occur while the underlying object is being cleaned out. + */ + VOP_LOCK(vp); + /* + * Prevent the vnode from being recycled or + * brought into use while we clean it out. + */ + if (vp->v_flag & VXLOCK) + panic("vclean: deadlock"); + vp->v_flag |= VXLOCK; + /* + * Clean out any buffers associated with the vnode. + */ + if (flags & DOCLOSE) + vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0); + /* + * Any other processes trying to obtain this lock must first + * wait for VXLOCK to clear, then call the new lock operation. + */ + VOP_UNLOCK(vp); + /* + * If purging an active vnode, it must be closed and + * deactivated before being reclaimed. + */ + if (active) { + if (flags & DOCLOSE) + VOP_CLOSE(vp, IO_NDELAY, NOCRED, NULL); + VOP_INACTIVE(vp); + } + /* + * Reclaim the vnode. + */ + if (VOP_RECLAIM(vp)) + panic("vclean: cannot reclaim"); + if (active) + vrele(vp); + + /* + * Done with purge, notify sleepers of the grim news. + */ + vp->v_op = dead_vnodeop_p; + vp->v_tag = VT_NON; + vp->v_flag &= ~VXLOCK; + if (vp->v_flag & VXWANT) { + vp->v_flag &= ~VXWANT; + wakeup((caddr_t)vp); + } +} + +/* + * Eliminate all activity associated with the requested vnode + * and with all vnodes aliased to the requested vnode. + */ +void vgoneall(vp) + register struct vnode *vp; +{ + register struct vnode *vq; + + if (vp->v_flag & VALIASED) { + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + return; + } + /* + * Ensure that vp will not be vgone'd while we + * are eliminating its aliases. + */ + vp->v_flag |= VXLOCK; + while (vp->v_flag & VALIASED) { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type || vp == vq) + continue; + vgone(vq); + break; + } + } + /* + * Remove the lock so that vgone below will + * really eliminate the vnode after which time + * vgone will awaken any sleepers. + */ + vp->v_flag &= ~VXLOCK; + } + vgone(vp); +} + +/* + * Eliminate all activity associated with a vnode + * in preparation for reuse. + */ +void vgone(vp) + register struct vnode *vp; +{ + register struct vnode *vq; + struct vnode *vx; + + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + return; + } + /* + * Clean out the filesystem specific data. + */ + vclean(vp, DOCLOSE); + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) { + LIST_REMOVE(vp, v_mntvnodes); + vp->v_mount = NULL; + } + /* + * If special device, remove it from special device alias list. + */ + if (vp->v_type == VBLK || vp->v_type == VCHR) { + if (*vp->v_hashchain == vp) { + *vp->v_hashchain = vp->v_specnext; + } else { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_specnext != vp) + continue; + vq->v_specnext = vp->v_specnext; + break; + } + if (vq == NULL) + panic("missing bdev"); + } + if (vp->v_flag & VALIASED) { + vx = NULL; + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vx) + break; + vx = vq; + } + if (vx == NULL) + panic("missing alias"); + if (vq == NULL) + vx->v_flag &= ~VALIASED; + vp->v_flag &= ~VALIASED; + } + FREE(vp->v_specinfo, M_VNODE); + vp->v_specinfo = NULL; + } + /* + * If it is on the freelist and not already at the head, + * move it to the head of the list. The test of the back + * pointer and the reference count of zero is because + * it will be removed from the free list by getnewvnode, + * but will not have its reference count incremented until + * after calling vgone. If the reference count were + * incremented first, vgone would (incorrectly) try to + * close the previous instance of the underlying object. + * So, the back pointer is explicitly set to `0xdeadb' in + * getnewvnode after removing it from the freelist to ensure + * that we do not try to move it here. + */ + if (vp->v_usecount == 0 && + vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb && + vnode_free_list.tqh_first != vp) { + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + } + vp->v_type = VBAD; +} + +/* + * Lookup a vnode by device number. + */ +vfinddev(dev, type, vpp) + dev_t dev; + enum vtype type; + struct vnode **vpp; +{ + register struct vnode *vp; + + for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { + if (dev != vp->v_rdev || type != vp->v_type) + continue; + *vpp = vp; + return (1); + } + return (0); +} + +/* + * Calculate the total number of references to a special device. + */ +vcount(vp) + register struct vnode *vp; +{ + register struct vnode *vq, *vnext; + int count; + +loop: + if ((vp->v_flag & VALIASED) == 0) + return (vp->v_usecount); + for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { + vnext = vq->v_specnext; + if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + */ + if (vq->v_usecount == 0 && vq != vp) { + vgone(vq); + goto loop; + } + count += vq->v_usecount; + } + return (count); +} + +/* + * Print out a description of a vnode. + */ +static char *typename[] = + { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" }; + +vprint(label, vp) + char *label; + register struct vnode *vp; +{ + char buf[64]; + + if (label != NULL) + printf("%s: ", label); + printf("type %s, usecount %d, writecount %d, refcount %d,", + typename[vp->v_type], vp->v_usecount, vp->v_writecount, + vp->v_holdcnt); + buf[0] = '\0'; + if (vp->v_flag & VROOT) + strcat(buf, "|VROOT"); + if (vp->v_flag & VTEXT) + strcat(buf, "|VTEXT"); + if (vp->v_flag & VSYSTEM) + strcat(buf, "|VSYSTEM"); + if (vp->v_flag & VXLOCK) + strcat(buf, "|VXLOCK"); + if (vp->v_flag & VXWANT) + strcat(buf, "|VXWANT"); + if (vp->v_flag & VBWAIT) + strcat(buf, "|VBWAIT"); + if (vp->v_flag & VALIASED) + strcat(buf, "|VALIASED"); + if (buf[0] != '\0') + printf(" flags (%s)", &buf[1]); + if (vp->v_data == NULL) { + printf("\n"); + } else { + printf("\n\t"); + VOP_PRINT(vp); + } +} + +#ifdef DEBUG +/* + * List all of the locked vnodes in the system. + * Called when debugging the kernel. + */ +printlockedvnodes() +{ + register struct mount *mp; + register struct vnode *vp; + + printf("Locked vnodes\n"); + for (mp = mountlist.tqh_first; mp != NULL; mp = mp->mnt_list.tqe_next) { + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) + if (VOP_ISLOCKED(vp)) + vprint((char *)0, vp); + } +} +#endif + +int kinfo_vdebug = 1; +int kinfo_vgetfailed; +#define KINFO_VNODESLOP 10 +/* + * Dump vnode list (via sysctl). + * Copyout address of vnode followed by vnode. + */ +/* ARGSUSED */ +sysctl_vnode(where, sizep) + char *where; + size_t *sizep; +{ + register struct mount *mp, *nmp; + struct vnode *vp; + register char *bp = where, *savebp; + char *ewhere; + int error; + +#define VPTRSZ sizeof (struct vnode *) +#define VNODESZ sizeof (struct vnode) + if (where == NULL) { + *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); + return (0); + } + ewhere = where + *sizep; + + for (mp = mountlist.tqh_first; mp != NULL; mp = nmp) { + nmp = mp->mnt_list.tqe_next; + if (vfs_busy(mp)) + continue; + savebp = bp; +again: + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + /* + * Check that the vp is still associated with + * this filesystem. RACE: could have been + * recycled onto the same filesystem. + */ + if (vp->v_mount != mp) { + if (kinfo_vdebug) + printf("kinfo: vp changed\n"); + bp = savebp; + goto again; + } + if (bp + VPTRSZ + VNODESZ > ewhere) { + *sizep = bp - where; + return (ENOMEM); + } + if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || + (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) + return (error); + bp += VPTRSZ + VNODESZ; + } + vfs_unbusy(mp); + } + + *sizep = bp - where; + return (0); +} + +/* + * Check to see if a filesystem is mounted on a block device. + */ +int +vfs_mountedon(vp) + register struct vnode *vp; +{ + register struct vnode *vq; + + if (vp->v_specflags & SI_MOUNTEDON) + return (EBUSY); + if (vp->v_flag & VALIASED) { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vq->v_specflags & SI_MOUNTEDON) + return (EBUSY); + } + } + return (0); +} + +/* + * Build hash lists of net addresses and hang them off the mount point. + * Called by ufs_mount() to set up the lists of export addresses. + */ +static int +vfs_hang_addrlist(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + register int i; + struct radix_node *rn; + struct sockaddr *saddr, *smask = 0; + struct domain *dom; + int error; + + if (argp->ex_addrlen == 0) { + if (mp->mnt_flag & MNT_DEFEXPORTED) + return (EPERM); + np = &nep->ne_defexported; + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + mp->mnt_flag |= MNT_DEFEXPORTED; + return (0); + } + i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; + np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK); + bzero((caddr_t)np, i); + saddr = (struct sockaddr *)(np + 1); + if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen)) + goto out; + if (saddr->sa_len > argp->ex_addrlen) + saddr->sa_len = argp->ex_addrlen; + if (argp->ex_masklen) { + smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); + error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen); + if (error) + goto out; + if (smask->sa_len > argp->ex_masklen) + smask->sa_len = argp->ex_masklen; + } + i = saddr->sa_family; + if ((rnh = nep->ne_rtable[i]) == 0) { + /* + * Seems silly to initialize every AF when most are not + * used, do so on demand here + */ + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_family == i && dom->dom_rtattach) { + dom->dom_rtattach((void **)&nep->ne_rtable[i], + dom->dom_rtoffset); + break; + } + if ((rnh = nep->ne_rtable[i]) == 0) { + error = ENOBUFS; + goto out; + } + } + rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh, + np->netc_rnodes); + if (rn == 0 || np != (struct netcred *)rn) { /* already exists */ + error = EPERM; + goto out; + } + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + return (0); +out: + free(np, M_NETADDR); + return (error); +} + +/* ARGSUSED */ +static int +vfs_free_netcred(rn, w) + struct radix_node *rn; + caddr_t w; +{ + register struct radix_node_head *rnh = (struct radix_node_head *)w; + + (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); + free((caddr_t)rn, M_NETADDR); + return (0); +} + +/* + * Free the net address hash lists that are hanging off the mount points. + */ +static void +vfs_free_addrlist(nep) + struct netexport *nep; +{ + register int i; + register struct radix_node_head *rnh; + + for (i = 0; i <= AF_MAX; i++) + if (rnh = nep->ne_rtable[i]) { + (*rnh->rnh_walktree)(rnh, vfs_free_netcred, + (caddr_t)rnh); + free((caddr_t)rnh, M_RTABLE); + nep->ne_rtable[i] = 0; + } +} + +int +vfs_export(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + int error; + + if (argp->ex_flags & MNT_DELEXPORT) { + vfs_free_addrlist(nep); + mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); + } + if (argp->ex_flags & MNT_EXPORTED) { + if (error = vfs_hang_addrlist(mp, nep, argp)) + return (error); + mp->mnt_flag |= MNT_EXPORTED; + } + return (0); +} + +struct netcred * +vfs_export_lookup(mp, nep, nam) + register struct mount *mp; + struct netexport *nep; + struct mbuf *nam; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + struct sockaddr *saddr; + + np = NULL; + if (mp->mnt_flag & MNT_EXPORTED) { + /* + * Lookup in the export list first. + */ + if (nam != NULL) { + saddr = mtod(nam, struct sockaddr *); + rnh = nep->ne_rtable[saddr->sa_family]; + if (rnh != NULL) { + np = (struct netcred *) + (*rnh->rnh_matchaddr)((caddr_t)saddr, + rnh); + if (np && np->netc_rnodes->rn_flags & RNF_ROOT) + np = NULL; + } + } + /* + * If no address match, use the default if it exists. + */ + if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) + np = &nep->ne_defexported; + } + return (np); +} diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c new file mode 100644 index 00000000000..345c7a79bf2 --- /dev/null +++ b/sys/kern/vfs_extattr.c @@ -0,0 +1,2107 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static int change_dir __P((struct nameidata *ndp, struct proc *p)); + +/* + * Virtual File System System Calls + */ + +/* + * Mount a file system. + */ +struct mount_args { + int type; + char *path; + int flags; + caddr_t data; +}; +/* ARGSUSED */ +mount(p, uap, retval) + struct proc *p; + register struct mount_args *uap; + int *retval; +{ + register struct vnode *vp; + register struct mount *mp; + int error, flag; + struct nameidata nd; + + /* + * Must be super user + */ + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + /* + * Get vnode to be covered + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (uap->flags & MNT_UPDATE) { + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + mp = vp->v_mount; + flag = mp->mnt_flag; + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + if ((uap->flags & MNT_RELOAD) && + ((mp->mnt_flag & MNT_RDONLY) == 0)) { + vput(vp); + return (EOPNOTSUPP); /* Needs translation */ + } + mp->mnt_flag |= + uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); + VOP_UNLOCK(vp); + goto update; + } + if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + if (vp->v_type != VDIR) { + vput(vp); + return (ENOTDIR); + } + if ((u_long)uap->type > MOUNT_MAXTYPE || vfssw[uap->type] == NULL) { + vput(vp); + return (ENODEV); + } + + /* + * Allocate and initialize the file system. + */ + mp = (struct mount *)malloc((u_long)sizeof(struct mount), + M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + mp->mnt_op = vfssw[uap->type]; + if (error = vfs_lock(mp)) { + free((caddr_t)mp, M_MOUNT); + vput(vp); + return (error); + } + if (vp->v_mountedhere != NULL) { + vfs_unlock(mp); + free((caddr_t)mp, M_MOUNT); + vput(vp); + return (EBUSY); + } + vp->v_mountedhere = mp; + mp->mnt_vnodecovered = vp; +update: + /* + * Set the mount level flags. + */ + if (uap->flags & MNT_RDONLY) + mp->mnt_flag |= MNT_RDONLY; + else if (mp->mnt_flag & MNT_RDONLY) + mp->mnt_flag |= MNT_WANTRDWR; + mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | + MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC); + mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | + MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC); + /* + * Mount the filesystem. + */ + error = VFS_MOUNT(mp, uap->path, uap->data, &nd, p); + if (mp->mnt_flag & MNT_UPDATE) { + vrele(vp); + if (mp->mnt_flag & MNT_WANTRDWR) + mp->mnt_flag &= ~MNT_RDONLY; + mp->mnt_flag &=~ + (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_WANTRDWR); + if (error) + mp->mnt_flag = flag; + return (error); + } + /* + * Put the new filesystem on the mount list after root. + */ + cache_purge(vp); + if (!error) { + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + VOP_UNLOCK(vp); + vfs_unlock(mp); + error = VFS_START(mp, 0, p); + } else { + mp->mnt_vnodecovered->v_mountedhere = (struct mount *)0; + vfs_unlock(mp); + free((caddr_t)mp, M_MOUNT); + vput(vp); + } + return (error); +} + +/* + * Unmount a file system. + * + * Note: unmount takes a path to the vnode mounted on as argument, + * not special file (as before). + */ +struct unmount_args { + char *path; + int flags; +}; +/* ARGSUSED */ +unmount(p, uap, retval) + struct proc *p; + register struct unmount_args *uap; + int *retval; +{ + register struct vnode *vp; + struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + + /* + * Unless this is a user mount, then must + * have suser privilege. + */ + if (((vp->v_mount->mnt_flag & MNT_USER) == 0) && + (error = suser(p->p_ucred, &p->p_acflag))) { + vput(vp); + return (error); + } + + /* + * Must be the root of the filesystem + */ + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + mp = vp->v_mount; + vput(vp); + return (dounmount(mp, uap->flags, p)); +} + +/* + * Do the actual file system unmount. + */ +dounmount(mp, flags, p) + register struct mount *mp; + int flags; + struct proc *p; +{ + struct vnode *coveredvp; + int error; + + coveredvp = mp->mnt_vnodecovered; + if (vfs_busy(mp)) + return (EBUSY); + mp->mnt_flag |= MNT_UNMOUNT; + if (error = vfs_lock(mp)) + return (error); + + mp->mnt_flag &=~ MNT_ASYNC; + vnode_pager_umount(mp); /* release cached vnodes */ + cache_purgevfs(mp); /* remove cache entries for this file sys */ + if ((error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0 || + (flags & MNT_FORCE)) + error = VFS_UNMOUNT(mp, flags, p); + mp->mnt_flag &= ~MNT_UNMOUNT; + vfs_unbusy(mp); + if (error) { + vfs_unlock(mp); + } else { + vrele(coveredvp); + TAILQ_REMOVE(&mountlist, mp, mnt_list); + mp->mnt_vnodecovered->v_mountedhere = (struct mount *)0; + vfs_unlock(mp); + if (mp->mnt_vnodelist.lh_first != NULL) + panic("unmount: dangling vnode"); + free((caddr_t)mp, M_MOUNT); + } + return (error); +} + +/* + * Sync each mounted filesystem. + */ +#ifdef DIAGNOSTIC +int syncprt = 0; +struct ctldebug debug0 = { "syncprt", &syncprt }; +#endif + +struct sync_args { + int dummy; +}; +/* ARGSUSED */ +sync(p, uap, retval) + struct proc *p; + struct sync_args *uap; + int *retval; +{ + register struct mount *mp, *nmp; + int asyncflag; + + for (mp = mountlist.tqh_first; mp != NULL; mp = nmp) { + nmp = mp->mnt_list.tqe_next; + /* + * The lock check below is to avoid races with mount + * and unmount. + */ + if ((mp->mnt_flag & (MNT_MLOCK|MNT_RDONLY|MNT_MPBUSY)) == 0 && + !vfs_busy(mp)) { + asyncflag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + VFS_SYNC(mp, MNT_NOWAIT, p->p_ucred, p); + if (asyncflag) + mp->mnt_flag |= MNT_ASYNC; + vfs_unbusy(mp); + } + } +#ifdef DIAGNOSTIC + if (syncprt) + vfs_bufstats(); +#endif /* DIAGNOSTIC */ + return (0); +} + +/* + * Change filesystem quotas. + */ +struct quotactl_args { + char *path; + int cmd; + int uid; + caddr_t arg; +}; +/* ARGSUSED */ +quotactl(p, uap, retval) + struct proc *p; + register struct quotactl_args *uap; + int *retval; +{ + register struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + vrele(nd.ni_vp); + return (VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, p)); +} + +/* + * Get filesystem statistics. + */ +struct statfs_args { + char *path; + struct statfs *buf; +}; +/* ARGSUSED */ +statfs(p, uap, retval) + struct proc *p; + register struct statfs_args *uap; + int *retval; +{ + register struct mount *mp; + register struct statfs *sp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + sp = &mp->mnt_stat; + vrele(nd.ni_vp); + if (error = VFS_STATFS(mp, sp, p)) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + return (copyout((caddr_t)sp, (caddr_t)uap->buf, sizeof(*sp))); +} + +/* + * Get filesystem statistics. + */ +struct fstatfs_args { + int fd; + struct statfs *buf; +}; +/* ARGSUSED */ +fstatfs(p, uap, retval) + struct proc *p; + register struct fstatfs_args *uap; + int *retval; +{ + struct file *fp; + struct mount *mp; + register struct statfs *sp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + mp = ((struct vnode *)fp->f_data)->v_mount; + sp = &mp->mnt_stat; + if (error = VFS_STATFS(mp, sp, p)) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + return (copyout((caddr_t)sp, (caddr_t)uap->buf, sizeof(*sp))); +} + +/* + * Get statistics on all filesystems. + */ +struct getfsstat_args { + struct statfs *buf; + long bufsize; + int flags; +}; +getfsstat(p, uap, retval) + struct proc *p; + register struct getfsstat_args *uap; + int *retval; +{ + register struct mount *mp, *nmp; + register struct statfs *sp; + caddr_t sfsp; + long count, maxcount, error; + + maxcount = uap->bufsize / sizeof(struct statfs); + sfsp = (caddr_t)uap->buf; + for (count = 0, mp = mountlist.tqh_first; mp != NULL; mp = nmp) { + nmp = mp->mnt_list.tqe_next; + if (sfsp && count < maxcount && + ((mp->mnt_flag & MNT_MLOCK) == 0)) { + sp = &mp->mnt_stat; + /* + * If MNT_NOWAIT is specified, do not refresh the + * fsstat cache. MNT_WAIT overrides MNT_NOWAIT. + */ + if (((uap->flags & MNT_NOWAIT) == 0 || + (uap->flags & MNT_WAIT)) && + (error = VFS_STATFS(mp, sp, p))) + continue; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (error = copyout((caddr_t)sp, sfsp, sizeof(*sp))) + return (error); + sfsp += sizeof(*sp); + } + count++; + } + if (sfsp && count > maxcount) + *retval = maxcount; + else + *retval = count; + return (0); +} + +/* + * Change current working directory to a given file descriptor. + */ +struct fchdir_args { + int fd; +}; +/* ARGSUSED */ +fchdir(p, uap, retval) + struct proc *p; + struct fchdir_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(fdp, uap->fd, &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VOP_LOCK(vp); + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + VOP_UNLOCK(vp); + if (error) + return (error); + VREF(vp); + vrele(fdp->fd_cdir); + fdp->fd_cdir = vp; + return (0); +} + +/* + * Change current working directory (``.''). + */ +struct chdir_args { + char *path; +}; +/* ARGSUSED */ +chdir(p, uap, retval) + struct proc *p; + struct chdir_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = change_dir(&nd, p)) + return (error); + vrele(fdp->fd_cdir); + fdp->fd_cdir = nd.ni_vp; + return (0); +} + +/* + * Change notion of root (``/'') directory. + */ +struct chroot_args { + char *path; +}; +/* ARGSUSED */ +chroot(p, uap, retval) + struct proc *p; + struct chroot_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = change_dir(&nd, p)) + return (error); + if (fdp->fd_rdir != NULL) + vrele(fdp->fd_rdir); + fdp->fd_rdir = nd.ni_vp; + return (0); +} + +/* + * Common routine for chroot and chdir. + */ +static int +change_dir(ndp, p) + register struct nameidata *ndp; + struct proc *p; +{ + struct vnode *vp; + int error; + + if (error = namei(ndp)) + return (error); + vp = ndp->ni_vp; + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + VOP_UNLOCK(vp); + if (error) + vrele(vp); + return (error); +} + +/* + * Check permissions, allocate an open file structure, + * and call the device open routine if any. + */ +struct open_args { + char *path; + int flags; + int mode; +}; +open(p, uap, retval) + struct proc *p; + register struct open_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register struct vnode *vp; + int flags, cmode; + struct file *nfp; + int type, indx, error; + struct flock lf; + struct nameidata nd; + extern struct fileops vnops; + + if (error = falloc(p, &nfp, &indx)) + return (error); + fp = nfp; + flags = FFLAGS(uap->flags); + cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + p->p_dupfd = -indx - 1; /* XXX check for fdopen */ + if (error = vn_open(&nd, flags, cmode)) { + ffree(fp); + if ((error == ENODEV || error == ENXIO) && + p->p_dupfd >= 0 && /* XXX from fdopen */ + (error = + dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) { + *retval = indx; + return (0); + } + if (error == ERESTART) + error = EINTR; + fdp->fd_ofiles[indx] = NULL; + return (error); + } + p->p_dupfd = 0; + vp = nd.ni_vp; + fp->f_flag = flags & FMASK; + fp->f_type = DTYPE_VNODE; + fp->f_ops = &vnops; + fp->f_data = (caddr_t)vp; + if (flags & (O_EXLOCK | O_SHLOCK)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (flags & O_EXLOCK) + lf.l_type = F_WRLCK; + else + lf.l_type = F_RDLCK; + type = F_FLOCK; + if ((flags & FNONBLOCK) == 0) + type |= F_WAIT; + VOP_UNLOCK(vp); + if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) { + (void) vn_close(vp, fp->f_flag, fp->f_cred, p); + ffree(fp); + fdp->fd_ofiles[indx] = NULL; + return (error); + } + VOP_LOCK(vp); + fp->f_flag |= FHASLOCK; + } + VOP_UNLOCK(vp); + *retval = indx; + return (0); +} + +#ifdef COMPAT_43 +/* + * Create a file. + */ +struct ocreat_args { + char *path; + int mode; +}; +ocreat(p, uap, retval) + struct proc *p; + register struct ocreat_args *uap; + int *retval; +{ + struct open_args openuap; + + openuap.path = uap->path; + openuap.mode = uap->mode; + openuap.flags = O_WRONLY | O_CREAT | O_TRUNC; + return (open(p, &openuap, retval)); +} +#endif /* COMPAT_43 */ + +/* + * Create a special file. + */ +struct mknod_args { + char *path; + int mode; + int dev; +}; +/* ARGSUSED */ +mknod(p, uap, retval) + struct proc *p; + register struct mknod_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) + error = EEXIST; + else { + VATTR_NULL(&vattr); + vattr.va_mode = (uap->mode & ALLPERMS) &~ p->p_fd->fd_cmask; + vattr.va_rdev = uap->dev; + + switch (uap->mode & S_IFMT) { + case S_IFMT: /* used by badsect to flag bad sectors */ + vattr.va_type = VBAD; + break; + case S_IFCHR: + vattr.va_type = VCHR; + break; + case S_IFBLK: + vattr.va_type = VBLK; + break; + default: + error = EINVAL; + break; + } + } + if (!error) { + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp) + vrele(vp); + } + return (error); +} + +/* + * Create named pipe. + */ +struct mkfifo_args { + char *path; + int mode; +}; +/* ARGSUSED */ +mkfifo(p, uap, retval) + struct proc *p; + register struct mkfifo_args *uap; + int *retval; +{ + struct vattr vattr; + int error; + struct nameidata nd; + +#ifndef FIFO + return (EOPNOTSUPP); +#else + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + if (nd.ni_vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VFIFO; + vattr.va_mode = (uap->mode & ALLPERMS) &~ p->p_fd->fd_cmask; + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + return (VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr)); +#endif /* FIFO */ +} + +/* + * Make a hard file link. + */ +struct link_args { + char *path; + char *link; +}; +/* ARGSUSED */ +link(p, uap, retval) + struct proc *p; + register struct link_args *uap; + int *retval; +{ + register struct vnode *vp; + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR || + (error = suser(p->p_ucred, &p->p_acflag)) == 0) { + nd.ni_cnd.cn_nameiop = CREATE; + nd.ni_cnd.cn_flags = LOCKPARENT; + nd.ni_dirp = uap->link; + if ((error = namei(&nd)) == 0) { + if (nd.ni_vp != NULL) + error = EEXIST; + if (!error) { + LEASE_CHECK(nd.ni_dvp, + p, p->p_ucred, LEASE_WRITE); + LEASE_CHECK(vp, + p, p->p_ucred, LEASE_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + } + } + } + vrele(vp); + return (error); +} + +/* + * Make a symbolic link. + */ +struct symlink_args { + char *path; + char *link; +}; +/* ARGSUSED */ +symlink(p, uap, retval) + struct proc *p; + register struct symlink_args *uap; + int *retval; +{ + struct vattr vattr; + char *path; + int error; + struct nameidata nd; + + MALLOC(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (error = copyinstr(uap->path, path, MAXPATHLEN, NULL)) + goto out; + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, uap->link, p); + if (error = namei(&nd)) + goto out; + if (nd.ni_vp) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + error = EEXIST; + goto out; + } + VATTR_NULL(&vattr); + vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask; + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); +out: + FREE(path, M_NAMEI); + return (error); +} + +/* + * Delete a name from the filesystem. + */ +struct unlink_args { + char *path; +}; +/* ARGSUSED */ +unlink(p, uap, retval) + struct proc *p; + struct unlink_args *uap; + int *retval; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + + if (vp->v_type != VDIR || + (error = suser(p->p_ucred, &p->p_acflag)) == 0) { + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) + error = EBUSY; + else + (void)vnode_pager_uncache(vp); + } + + if (!error) { + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + } + return (error); +} + +/* + * Reposition read/write file offset. + */ +struct lseek_args { + int fd; + int pad; + off_t offset; + int whence; +}; +lseek(p, uap, retval) + struct proc *p; + register struct lseek_args *uap; + int *retval; +{ + struct ucred *cred = p->p_ucred; + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vattr vattr; + int error; + + if ((u_int)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (ESPIPE); + switch (uap->whence) { + case L_INCR: + fp->f_offset += uap->offset; + break; + case L_XTND: + if (error = + VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p)) + return (error); + fp->f_offset = uap->offset + vattr.va_size; + break; + case L_SET: + fp->f_offset = uap->offset; + break; + default: + return (EINVAL); + } + *(off_t *)retval = fp->f_offset; + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Reposition read/write file offset. + */ +struct olseek_args { + int fd; + long offset; + int whence; +}; +olseek(p, uap, retval) + struct proc *p; + register struct olseek_args *uap; + int *retval; +{ + struct lseek_args nuap; + off_t qret; + int error; + + nuap.fd = uap->fd; + nuap.offset = uap->offset; + nuap.whence = uap->whence; + error = lseek(p, &nuap, &qret); + *(long *)retval = qret; + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Check access permissions. + */ +struct access_args { + char *path; + int flags; +}; +access(p, uap, retval) + struct proc *p; + register struct access_args *uap; + int *retval; +{ + register struct ucred *cred = p->p_ucred; + register struct vnode *vp; + int error, flags, t_gid, t_uid; + struct nameidata nd; + + t_uid = cred->cr_uid; + t_gid = cred->cr_groups[0]; + cred->cr_uid = p->p_cred->p_ruid; + cred->cr_groups[0] = p->p_cred->p_rgid; + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + goto out1; + vp = nd.ni_vp; + + /* Flags == 0 means only check for existence. */ + if (uap->flags) { + flags = 0; + if (uap->flags & R_OK) + flags |= VREAD; + if (uap->flags & W_OK) + flags |= VWRITE; + if (uap->flags & X_OK) + flags |= VEXEC; + if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) + error = VOP_ACCESS(vp, flags, cred, p); + } + vput(vp); +out1: + cred->cr_uid = t_uid; + cred->cr_groups[0] = t_gid; + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Get file status; this version follows links. + */ +struct ostat_args { + char *path; + struct ostat *ub; +}; +/* ARGSUSED */ +ostat(p, uap, retval) + struct proc *p; + register struct ostat_args *uap; + int *retval; +{ + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)uap->ub, sizeof (osb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +struct olstat_args { + char *path; + struct ostat *ub; +}; +/* ARGSUSED */ +olstat(p, uap, retval) + struct proc *p; + register struct olstat_args *uap; + int *retval; +{ + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)uap->ub, sizeof (osb)); + return (error); +} + +/* + * Convert from an old to a new stat structure. + */ +cvtstat(st, ost) + struct stat *st; + struct ostat *ost; +{ + + ost->st_dev = st->st_dev; + ost->st_ino = st->st_ino; + ost->st_mode = st->st_mode; + ost->st_nlink = st->st_nlink; + ost->st_uid = st->st_uid; + ost->st_gid = st->st_gid; + ost->st_rdev = st->st_rdev; + if (st->st_size < (quad_t)1 << 32) + ost->st_size = st->st_size; + else + ost->st_size = -2; + ost->st_atime = st->st_atime; + ost->st_mtime = st->st_mtime; + ost->st_ctime = st->st_ctime; + ost->st_blksize = st->st_blksize; + ost->st_blocks = st->st_blocks; + ost->st_flags = st->st_flags; + ost->st_gen = st->st_gen; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Get file status; this version follows links. + */ +struct stat_args { + char *path; + struct stat *ub; +}; +/* ARGSUSED */ +stat(p, uap, retval) + struct proc *p; + register struct stat_args *uap; + int *retval; +{ + struct stat sb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + error = copyout((caddr_t)&sb, (caddr_t)uap->ub, sizeof (sb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +struct lstat_args { + char *path; + struct stat *ub; +}; +/* ARGSUSED */ +lstat(p, uap, retval) + struct proc *p; + register struct lstat_args *uap; + int *retval; +{ + int error; + struct vnode *vp, *dvp; + struct stat sb, sb1; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKPARENT, UIO_USERSPACE, + uap->path, p); + if (error = namei(&nd)) + return (error); + /* + * For symbolic links, always return the attributes of its + * containing directory, except for mode, size, and links. + */ + vp = nd.ni_vp; + dvp = nd.ni_dvp; + if (vp->v_type != VLNK) { + if (dvp == vp) + vrele(dvp); + else + vput(dvp); + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + } else { + error = vn_stat(dvp, &sb, p); + vput(dvp); + if (error) { + vput(vp); + return (error); + } + error = vn_stat(vp, &sb1, p); + vput(vp); + if (error) + return (error); + sb.st_mode &= ~S_IFDIR; + sb.st_mode |= S_IFLNK; + sb.st_nlink = sb1.st_nlink; + sb.st_size = sb1.st_size; + sb.st_blocks = sb1.st_blocks; + } + error = copyout((caddr_t)&sb, (caddr_t)uap->ub, sizeof (sb)); + return (error); +} + +/* + * Get configurable pathname variables. + */ +struct pathconf_args { + char *path; + int name; +}; +/* ARGSUSED */ +pathconf(p, uap, retval) + struct proc *p; + register struct pathconf_args *uap; + int *retval; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + error = VOP_PATHCONF(nd.ni_vp, uap->name, retval); + vput(nd.ni_vp); + return (error); +} + +/* + * Return target name of a symbolic link. + */ +struct readlink_args { + char *path; + char *buf; + int count; +}; +/* ARGSUSED */ +readlink(p, uap, retval) + struct proc *p; + register struct readlink_args *uap; + int *retval; +{ + register struct vnode *vp; + struct iovec aiov; + struct uio auio; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VLNK) + error = EINVAL; + else { + aiov.iov_base = uap->buf; + aiov.iov_len = uap->count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = uap->count; + error = VOP_READLINK(vp, &auio, p->p_ucred); + } + vput(vp); + *retval = uap->count - auio.uio_resid; + return (error); +} + +/* + * Change flags of a file given a path name. + */ +struct chflags_args { + char *path; + int flags; +}; +/* ARGSUSED */ +chflags(p, uap, retval) + struct proc *p; + register struct chflags_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_flags = uap->flags; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Change flags of a file given a file descriptor. + */ +struct fchflags_args { + int fd; + int flags; +}; +/* ARGSUSED */ +fchflags(p, uap, retval) + struct proc *p; + register struct fchflags_args *uap; + int *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_flags = uap->flags; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + VOP_UNLOCK(vp); + return (error); +} + +/* + * Change mode of a file given path name. + */ +struct chmod_args { + char *path; + int mode; +}; +/* ARGSUSED */ +chmod(p, uap, retval) + struct proc *p; + register struct chmod_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_mode = uap->mode & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Change mode of a file given a file descriptor. + */ +struct fchmod_args { + int fd; + int mode; +}; +/* ARGSUSED */ +fchmod(p, uap, retval) + struct proc *p; + register struct fchmod_args *uap; + int *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_mode = uap->mode & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + VOP_UNLOCK(vp); + return (error); +} + +/* + * Set ownership given a path name. + */ +struct chown_args { + char *path; + int uid; + int gid; +}; +/* ARGSUSED */ +chown(p, uap, retval) + struct proc *p; + register struct chown_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_uid = uap->uid; + vattr.va_gid = uap->gid; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Set ownership given a file descriptor. + */ +struct fchown_args { + int fd; + int uid; + int gid; +}; +/* ARGSUSED */ +fchown(p, uap, retval) + struct proc *p; + register struct fchown_args *uap; + int *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_uid = uap->uid; + vattr.va_gid = uap->gid; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + VOP_UNLOCK(vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +struct utimes_args { + char *path; + struct timeval *tptr; +}; +/* ARGSUSED */ +utimes(p, uap, retval) + struct proc *p; + register struct utimes_args *uap; + int *retval; +{ + register struct vnode *vp; + struct timeval tv[2]; + struct vattr vattr; + int error; + struct nameidata nd; + + VATTR_NULL(&vattr); + if (uap->tptr == NULL) { + microtime(&tv[0]); + tv[1] = tv[0]; + vattr.va_vaflags |= VA_UTIMES_NULL; + } else if (error = copyin((caddr_t)uap->tptr, (caddr_t)tv, sizeof (tv))) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + vattr.va_atime.ts_sec = tv[0].tv_sec; + vattr.va_atime.ts_nsec = tv[0].tv_usec * 1000; + vattr.va_mtime.ts_sec = tv[1].tv_sec; + vattr.va_mtime.ts_nsec = tv[1].tv_usec * 1000; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Truncate a file given its path name. + */ +struct truncate_args { + char *path; + int pad; + off_t length; +}; +/* ARGSUSED */ +truncate(p, uap, retval) + struct proc *p; + register struct truncate_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0 && + (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = uap->length; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Truncate a file given a file descriptor. + */ +struct ftruncate_args { + int fd; + int pad; + off_t length; +}; +/* ARGSUSED */ +ftruncate(p, uap, retval) + struct proc *p; + register struct ftruncate_args *uap; + int *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + if ((fp->f_flag & FWRITE) == 0) + return (EINVAL); + vp = (struct vnode *)fp->f_data; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = uap->length; + error = VOP_SETATTR(vp, &vattr, fp->f_cred, p); + } + VOP_UNLOCK(vp); + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Truncate a file given its path name. + */ +struct otruncate_args { + char *path; + long length; +}; +/* ARGSUSED */ +otruncate(p, uap, retval) + struct proc *p; + register struct otruncate_args *uap; + int *retval; +{ + struct truncate_args nuap; + + nuap.path = uap->path; + nuap.length = uap->length; + return (truncate(p, &nuap, retval)); +} + +/* + * Truncate a file given a file descriptor. + */ +struct oftruncate_args { + int fd; + long length; +}; +/* ARGSUSED */ +oftruncate(p, uap, retval) + struct proc *p; + register struct oftruncate_args *uap; + int *retval; +{ + struct ftruncate_args nuap; + + nuap.fd = uap->fd; + nuap.length = uap->length; + return (ftruncate(p, &nuap, retval)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Sync an open file. + */ +struct fsync_args { + int fd; +}; +/* ARGSUSED */ +fsync(p, uap, retval) + struct proc *p; + struct fsync_args *uap; + int *retval; +{ + register struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VOP_LOCK(vp); + error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p); + VOP_UNLOCK(vp); + return (error); +} + +/* + * Rename files. Source and destination must either both be directories, + * or both not be directories. If target is a directory, it must be empty. + */ +struct rename_args { + char *from; + char *to; +}; +/* ARGSUSED */ +rename(p, uap, retval) + struct proc *p; + register struct rename_args *uap; + int *retval; +{ + register struct vnode *tvp, *fvp, *tdvp; + struct nameidata fromnd, tond; + int error; + + NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE, + uap->from, p); + if (error = namei(&fromnd)) + return (error); + fvp = fromnd.ni_vp; + NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART, + UIO_USERSPACE, uap->to, p); + if (error = namei(&tond)) { + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + if (fvp == tdvp) + error = EINVAL; + /* + * If source is the same as the destination (that is the + * same inode number with the same name in the same directory), + * then there is nothing to do. + */ + if (fvp == tvp && fromnd.ni_dvp == tdvp && + fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && + !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, + fromnd.ni_cnd.cn_namelen)) + error = -1; +out: + if (!error) { + LEASE_CHECK(tdvp, p, p->p_ucred, LEASE_WRITE); + if (fromnd.ni_dvp != tdvp) + LEASE_CHECK(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (tvp) + LEASE_CHECK(tvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + } else { + VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + vrele(tond.ni_startdir); + FREE(tond.ni_cnd.cn_pnbuf, M_NAMEI); +out1: + if (fromnd.ni_startdir) + vrele(fromnd.ni_startdir); + FREE(fromnd.ni_cnd.cn_pnbuf, M_NAMEI); + if (error == -1) + return (0); + return (error); +} + +/* + * Make a directory file. + */ +struct mkdir_args { + char *path; + int mode; +}; +/* ARGSUSED */ +mkdir(p, uap, retval) + struct proc *p; + register struct mkdir_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VDIR; + vattr.va_mode = (uap->mode & ACCESSPERMS) &~ p->p_fd->fd_cmask; + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + if (!error) + vput(nd.ni_vp); + return (error); +} + +/* + * Remove a directory file. + */ +struct rmdir_args { + char *path; +}; +/* ARGSUSED */ +rmdir(p, uap, retval) + struct proc *p; + struct rmdir_args *uap; + int *retval; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + /* + * No rmdir "." please. + */ + if (nd.ni_dvp == vp) { + error = EINVAL; + goto out; + } + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) + error = EBUSY; +out: + if (!error) { + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + } + return (error); +} + +#ifdef COMPAT_43 +/* + * Read a block of directory entries in a file system independent format. + */ +struct ogetdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +ogetdirentries(p, uap, retval) + struct proc *p; + register struct ogetdirentries_args *uap; + int *retval; +{ + register struct vnode *vp; + struct file *fp; + struct uio auio, kuio; + struct iovec aiov, kiov; + struct dirent *dp, *edp; + caddr_t dirbuf; + int error, readcnt; + long loff; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = uap->buf; + aiov.iov_len = uap->count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = uap->count; + VOP_LOCK(vp); + loff = auio.uio_offset = fp->f_offset; +# if (BYTE_ORDER != LITTLE_ENDIAN) + if (vp->v_mount->mnt_maxsymlinklen <= 0) { + error = VOP_READDIR(vp, &auio, fp->f_cred); + fp->f_offset = auio.uio_offset; + } else +# endif + { + kuio = auio; + kuio.uio_iov = &kiov; + kuio.uio_segflg = UIO_SYSSPACE; + kiov.iov_len = uap->count; + MALLOC(dirbuf, caddr_t, uap->count, M_TEMP, M_WAITOK); + kiov.iov_base = dirbuf; + error = VOP_READDIR(vp, &kuio, fp->f_cred); + fp->f_offset = kuio.uio_offset; + if (error == 0) { + readcnt = uap->count - kuio.uio_resid; + edp = (struct dirent *)&dirbuf[readcnt]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + /* + * The expected low byte of + * dp->d_namlen is our dp->d_type. + * The high MBZ byte of dp->d_namlen + * is our dp->d_namlen. + */ + dp->d_type = dp->d_namlen; + dp->d_namlen = 0; +# else + /* + * The dp->d_type is the high byte + * of the expected dp->d_namlen, + * so must be zero'ed. + */ + dp->d_type = 0; +# endif + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } + } + if (dp >= edp) + error = uiomove(dirbuf, readcnt, &auio); + } + FREE(dirbuf, M_TEMP); + } + VOP_UNLOCK(vp); + if (error) + return (error); + error = copyout((caddr_t)&loff, (caddr_t)uap->basep, sizeof(long)); + *retval = uap->count - auio.uio_resid; + return (error); +} +#endif + +/* + * Read a block of directory entries in a file system independent format. + */ +struct getdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +getdirentries(p, uap, retval) + struct proc *p; + register struct getdirentries_args *uap; + int *retval; +{ + register struct vnode *vp; + struct file *fp; + struct uio auio; + struct iovec aiov; + long loff; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = uap->buf; + aiov.iov_len = uap->count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = uap->count; + VOP_LOCK(vp); + loff = auio.uio_offset = fp->f_offset; + error = VOP_READDIR(vp, &auio, fp->f_cred); + fp->f_offset = auio.uio_offset; + VOP_UNLOCK(vp); + if (error) + return (error); + +#ifdef UNION +{ + extern int (**union_vnodeop_p)(); + extern struct vnode *union_lowervp __P((struct vnode *)); + + if ((uap->count == auio.uio_resid) && + (vp->v_op == union_vnodeop_p)) { + struct vnode *tvp = vp; + + vp = union_lowervp(vp); + if (vp != NULLVP) { + VOP_LOCK(vp); + error = VOP_OPEN(vp, FREAD); + VOP_UNLOCK(vp); + + if (error) { + vrele(vp); + return (error); + } + fp->f_data = (caddr_t) vp; + fp->f_offset = 0; + error = vn_close(tvp, FREAD, fp->f_cred, p); + if (error) + return (error); + goto unionread; + } + } +} +#endif + + if ((uap->count == auio.uio_resid) && + (vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_UNION)) { + struct vnode *tvp = vp; + vp = vp->v_mount->mnt_vnodecovered; + VREF(vp); + fp->f_data = (caddr_t) vp; + fp->f_offset = 0; + vrele(tvp); + goto unionread; + } + error = copyout((caddr_t)&loff, (caddr_t)uap->basep, sizeof(long)); + *retval = uap->count - auio.uio_resid; + return (error); +} + +/* + * Set the mode mask for creation of filesystem nodes. + */ +struct umask_args { + int newmask; +}; +mode_t /* XXX */ +umask(p, uap, retval) + struct proc *p; + struct umask_args *uap; + int *retval; +{ + register struct filedesc *fdp; + + fdp = p->p_fd; + *retval = fdp->fd_cmask; + fdp->fd_cmask = uap->newmask & ALLPERMS; + return (0); +} + +/* + * Void all references to file by ripping underlying filesystem + * away from vnode. + */ +struct revoke_args { + char *path; +}; +/* ARGSUSED */ +revoke(p, uap, retval) + struct proc *p; + register struct revoke_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VCHR && vp->v_type != VBLK) { + error = EINVAL; + goto out; + } + if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p)) + goto out; + if (p->p_ucred->cr_uid != vattr.va_uid && + (error = suser(p->p_ucred, &p->p_acflag))) + goto out; + if (vp->v_usecount > 1 || (vp->v_flag & VALIASED)) + vgoneall(vp); +out: + vrele(vp); + return (error); +} + +/* + * Convert a user file descriptor to a kernel file entry. + */ +getvnode(fdp, fd, fpp) + struct filedesc *fdp; + struct file **fpp; + int fd; +{ + struct file *fp; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (EINVAL); + *fpp = fp; + return (0); +} diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c new file mode 100644 index 00000000000..1ce7347bdc8 --- /dev/null +++ b/sys/kern/vfs_init.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed + * to Berkeley by John Heidemann of the UCLA Ficus project. + * + * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_init.c 8.3 (Berkeley) 1/4/94 + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Sigh, such primitive tools are these... + */ +#if 0 +#define DODEBUG(A) A +#else +#define DODEBUG(A) +#endif + +extern struct vnodeopv_desc *vfs_opv_descs[]; + /* a list of lists of vnodeops defns */ +extern struct vnodeop_desc *vfs_op_descs[]; + /* and the operations they perform */ +/* + * This code doesn't work if the defn is **vnodop_defns with cc. + * The problem is because of the compiler sometimes putting in an + * extra level of indirection for arrays. It's an interesting + * "feature" of C. + */ +int vfs_opv_numops; + +typedef (*PFI)(); /* the standard Pointer to a Function returning an Int */ + +/* + * A miscellaneous routine. + * A generic "default" routine that just returns an error. + */ +int +vn_default_error() +{ + + return (EOPNOTSUPP); +} + +/* + * vfs_init.c + * + * Allocate and fill in operations vectors. + * + * An undocumented feature of this approach to defining operations is that + * there can be multiple entries in vfs_opv_descs for the same operations + * vector. This allows third parties to extend the set of operations + * supported by another layer in a binary compatibile way. For example, + * assume that NFS needed to be modified to support Ficus. NFS has an entry + * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by + * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions) + * listing those new operations Ficus adds to NFS, all without modifying the + * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but + * that is a(whole)nother story.) This is a feature. + */ +void +vfs_opv_init() +{ + int i, j, k; + int (***opv_desc_vector_p)(); + int (**opv_desc_vector)(); + struct vnodeopv_entry_desc *opve_descp; + + /* + * Allocate the dynamic vectors and fill them in. + */ + for (i=0; vfs_opv_descs[i]; i++) { + opv_desc_vector_p = vfs_opv_descs[i]->opv_desc_vector_p; + /* + * Allocate and init the vector, if it needs it. + * Also handle backwards compatibility. + */ + if (*opv_desc_vector_p == NULL) { + /* XXX - shouldn't be M_VNODE */ + MALLOC(*opv_desc_vector_p, PFI*, + vfs_opv_numops*sizeof(PFI), M_VNODE, M_WAITOK); + bzero (*opv_desc_vector_p, vfs_opv_numops*sizeof(PFI)); + DODEBUG(printf("vector at %x allocated\n", + opv_desc_vector_p)); + } + opv_desc_vector = *opv_desc_vector_p; + for (j=0; vfs_opv_descs[i]->opv_desc_ops[j].opve_op; j++) { + opve_descp = &(vfs_opv_descs[i]->opv_desc_ops[j]); + + /* + * Sanity check: is this operation listed + * in the list of operations? We check this + * by seeing if its offest is zero. Since + * the default routine should always be listed + * first, it should be the only one with a zero + * offset. Any other operation with a zero + * offset is probably not listed in + * vfs_op_descs, and so is probably an error. + * + * A panic here means the layer programmer + * has committed the all-too common bug + * of adding a new operation to the layer's + * list of vnode operations but + * not adding the operation to the system-wide + * list of supported operations. + */ + if (opve_descp->opve_op->vdesc_offset == 0 && + opve_descp->opve_op->vdesc_offset != + VOFFSET(vop_default)) { + printf("operation %s not listed in %s.\n", + opve_descp->opve_op->vdesc_name, + "vfs_op_descs"); + panic ("vfs_opv_init: bad operation"); + } + /* + * Fill in this entry. + */ + opv_desc_vector[opve_descp->opve_op->vdesc_offset] = + opve_descp->opve_impl; + } + } + /* + * Finally, go back and replace unfilled routines + * with their default. (Sigh, an O(n^3) algorithm. I + * could make it better, but that'd be work, and n is small.) + */ + for (i = 0; vfs_opv_descs[i]; i++) { + opv_desc_vector = *(vfs_opv_descs[i]->opv_desc_vector_p); + /* + * Force every operations vector to have a default routine. + */ + if (opv_desc_vector[VOFFSET(vop_default)]==NULL) { + panic("vfs_opv_init: operation vector without default routine."); + } + for (k = 0; kopv_desc_vector_p) = NULL; + /* + * Figure out how many ops there are by counting the table, + * and assign each its offset. + */ + for (vfs_opv_numops = 0, i = 0; vfs_op_descs[i]; i++) { + vfs_op_descs[i]->vdesc_offset = vfs_opv_numops; + vfs_opv_numops++; + } + DODEBUG(printf ("vfs_opv_numops=%d\n", vfs_opv_numops)); +} + +/* + * Routines having to do with the management of the vnode table. + */ +extern struct vnodeops dead_vnodeops; +extern struct vnodeops spec_vnodeops; +extern void vclean(); +struct vattr va_null; + +/* + * Initialize the vnode structures and initialize each file system type. + */ +vfsinit() +{ + struct vfsops **vfsp; + + /* + * Initialize the vnode table + */ + vntblinit(); + /* + * Initialize the vnode name cache + */ + nchinit(); + /* + * Build vnode operation vectors. + */ + vfs_op_init(); + vfs_opv_init(); /* finish the job */ + /* + * Initialize each file system type. + */ + vattr_null(&va_null); + for (vfsp = &vfssw[0]; vfsp <= &vfssw[MOUNT_MAXTYPE]; vfsp++) { + if (*vfsp == NULL) + continue; + (*(*vfsp)->vfs_init)(); + } +} diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c new file mode 100644 index 00000000000..0fa5aa19b78 --- /dev/null +++ b/sys/kern/vfs_lookup.c @@ -0,0 +1,506 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef KTRACE +#include +#endif + +/* + * Convert a pathname into a pointer to a locked inode. + * + * The FOLLOW flag is set when symbolic links are to be followed + * when they occur at the end of the name translation process. + * Symbolic links are always followed for all other pathname + * components other than the last. + * + * The segflg defines whether the name is to be copied from user + * space or kernel space. + * + * Overall outline of namei: + * + * copy in name + * get starting directory + * while (!done && !error) { + * call lookup to search path. + * if symbolic link, massage name in buffer and continue + * } + */ +int +namei(ndp) + register struct nameidata *ndp; +{ + register struct filedesc *fdp; /* pointer to file descriptor state */ + register char *cp; /* pointer into pathname argument */ + register struct vnode *dp; /* the directory we are searching */ + struct iovec aiov; /* uio for reading symbolic links */ + struct uio auio; + int error, linklen; + struct componentname *cnp = &ndp->ni_cnd; + + ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_proc->p_ucred; +#ifdef DIAGNOSTIC + if (!cnp->cn_cred || !cnp->cn_proc) + panic ("namei: bad cred/proc"); + if (cnp->cn_nameiop & (~OPMASK)) + panic ("namei: nameiop contaminated with flags"); + if (cnp->cn_flags & OPMASK) + panic ("namei: flags contaminated with nameiops"); +#endif + fdp = cnp->cn_proc->p_fd; + + /* + * Get a buffer for the name to be translated, and copy the + * name into the buffer. + */ + if ((cnp->cn_flags & HASBUF) == 0) + MALLOC(cnp->cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (ndp->ni_segflg == UIO_SYSSPACE) + error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, + MAXPATHLEN, &ndp->ni_pathlen); + else + error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, + MAXPATHLEN, &ndp->ni_pathlen); + if (error) { + free(cnp->cn_pnbuf, M_NAMEI); + ndp->ni_vp = NULL; + return (error); + } + ndp->ni_loopcnt = 0; +#ifdef KTRACE + if (KTRPOINT(cnp->cn_proc, KTR_NAMEI)) + ktrnamei(cnp->cn_proc->p_tracep, cnp->cn_pnbuf); +#endif + + /* + * Get starting point for the translation. + */ + if ((ndp->ni_rootdir = fdp->fd_rdir) == NULL) + ndp->ni_rootdir = rootvnode; + dp = fdp->fd_cdir; + VREF(dp); + for (;;) { + /* + * Check if root directory should replace current directory. + * Done at start of translation and after symbolic link. + */ + cnp->cn_nameptr = cnp->cn_pnbuf; + if (*(cnp->cn_nameptr) == '/') { + vrele(dp); + while (*(cnp->cn_nameptr) == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + dp = ndp->ni_rootdir; + VREF(dp); + } + ndp->ni_startdir = dp; + if (error = lookup(ndp)) { + FREE(cnp->cn_pnbuf, M_NAMEI); + return (error); + } + /* + * Check for symbolic link + */ + if ((cnp->cn_flags & ISSYMLINK) == 0) { + if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) + FREE(cnp->cn_pnbuf, M_NAMEI); + else + cnp->cn_flags |= HASBUF; + return (0); + } + if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) + VOP_UNLOCK(ndp->ni_dvp); + if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { + error = ELOOP; + break; + } + if (ndp->ni_pathlen > 1) + MALLOC(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + else + cp = cnp->cn_pnbuf; + aiov.iov_base = cp; + aiov.iov_len = MAXPATHLEN; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_procp = (struct proc *)0; + auio.uio_resid = MAXPATHLEN; + if (error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred)) { + if (ndp->ni_pathlen > 1) + free(cp, M_NAMEI); + break; + } + linklen = MAXPATHLEN - auio.uio_resid; + if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { + if (ndp->ni_pathlen > 1) + free(cp, M_NAMEI); + error = ENAMETOOLONG; + break; + } + if (ndp->ni_pathlen > 1) { + bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); + FREE(cnp->cn_pnbuf, M_NAMEI); + cnp->cn_pnbuf = cp; + } else + cnp->cn_pnbuf[linklen] = '\0'; + ndp->ni_pathlen += linklen; + vput(ndp->ni_vp); + dp = ndp->ni_dvp; + } + FREE(cnp->cn_pnbuf, M_NAMEI); + vrele(ndp->ni_dvp); + vput(ndp->ni_vp); + ndp->ni_vp = NULL; + return (error); +} + +/* + * Search a pathname. + * This is a very central and rather complicated routine. + * + * The pathname is pointed to by ni_ptr and is of length ni_pathlen. + * The starting directory is taken from ni_startdir. The pathname is + * descended until done, or a symbolic link is encountered. The variable + * ni_more is clear if the path is completed; it is set to one if a + * symbolic link needing interpretation is encountered. + * + * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on + * whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it, the parent directory is returned + * locked. If flag has WANTPARENT or'ed into it, the parent directory is + * returned unlocked. Otherwise the parent directory is not returned. If + * the target of the pathname exists and LOCKLEAF is or'ed into the flag + * the target is returned locked, otherwise it is returned unlocked. + * When creating or renaming and LOCKPARENT is specified, the target may not + * be ".". When deleting and LOCKPARENT is specified, the target may be ".". + * + * Overall outline of lookup: + * + * dirloop: + * identify next component of name at ndp->ni_ptr + * handle degenerate case where name is null string + * if .. and crossing mount points and on mounted filesys, find parent + * call VOP_LOOKUP routine for next component name + * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set + * component vnode returned in ni_vp (if it exists), locked. + * if result vnode is mounted on and crossing mount points, + * find mounted on vnode + * if more components of name, do next level at dirloop + * return the answer in ni_vp, locked if LOCKLEAF set + * if LOCKPARENT set, return locked parent in ni_dvp + * if WANTPARENT set, return unlocked parent in ni_dvp + */ +int +lookup(ndp) + register struct nameidata *ndp; +{ + register char *cp; /* pointer into pathname argument */ + register struct vnode *dp = 0; /* the directory we are searching */ + struct vnode *tdp; /* saved dp */ + struct mount *mp; /* mount table entry */ + int docache; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int error = 0; + struct componentname *cnp = &ndp->ni_cnd; + + /* + * Setup: break out flag bits into variables. + */ + wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + if (cnp->cn_nameiop == DELETE || + (wantparent && cnp->cn_nameiop != CREATE)) + docache = 0; + rdonly = cnp->cn_flags & RDONLY; + ndp->ni_dvp = NULL; + cnp->cn_flags &= ~ISSYMLINK; + dp = ndp->ni_startdir; + ndp->ni_startdir = NULLVP; + VOP_LOCK(dp); + +dirloop: + /* + * Search a new directory. + * + * The cn_hash value is for use by vfs_cache. + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ + cnp->cn_consume = 0; + cnp->cn_hash = 0; + for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) + cnp->cn_hash += (unsigned char)*cp; + cnp->cn_namelen = cp - cnp->cn_nameptr; + if (cnp->cn_namelen > NAME_MAX) { + error = ENAMETOOLONG; + goto bad; + } +#ifdef NAMEI_DIAGNOSTIC + { char c = *cp; + *cp = '\0'; + printf("{%s}: ", cnp->cn_nameptr); + *cp = c; } +#endif + ndp->ni_pathlen -= cnp->cn_namelen; + ndp->ni_next = cp; + cnp->cn_flags |= MAKEENTRY; + if (*cp == '\0' && docache == 0) + cnp->cn_flags &= ~MAKEENTRY; + if (cnp->cn_namelen == 2 && + cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') + cnp->cn_flags |= ISDOTDOT; + else + cnp->cn_flags &= ~ISDOTDOT; + if (*ndp->ni_next == 0) + cnp->cn_flags |= ISLASTCN; + else + cnp->cn_flags &= ~ISLASTCN; + + + /* + * Check for degenerate name (e.g. / or "") + * which is a way of talking about a directory, + * e.g. like "/." or ".". + */ + if (cnp->cn_nameptr[0] == '\0') { + if (cnp->cn_nameiop != LOOKUP) { + error = EISDIR; + goto bad; + } + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto bad; + } + if (wantparent) { + ndp->ni_dvp = dp; + VREF(dp); + } + ndp->ni_vp = dp; + if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) + VOP_UNLOCK(dp); + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + return (0); + } + + /* + * Handle "..": two special cases. + * 1. If at root directory (e.g. after chroot) + * or at absolute root directory + * then ignore it so can't get out. + * 2. If this vnode is the root of a mounted + * filesystem, then replace it with the + * vnode which was mounted on so we take the + * .. in the other file system. + */ + if (cnp->cn_flags & ISDOTDOT) { + for (;;) { + if (dp == ndp->ni_rootdir || dp == rootvnode) { + ndp->ni_dvp = dp; + ndp->ni_vp = dp; + VREF(dp); + goto nextname; + } + if ((dp->v_flag & VROOT) == 0 || + (cnp->cn_flags & NOCROSSMOUNT)) + break; + tdp = dp; + dp = dp->v_mount->mnt_vnodecovered; + vput(tdp); + VREF(dp); + VOP_LOCK(dp); + } + } + + /* + * We now have a segment name to search for, and a directory to search. + */ +unionlookup: + ndp->ni_dvp = dp; + if (error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) { +#ifdef DIAGNOSTIC + if (ndp->ni_vp != NULL) + panic("leaf should be empty"); +#endif +#ifdef NAMEI_DIAGNOSTIC + printf("not found\n"); +#endif + if ((error == ENOENT) && + (dp->v_flag & VROOT) && + (dp->v_mount->mnt_flag & MNT_UNION)) { + tdp = dp; + dp = dp->v_mount->mnt_vnodecovered; + vput(tdp); + VREF(dp); + VOP_LOCK(dp); + goto unionlookup; + } + + if (error != EJUSTRETURN) + goto bad; + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (rdonly || (ndp->ni_dvp->v_mount->mnt_flag & MNT_RDONLY)) { + error = EROFS; + goto bad; + } + /* + * We return with ni_vp NULL to indicate that the entry + * doesn't currently exist, leaving a pointer to the + * (possibly locked) directory inode in ndp->ni_dvp. + */ + if (cnp->cn_flags & SAVESTART) { + ndp->ni_startdir = ndp->ni_dvp; + VREF(ndp->ni_startdir); + } + return (0); + } +#ifdef NAMEI_DIAGNOSTIC + printf("found\n"); +#endif + + /* + * Take into account any additional components consumed by + * the underlying filesystem. + */ + if (cnp->cn_consume > 0) { + cnp->cn_nameptr += cnp->cn_consume; + ndp->ni_next += cnp->cn_consume; + ndp->ni_pathlen -= cnp->cn_consume; + cnp->cn_consume = 0; + } + + dp = ndp->ni_vp; + /* + * Check for symbolic link + */ + if ((dp->v_type == VLNK) && + ((cnp->cn_flags & FOLLOW) || *ndp->ni_next == '/')) { + cnp->cn_flags |= ISSYMLINK; + return (0); + } + + /* + * Check to see if the vnode has been mounted on; + * if so find the root of the mounted file system. + */ + while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && + (cnp->cn_flags & NOCROSSMOUNT) == 0) { + if (mp->mnt_flag & MNT_MLOCK) { + mp->mnt_flag |= MNT_MWAIT; + sleep((caddr_t)mp, PVFS); + continue; + } + if (error = VFS_ROOT(dp->v_mountedhere, &tdp)) + goto bad2; + vput(dp); + ndp->ni_vp = dp = tdp; + } + +nextname: + /* + * Not a symbolic link. If more pathname, + * continue at next component, else return. + */ + if (*ndp->ni_next == '/') { + cnp->cn_nameptr = ndp->ni_next; + while (*cnp->cn_nameptr == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + vrele(ndp->ni_dvp); + goto dirloop; + } + /* + * Check for read-only file systems. + */ + if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) { + /* + * Disallow directory write attempts on read-only + * file systems. + */ + if (rdonly || (dp->v_mount->mnt_flag & MNT_RDONLY) || + (wantparent && + (ndp->ni_dvp->v_mount->mnt_flag & MNT_RDONLY))) { + error = EROFS; + goto bad2; + } + } + if (cnp->cn_flags & SAVESTART) { + ndp->ni_startdir = ndp->ni_dvp; + VREF(ndp->ni_startdir); + } + if (!wantparent) + vrele(ndp->ni_dvp); + if ((cnp->cn_flags & LOCKLEAF) == 0) + VOP_UNLOCK(dp); + return (0); + +bad2: + if ((cnp->cn_flags & LOCKPARENT) && *ndp->ni_next == '\0') + VOP_UNLOCK(ndp->ni_dvp); + vrele(ndp->ni_dvp); +bad: + vput(dp); + ndp->ni_vp = NULL; + return (error); +} + + diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c new file mode 100644 index 00000000000..2fe39eb674b --- /dev/null +++ b/sys/kern/vfs_mount.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_conf.c 8.8 (Berkeley) 3/31/94 + */ + +#include +#include +#include + +#ifdef FFS +#include + +/* + * This specifies the filesystem used to mount the root. + * This specification should be done by /etc/config. + */ +int (*mountroot)() = ffs_mountroot; +#endif + +/* + * These define the root filesystem and device. + */ +struct mount *rootfs; +struct vnode *rootvnode; + +/* + * Set up the filesystem operations for vnodes. + * The types are defined in mount.h. + */ +#ifdef FFS +extern struct vfsops ufs_vfsops; +#define UFS_VFSOPS &ufs_vfsops +#else +#define UFS_VFSOPS NULL +#endif + +#ifdef LFS +extern struct vfsops lfs_vfsops; +#define LFS_VFSOPS &lfs_vfsops +#else +#define LFS_VFSOPS NULL +#endif + +#ifdef MFS +extern struct vfsops mfs_vfsops; +#define MFS_VFSOPS &mfs_vfsops +#else +#define MFS_VFSOPS NULL +#endif + +#ifdef NFS +extern struct vfsops nfs_vfsops; +#define NFS_VFSOPS &nfs_vfsops +#else +#define NFS_VFSOPS NULL +#endif + +#ifdef FDESC +extern struct vfsops fdesc_vfsops; +#define FDESC_VFSOPS &fdesc_vfsops +#else +#define FDESC_VFSOPS NULL +#endif + +#ifdef PORTAL +extern struct vfsops portal_vfsops; +#define PORTAL_VFSOPS &portal_vfsops +#else +#define PORTAL_VFSOPS NULL +#endif + +#ifdef NULLFS +extern struct vfsops null_vfsops; +#define NULL_VFSOPS &null_vfsops +#else +#define NULL_VFSOPS NULL +#endif + +#ifdef UMAPFS +extern struct vfsops umap_vfsops; +#define UMAP_VFSOPS &umap_vfsops +#else +#define UMAP_VFSOPS NULL +#endif + +#ifdef KERNFS +extern struct vfsops kernfs_vfsops; +#define KERNFS_VFSOPS &kernfs_vfsops +#else +#define KERNFS_VFSOPS NULL +#endif + +#ifdef PROCFS +extern struct vfsops procfs_vfsops; +#define PROCFS_VFSOPS &procfs_vfsops +#else +#define PROCFS_VFSOPS NULL +#endif + +#ifdef AFS +extern struct vfsops afs_vfsops; +#define AFS_VFSOPS &afs_vfsops +#else +#define AFS_VFSOPS NULL +#endif + +#ifdef CD9660 +extern struct vfsops cd9660_vfsops; +#define CD9660_VFSOPS &cd9660_vfsops +#else +#define CD9660_VFSOPS NULL +#endif + +#ifdef UNION +extern struct vfsops union_vfsops; +#define UNION_VFSOPS &union_vfsops +#else +#define UNION_VFSOPS NULL +#endif + +struct vfsops *vfssw[] = { + NULL, /* 0 = MOUNT_NONE */ + UFS_VFSOPS, /* 1 = MOUNT_UFS */ + NFS_VFSOPS, /* 2 = MOUNT_NFS */ + MFS_VFSOPS, /* 3 = MOUNT_MFS */ + NULL, /* 4 = MOUNT_PC */ + LFS_VFSOPS, /* 5 = MOUNT_LFS */ + NULL, /* 6 = MOUNT_LOFS */ + FDESC_VFSOPS, /* 7 = MOUNT_FDESC */ + PORTAL_VFSOPS, /* 8 = MOUNT_PORTAL */ + NULL_VFSOPS, /* 9 = MOUNT_NULL */ + UMAP_VFSOPS, /* 10 = MOUNT_UMAP */ + KERNFS_VFSOPS, /* 11 = MOUNT_KERNFS */ + PROCFS_VFSOPS, /* 12 = MOUNT_PROCFS */ + AFS_VFSOPS, /* 13 = MOUNT_AFS */ + CD9660_VFSOPS, /* 14 = MOUNT_CD9660 */ + UNION_VFSOPS, /* 15 = MOUNT_UNION */ + 0 +}; + + +/* + * + * vfs_opv_descs enumerates the list of vnode classes, each with it's own + * vnode operation vector. It is consulted at system boot to build operation + * vectors. It is NULL terminated. + * + */ +extern struct vnodeopv_desc ffs_vnodeop_opv_desc; +extern struct vnodeopv_desc ffs_specop_opv_desc; +extern struct vnodeopv_desc ffs_fifoop_opv_desc; +extern struct vnodeopv_desc lfs_vnodeop_opv_desc; +extern struct vnodeopv_desc lfs_specop_opv_desc; +extern struct vnodeopv_desc lfs_fifoop_opv_desc; +extern struct vnodeopv_desc mfs_vnodeop_opv_desc; +extern struct vnodeopv_desc dead_vnodeop_opv_desc; +extern struct vnodeopv_desc fifo_vnodeop_opv_desc; +extern struct vnodeopv_desc spec_vnodeop_opv_desc; +extern struct vnodeopv_desc nfsv2_vnodeop_opv_desc; +extern struct vnodeopv_desc spec_nfsv2nodeop_opv_desc; +extern struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc; +extern struct vnodeopv_desc fdesc_vnodeop_opv_desc; +extern struct vnodeopv_desc portal_vnodeop_opv_desc; +extern struct vnodeopv_desc null_vnodeop_opv_desc; +extern struct vnodeopv_desc umap_vnodeop_opv_desc; +extern struct vnodeopv_desc kernfs_vnodeop_opv_desc; +extern struct vnodeopv_desc procfs_vnodeop_opv_desc; +extern struct vnodeopv_desc cd9660_vnodeop_opv_desc; +extern struct vnodeopv_desc cd9660_specop_opv_desc; +extern struct vnodeopv_desc cd9660_fifoop_opv_desc; +extern struct vnodeopv_desc union_vnodeop_opv_desc; + +struct vnodeopv_desc *vfs_opv_descs[] = { + &ffs_vnodeop_opv_desc, + &ffs_specop_opv_desc, +#ifdef FIFO + &ffs_fifoop_opv_desc, +#endif + &dead_vnodeop_opv_desc, +#ifdef FIFO + &fifo_vnodeop_opv_desc, +#endif + &spec_vnodeop_opv_desc, +#ifdef LFS + &lfs_vnodeop_opv_desc, + &lfs_specop_opv_desc, +#ifdef FIFO + &lfs_fifoop_opv_desc, +#endif +#endif +#ifdef MFS + &mfs_vnodeop_opv_desc, +#endif +#ifdef NFS + &nfsv2_vnodeop_opv_desc, + &spec_nfsv2nodeop_opv_desc, +#ifdef FIFO + &fifo_nfsv2nodeop_opv_desc, +#endif +#endif +#ifdef FDESC + &fdesc_vnodeop_opv_desc, +#endif +#ifdef PORTAL + &portal_vnodeop_opv_desc, +#endif +#ifdef NULLFS + &null_vnodeop_opv_desc, +#endif +#ifdef UMAPFS + &umap_vnodeop_opv_desc, +#endif +#ifdef KERNFS + &kernfs_vnodeop_opv_desc, +#endif +#ifdef PROCFS + &procfs_vnodeop_opv_desc, +#endif +#ifdef CD9660 + &cd9660_vnodeop_opv_desc, + &cd9660_specop_opv_desc, +#ifdef FIFO + &cd9660_fifoop_opv_desc, +#endif +#endif +#ifdef UNION + &union_vnodeop_opv_desc, +#endif + NULL +}; diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c new file mode 100644 index 00000000000..9891fe61c19 --- /dev/null +++ b/sys/kern/vfs_subr.c @@ -0,0 +1,1322 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 + */ + +/* + * External virtual filesystem routines + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +enum vtype iftovt_tab[16] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, +}; +int vttoif_tab[9] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, + S_IFSOCK, S_IFIFO, S_IFMT, +}; + +/* + * Insq/Remq for the vnode usage lists. + */ +#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) +#define bufremvn(bp) { \ + LIST_REMOVE(bp, b_vnbufs); \ + (bp)->b_vnbufs.le_next = NOLIST; \ +} + +TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ +struct mntlist mountlist; /* mounted filesystem list */ + +/* + * Initialize the vnode management data structures. + */ +vntblinit() +{ + + TAILQ_INIT(&vnode_free_list); + TAILQ_INIT(&mountlist); +} + +/* + * Lock a filesystem. + * Used to prevent access to it while mounting and unmounting. + */ +vfs_lock(mp) + register struct mount *mp; +{ + + while(mp->mnt_flag & MNT_MLOCK) { + mp->mnt_flag |= MNT_MWAIT; + sleep((caddr_t)mp, PVFS); + } + mp->mnt_flag |= MNT_MLOCK; + return (0); +} + +/* + * Unlock a locked filesystem. + * Panic if filesystem is not locked. + */ +void +vfs_unlock(mp) + register struct mount *mp; +{ + + if ((mp->mnt_flag & MNT_MLOCK) == 0) + panic("vfs_unlock: not locked"); + mp->mnt_flag &= ~MNT_MLOCK; + if (mp->mnt_flag & MNT_MWAIT) { + mp->mnt_flag &= ~MNT_MWAIT; + wakeup((caddr_t)mp); + } +} + +/* + * Mark a mount point as busy. + * Used to synchronize access and to delay unmounting. + */ +vfs_busy(mp) + register struct mount *mp; +{ + + while(mp->mnt_flag & MNT_MPBUSY) { + mp->mnt_flag |= MNT_MPWANT; + sleep((caddr_t)&mp->mnt_flag, PVFS); + } + if (mp->mnt_flag & MNT_UNMOUNT) + return (1); + mp->mnt_flag |= MNT_MPBUSY; + return (0); +} + +/* + * Free a busy filesystem. + * Panic if filesystem is not busy. + */ +vfs_unbusy(mp) + register struct mount *mp; +{ + + if ((mp->mnt_flag & MNT_MPBUSY) == 0) + panic("vfs_unbusy: not busy"); + mp->mnt_flag &= ~MNT_MPBUSY; + if (mp->mnt_flag & MNT_MPWANT) { + mp->mnt_flag &= ~MNT_MPWANT; + wakeup((caddr_t)&mp->mnt_flag); + } +} + +/* + * Lookup a mount point by filesystem identifier. + */ +struct mount * +getvfs(fsid) + fsid_t *fsid; +{ + register struct mount *mp; + + for (mp = mountlist.tqh_first; mp != NULL; mp = mp->mnt_list.tqe_next) { + if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) + return (mp); + } + return ((struct mount *)0); +} + +/* + * Get a new unique fsid + */ +void +getnewfsid(mp, mtype) + struct mount *mp; + int mtype; +{ +static u_short xxxfs_mntid; + + fsid_t tfsid; + + mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); + mp->mnt_stat.f_fsid.val[1] = mtype; + if (xxxfs_mntid == 0) + ++xxxfs_mntid; + tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid); + tfsid.val[1] = mtype; + if (mountlist.tqh_first != NULL) { + while (getvfs(&tfsid)) { + tfsid.val[0]++; + xxxfs_mntid++; + } + } + mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; +} + +/* + * Set vnode attributes to VNOVAL + */ +void vattr_null(vap) + register struct vattr *vap; +{ + + vap->va_type = VNON; + vap->va_size = vap->va_bytes = VNOVAL; + vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid = + vap->va_fsid = vap->va_fileid = + vap->va_blocksize = vap->va_rdev = + vap->va_atime.ts_sec = vap->va_atime.ts_nsec = + vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec = + vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec = + vap->va_flags = vap->va_gen = VNOVAL; + vap->va_vaflags = 0; +} + +/* + * Routines having to do with the management of the vnode table. + */ +extern int (**dead_vnodeop_p)(); +extern void vclean(); +long numvnodes; +extern struct vattr va_null; + +/* + * Return the next vnode from the free list. + */ +getnewvnode(tag, mp, vops, vpp) + enum vtagtype tag; + struct mount *mp; + int (**vops)(); + struct vnode **vpp; +{ + register struct vnode *vp; + int s; + + if ((vnode_free_list.tqh_first == NULL && + numvnodes < 2 * desiredvnodes) || + numvnodes < desiredvnodes) { + vp = (struct vnode *)malloc((u_long)sizeof *vp, + M_VNODE, M_WAITOK); + bzero((char *)vp, sizeof *vp); + numvnodes++; + } else { + if ((vp = vnode_free_list.tqh_first) == NULL) { + tablefull("vnode"); + *vpp = 0; + return (ENFILE); + } + if (vp->v_usecount) + panic("free vnode isn't"); + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + /* see comment on why 0xdeadb is set at end of vgone (below) */ + vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; + vp->v_lease = NULL; + if (vp->v_type != VBAD) + vgone(vp); +#ifdef DIAGNOSTIC + if (vp->v_data) + panic("cleaned vnode isn't"); + s = splbio(); + if (vp->v_numoutput) + panic("Clean vnode has pending I/O's"); + splx(s); +#endif + vp->v_flag = 0; + vp->v_lastr = 0; + vp->v_ralen = 0; + vp->v_maxra = 0; + vp->v_lastw = 0; + vp->v_lasta = 0; + vp->v_cstart = 0; + vp->v_clen = 0; + vp->v_socket = 0; + } + vp->v_type = VNON; + cache_purge(vp); + vp->v_tag = tag; + vp->v_op = vops; + insmntque(vp, mp); + *vpp = vp; + vp->v_usecount = 1; + vp->v_data = 0; + return (0); +} + +/* + * Move a vnode from one mount queue to another. + */ +insmntque(vp, mp) + register struct vnode *vp; + register struct mount *mp; +{ + + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + LIST_REMOVE(vp, v_mntvnodes); + /* + * Insert into list of vnodes for the new mount point, if available. + */ + if ((vp->v_mount = mp) == NULL) + return; + LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); +} + +/* + * Update outstanding I/O count and do wakeup if requested. + */ +vwakeup(bp) + register struct buf *bp; +{ + register struct vnode *vp; + + bp->b_flags &= ~B_WRITEINPROG; + if (vp = bp->b_vp) { + vp->v_numoutput--; + if (vp->v_numoutput < 0) + panic("vwakeup: neg numoutput"); + if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { + if (vp->v_numoutput < 0) + panic("vwakeup: neg numoutput"); + vp->v_flag &= ~VBWAIT; + wakeup((caddr_t)&vp->v_numoutput); + } + } +} + +/* + * Flush out and invalidate all buffers associated with a vnode. + * Called with the underlying object locked. + */ +int +vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) + register struct vnode *vp; + int flags; + struct ucred *cred; + struct proc *p; + int slpflag, slptimeo; +{ + register struct buf *bp; + struct buf *nbp, *blist; + int s, error; + + if (flags & V_SAVE) { + if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) + return (error); + if (vp->v_dirtyblkhd.lh_first != NULL) + panic("vinvalbuf: dirty bufs"); + } + for (;;) { + if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA) + while (blist && blist->b_lblkno < 0) + blist = blist->b_vnbufs.le_next; + if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && + (flags & V_SAVEMETA)) + while (blist && blist->b_lblkno < 0) + blist = blist->b_vnbufs.le_next; + if (!blist) + break; + + for (bp = blist; bp; bp = nbp) { + nbp = bp->b_vnbufs.le_next; + if (flags & V_SAVEMETA && bp->b_lblkno < 0) + continue; + s = splbio(); + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + error = tsleep((caddr_t)bp, + slpflag | (PRIBIO + 1), "vinvalbuf", + slptimeo); + splx(s); + if (error) + return (error); + break; + } + bremfree(bp); + bp->b_flags |= B_BUSY; + splx(s); + /* + * XXX Since there are no node locks for NFS, I believe + * there is a slight chance that a delayed write will + * occur while sleeping just above, so check for it. + */ + if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { + (void) VOP_BWRITE(bp); + break; + } + bp->b_flags |= B_INVAL; + brelse(bp); + } + } + if (!(flags & V_SAVEMETA) && + (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first)) + panic("vinvalbuf: flush failed"); + return (0); +} + +/* + * Associate a buffer with a vnode. + */ +bgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ + + if (bp->b_vp) + panic("bgetvp: not free"); + VHOLD(vp); + bp->b_vp = vp; + if (vp->v_type == VBLK || vp->v_type == VCHR) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; + /* + * Insert onto list for new vnode. + */ + bufinsvn(bp, &vp->v_cleanblkhd); +} + +/* + * Disassociate a buffer from a vnode. + */ +brelvp(bp) + register struct buf *bp; +{ + struct vnode *vp; + + if (bp->b_vp == (struct vnode *) 0) + panic("brelvp: NULL"); + /* + * Delete from old vnode list, if on one. + */ + if (bp->b_vnbufs.le_next != NOLIST) + bufremvn(bp); + vp = bp->b_vp; + bp->b_vp = (struct vnode *) 0; + HOLDRELE(vp); +} + +/* + * Reassign a buffer from one vnode to another. + * Used to assign file specific control information + * (indirect blocks) to the vnode to which they belong. + */ +reassignbuf(bp, newvp) + register struct buf *bp; + register struct vnode *newvp; +{ + register struct buflists *listheadp; + + if (newvp == NULL) { + printf("reassignbuf: NULL"); + return; + } + /* + * Delete from old vnode list, if on one. + */ + if (bp->b_vnbufs.le_next != NOLIST) + bufremvn(bp); + /* + * If dirty, put on list of dirty buffers; + * otherwise insert onto list of clean buffers. + */ + if (bp->b_flags & B_DELWRI) + listheadp = &newvp->v_dirtyblkhd; + else + listheadp = &newvp->v_cleanblkhd; + bufinsvn(bp, listheadp); +} + +/* + * Create a vnode for a block device. + * Used for root filesystem, argdev, and swap areas. + * Also used for memory file system special devices. + */ +bdevvp(dev, vpp) + dev_t dev; + struct vnode **vpp; +{ + register struct vnode *vp; + struct vnode *nvp; + int error; + + if (dev == NODEV) + return (0); + error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); + if (error) { + *vpp = 0; + return (error); + } + vp = nvp; + vp->v_type = VBLK; + if (nvp = checkalias(vp, dev, (struct mount *)0)) { + vput(vp); + vp = nvp; + } + *vpp = vp; + return (0); +} + +/* + * Check to see if the new vnode represents a special device + * for which we already have a vnode (either because of + * bdevvp() or because of a different vnode representing + * the same block device). If such an alias exists, deallocate + * the existing contents and return the aliased vnode. The + * caller is responsible for filling it with its new contents. + */ +struct vnode * +checkalias(nvp, nvp_rdev, mp) + register struct vnode *nvp; + dev_t nvp_rdev; + struct mount *mp; +{ + register struct vnode *vp; + struct vnode **vpp; + + if (nvp->v_type != VBLK && nvp->v_type != VCHR) + return (NULLVP); + + vpp = &speclisth[SPECHASH(nvp_rdev)]; +loop: + for (vp = *vpp; vp; vp = vp->v_specnext) { + if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + */ + if (vp->v_usecount == 0) { + vgone(vp); + goto loop; + } + if (vget(vp, 1)) + goto loop; + break; + } + if (vp == NULL || vp->v_tag != VT_NON) { + MALLOC(nvp->v_specinfo, struct specinfo *, + sizeof(struct specinfo), M_VNODE, M_WAITOK); + nvp->v_rdev = nvp_rdev; + nvp->v_hashchain = vpp; + nvp->v_specnext = *vpp; + nvp->v_specflags = 0; + *vpp = nvp; + if (vp != NULL) { + nvp->v_flag |= VALIASED; + vp->v_flag |= VALIASED; + vput(vp); + } + return (NULLVP); + } + VOP_UNLOCK(vp); + vclean(vp, 0); + vp->v_op = nvp->v_op; + vp->v_tag = nvp->v_tag; + nvp->v_type = VNON; + insmntque(vp, mp); + return (vp); +} + +/* + * Grab a particular vnode from the free list, increment its + * reference count and lock it. The vnode lock bit is set the + * vnode is being eliminated in vgone. The process is awakened + * when the transition is completed, and an error returned to + * indicate that the vnode is no longer usable (possibly having + * been changed to a new file system type). + */ +vget(vp, lockflag) + register struct vnode *vp; + int lockflag; +{ + + /* + * If the vnode is in the process of being cleaned out for + * another use, we wait for the cleaning to finish and then + * return failure. Cleaning is determined either by checking + * that the VXLOCK flag is set, or that the use count is + * zero with the back pointer set to show that it has been + * removed from the free list by getnewvnode. The VXLOCK + * flag may not have been set yet because vclean is blocked in + * the VOP_LOCK call waiting for the VOP_INACTIVE to complete. + */ + if ((vp->v_flag & VXLOCK) || + (vp->v_usecount == 0 && + vp->v_freelist.tqe_prev == (struct vnode **)0xdeadb)) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + return (1); + } + if (vp->v_usecount == 0) + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + vp->v_usecount++; + if (lockflag) + VOP_LOCK(vp); + return (0); +} + +/* + * Vnode reference, just increment the count + */ +void vref(vp) + struct vnode *vp; +{ + + if (vp->v_usecount <= 0) + panic("vref used where vget required"); + vp->v_usecount++; +} + +/* + * vput(), just unlock and vrele() + */ +void vput(vp) + register struct vnode *vp; +{ + + VOP_UNLOCK(vp); + vrele(vp); +} + +/* + * Vnode release. + * If count drops to zero, call inactive routine and return to freelist. + */ +void vrele(vp) + register struct vnode *vp; +{ + +#ifdef DIAGNOSTIC + if (vp == NULL) + panic("vrele: null vp"); +#endif + vp->v_usecount--; + if (vp->v_usecount > 0) + return; +#ifdef DIAGNOSTIC + if (vp->v_usecount != 0 || vp->v_writecount != 0) { + vprint("vrele: bad ref count", vp); + panic("vrele: ref cnt"); + } +#endif + /* + * insert at tail of LRU list + */ + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + VOP_INACTIVE(vp); +} + +/* + * Page or buffer structure gets a reference. + */ +void vhold(vp) + register struct vnode *vp; +{ + + vp->v_holdcnt++; +} + +/* + * Page or buffer structure frees a reference. + */ +void holdrele(vp) + register struct vnode *vp; +{ + + if (vp->v_holdcnt <= 0) + panic("holdrele: holdcnt"); + vp->v_holdcnt--; +} + +/* + * Remove any vnodes in the vnode table belonging to mount point mp. + * + * If MNT_NOFORCE is specified, there should not be any active ones, + * return error if any are found (nb: this is a user error, not a + * system error). If MNT_FORCE is specified, detach any active vnodes + * that are found. + */ +#ifdef DIAGNOSTIC +int busyprt = 0; /* print out busy vnodes */ +struct ctldebug debug1 = { "busyprt", &busyprt }; +#endif + +vflush(mp, skipvp, flags) + struct mount *mp; + struct vnode *skipvp; + int flags; +{ + register struct vnode *vp, *nvp; + int busy = 0; + + if ((mp->mnt_flag & MNT_MPBUSY) == 0) + panic("vflush: not busy"); +loop: + for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { + if (vp->v_mount != mp) + goto loop; + nvp = vp->v_mntvnodes.le_next; + /* + * Skip over a selected vnode. + */ + if (vp == skipvp) + continue; + /* + * Skip over a vnodes marked VSYSTEM. + */ + if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) + continue; + /* + * If WRITECLOSE is set, only flush out regular file + * vnodes open for writing. + */ + if ((flags & WRITECLOSE) && + (vp->v_writecount == 0 || vp->v_type != VREG)) + continue; + /* + * With v_usecount == 0, all we need to do is clear + * out the vnode data structures and we are done. + */ + if (vp->v_usecount == 0) { + vgone(vp); + continue; + } + /* + * If FORCECLOSE is set, forcibly close the vnode. + * For block or character devices, revert to an + * anonymous device. For all other files, just kill them. + */ + if (flags & FORCECLOSE) { + if (vp->v_type != VBLK && vp->v_type != VCHR) { + vgone(vp); + } else { + vclean(vp, 0); + vp->v_op = spec_vnodeop_p; + insmntque(vp, (struct mount *)0); + } + continue; + } +#ifdef DIAGNOSTIC + if (busyprt) + vprint("vflush: busy vnode", vp); +#endif + busy++; + } + if (busy) + return (EBUSY); + return (0); +} + +/* + * Disassociate the underlying file system from a vnode. + */ +void +vclean(vp, flags) + register struct vnode *vp; + int flags; +{ + int active; + + /* + * Check to see if the vnode is in use. + * If so we have to reference it before we clean it out + * so that its count cannot fall to zero and generate a + * race against ourselves to recycle it. + */ + if (active = vp->v_usecount) + VREF(vp); + /* + * Even if the count is zero, the VOP_INACTIVE routine may still + * have the object locked while it cleans it out. The VOP_LOCK + * ensures that the VOP_INACTIVE routine is done with its work. + * For active vnodes, it ensures that no other activity can + * occur while the underlying object is being cleaned out. + */ + VOP_LOCK(vp); + /* + * Prevent the vnode from being recycled or + * brought into use while we clean it out. + */ + if (vp->v_flag & VXLOCK) + panic("vclean: deadlock"); + vp->v_flag |= VXLOCK; + /* + * Clean out any buffers associated with the vnode. + */ + if (flags & DOCLOSE) + vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0); + /* + * Any other processes trying to obtain this lock must first + * wait for VXLOCK to clear, then call the new lock operation. + */ + VOP_UNLOCK(vp); + /* + * If purging an active vnode, it must be closed and + * deactivated before being reclaimed. + */ + if (active) { + if (flags & DOCLOSE) + VOP_CLOSE(vp, IO_NDELAY, NOCRED, NULL); + VOP_INACTIVE(vp); + } + /* + * Reclaim the vnode. + */ + if (VOP_RECLAIM(vp)) + panic("vclean: cannot reclaim"); + if (active) + vrele(vp); + + /* + * Done with purge, notify sleepers of the grim news. + */ + vp->v_op = dead_vnodeop_p; + vp->v_tag = VT_NON; + vp->v_flag &= ~VXLOCK; + if (vp->v_flag & VXWANT) { + vp->v_flag &= ~VXWANT; + wakeup((caddr_t)vp); + } +} + +/* + * Eliminate all activity associated with the requested vnode + * and with all vnodes aliased to the requested vnode. + */ +void vgoneall(vp) + register struct vnode *vp; +{ + register struct vnode *vq; + + if (vp->v_flag & VALIASED) { + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + return; + } + /* + * Ensure that vp will not be vgone'd while we + * are eliminating its aliases. + */ + vp->v_flag |= VXLOCK; + while (vp->v_flag & VALIASED) { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type || vp == vq) + continue; + vgone(vq); + break; + } + } + /* + * Remove the lock so that vgone below will + * really eliminate the vnode after which time + * vgone will awaken any sleepers. + */ + vp->v_flag &= ~VXLOCK; + } + vgone(vp); +} + +/* + * Eliminate all activity associated with a vnode + * in preparation for reuse. + */ +void vgone(vp) + register struct vnode *vp; +{ + register struct vnode *vq; + struct vnode *vx; + + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + return; + } + /* + * Clean out the filesystem specific data. + */ + vclean(vp, DOCLOSE); + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) { + LIST_REMOVE(vp, v_mntvnodes); + vp->v_mount = NULL; + } + /* + * If special device, remove it from special device alias list. + */ + if (vp->v_type == VBLK || vp->v_type == VCHR) { + if (*vp->v_hashchain == vp) { + *vp->v_hashchain = vp->v_specnext; + } else { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_specnext != vp) + continue; + vq->v_specnext = vp->v_specnext; + break; + } + if (vq == NULL) + panic("missing bdev"); + } + if (vp->v_flag & VALIASED) { + vx = NULL; + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vx) + break; + vx = vq; + } + if (vx == NULL) + panic("missing alias"); + if (vq == NULL) + vx->v_flag &= ~VALIASED; + vp->v_flag &= ~VALIASED; + } + FREE(vp->v_specinfo, M_VNODE); + vp->v_specinfo = NULL; + } + /* + * If it is on the freelist and not already at the head, + * move it to the head of the list. The test of the back + * pointer and the reference count of zero is because + * it will be removed from the free list by getnewvnode, + * but will not have its reference count incremented until + * after calling vgone. If the reference count were + * incremented first, vgone would (incorrectly) try to + * close the previous instance of the underlying object. + * So, the back pointer is explicitly set to `0xdeadb' in + * getnewvnode after removing it from the freelist to ensure + * that we do not try to move it here. + */ + if (vp->v_usecount == 0 && + vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb && + vnode_free_list.tqh_first != vp) { + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + } + vp->v_type = VBAD; +} + +/* + * Lookup a vnode by device number. + */ +vfinddev(dev, type, vpp) + dev_t dev; + enum vtype type; + struct vnode **vpp; +{ + register struct vnode *vp; + + for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { + if (dev != vp->v_rdev || type != vp->v_type) + continue; + *vpp = vp; + return (1); + } + return (0); +} + +/* + * Calculate the total number of references to a special device. + */ +vcount(vp) + register struct vnode *vp; +{ + register struct vnode *vq, *vnext; + int count; + +loop: + if ((vp->v_flag & VALIASED) == 0) + return (vp->v_usecount); + for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { + vnext = vq->v_specnext; + if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + */ + if (vq->v_usecount == 0 && vq != vp) { + vgone(vq); + goto loop; + } + count += vq->v_usecount; + } + return (count); +} + +/* + * Print out a description of a vnode. + */ +static char *typename[] = + { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" }; + +vprint(label, vp) + char *label; + register struct vnode *vp; +{ + char buf[64]; + + if (label != NULL) + printf("%s: ", label); + printf("type %s, usecount %d, writecount %d, refcount %d,", + typename[vp->v_type], vp->v_usecount, vp->v_writecount, + vp->v_holdcnt); + buf[0] = '\0'; + if (vp->v_flag & VROOT) + strcat(buf, "|VROOT"); + if (vp->v_flag & VTEXT) + strcat(buf, "|VTEXT"); + if (vp->v_flag & VSYSTEM) + strcat(buf, "|VSYSTEM"); + if (vp->v_flag & VXLOCK) + strcat(buf, "|VXLOCK"); + if (vp->v_flag & VXWANT) + strcat(buf, "|VXWANT"); + if (vp->v_flag & VBWAIT) + strcat(buf, "|VBWAIT"); + if (vp->v_flag & VALIASED) + strcat(buf, "|VALIASED"); + if (buf[0] != '\0') + printf(" flags (%s)", &buf[1]); + if (vp->v_data == NULL) { + printf("\n"); + } else { + printf("\n\t"); + VOP_PRINT(vp); + } +} + +#ifdef DEBUG +/* + * List all of the locked vnodes in the system. + * Called when debugging the kernel. + */ +printlockedvnodes() +{ + register struct mount *mp; + register struct vnode *vp; + + printf("Locked vnodes\n"); + for (mp = mountlist.tqh_first; mp != NULL; mp = mp->mnt_list.tqe_next) { + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) + if (VOP_ISLOCKED(vp)) + vprint((char *)0, vp); + } +} +#endif + +int kinfo_vdebug = 1; +int kinfo_vgetfailed; +#define KINFO_VNODESLOP 10 +/* + * Dump vnode list (via sysctl). + * Copyout address of vnode followed by vnode. + */ +/* ARGSUSED */ +sysctl_vnode(where, sizep) + char *where; + size_t *sizep; +{ + register struct mount *mp, *nmp; + struct vnode *vp; + register char *bp = where, *savebp; + char *ewhere; + int error; + +#define VPTRSZ sizeof (struct vnode *) +#define VNODESZ sizeof (struct vnode) + if (where == NULL) { + *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); + return (0); + } + ewhere = where + *sizep; + + for (mp = mountlist.tqh_first; mp != NULL; mp = nmp) { + nmp = mp->mnt_list.tqe_next; + if (vfs_busy(mp)) + continue; + savebp = bp; +again: + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + /* + * Check that the vp is still associated with + * this filesystem. RACE: could have been + * recycled onto the same filesystem. + */ + if (vp->v_mount != mp) { + if (kinfo_vdebug) + printf("kinfo: vp changed\n"); + bp = savebp; + goto again; + } + if (bp + VPTRSZ + VNODESZ > ewhere) { + *sizep = bp - where; + return (ENOMEM); + } + if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || + (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) + return (error); + bp += VPTRSZ + VNODESZ; + } + vfs_unbusy(mp); + } + + *sizep = bp - where; + return (0); +} + +/* + * Check to see if a filesystem is mounted on a block device. + */ +int +vfs_mountedon(vp) + register struct vnode *vp; +{ + register struct vnode *vq; + + if (vp->v_specflags & SI_MOUNTEDON) + return (EBUSY); + if (vp->v_flag & VALIASED) { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vq->v_specflags & SI_MOUNTEDON) + return (EBUSY); + } + } + return (0); +} + +/* + * Build hash lists of net addresses and hang them off the mount point. + * Called by ufs_mount() to set up the lists of export addresses. + */ +static int +vfs_hang_addrlist(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + register int i; + struct radix_node *rn; + struct sockaddr *saddr, *smask = 0; + struct domain *dom; + int error; + + if (argp->ex_addrlen == 0) { + if (mp->mnt_flag & MNT_DEFEXPORTED) + return (EPERM); + np = &nep->ne_defexported; + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + mp->mnt_flag |= MNT_DEFEXPORTED; + return (0); + } + i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; + np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK); + bzero((caddr_t)np, i); + saddr = (struct sockaddr *)(np + 1); + if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen)) + goto out; + if (saddr->sa_len > argp->ex_addrlen) + saddr->sa_len = argp->ex_addrlen; + if (argp->ex_masklen) { + smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); + error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen); + if (error) + goto out; + if (smask->sa_len > argp->ex_masklen) + smask->sa_len = argp->ex_masklen; + } + i = saddr->sa_family; + if ((rnh = nep->ne_rtable[i]) == 0) { + /* + * Seems silly to initialize every AF when most are not + * used, do so on demand here + */ + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_family == i && dom->dom_rtattach) { + dom->dom_rtattach((void **)&nep->ne_rtable[i], + dom->dom_rtoffset); + break; + } + if ((rnh = nep->ne_rtable[i]) == 0) { + error = ENOBUFS; + goto out; + } + } + rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh, + np->netc_rnodes); + if (rn == 0 || np != (struct netcred *)rn) { /* already exists */ + error = EPERM; + goto out; + } + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + return (0); +out: + free(np, M_NETADDR); + return (error); +} + +/* ARGSUSED */ +static int +vfs_free_netcred(rn, w) + struct radix_node *rn; + caddr_t w; +{ + register struct radix_node_head *rnh = (struct radix_node_head *)w; + + (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); + free((caddr_t)rn, M_NETADDR); + return (0); +} + +/* + * Free the net address hash lists that are hanging off the mount points. + */ +static void +vfs_free_addrlist(nep) + struct netexport *nep; +{ + register int i; + register struct radix_node_head *rnh; + + for (i = 0; i <= AF_MAX; i++) + if (rnh = nep->ne_rtable[i]) { + (*rnh->rnh_walktree)(rnh, vfs_free_netcred, + (caddr_t)rnh); + free((caddr_t)rnh, M_RTABLE); + nep->ne_rtable[i] = 0; + } +} + +int +vfs_export(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + int error; + + if (argp->ex_flags & MNT_DELEXPORT) { + vfs_free_addrlist(nep); + mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); + } + if (argp->ex_flags & MNT_EXPORTED) { + if (error = vfs_hang_addrlist(mp, nep, argp)) + return (error); + mp->mnt_flag |= MNT_EXPORTED; + } + return (0); +} + +struct netcred * +vfs_export_lookup(mp, nep, nam) + register struct mount *mp; + struct netexport *nep; + struct mbuf *nam; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + struct sockaddr *saddr; + + np = NULL; + if (mp->mnt_flag & MNT_EXPORTED) { + /* + * Lookup in the export list first. + */ + if (nam != NULL) { + saddr = mtod(nam, struct sockaddr *); + rnh = nep->ne_rtable[saddr->sa_family]; + if (rnh != NULL) { + np = (struct netcred *) + (*rnh->rnh_matchaddr)((caddr_t)saddr, + rnh); + if (np && np->netc_rnodes->rn_flags & RNF_ROOT) + np = NULL; + } + } + /* + * If no address match, use the default if it exists. + */ + if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) + np = &nep->ne_defexported; + } + return (np); +} diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c new file mode 100644 index 00000000000..345c7a79bf2 --- /dev/null +++ b/sys/kern/vfs_syscalls.c @@ -0,0 +1,2107 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static int change_dir __P((struct nameidata *ndp, struct proc *p)); + +/* + * Virtual File System System Calls + */ + +/* + * Mount a file system. + */ +struct mount_args { + int type; + char *path; + int flags; + caddr_t data; +}; +/* ARGSUSED */ +mount(p, uap, retval) + struct proc *p; + register struct mount_args *uap; + int *retval; +{ + register struct vnode *vp; + register struct mount *mp; + int error, flag; + struct nameidata nd; + + /* + * Must be super user + */ + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + /* + * Get vnode to be covered + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (uap->flags & MNT_UPDATE) { + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + mp = vp->v_mount; + flag = mp->mnt_flag; + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + if ((uap->flags & MNT_RELOAD) && + ((mp->mnt_flag & MNT_RDONLY) == 0)) { + vput(vp); + return (EOPNOTSUPP); /* Needs translation */ + } + mp->mnt_flag |= + uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); + VOP_UNLOCK(vp); + goto update; + } + if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + if (vp->v_type != VDIR) { + vput(vp); + return (ENOTDIR); + } + if ((u_long)uap->type > MOUNT_MAXTYPE || vfssw[uap->type] == NULL) { + vput(vp); + return (ENODEV); + } + + /* + * Allocate and initialize the file system. + */ + mp = (struct mount *)malloc((u_long)sizeof(struct mount), + M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + mp->mnt_op = vfssw[uap->type]; + if (error = vfs_lock(mp)) { + free((caddr_t)mp, M_MOUNT); + vput(vp); + return (error); + } + if (vp->v_mountedhere != NULL) { + vfs_unlock(mp); + free((caddr_t)mp, M_MOUNT); + vput(vp); + return (EBUSY); + } + vp->v_mountedhere = mp; + mp->mnt_vnodecovered = vp; +update: + /* + * Set the mount level flags. + */ + if (uap->flags & MNT_RDONLY) + mp->mnt_flag |= MNT_RDONLY; + else if (mp->mnt_flag & MNT_RDONLY) + mp->mnt_flag |= MNT_WANTRDWR; + mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | + MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC); + mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | + MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC); + /* + * Mount the filesystem. + */ + error = VFS_MOUNT(mp, uap->path, uap->data, &nd, p); + if (mp->mnt_flag & MNT_UPDATE) { + vrele(vp); + if (mp->mnt_flag & MNT_WANTRDWR) + mp->mnt_flag &= ~MNT_RDONLY; + mp->mnt_flag &=~ + (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_WANTRDWR); + if (error) + mp->mnt_flag = flag; + return (error); + } + /* + * Put the new filesystem on the mount list after root. + */ + cache_purge(vp); + if (!error) { + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + VOP_UNLOCK(vp); + vfs_unlock(mp); + error = VFS_START(mp, 0, p); + } else { + mp->mnt_vnodecovered->v_mountedhere = (struct mount *)0; + vfs_unlock(mp); + free((caddr_t)mp, M_MOUNT); + vput(vp); + } + return (error); +} + +/* + * Unmount a file system. + * + * Note: unmount takes a path to the vnode mounted on as argument, + * not special file (as before). + */ +struct unmount_args { + char *path; + int flags; +}; +/* ARGSUSED */ +unmount(p, uap, retval) + struct proc *p; + register struct unmount_args *uap; + int *retval; +{ + register struct vnode *vp; + struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + + /* + * Unless this is a user mount, then must + * have suser privilege. + */ + if (((vp->v_mount->mnt_flag & MNT_USER) == 0) && + (error = suser(p->p_ucred, &p->p_acflag))) { + vput(vp); + return (error); + } + + /* + * Must be the root of the filesystem + */ + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + mp = vp->v_mount; + vput(vp); + return (dounmount(mp, uap->flags, p)); +} + +/* + * Do the actual file system unmount. + */ +dounmount(mp, flags, p) + register struct mount *mp; + int flags; + struct proc *p; +{ + struct vnode *coveredvp; + int error; + + coveredvp = mp->mnt_vnodecovered; + if (vfs_busy(mp)) + return (EBUSY); + mp->mnt_flag |= MNT_UNMOUNT; + if (error = vfs_lock(mp)) + return (error); + + mp->mnt_flag &=~ MNT_ASYNC; + vnode_pager_umount(mp); /* release cached vnodes */ + cache_purgevfs(mp); /* remove cache entries for this file sys */ + if ((error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0 || + (flags & MNT_FORCE)) + error = VFS_UNMOUNT(mp, flags, p); + mp->mnt_flag &= ~MNT_UNMOUNT; + vfs_unbusy(mp); + if (error) { + vfs_unlock(mp); + } else { + vrele(coveredvp); + TAILQ_REMOVE(&mountlist, mp, mnt_list); + mp->mnt_vnodecovered->v_mountedhere = (struct mount *)0; + vfs_unlock(mp); + if (mp->mnt_vnodelist.lh_first != NULL) + panic("unmount: dangling vnode"); + free((caddr_t)mp, M_MOUNT); + } + return (error); +} + +/* + * Sync each mounted filesystem. + */ +#ifdef DIAGNOSTIC +int syncprt = 0; +struct ctldebug debug0 = { "syncprt", &syncprt }; +#endif + +struct sync_args { + int dummy; +}; +/* ARGSUSED */ +sync(p, uap, retval) + struct proc *p; + struct sync_args *uap; + int *retval; +{ + register struct mount *mp, *nmp; + int asyncflag; + + for (mp = mountlist.tqh_first; mp != NULL; mp = nmp) { + nmp = mp->mnt_list.tqe_next; + /* + * The lock check below is to avoid races with mount + * and unmount. + */ + if ((mp->mnt_flag & (MNT_MLOCK|MNT_RDONLY|MNT_MPBUSY)) == 0 && + !vfs_busy(mp)) { + asyncflag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + VFS_SYNC(mp, MNT_NOWAIT, p->p_ucred, p); + if (asyncflag) + mp->mnt_flag |= MNT_ASYNC; + vfs_unbusy(mp); + } + } +#ifdef DIAGNOSTIC + if (syncprt) + vfs_bufstats(); +#endif /* DIAGNOSTIC */ + return (0); +} + +/* + * Change filesystem quotas. + */ +struct quotactl_args { + char *path; + int cmd; + int uid; + caddr_t arg; +}; +/* ARGSUSED */ +quotactl(p, uap, retval) + struct proc *p; + register struct quotactl_args *uap; + int *retval; +{ + register struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + vrele(nd.ni_vp); + return (VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, p)); +} + +/* + * Get filesystem statistics. + */ +struct statfs_args { + char *path; + struct statfs *buf; +}; +/* ARGSUSED */ +statfs(p, uap, retval) + struct proc *p; + register struct statfs_args *uap; + int *retval; +{ + register struct mount *mp; + register struct statfs *sp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + sp = &mp->mnt_stat; + vrele(nd.ni_vp); + if (error = VFS_STATFS(mp, sp, p)) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + return (copyout((caddr_t)sp, (caddr_t)uap->buf, sizeof(*sp))); +} + +/* + * Get filesystem statistics. + */ +struct fstatfs_args { + int fd; + struct statfs *buf; +}; +/* ARGSUSED */ +fstatfs(p, uap, retval) + struct proc *p; + register struct fstatfs_args *uap; + int *retval; +{ + struct file *fp; + struct mount *mp; + register struct statfs *sp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + mp = ((struct vnode *)fp->f_data)->v_mount; + sp = &mp->mnt_stat; + if (error = VFS_STATFS(mp, sp, p)) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + return (copyout((caddr_t)sp, (caddr_t)uap->buf, sizeof(*sp))); +} + +/* + * Get statistics on all filesystems. + */ +struct getfsstat_args { + struct statfs *buf; + long bufsize; + int flags; +}; +getfsstat(p, uap, retval) + struct proc *p; + register struct getfsstat_args *uap; + int *retval; +{ + register struct mount *mp, *nmp; + register struct statfs *sp; + caddr_t sfsp; + long count, maxcount, error; + + maxcount = uap->bufsize / sizeof(struct statfs); + sfsp = (caddr_t)uap->buf; + for (count = 0, mp = mountlist.tqh_first; mp != NULL; mp = nmp) { + nmp = mp->mnt_list.tqe_next; + if (sfsp && count < maxcount && + ((mp->mnt_flag & MNT_MLOCK) == 0)) { + sp = &mp->mnt_stat; + /* + * If MNT_NOWAIT is specified, do not refresh the + * fsstat cache. MNT_WAIT overrides MNT_NOWAIT. + */ + if (((uap->flags & MNT_NOWAIT) == 0 || + (uap->flags & MNT_WAIT)) && + (error = VFS_STATFS(mp, sp, p))) + continue; + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (error = copyout((caddr_t)sp, sfsp, sizeof(*sp))) + return (error); + sfsp += sizeof(*sp); + } + count++; + } + if (sfsp && count > maxcount) + *retval = maxcount; + else + *retval = count; + return (0); +} + +/* + * Change current working directory to a given file descriptor. + */ +struct fchdir_args { + int fd; +}; +/* ARGSUSED */ +fchdir(p, uap, retval) + struct proc *p; + struct fchdir_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(fdp, uap->fd, &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VOP_LOCK(vp); + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + VOP_UNLOCK(vp); + if (error) + return (error); + VREF(vp); + vrele(fdp->fd_cdir); + fdp->fd_cdir = vp; + return (0); +} + +/* + * Change current working directory (``.''). + */ +struct chdir_args { + char *path; +}; +/* ARGSUSED */ +chdir(p, uap, retval) + struct proc *p; + struct chdir_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = change_dir(&nd, p)) + return (error); + vrele(fdp->fd_cdir); + fdp->fd_cdir = nd.ni_vp; + return (0); +} + +/* + * Change notion of root (``/'') directory. + */ +struct chroot_args { + char *path; +}; +/* ARGSUSED */ +chroot(p, uap, retval) + struct proc *p; + struct chroot_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = change_dir(&nd, p)) + return (error); + if (fdp->fd_rdir != NULL) + vrele(fdp->fd_rdir); + fdp->fd_rdir = nd.ni_vp; + return (0); +} + +/* + * Common routine for chroot and chdir. + */ +static int +change_dir(ndp, p) + register struct nameidata *ndp; + struct proc *p; +{ + struct vnode *vp; + int error; + + if (error = namei(ndp)) + return (error); + vp = ndp->ni_vp; + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + VOP_UNLOCK(vp); + if (error) + vrele(vp); + return (error); +} + +/* + * Check permissions, allocate an open file structure, + * and call the device open routine if any. + */ +struct open_args { + char *path; + int flags; + int mode; +}; +open(p, uap, retval) + struct proc *p; + register struct open_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register struct vnode *vp; + int flags, cmode; + struct file *nfp; + int type, indx, error; + struct flock lf; + struct nameidata nd; + extern struct fileops vnops; + + if (error = falloc(p, &nfp, &indx)) + return (error); + fp = nfp; + flags = FFLAGS(uap->flags); + cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + p->p_dupfd = -indx - 1; /* XXX check for fdopen */ + if (error = vn_open(&nd, flags, cmode)) { + ffree(fp); + if ((error == ENODEV || error == ENXIO) && + p->p_dupfd >= 0 && /* XXX from fdopen */ + (error = + dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) { + *retval = indx; + return (0); + } + if (error == ERESTART) + error = EINTR; + fdp->fd_ofiles[indx] = NULL; + return (error); + } + p->p_dupfd = 0; + vp = nd.ni_vp; + fp->f_flag = flags & FMASK; + fp->f_type = DTYPE_VNODE; + fp->f_ops = &vnops; + fp->f_data = (caddr_t)vp; + if (flags & (O_EXLOCK | O_SHLOCK)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (flags & O_EXLOCK) + lf.l_type = F_WRLCK; + else + lf.l_type = F_RDLCK; + type = F_FLOCK; + if ((flags & FNONBLOCK) == 0) + type |= F_WAIT; + VOP_UNLOCK(vp); + if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) { + (void) vn_close(vp, fp->f_flag, fp->f_cred, p); + ffree(fp); + fdp->fd_ofiles[indx] = NULL; + return (error); + } + VOP_LOCK(vp); + fp->f_flag |= FHASLOCK; + } + VOP_UNLOCK(vp); + *retval = indx; + return (0); +} + +#ifdef COMPAT_43 +/* + * Create a file. + */ +struct ocreat_args { + char *path; + int mode; +}; +ocreat(p, uap, retval) + struct proc *p; + register struct ocreat_args *uap; + int *retval; +{ + struct open_args openuap; + + openuap.path = uap->path; + openuap.mode = uap->mode; + openuap.flags = O_WRONLY | O_CREAT | O_TRUNC; + return (open(p, &openuap, retval)); +} +#endif /* COMPAT_43 */ + +/* + * Create a special file. + */ +struct mknod_args { + char *path; + int mode; + int dev; +}; +/* ARGSUSED */ +mknod(p, uap, retval) + struct proc *p; + register struct mknod_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) + error = EEXIST; + else { + VATTR_NULL(&vattr); + vattr.va_mode = (uap->mode & ALLPERMS) &~ p->p_fd->fd_cmask; + vattr.va_rdev = uap->dev; + + switch (uap->mode & S_IFMT) { + case S_IFMT: /* used by badsect to flag bad sectors */ + vattr.va_type = VBAD; + break; + case S_IFCHR: + vattr.va_type = VCHR; + break; + case S_IFBLK: + vattr.va_type = VBLK; + break; + default: + error = EINVAL; + break; + } + } + if (!error) { + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp) + vrele(vp); + } + return (error); +} + +/* + * Create named pipe. + */ +struct mkfifo_args { + char *path; + int mode; +}; +/* ARGSUSED */ +mkfifo(p, uap, retval) + struct proc *p; + register struct mkfifo_args *uap; + int *retval; +{ + struct vattr vattr; + int error; + struct nameidata nd; + +#ifndef FIFO + return (EOPNOTSUPP); +#else + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + if (nd.ni_vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VFIFO; + vattr.va_mode = (uap->mode & ALLPERMS) &~ p->p_fd->fd_cmask; + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + return (VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr)); +#endif /* FIFO */ +} + +/* + * Make a hard file link. + */ +struct link_args { + char *path; + char *link; +}; +/* ARGSUSED */ +link(p, uap, retval) + struct proc *p; + register struct link_args *uap; + int *retval; +{ + register struct vnode *vp; + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR || + (error = suser(p->p_ucred, &p->p_acflag)) == 0) { + nd.ni_cnd.cn_nameiop = CREATE; + nd.ni_cnd.cn_flags = LOCKPARENT; + nd.ni_dirp = uap->link; + if ((error = namei(&nd)) == 0) { + if (nd.ni_vp != NULL) + error = EEXIST; + if (!error) { + LEASE_CHECK(nd.ni_dvp, + p, p->p_ucred, LEASE_WRITE); + LEASE_CHECK(vp, + p, p->p_ucred, LEASE_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + } + } + } + vrele(vp); + return (error); +} + +/* + * Make a symbolic link. + */ +struct symlink_args { + char *path; + char *link; +}; +/* ARGSUSED */ +symlink(p, uap, retval) + struct proc *p; + register struct symlink_args *uap; + int *retval; +{ + struct vattr vattr; + char *path; + int error; + struct nameidata nd; + + MALLOC(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (error = copyinstr(uap->path, path, MAXPATHLEN, NULL)) + goto out; + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, uap->link, p); + if (error = namei(&nd)) + goto out; + if (nd.ni_vp) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + error = EEXIST; + goto out; + } + VATTR_NULL(&vattr); + vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask; + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); +out: + FREE(path, M_NAMEI); + return (error); +} + +/* + * Delete a name from the filesystem. + */ +struct unlink_args { + char *path; +}; +/* ARGSUSED */ +unlink(p, uap, retval) + struct proc *p; + struct unlink_args *uap; + int *retval; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + + if (vp->v_type != VDIR || + (error = suser(p->p_ucred, &p->p_acflag)) == 0) { + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) + error = EBUSY; + else + (void)vnode_pager_uncache(vp); + } + + if (!error) { + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + } + return (error); +} + +/* + * Reposition read/write file offset. + */ +struct lseek_args { + int fd; + int pad; + off_t offset; + int whence; +}; +lseek(p, uap, retval) + struct proc *p; + register struct lseek_args *uap; + int *retval; +{ + struct ucred *cred = p->p_ucred; + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vattr vattr; + int error; + + if ((u_int)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (ESPIPE); + switch (uap->whence) { + case L_INCR: + fp->f_offset += uap->offset; + break; + case L_XTND: + if (error = + VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p)) + return (error); + fp->f_offset = uap->offset + vattr.va_size; + break; + case L_SET: + fp->f_offset = uap->offset; + break; + default: + return (EINVAL); + } + *(off_t *)retval = fp->f_offset; + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Reposition read/write file offset. + */ +struct olseek_args { + int fd; + long offset; + int whence; +}; +olseek(p, uap, retval) + struct proc *p; + register struct olseek_args *uap; + int *retval; +{ + struct lseek_args nuap; + off_t qret; + int error; + + nuap.fd = uap->fd; + nuap.offset = uap->offset; + nuap.whence = uap->whence; + error = lseek(p, &nuap, &qret); + *(long *)retval = qret; + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Check access permissions. + */ +struct access_args { + char *path; + int flags; +}; +access(p, uap, retval) + struct proc *p; + register struct access_args *uap; + int *retval; +{ + register struct ucred *cred = p->p_ucred; + register struct vnode *vp; + int error, flags, t_gid, t_uid; + struct nameidata nd; + + t_uid = cred->cr_uid; + t_gid = cred->cr_groups[0]; + cred->cr_uid = p->p_cred->p_ruid; + cred->cr_groups[0] = p->p_cred->p_rgid; + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + goto out1; + vp = nd.ni_vp; + + /* Flags == 0 means only check for existence. */ + if (uap->flags) { + flags = 0; + if (uap->flags & R_OK) + flags |= VREAD; + if (uap->flags & W_OK) + flags |= VWRITE; + if (uap->flags & X_OK) + flags |= VEXEC; + if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) + error = VOP_ACCESS(vp, flags, cred, p); + } + vput(vp); +out1: + cred->cr_uid = t_uid; + cred->cr_groups[0] = t_gid; + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Get file status; this version follows links. + */ +struct ostat_args { + char *path; + struct ostat *ub; +}; +/* ARGSUSED */ +ostat(p, uap, retval) + struct proc *p; + register struct ostat_args *uap; + int *retval; +{ + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)uap->ub, sizeof (osb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +struct olstat_args { + char *path; + struct ostat *ub; +}; +/* ARGSUSED */ +olstat(p, uap, retval) + struct proc *p; + register struct olstat_args *uap; + int *retval; +{ + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)uap->ub, sizeof (osb)); + return (error); +} + +/* + * Convert from an old to a new stat structure. + */ +cvtstat(st, ost) + struct stat *st; + struct ostat *ost; +{ + + ost->st_dev = st->st_dev; + ost->st_ino = st->st_ino; + ost->st_mode = st->st_mode; + ost->st_nlink = st->st_nlink; + ost->st_uid = st->st_uid; + ost->st_gid = st->st_gid; + ost->st_rdev = st->st_rdev; + if (st->st_size < (quad_t)1 << 32) + ost->st_size = st->st_size; + else + ost->st_size = -2; + ost->st_atime = st->st_atime; + ost->st_mtime = st->st_mtime; + ost->st_ctime = st->st_ctime; + ost->st_blksize = st->st_blksize; + ost->st_blocks = st->st_blocks; + ost->st_flags = st->st_flags; + ost->st_gen = st->st_gen; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Get file status; this version follows links. + */ +struct stat_args { + char *path; + struct stat *ub; +}; +/* ARGSUSED */ +stat(p, uap, retval) + struct proc *p; + register struct stat_args *uap; + int *retval; +{ + struct stat sb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + error = copyout((caddr_t)&sb, (caddr_t)uap->ub, sizeof (sb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +struct lstat_args { + char *path; + struct stat *ub; +}; +/* ARGSUSED */ +lstat(p, uap, retval) + struct proc *p; + register struct lstat_args *uap; + int *retval; +{ + int error; + struct vnode *vp, *dvp; + struct stat sb, sb1; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKPARENT, UIO_USERSPACE, + uap->path, p); + if (error = namei(&nd)) + return (error); + /* + * For symbolic links, always return the attributes of its + * containing directory, except for mode, size, and links. + */ + vp = nd.ni_vp; + dvp = nd.ni_dvp; + if (vp->v_type != VLNK) { + if (dvp == vp) + vrele(dvp); + else + vput(dvp); + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + } else { + error = vn_stat(dvp, &sb, p); + vput(dvp); + if (error) { + vput(vp); + return (error); + } + error = vn_stat(vp, &sb1, p); + vput(vp); + if (error) + return (error); + sb.st_mode &= ~S_IFDIR; + sb.st_mode |= S_IFLNK; + sb.st_nlink = sb1.st_nlink; + sb.st_size = sb1.st_size; + sb.st_blocks = sb1.st_blocks; + } + error = copyout((caddr_t)&sb, (caddr_t)uap->ub, sizeof (sb)); + return (error); +} + +/* + * Get configurable pathname variables. + */ +struct pathconf_args { + char *path; + int name; +}; +/* ARGSUSED */ +pathconf(p, uap, retval) + struct proc *p; + register struct pathconf_args *uap; + int *retval; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + error = VOP_PATHCONF(nd.ni_vp, uap->name, retval); + vput(nd.ni_vp); + return (error); +} + +/* + * Return target name of a symbolic link. + */ +struct readlink_args { + char *path; + char *buf; + int count; +}; +/* ARGSUSED */ +readlink(p, uap, retval) + struct proc *p; + register struct readlink_args *uap; + int *retval; +{ + register struct vnode *vp; + struct iovec aiov; + struct uio auio; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VLNK) + error = EINVAL; + else { + aiov.iov_base = uap->buf; + aiov.iov_len = uap->count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = uap->count; + error = VOP_READLINK(vp, &auio, p->p_ucred); + } + vput(vp); + *retval = uap->count - auio.uio_resid; + return (error); +} + +/* + * Change flags of a file given a path name. + */ +struct chflags_args { + char *path; + int flags; +}; +/* ARGSUSED */ +chflags(p, uap, retval) + struct proc *p; + register struct chflags_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_flags = uap->flags; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Change flags of a file given a file descriptor. + */ +struct fchflags_args { + int fd; + int flags; +}; +/* ARGSUSED */ +fchflags(p, uap, retval) + struct proc *p; + register struct fchflags_args *uap; + int *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_flags = uap->flags; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + VOP_UNLOCK(vp); + return (error); +} + +/* + * Change mode of a file given path name. + */ +struct chmod_args { + char *path; + int mode; +}; +/* ARGSUSED */ +chmod(p, uap, retval) + struct proc *p; + register struct chmod_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_mode = uap->mode & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Change mode of a file given a file descriptor. + */ +struct fchmod_args { + int fd; + int mode; +}; +/* ARGSUSED */ +fchmod(p, uap, retval) + struct proc *p; + register struct fchmod_args *uap; + int *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_mode = uap->mode & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + VOP_UNLOCK(vp); + return (error); +} + +/* + * Set ownership given a path name. + */ +struct chown_args { + char *path; + int uid; + int gid; +}; +/* ARGSUSED */ +chown(p, uap, retval) + struct proc *p; + register struct chown_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_uid = uap->uid; + vattr.va_gid = uap->gid; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Set ownership given a file descriptor. + */ +struct fchown_args { + int fd; + int uid; + int gid; +}; +/* ARGSUSED */ +fchown(p, uap, retval) + struct proc *p; + register struct fchown_args *uap; + int *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + VATTR_NULL(&vattr); + vattr.va_uid = uap->uid; + vattr.va_gid = uap->gid; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + VOP_UNLOCK(vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +struct utimes_args { + char *path; + struct timeval *tptr; +}; +/* ARGSUSED */ +utimes(p, uap, retval) + struct proc *p; + register struct utimes_args *uap; + int *retval; +{ + register struct vnode *vp; + struct timeval tv[2]; + struct vattr vattr; + int error; + struct nameidata nd; + + VATTR_NULL(&vattr); + if (uap->tptr == NULL) { + microtime(&tv[0]); + tv[1] = tv[0]; + vattr.va_vaflags |= VA_UTIMES_NULL; + } else if (error = copyin((caddr_t)uap->tptr, (caddr_t)tv, sizeof (tv))) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + error = EROFS; + else { + vattr.va_atime.ts_sec = tv[0].tv_sec; + vattr.va_atime.ts_nsec = tv[0].tv_usec * 1000; + vattr.va_mtime.ts_sec = tv[1].tv_sec; + vattr.va_mtime.ts_nsec = tv[1].tv_usec * 1000; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Truncate a file given its path name. + */ +struct truncate_args { + char *path; + int pad; + off_t length; +}; +/* ARGSUSED */ +truncate(p, uap, retval) + struct proc *p; + register struct truncate_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0 && + (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = uap->length; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Truncate a file given a file descriptor. + */ +struct ftruncate_args { + int fd; + int pad; + off_t length; +}; +/* ARGSUSED */ +ftruncate(p, uap, retval) + struct proc *p; + register struct ftruncate_args *uap; + int *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + if ((fp->f_flag & FWRITE) == 0) + return (EINVAL); + vp = (struct vnode *)fp->f_data; + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + VOP_LOCK(vp); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = uap->length; + error = VOP_SETATTR(vp, &vattr, fp->f_cred, p); + } + VOP_UNLOCK(vp); + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Truncate a file given its path name. + */ +struct otruncate_args { + char *path; + long length; +}; +/* ARGSUSED */ +otruncate(p, uap, retval) + struct proc *p; + register struct otruncate_args *uap; + int *retval; +{ + struct truncate_args nuap; + + nuap.path = uap->path; + nuap.length = uap->length; + return (truncate(p, &nuap, retval)); +} + +/* + * Truncate a file given a file descriptor. + */ +struct oftruncate_args { + int fd; + long length; +}; +/* ARGSUSED */ +oftruncate(p, uap, retval) + struct proc *p; + register struct oftruncate_args *uap; + int *retval; +{ + struct ftruncate_args nuap; + + nuap.fd = uap->fd; + nuap.length = uap->length; + return (ftruncate(p, &nuap, retval)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Sync an open file. + */ +struct fsync_args { + int fd; +}; +/* ARGSUSED */ +fsync(p, uap, retval) + struct proc *p; + struct fsync_args *uap; + int *retval; +{ + register struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VOP_LOCK(vp); + error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p); + VOP_UNLOCK(vp); + return (error); +} + +/* + * Rename files. Source and destination must either both be directories, + * or both not be directories. If target is a directory, it must be empty. + */ +struct rename_args { + char *from; + char *to; +}; +/* ARGSUSED */ +rename(p, uap, retval) + struct proc *p; + register struct rename_args *uap; + int *retval; +{ + register struct vnode *tvp, *fvp, *tdvp; + struct nameidata fromnd, tond; + int error; + + NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE, + uap->from, p); + if (error = namei(&fromnd)) + return (error); + fvp = fromnd.ni_vp; + NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART, + UIO_USERSPACE, uap->to, p); + if (error = namei(&tond)) { + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + if (fvp == tdvp) + error = EINVAL; + /* + * If source is the same as the destination (that is the + * same inode number with the same name in the same directory), + * then there is nothing to do. + */ + if (fvp == tvp && fromnd.ni_dvp == tdvp && + fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && + !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, + fromnd.ni_cnd.cn_namelen)) + error = -1; +out: + if (!error) { + LEASE_CHECK(tdvp, p, p->p_ucred, LEASE_WRITE); + if (fromnd.ni_dvp != tdvp) + LEASE_CHECK(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (tvp) + LEASE_CHECK(tvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + } else { + VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + vrele(tond.ni_startdir); + FREE(tond.ni_cnd.cn_pnbuf, M_NAMEI); +out1: + if (fromnd.ni_startdir) + vrele(fromnd.ni_startdir); + FREE(fromnd.ni_cnd.cn_pnbuf, M_NAMEI); + if (error == -1) + return (0); + return (error); +} + +/* + * Make a directory file. + */ +struct mkdir_args { + char *path; + int mode; +}; +/* ARGSUSED */ +mkdir(p, uap, retval) + struct proc *p; + register struct mkdir_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VDIR; + vattr.va_mode = (uap->mode & ACCESSPERMS) &~ p->p_fd->fd_cmask; + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + if (!error) + vput(nd.ni_vp); + return (error); +} + +/* + * Remove a directory file. + */ +struct rmdir_args { + char *path; +}; +/* ARGSUSED */ +rmdir(p, uap, retval) + struct proc *p; + struct rmdir_args *uap; + int *retval; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + /* + * No rmdir "." please. + */ + if (nd.ni_dvp == vp) { + error = EINVAL; + goto out; + } + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) + error = EBUSY; +out: + if (!error) { + LEASE_CHECK(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + LEASE_CHECK(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + } + return (error); +} + +#ifdef COMPAT_43 +/* + * Read a block of directory entries in a file system independent format. + */ +struct ogetdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +ogetdirentries(p, uap, retval) + struct proc *p; + register struct ogetdirentries_args *uap; + int *retval; +{ + register struct vnode *vp; + struct file *fp; + struct uio auio, kuio; + struct iovec aiov, kiov; + struct dirent *dp, *edp; + caddr_t dirbuf; + int error, readcnt; + long loff; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = uap->buf; + aiov.iov_len = uap->count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = uap->count; + VOP_LOCK(vp); + loff = auio.uio_offset = fp->f_offset; +# if (BYTE_ORDER != LITTLE_ENDIAN) + if (vp->v_mount->mnt_maxsymlinklen <= 0) { + error = VOP_READDIR(vp, &auio, fp->f_cred); + fp->f_offset = auio.uio_offset; + } else +# endif + { + kuio = auio; + kuio.uio_iov = &kiov; + kuio.uio_segflg = UIO_SYSSPACE; + kiov.iov_len = uap->count; + MALLOC(dirbuf, caddr_t, uap->count, M_TEMP, M_WAITOK); + kiov.iov_base = dirbuf; + error = VOP_READDIR(vp, &kuio, fp->f_cred); + fp->f_offset = kuio.uio_offset; + if (error == 0) { + readcnt = uap->count - kuio.uio_resid; + edp = (struct dirent *)&dirbuf[readcnt]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + /* + * The expected low byte of + * dp->d_namlen is our dp->d_type. + * The high MBZ byte of dp->d_namlen + * is our dp->d_namlen. + */ + dp->d_type = dp->d_namlen; + dp->d_namlen = 0; +# else + /* + * The dp->d_type is the high byte + * of the expected dp->d_namlen, + * so must be zero'ed. + */ + dp->d_type = 0; +# endif + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } + } + if (dp >= edp) + error = uiomove(dirbuf, readcnt, &auio); + } + FREE(dirbuf, M_TEMP); + } + VOP_UNLOCK(vp); + if (error) + return (error); + error = copyout((caddr_t)&loff, (caddr_t)uap->basep, sizeof(long)); + *retval = uap->count - auio.uio_resid; + return (error); +} +#endif + +/* + * Read a block of directory entries in a file system independent format. + */ +struct getdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +getdirentries(p, uap, retval) + struct proc *p; + register struct getdirentries_args *uap; + int *retval; +{ + register struct vnode *vp; + struct file *fp; + struct uio auio; + struct iovec aiov; + long loff; + int error; + + if (error = getvnode(p->p_fd, uap->fd, &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = uap->buf; + aiov.iov_len = uap->count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = uap->count; + VOP_LOCK(vp); + loff = auio.uio_offset = fp->f_offset; + error = VOP_READDIR(vp, &auio, fp->f_cred); + fp->f_offset = auio.uio_offset; + VOP_UNLOCK(vp); + if (error) + return (error); + +#ifdef UNION +{ + extern int (**union_vnodeop_p)(); + extern struct vnode *union_lowervp __P((struct vnode *)); + + if ((uap->count == auio.uio_resid) && + (vp->v_op == union_vnodeop_p)) { + struct vnode *tvp = vp; + + vp = union_lowervp(vp); + if (vp != NULLVP) { + VOP_LOCK(vp); + error = VOP_OPEN(vp, FREAD); + VOP_UNLOCK(vp); + + if (error) { + vrele(vp); + return (error); + } + fp->f_data = (caddr_t) vp; + fp->f_offset = 0; + error = vn_close(tvp, FREAD, fp->f_cred, p); + if (error) + return (error); + goto unionread; + } + } +} +#endif + + if ((uap->count == auio.uio_resid) && + (vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_UNION)) { + struct vnode *tvp = vp; + vp = vp->v_mount->mnt_vnodecovered; + VREF(vp); + fp->f_data = (caddr_t) vp; + fp->f_offset = 0; + vrele(tvp); + goto unionread; + } + error = copyout((caddr_t)&loff, (caddr_t)uap->basep, sizeof(long)); + *retval = uap->count - auio.uio_resid; + return (error); +} + +/* + * Set the mode mask for creation of filesystem nodes. + */ +struct umask_args { + int newmask; +}; +mode_t /* XXX */ +umask(p, uap, retval) + struct proc *p; + struct umask_args *uap; + int *retval; +{ + register struct filedesc *fdp; + + fdp = p->p_fd; + *retval = fdp->fd_cmask; + fdp->fd_cmask = uap->newmask & ALLPERMS; + return (0); +} + +/* + * Void all references to file by ripping underlying filesystem + * away from vnode. + */ +struct revoke_args { + char *path; +}; +/* ARGSUSED */ +revoke(p, uap, retval) + struct proc *p; + register struct revoke_args *uap; + int *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VCHR && vp->v_type != VBLK) { + error = EINVAL; + goto out; + } + if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p)) + goto out; + if (p->p_ucred->cr_uid != vattr.va_uid && + (error = suser(p->p_ucred, &p->p_acflag))) + goto out; + if (vp->v_usecount > 1 || (vp->v_flag & VALIASED)) + vgoneall(vp); +out: + vrele(vp); + return (error); +} + +/* + * Convert a user file descriptor to a kernel file entry. + */ +getvnode(fdp, fd, fpp) + struct filedesc *fdp; + struct file **fpp; + int fd; +{ + struct file *fp; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (EINVAL); + *fpp = fp; + return (0); +} diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c new file mode 100644 index 00000000000..d104bb9de77 --- /dev/null +++ b/sys/kern/vfs_vnops.c @@ -0,0 +1,422 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct fileops vnops = + { vn_read, vn_write, vn_ioctl, vn_select, vn_closefile }; + +/* + * Common code for vnode open operations. + * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. + */ +vn_open(ndp, fmode, cmode) + register struct nameidata *ndp; + int fmode, cmode; +{ + register struct vnode *vp; + register struct proc *p = ndp->ni_cnd.cn_proc; + register struct ucred *cred = p->p_ucred; + struct vattr vat; + struct vattr *vap = &vat; + int error; + + if (fmode & O_CREAT) { + ndp->ni_cnd.cn_nameiop = CREATE; + ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; + if ((fmode & O_EXCL) == 0) + ndp->ni_cnd.cn_flags |= FOLLOW; + if (error = namei(ndp)) + return (error); + if (ndp->ni_vp == NULL) { + VATTR_NULL(vap); + vap->va_type = VREG; + vap->va_mode = cmode; + LEASE_CHECK(ndp->ni_dvp, p, cred, LEASE_WRITE); + if (error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, + &ndp->ni_cnd, vap)) + return (error); + fmode &= ~O_TRUNC; + vp = ndp->ni_vp; + } else { + VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd); + if (ndp->ni_dvp == ndp->ni_vp) + vrele(ndp->ni_dvp); + else + vput(ndp->ni_dvp); + ndp->ni_dvp = NULL; + vp = ndp->ni_vp; + if (fmode & O_EXCL) { + error = EEXIST; + goto bad; + } + fmode &= ~O_CREAT; + } + } else { + ndp->ni_cnd.cn_nameiop = LOOKUP; + ndp->ni_cnd.cn_flags = FOLLOW | LOCKLEAF; + if (error = namei(ndp)) + return (error); + vp = ndp->ni_vp; + } + if (vp->v_type == VSOCK) { + error = EOPNOTSUPP; + goto bad; + } + if ((fmode & O_CREAT) == 0) { + if (fmode & FREAD) { + if (error = VOP_ACCESS(vp, VREAD, cred, p)) + goto bad; + } + if (fmode & (FWRITE | O_TRUNC)) { + if (vp->v_type == VDIR) { + error = EISDIR; + goto bad; + } + if ((error = vn_writechk(vp)) || + (error = VOP_ACCESS(vp, VWRITE, cred, p))) + goto bad; + } + } + if (fmode & O_TRUNC) { + VOP_UNLOCK(vp); /* XXX */ + LEASE_CHECK(vp, p, cred, LEASE_WRITE); + VOP_LOCK(vp); /* XXX */ + VATTR_NULL(vap); + vap->va_size = 0; + if (error = VOP_SETATTR(vp, vap, cred, p)) + goto bad; + } + if (error = VOP_OPEN(vp, fmode, cred, p)) + goto bad; + if (fmode & FWRITE) + vp->v_writecount++; + return (0); +bad: + vput(vp); + return (error); +} + +/* + * Check for write permissions on the specified vnode. + * The read-only status of the file system is checked. + * Also, prototype text segments cannot be written. + */ +vn_writechk(vp) + register struct vnode *vp; +{ + + /* + * Disallow write attempts on read-only file systems; + * unless the file is a socket or a block or character + * device resident on the file system. + */ + if (vp->v_mount->mnt_flag & MNT_RDONLY) { + switch (vp->v_type) { + case VREG: case VDIR: case VLNK: + return (EROFS); + } + } + /* + * If there's shared text associated with + * the vnode, try to free it up once. If + * we fail, we can't allow writing. + */ + if ((vp->v_flag & VTEXT) && !vnode_pager_uncache(vp)) + return (ETXTBSY); + return (0); +} + +/* + * Vnode close call + */ +vn_close(vp, flags, cred, p) + register struct vnode *vp; + int flags; + struct ucred *cred; + struct proc *p; +{ + int error; + + if (flags & FWRITE) + vp->v_writecount--; + error = VOP_CLOSE(vp, flags, cred, p); + vrele(vp); + return (error); +} + +/* + * Package up an I/O request on a vnode into a uio and do it. + */ +vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p) + enum uio_rw rw; + struct vnode *vp; + caddr_t base; + int len; + off_t offset; + enum uio_seg segflg; + int ioflg; + struct ucred *cred; + int *aresid; + struct proc *p; +{ + struct uio auio; + struct iovec aiov; + int error; + + if ((ioflg & IO_NODELOCKED) == 0) + VOP_LOCK(vp); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_procp = p; + if (rw == UIO_READ) { + error = VOP_READ(vp, &auio, ioflg, cred); + } else { + error = VOP_WRITE(vp, &auio, ioflg, cred); + } + if (aresid) + *aresid = auio.uio_resid; + else + if (auio.uio_resid && error == 0) + error = EIO; + if ((ioflg & IO_NODELOCKED) == 0) + VOP_UNLOCK(vp); + return (error); +} + +/* + * File table vnode read routine. + */ +vn_read(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + register struct vnode *vp = (struct vnode *)fp->f_data; + int count, error; + + LEASE_CHECK(vp, uio->uio_procp, cred, LEASE_READ); + VOP_LOCK(vp); + uio->uio_offset = fp->f_offset; + count = uio->uio_resid; + error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0, + cred); + fp->f_offset += count - uio->uio_resid; + VOP_UNLOCK(vp); + return (error); +} + +/* + * File table vnode write routine. + */ +vn_write(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + register struct vnode *vp = (struct vnode *)fp->f_data; + int count, error, ioflag = 0; + + if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) + ioflag |= IO_APPEND; + if (fp->f_flag & FNONBLOCK) + ioflag |= IO_NDELAY; + LEASE_CHECK(vp, uio->uio_procp, cred, LEASE_WRITE); + VOP_LOCK(vp); + uio->uio_offset = fp->f_offset; + count = uio->uio_resid; + error = VOP_WRITE(vp, uio, ioflag, cred); + if (ioflag & IO_APPEND) + fp->f_offset = uio->uio_offset; + else + fp->f_offset += count - uio->uio_resid; + VOP_UNLOCK(vp); + return (error); +} + +/* + * File table vnode stat routine. + */ +vn_stat(vp, sb, p) + struct vnode *vp; + register struct stat *sb; + struct proc *p; +{ + struct vattr vattr; + register struct vattr *vap; + int error; + u_short mode; + + vap = &vattr; + error = VOP_GETATTR(vp, vap, p->p_ucred, p); + if (error) + return (error); + /* + * Copy from vattr table + */ + sb->st_dev = vap->va_fsid; + sb->st_ino = vap->va_fileid; + mode = vap->va_mode; + switch (vp->v_type) { + case VREG: + mode |= S_IFREG; + break; + case VDIR: + mode |= S_IFDIR; + break; + case VBLK: + mode |= S_IFBLK; + break; + case VCHR: + mode |= S_IFCHR; + break; + case VLNK: + mode |= S_IFLNK; + break; + case VSOCK: + mode |= S_IFSOCK; + break; + case VFIFO: + mode |= S_IFIFO; + break; + default: + return (EBADF); + }; + sb->st_mode = mode; + sb->st_nlink = vap->va_nlink; + sb->st_uid = vap->va_uid; + sb->st_gid = vap->va_gid; + sb->st_rdev = vap->va_rdev; + sb->st_size = vap->va_size; + sb->st_atimespec = vap->va_atime; + sb->st_mtimespec= vap->va_mtime; + sb->st_ctimespec = vap->va_ctime; + sb->st_blksize = vap->va_blocksize; + sb->st_flags = vap->va_flags; + sb->st_gen = vap->va_gen; + sb->st_blocks = vap->va_bytes / S_BLKSIZE; + return (0); +} + +/* + * File table vnode ioctl routine. + */ +vn_ioctl(fp, com, data, p) + struct file *fp; + int com; + caddr_t data; + struct proc *p; +{ + register struct vnode *vp = ((struct vnode *)fp->f_data); + struct vattr vattr; + int error; + + switch (vp->v_type) { + + case VREG: + case VDIR: + if (com == FIONREAD) { + if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p)) + return (error); + *(int *)data = vattr.va_size - fp->f_offset; + return (0); + } + if (com == FIONBIO || com == FIOASYNC) /* XXX */ + return (0); /* XXX */ + /* fall into ... */ + + default: + return (ENOTTY); + + case VFIFO: + case VCHR: + case VBLK: + error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p); + if (error == 0 && com == TIOCSCTTY) { + p->p_session->s_ttyvp = vp; + VREF(vp); + } + return (error); + } +} + +/* + * File table vnode select routine. + */ +vn_select(fp, which, p) + struct file *fp; + int which; + struct proc *p; +{ + + return (VOP_SELECT(((struct vnode *)fp->f_data), which, fp->f_flag, + fp->f_cred, p)); +} + +/* + * File table vnode close routine. + */ +vn_closefile(fp, p) + struct file *fp; + struct proc *p; +{ + + return (vn_close(((struct vnode *)fp->f_data), fp->f_flag, + fp->f_cred, p)); +} diff --git a/sys/kern/vnode_if.pl b/sys/kern/vnode_if.pl new file mode 100644 index 00000000000..e190fa04836 --- /dev/null +++ b/sys/kern/vnode_if.pl @@ -0,0 +1,433 @@ +#!/bin/sh - +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 +# + +# Script to produce VFS front-end sugar. +# +# usage: vnode_if.sh srcfile +# (where srcfile is currently /sys/kern/vnode_if.src) +# +# These awk scripts are not particularly well written, specifically they +# don't use arrays well and figure out the same information repeatedly. +# Please rewrite them if you actually understand how to use awk. Note, +# they use nawk extensions and gawk's toupper. + +if [ $# -ne 1 ] ; then + echo 'usage: vnode_if.sh srcfile' + exit 1 +fi + +# Name of the source file. +SRC=$1 + +# Names of the created files. +CFILE=vnode_if.c +HEADER=vnode_if.h + +# Awk program (must support nawk extensions and gawk's "toupper") +# Use "awk" at Berkeley, "gawk" elsewhere. +AWK=awk + +# Print out header information for vnode_if.h. +cat << END_OF_LEADING_COMMENT > $HEADER +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +extern struct vnodeop_desc vop_default_desc; +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.h. +$AWK ' + NF == 0 || $0 ~ "^#" { + next; + } + { + # Get the function name. + name = $1; + uname = toupper(name); + + # Get the function arguments. + for (c1 = 0;; ++c1) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + a[c1] = $0; + } + + # Print out the vop_F_args structure. + printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n", + name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%sa_%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("};\n"); + + # Print out extern declaration. + printf("extern struct vnodeop_desc %s_desc;\n", name); + + # Print out inline struct. + printf("static inline int %s(", uname); + sep = ", "; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = ")\n"; + c3 = split(a[c2], t); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s", substr(t[c3], beg, end - beg), sep); + } + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%s%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("{\n\tstruct %s_args a;\n\n", name); + printf("\ta.a_desc = VDESC(%s);\n", name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("a.a_%s = %s\n", + substr(t[c3], beg, end - beg), substr(t[c3], beg)); + } + c1 = split(a[0], t); + beg = match(t[c1], "[^*]"); + end = match(t[c1], ";"); + printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n", + substr(t[c1], beg, end - beg), name); + }' < $SRC >> $HEADER + +# Print out header information for vnode_if.c. +cat << END_OF_LEADING_COMMENT > $CFILE +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include + +struct vnodeop_desc vop_default_desc = { + 0, + "default", + 0, + NULL, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; + +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.c. +$AWK 'function kill_surrounding_ws (s) { + sub (/^[ \t]*/, "", s); + sub (/[ \t]*$/, "", s); + return s; + } + + function read_args() { + numargs = 0; + while (getline ln) { + if (ln ~ /}/) { + break; + }; + + # Delete comments, if any. + gsub (/\/\*.*\*\//, "", ln); + + # Delete leading/trailing space. + ln = kill_surrounding_ws(ln); + + # Pick off direction. + if (1 == sub(/^INOUT[ \t]+/, "", ln)) + dir = "INOUT"; + else if (1 == sub(/^IN[ \t]+/, "", ln)) + dir = "IN"; + else if (1 == sub(/^OUT[ \t]+/, "", ln)) + dir = "OUT"; + else + bail("No IN/OUT direction for \"" ln "\"."); + + # check for "WILLRELE" + if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) { + rele = "WILLRELE"; + } else { + rele = "WONTRELE"; + }; + + # kill trailing ; + if (1 != sub (/;$/, "", ln)) { + bail("Missing end-of-line ; in \"" ln "\"."); + }; + + # pick off variable name + if (!(i = match(ln, /[A-Za-z0-9_]+$/))) { + bail("Missing var name \"a_foo\" in \"" ln "\"."); + }; + arg = substr (ln, i); + # Want to <>, but nawk cannot. + # Hack around this. + ln = substr(ln, 1, i-1); + + # what is left must be type + # (put clean it up some) + type = ln; + gsub (/[ \t]+/, " ", type); # condense whitespace + type = kill_surrounding_ws(type); + + # (boy this was easier in Perl) + + numargs++; + dirs[numargs] = dir; + reles[numargs] = rele; + types[numargs] = type; + args[numargs] = arg; + }; + } + + function generate_operation_vp_offsets() { + printf ("int %s_vp_offsets[] = {\n", name); + # as a side effect, figure out the releflags + releflags = ""; + vpnum = 0; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode *") { + printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n", + name, args[i]); + if (reles[i] == "WILLRELE") { + releflags = releflags "|VDESC_VP" vpnum "_WILLRELE"; + }; + vpnum++; + }; + }; + sub (/^\|/, "", releflags); + print "\tVDESC_NO_OFFSET"; + print "};"; + } + + function find_arg_with_type (type) { + for (i=1; i<=numargs; i++) { + if (types[i] == type) { + return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")"; + }; + }; + return "VDESC_NO_OFFSET"; + } + + function generate_operation_desc() { + printf ("struct vnodeop_desc %s_desc = {\n", name); + # offset + printf ("\t0,\n"); + # printable name + printf ("\t\"%s\",\n", name); + # flags + vppwillrele = ""; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode **" && + (reles[i] == "WILLRELE")) { + vppwillrele = "|VDESC_VPP_WILLRELE"; + }; + }; + if (releflags == "") { + printf ("\t0%s,\n", vppwillrele); + } else { + printf ("\t%s%s,\n", releflags, vppwillrele); + }; + # vp offsets + printf ("\t%s_vp_offsets,\n", name); + # vpp (if any) + printf ("\t%s,\n", find_arg_with_type("struct vnode **")); + # cred (if any) + printf ("\t%s,\n", find_arg_with_type("struct ucred *")); + # proc (if any) + printf ("\t%s,\n", find_arg_with_type("struct proc *")); + # componentname + printf ("\t%s,\n", find_arg_with_type("struct componentname *")); + # transport layer information + printf ("\tNULL,\n};\n"); + } + + NF == 0 || $0 ~ "^#" { + next; + } + { + # get the function name + name = $1; + + # get the function arguments + read_args(); + + # Print out the vop_F_vp_offsets structure. This all depends + # on naming conventions and nothing else. + generate_operation_vp_offsets(); + + # Print out the vnodeop_desc structure. + generate_operation_desc(); + + printf "\n"; + + }' < $SRC >> $CFILE +# THINGS THAT DON'T WORK RIGHT YET. +# +# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as +# arguments. This means that these operations can't function successfully +# through a bypass routine. +# +# Bwrite and strategy will be replaced when the VM page/buffer cache +# integration happens. +# +# To get around this problem for now we handle these ops as special cases. + +cat << END_OF_SPECIAL_CASES >> $HEADER +#include +struct vop_strategy_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_strategy_desc; +static inline int VOP_STRATEGY(bp) + struct buf *bp; +{ + struct vop_strategy_args a; + + a.a_desc = VDESC(vop_strategy); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_strategy), &a)); +} + +struct vop_bwrite_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_bwrite_desc; +static inline int VOP_BWRITE(bp) + struct buf *bp; +{ + struct vop_bwrite_args a; + + a.a_desc = VDESC(vop_bwrite); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a)); +} +END_OF_SPECIAL_CASES + +cat << END_OF_SPECIAL_CASES >> $CFILE +int vop_strategy_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_strategy_desc = { + 0, + "vop_strategy", + 0, + vop_strategy_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +int vop_bwrite_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_bwrite_desc = { + 0, + "vop_bwrite", + 0, + vop_bwrite_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +END_OF_SPECIAL_CASES + +# Add the vfs_op_descs array to the C file. +$AWK ' + BEGIN { + printf("\nstruct vnodeop_desc *vfs_op_descs[] = {\n"); + printf("\t&vop_default_desc, /* MUST BE FIRST */\n"); + printf("\t&vop_strategy_desc, /* XXX: SPECIAL CASE */\n"); + printf("\t&vop_bwrite_desc, /* XXX: SPECIAL CASE */\n"); + } + END { + printf("\tNULL\n};\n"); + } + NF == 0 || $0 ~ "^#" { + next; + } + { + # Get the function name. + printf("\t&%s_desc,\n", $1); + + # Skip the function arguments. + for (;;) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + } + }' < $SRC >> $CFILE + diff --git a/sys/kern/vnode_if.sh b/sys/kern/vnode_if.sh new file mode 100644 index 00000000000..e190fa04836 --- /dev/null +++ b/sys/kern/vnode_if.sh @@ -0,0 +1,433 @@ +#!/bin/sh - +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 +# + +# Script to produce VFS front-end sugar. +# +# usage: vnode_if.sh srcfile +# (where srcfile is currently /sys/kern/vnode_if.src) +# +# These awk scripts are not particularly well written, specifically they +# don't use arrays well and figure out the same information repeatedly. +# Please rewrite them if you actually understand how to use awk. Note, +# they use nawk extensions and gawk's toupper. + +if [ $# -ne 1 ] ; then + echo 'usage: vnode_if.sh srcfile' + exit 1 +fi + +# Name of the source file. +SRC=$1 + +# Names of the created files. +CFILE=vnode_if.c +HEADER=vnode_if.h + +# Awk program (must support nawk extensions and gawk's "toupper") +# Use "awk" at Berkeley, "gawk" elsewhere. +AWK=awk + +# Print out header information for vnode_if.h. +cat << END_OF_LEADING_COMMENT > $HEADER +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +extern struct vnodeop_desc vop_default_desc; +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.h. +$AWK ' + NF == 0 || $0 ~ "^#" { + next; + } + { + # Get the function name. + name = $1; + uname = toupper(name); + + # Get the function arguments. + for (c1 = 0;; ++c1) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + a[c1] = $0; + } + + # Print out the vop_F_args structure. + printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n", + name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%sa_%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("};\n"); + + # Print out extern declaration. + printf("extern struct vnodeop_desc %s_desc;\n", name); + + # Print out inline struct. + printf("static inline int %s(", uname); + sep = ", "; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = ")\n"; + c3 = split(a[c2], t); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s", substr(t[c3], beg, end - beg), sep); + } + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%s%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("{\n\tstruct %s_args a;\n\n", name); + printf("\ta.a_desc = VDESC(%s);\n", name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("a.a_%s = %s\n", + substr(t[c3], beg, end - beg), substr(t[c3], beg)); + } + c1 = split(a[0], t); + beg = match(t[c1], "[^*]"); + end = match(t[c1], ";"); + printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n", + substr(t[c1], beg, end - beg), name); + }' < $SRC >> $HEADER + +# Print out header information for vnode_if.c. +cat << END_OF_LEADING_COMMENT > $CFILE +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include + +struct vnodeop_desc vop_default_desc = { + 0, + "default", + 0, + NULL, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; + +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.c. +$AWK 'function kill_surrounding_ws (s) { + sub (/^[ \t]*/, "", s); + sub (/[ \t]*$/, "", s); + return s; + } + + function read_args() { + numargs = 0; + while (getline ln) { + if (ln ~ /}/) { + break; + }; + + # Delete comments, if any. + gsub (/\/\*.*\*\//, "", ln); + + # Delete leading/trailing space. + ln = kill_surrounding_ws(ln); + + # Pick off direction. + if (1 == sub(/^INOUT[ \t]+/, "", ln)) + dir = "INOUT"; + else if (1 == sub(/^IN[ \t]+/, "", ln)) + dir = "IN"; + else if (1 == sub(/^OUT[ \t]+/, "", ln)) + dir = "OUT"; + else + bail("No IN/OUT direction for \"" ln "\"."); + + # check for "WILLRELE" + if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) { + rele = "WILLRELE"; + } else { + rele = "WONTRELE"; + }; + + # kill trailing ; + if (1 != sub (/;$/, "", ln)) { + bail("Missing end-of-line ; in \"" ln "\"."); + }; + + # pick off variable name + if (!(i = match(ln, /[A-Za-z0-9_]+$/))) { + bail("Missing var name \"a_foo\" in \"" ln "\"."); + }; + arg = substr (ln, i); + # Want to <>, but nawk cannot. + # Hack around this. + ln = substr(ln, 1, i-1); + + # what is left must be type + # (put clean it up some) + type = ln; + gsub (/[ \t]+/, " ", type); # condense whitespace + type = kill_surrounding_ws(type); + + # (boy this was easier in Perl) + + numargs++; + dirs[numargs] = dir; + reles[numargs] = rele; + types[numargs] = type; + args[numargs] = arg; + }; + } + + function generate_operation_vp_offsets() { + printf ("int %s_vp_offsets[] = {\n", name); + # as a side effect, figure out the releflags + releflags = ""; + vpnum = 0; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode *") { + printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n", + name, args[i]); + if (reles[i] == "WILLRELE") { + releflags = releflags "|VDESC_VP" vpnum "_WILLRELE"; + }; + vpnum++; + }; + }; + sub (/^\|/, "", releflags); + print "\tVDESC_NO_OFFSET"; + print "};"; + } + + function find_arg_with_type (type) { + for (i=1; i<=numargs; i++) { + if (types[i] == type) { + return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")"; + }; + }; + return "VDESC_NO_OFFSET"; + } + + function generate_operation_desc() { + printf ("struct vnodeop_desc %s_desc = {\n", name); + # offset + printf ("\t0,\n"); + # printable name + printf ("\t\"%s\",\n", name); + # flags + vppwillrele = ""; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode **" && + (reles[i] == "WILLRELE")) { + vppwillrele = "|VDESC_VPP_WILLRELE"; + }; + }; + if (releflags == "") { + printf ("\t0%s,\n", vppwillrele); + } else { + printf ("\t%s%s,\n", releflags, vppwillrele); + }; + # vp offsets + printf ("\t%s_vp_offsets,\n", name); + # vpp (if any) + printf ("\t%s,\n", find_arg_with_type("struct vnode **")); + # cred (if any) + printf ("\t%s,\n", find_arg_with_type("struct ucred *")); + # proc (if any) + printf ("\t%s,\n", find_arg_with_type("struct proc *")); + # componentname + printf ("\t%s,\n", find_arg_with_type("struct componentname *")); + # transport layer information + printf ("\tNULL,\n};\n"); + } + + NF == 0 || $0 ~ "^#" { + next; + } + { + # get the function name + name = $1; + + # get the function arguments + read_args(); + + # Print out the vop_F_vp_offsets structure. This all depends + # on naming conventions and nothing else. + generate_operation_vp_offsets(); + + # Print out the vnodeop_desc structure. + generate_operation_desc(); + + printf "\n"; + + }' < $SRC >> $CFILE +# THINGS THAT DON'T WORK RIGHT YET. +# +# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as +# arguments. This means that these operations can't function successfully +# through a bypass routine. +# +# Bwrite and strategy will be replaced when the VM page/buffer cache +# integration happens. +# +# To get around this problem for now we handle these ops as special cases. + +cat << END_OF_SPECIAL_CASES >> $HEADER +#include +struct vop_strategy_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_strategy_desc; +static inline int VOP_STRATEGY(bp) + struct buf *bp; +{ + struct vop_strategy_args a; + + a.a_desc = VDESC(vop_strategy); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_strategy), &a)); +} + +struct vop_bwrite_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_bwrite_desc; +static inline int VOP_BWRITE(bp) + struct buf *bp; +{ + struct vop_bwrite_args a; + + a.a_desc = VDESC(vop_bwrite); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a)); +} +END_OF_SPECIAL_CASES + +cat << END_OF_SPECIAL_CASES >> $CFILE +int vop_strategy_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_strategy_desc = { + 0, + "vop_strategy", + 0, + vop_strategy_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +int vop_bwrite_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_bwrite_desc = { + 0, + "vop_bwrite", + 0, + vop_bwrite_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +END_OF_SPECIAL_CASES + +# Add the vfs_op_descs array to the C file. +$AWK ' + BEGIN { + printf("\nstruct vnodeop_desc *vfs_op_descs[] = {\n"); + printf("\t&vop_default_desc, /* MUST BE FIRST */\n"); + printf("\t&vop_strategy_desc, /* XXX: SPECIAL CASE */\n"); + printf("\t&vop_bwrite_desc, /* XXX: SPECIAL CASE */\n"); + } + END { + printf("\tNULL\n};\n"); + } + NF == 0 || $0 ~ "^#" { + next; + } + { + # Get the function name. + printf("\t&%s_desc,\n", $1); + + # Skip the function arguments. + for (;;) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + } + }' < $SRC >> $CFILE + diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src new file mode 100644 index 00000000000..caee21dce0b --- /dev/null +++ b/sys/kern/vnode_if.src @@ -0,0 +1,296 @@ +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)vnode_if.src 8.3 (Berkeley) 2/3/94 +# +vop_lookup { + IN struct vnode *dvp; + INOUT struct vnode **vpp; + IN struct componentname *cnp; +}; + +vop_create { + IN WILLRELE struct vnode *dvp; + OUT struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +vop_mknod { + IN WILLRELE struct vnode *dvp; + OUT WILLRELE struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +vop_open { + IN struct vnode *vp; + IN int mode; + IN struct ucred *cred; + IN struct proc *p; +}; + +vop_close { + IN struct vnode *vp; + IN int fflag; + IN struct ucred *cred; + IN struct proc *p; +}; + +vop_access { + IN struct vnode *vp; + IN int mode; + IN struct ucred *cred; + IN struct proc *p; +}; + +vop_getattr { + IN struct vnode *vp; + IN struct vattr *vap; + IN struct ucred *cred; + IN struct proc *p; +}; + +vop_setattr { + IN struct vnode *vp; + IN struct vattr *vap; + IN struct ucred *cred; + IN struct proc *p; +}; + +vop_read { + IN struct vnode *vp; + INOUT struct uio *uio; + IN int ioflag; + IN struct ucred *cred; +}; + +vop_write { + IN struct vnode *vp; + INOUT struct uio *uio; + IN int ioflag; + IN struct ucred *cred; +}; + +vop_ioctl { + IN struct vnode *vp; + IN int command; + IN caddr_t data; + IN int fflag; + IN struct ucred *cred; + IN struct proc *p; +}; + +# Needs work? (fflags) +vop_select { + IN struct vnode *vp; + IN int which; + IN int fflags; + IN struct ucred *cred; + IN struct proc *p; +}; + +vop_mmap { + IN struct vnode *vp; + IN int fflags; + IN struct ucred *cred; + IN struct proc *p; +}; + +vop_fsync { + IN struct vnode *vp; + IN struct ucred *cred; + IN int waitfor; + IN struct proc *p; +}; + +# Needs word: Is newoff right? What's it mean? +vop_seek { + IN struct vnode *vp; + IN off_t oldoff; + IN off_t newoff; + IN struct ucred *cred; +}; + +vop_remove { + IN WILLRELE struct vnode *dvp; + IN WILLRELE struct vnode *vp; + IN struct componentname *cnp; +}; + +vop_link { + IN WILLRELE struct vnode *vp; + IN struct vnode *tdvp; + IN struct componentname *cnp; +}; + +vop_rename { + IN WILLRELE struct vnode *fdvp; + IN WILLRELE struct vnode *fvp; + IN struct componentname *fcnp; + IN WILLRELE struct vnode *tdvp; + IN WILLRELE struct vnode *tvp; + IN struct componentname *tcnp; +}; + +vop_mkdir { + IN WILLRELE struct vnode *dvp; + OUT struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +vop_rmdir { + IN WILLRELE struct vnode *dvp; + IN WILLRELE struct vnode *vp; + IN struct componentname *cnp; +}; + +vop_symlink { + IN WILLRELE struct vnode *dvp; + OUT WILLRELE struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; + IN char *target; +}; + +vop_readdir { + IN struct vnode *vp; + INOUT struct uio *uio; + IN struct ucred *cred; +}; + +vop_readlink { + IN struct vnode *vp; + INOUT struct uio *uio; + IN struct ucred *cred; +}; + +vop_abortop { + IN struct vnode *dvp; + IN struct componentname *cnp; +}; + +vop_inactive { + IN struct vnode *vp; +}; + +vop_reclaim { + IN struct vnode *vp; +}; + +vop_lock { + IN struct vnode *vp; +}; + +vop_unlock { + IN struct vnode *vp; +}; + +vop_bmap { + IN struct vnode *vp; + IN daddr_t bn; + OUT struct vnode **vpp; + IN daddr_t *bnp; + OUT int *runp; +}; + +#vop_strategy { +# IN struct buf *bp; +#}; + +vop_print { + IN struct vnode *vp; +}; + +vop_islocked { + IN struct vnode *vp; +}; + +vop_pathconf { + IN struct vnode *vp; + IN int name; + OUT int *retval; +}; + +vop_advlock { + IN struct vnode *vp; + IN caddr_t id; + IN int op; + IN struct flock *fl; + IN int flags; +}; + +vop_blkatoff { + IN struct vnode *vp; + IN off_t offset; + OUT char **res; + OUT struct buf **bpp; +}; + +vop_valloc { + IN struct vnode *pvp; + IN int mode; + IN struct ucred *cred; + OUT struct vnode **vpp; +}; + +vop_reallocblks { + IN struct vnode *vp; + IN struct cluster_save *buflist; +}; + +vop_vfree { + IN struct vnode *pvp; + IN ino_t ino; + IN int mode; +}; + +vop_truncate { + IN struct vnode *vp; + IN off_t length; + IN int flags; + IN struct ucred *cred; + IN struct proc *p; +}; + +vop_update { + IN struct vnode *vp; + IN struct timeval *access; + IN struct timeval *modify; + IN int waitfor; +}; + +# Needs work: no vp? +#vop_bwrite { +# IN struct buf *bp; +#}; diff --git a/sys/libkern/Makefile b/sys/libkern/Makefile new file mode 100644 index 00000000000..991a4350144 --- /dev/null +++ b/sys/libkern/Makefile @@ -0,0 +1,20 @@ +# @(#)Makefile 7.9 (Berkeley) 6/1/93 + +LIB= kern +CFLAGS+= -I${.CURDIR} -I${.CURDIR}/.. +SRCS= adddi3.c anddi3.c ashldi3.c ashrdi3.c bcmp.c cmpdi2.c divdi3.c \ + ffs.c iordi3.c locc.c lshldi3.c lshrdi3.c mcount.c moddi3.c \ + muldi3.c negdi2.c notdi2.c qdivrem.c random.c rindex.c scanc.c \ + skpc.c strcat.c strcmp.c strcpy.c strlen.c strncpy.c subdi3.c \ + ucmpdi2.c udivdi3.c umoddi3.c xordi3.c + +.if exists(${.CURDIR}/${MACHINE}/Makefile.inc) +.PATH: ${.CURDIR}/${MACHINE} +.include "${.CURDIR}/${MACHINE}/Makefile.inc" +.endif + +# mcount cannot be compiled with profiling +mcount.po: mcount.o + cp mcount.o mcount.po + +.include diff --git a/sys/libkern/adddi3.c b/sys/libkern/adddi3.c new file mode 100644 index 00000000000..d10da47e0cf --- /dev/null +++ b/sys/libkern/adddi3.c @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)adddi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Add two quads. This is trivial since a one-bit carry from a single + * u_long addition x+y occurs if and only if the sum x+y is less than + * either x or y (the choice to compare with x or y is arbitrary). + */ +quad_t +__adddi3(a, b) + quad_t a, b; +{ + union uu aa, bb, sum; + + aa.q = a; + bb.q = b; + sum.ul[L] = aa.ul[L] + bb.ul[L]; + sum.ul[H] = aa.ul[H] + bb.ul[H] + (sum.ul[L] < bb.ul[L]); + return (sum.q); +} diff --git a/sys/libkern/anddi3.c b/sys/libkern/anddi3.c new file mode 100644 index 00000000000..5ae45ac1a86 --- /dev/null +++ b/sys/libkern/anddi3.c @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)anddi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Return a & b, in quad. + */ +quad_t +__anddi3(a, b) + quad_t a, b; +{ + union uu aa, bb; + + aa.q = a; + bb.q = b; + aa.ul[0] &= bb.ul[0]; + aa.ul[1] &= bb.ul[1]; + return (aa.q); +} diff --git a/sys/libkern/ashldi3.c b/sys/libkern/ashldi3.c new file mode 100644 index 00000000000..72501adfaed --- /dev/null +++ b/sys/libkern/ashldi3.c @@ -0,0 +1,66 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)ashldi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Shift a (signed) quad value left (arithmetic shift left). + * This is the same as logical shift left! + */ +quad_t +__ashldi3(a, shift) + quad_t a; + qshift_t shift; +{ + union uu aa; + + aa.q = a; + if (shift >= LONG_BITS) { + aa.ul[H] = shift >= QUAD_BITS ? 0 : + aa.ul[L] << (shift - LONG_BITS); + aa.ul[L] = 0; + } else if (shift > 0) { + aa.ul[H] = (aa.ul[H] << shift) | + (aa.ul[L] >> (LONG_BITS - shift)); + aa.ul[L] <<= shift; + } + return (aa.q); +} diff --git a/sys/libkern/ashrdi3.c b/sys/libkern/ashrdi3.c new file mode 100644 index 00000000000..9ffa5ed06b5 --- /dev/null +++ b/sys/libkern/ashrdi3.c @@ -0,0 +1,75 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)ashrdi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Shift a (signed) quad value right (arithmetic shift right). + */ +quad_t +__ashrdi3(a, shift) + quad_t a; + qshift_t shift; +{ + union uu aa; + + aa.q = a; + if (shift >= LONG_BITS) { + long s; + + /* + * Smear bits rightward using the machine's right-shift + * method, whether that is sign extension or zero fill, + * to get the `sign word' s. Note that shifting by + * LONG_BITS is undefined, so we shift (LONG_BITS-1), + * then 1 more, to get our answer. + */ + s = (aa.sl[H] >> (LONG_BITS - 1)) >> 1; + aa.ul[L] = shift >= QUAD_BITS ? s : + aa.sl[H] >> (shift - LONG_BITS); + aa.ul[H] = s; + } else if (shift > 0) { + aa.ul[L] = (aa.ul[L] >> shift) | + (aa.ul[H] << (LONG_BITS - shift)); + aa.sl[H] >>= shift; + } + return (aa.q); +} diff --git a/sys/libkern/bcmp.c b/sys/libkern/bcmp.c new file mode 100644 index 00000000000..5a3ae616800 --- /dev/null +++ b/sys/libkern/bcmp.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)bcmp.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include + +/* + * bcmp -- vax cmpc3 instruction + */ +int +bcmp(b1, b2, length) + const void *b1, *b2; + register size_t length; +{ + register char *p1, *p2; + + if (length == 0) + return(0); + p1 = (char *)b1; + p2 = (char *)b2; + do + if (*p1++ != *p2++) + break; + while (--length); + return(length); +} diff --git a/sys/libkern/cmpdi2.c b/sys/libkern/cmpdi2.c new file mode 100644 index 00000000000..f6e4bdd6a4d --- /dev/null +++ b/sys/libkern/cmpdi2.c @@ -0,0 +1,59 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)cmpdi2.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Return 0, 1, or 2 as a <, =, > b respectively. + * Both a and b are considered signed---which means only the high word is + * signed. + */ +int +__cmpdi2(a, b) + quad_t a, b; +{ + union uu aa, bb; + + aa.q = a; + bb.q = b; + return (aa.sl[H] < bb.sl[H] ? 0 : aa.sl[H] > bb.sl[H] ? 2 : + aa.ul[L] < bb.ul[L] ? 0 : aa.ul[L] > bb.ul[L] ? 2 : 1); +} diff --git a/sys/libkern/divdi3.c b/sys/libkern/divdi3.c new file mode 100644 index 00000000000..da7b2fccd01 --- /dev/null +++ b/sys/libkern/divdi3.c @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)divdi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Divide two signed quads. + * ??? if -1/2 should produce -1 on this machine, this code is wrong + */ +quad_t +__divdi3(a, b) + quad_t a, b; +{ + u_quad_t ua, ub, uq; + int neg; + + if (a < 0) + ua = -(u_quad_t)a, neg = 1; + else + ua = a, neg = 0; + if (b < 0) + ub = -(u_quad_t)b, neg ^= 1; + else + ub = b; + uq = __qdivrem(ua, ub, (u_quad_t *)0); + return (neg ? -uq : uq); +} diff --git a/sys/libkern/ffs.c b/sys/libkern/ffs.c new file mode 100644 index 00000000000..099ff8e4c91 --- /dev/null +++ b/sys/libkern/ffs.c @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)ffs.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include + +/* + * ffs -- vax ffs instruction + */ +int +ffs(mask) + register int mask; +{ + register int bit; + + if (mask == 0) + return(0); + for (bit = 1; !(mask & 1); bit++) + mask >>= 1; + return(bit); +} diff --git a/sys/libkern/iordi3.c b/sys/libkern/iordi3.c new file mode 100644 index 00000000000..e225005d414 --- /dev/null +++ b/sys/libkern/iordi3.c @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)iordi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Return a | b, in quad. + */ +quad_t +__iordi3(a, b) + quad_t a, b; +{ + union uu aa, bb; + + aa.q = a; + bb.q = b; + aa.ul[0] |= bb.ul[0]; + aa.ul[1] |= bb.ul[1]; + return (aa.q); +} diff --git a/sys/libkern/libkern.h b/sys/libkern/libkern.h new file mode 100644 index 00000000000..0e465e03dfd --- /dev/null +++ b/sys/libkern/libkern.h @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)libkern.h 8.1 (Berkeley) 6/10/93 + */ + +#include + +static inline int +imax(a, b) + int a, b; +{ + return (a > b ? a : b); +} +static inline int +imin(a, b) + int a, b; +{ + return (a < b ? a : b); +} +static inline long +lmax(a, b) + long a, b; +{ + return (a > b ? a : b); +} +static inline long +lmin(a, b) + long a, b; +{ + return (a < b ? a : b); +} +static inline u_int +max(a, b) + u_int a, b; +{ + return (a > b ? a : b); +} +static inline u_int +min(a, b) + u_int a, b; +{ + return (a < b ? a : b); +} +static inline u_long +ulmax(a, b) + u_long a, b; +{ + return (a > b ? a : b); +} +static inline u_long +ulmin(a, b) + u_long a, b; +{ + return (a < b ? a : b); +} + +/* Prototypes for non-quad routines. */ +int bcmp __P((const void *, const void *, size_t)); +int ffs __P((int)); +int locc __P((int, char *, u_int)); +u_long random __P((void)); +char *rindex __P((const char *, int)); +int scanc __P((u_int, u_char *, u_char *, int)); +int skpc __P((int, int, char *)); +char *strcat __P((char *, const char *)); +char *strcpy __P((char *, const char *)); +size_t strlen __P((const char *)); +char *strncpy __P((char *, const char *, size_t)); diff --git a/sys/libkern/locc.c b/sys/libkern/locc.c new file mode 100644 index 00000000000..3767222c5f0 --- /dev/null +++ b/sys/libkern/locc.c @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)locc.c 8.1 (Berkeley) 6/10/93 + */ + +#include + +int +locc(mask0, cp0, size) + int mask0; + char *cp0; + u_int size; +{ + register u_char *cp, *end, mask; + + mask = mask0; + cp = (u_char *)cp0; + for (end = &cp[size]; cp < end && *cp != mask; ++cp); + return (end - cp); +} diff --git a/sys/libkern/lshldi3.c b/sys/libkern/lshldi3.c new file mode 100644 index 00000000000..0af6051c1a6 --- /dev/null +++ b/sys/libkern/lshldi3.c @@ -0,0 +1,66 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)lshldi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Shift an (unsigned) quad value left (logical shift left). + * This is the same as arithmetic shift left! + */ +quad_t +__lshldi3(a, shift) + quad_t a; + qshift_t shift; +{ + union uu aa; + + aa.q = a; + if (shift >= LONG_BITS) { + aa.ul[H] = shift >= QUAD_BITS ? 0 : + aa.ul[L] << (shift - LONG_BITS); + aa.ul[L] = 0; + } else if (shift > 0) { + aa.ul[H] = (aa.ul[H] << shift) | + (aa.ul[L] >> (LONG_BITS - shift)); + aa.ul[L] <<= shift; + } + return (aa.q); +} diff --git a/sys/libkern/lshrdi3.c b/sys/libkern/lshrdi3.c new file mode 100644 index 00000000000..add2eda988c --- /dev/null +++ b/sys/libkern/lshrdi3.c @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)lshrdi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Shift an (unsigned) quad value right (logical shift right). + */ +quad_t +__lshrdi3(a, shift) + quad_t a; + qshift_t shift; +{ + union uu aa; + + aa.q = a; + if (shift >= LONG_BITS) { + aa.ul[L] = shift >= QUAD_BITS ? 0 : + aa.ul[H] >> (shift - LONG_BITS); + aa.ul[H] = 0; + } else if (shift > 0) { + aa.ul[L] = (aa.ul[L] >> shift) | + (aa.ul[H] << (LONG_BITS - shift)); + aa.ul[H] >>= shift; + } + return (aa.q); +} diff --git a/sys/libkern/mcount.c b/sys/libkern/mcount.c new file mode 100644 index 00000000000..523217d1d2d --- /dev/null +++ b/sys/libkern/mcount.c @@ -0,0 +1,178 @@ +/*- + * Copyright (c) 1983, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if !defined(lint) && !defined(KERNEL) && defined(LIBC_SCCS) +static char sccsid[] = "@(#)mcount.c 8.1 (Berkeley) 6/4/93"; +#endif + +#include +#include + +/* + * mcount is called on entry to each function compiled with the profiling + * switch set. _mcount(), which is declared in a machine-dependent way + * with _MCOUNT_DECL, does the actual work and is either inlined into a + * C routine or called by an assembly stub. In any case, this magic is + * taken care of by the MCOUNT definition in . + * + * _mcount updates data structures that represent traversals of the + * program's call graph edges. frompc and selfpc are the return + * address and function address that represents the given call graph edge. + * + * Note: the original BSD code used the same variable (frompcindex) for + * both frompcindex and frompc. Any reasonable, modern compiler will + * perform this optimization. + */ +_MCOUNT_DECL(frompc, selfpc) /* _mcount; may be static, inline, etc */ + register u_long frompc, selfpc; +{ + register u_short *frompcindex; + register struct tostruct *top, *prevtop; + register struct gmonparam *p; + register long toindex; +#ifdef KERNEL + register int s; +#endif + + p = &_gmonparam; + /* + * check that we are profiling + * and that we aren't recursively invoked. + */ + if (p->state != GMON_PROF_ON) + return; +#ifdef KERNEL + MCOUNT_ENTER; +#else + p->state = GMON_PROF_BUSY; +#endif + /* + * check that frompcindex is a reasonable pc value. + * for example: signal catchers get called from the stack, + * not from text space. too bad. + */ + frompc -= p->lowpc; + if (frompc > p->textsize) + goto done; + + frompcindex = &p->froms[frompc / (p->hashfraction * sizeof(*p->froms))]; + toindex = *frompcindex; + if (toindex == 0) { + /* + * first time traversing this arc + */ + toindex = ++p->tos[0].link; + if (toindex >= p->tolimit) + /* halt further profiling */ + goto overflow; + + *frompcindex = toindex; + top = &p->tos[toindex]; + top->selfpc = selfpc; + top->count = 1; + top->link = 0; + goto done; + } + top = &p->tos[toindex]; + if (top->selfpc == selfpc) { + /* + * arc at front of chain; usual case. + */ + top->count++; + goto done; + } + /* + * have to go looking down chain for it. + * top points to what we are looking at, + * prevtop points to previous top. + * we know it is not at the head of the chain. + */ + for (; /* goto done */; ) { + if (top->link == 0) { + /* + * top is end of the chain and none of the chain + * had top->selfpc == selfpc. + * so we allocate a new tostruct + * and link it to the head of the chain. + */ + toindex = ++p->tos[0].link; + if (toindex >= p->tolimit) + goto overflow; + + top = &p->tos[toindex]; + top->selfpc = selfpc; + top->count = 1; + top->link = *frompcindex; + *frompcindex = toindex; + goto done; + } + /* + * otherwise, check the next arc on the chain. + */ + prevtop = top; + top = &p->tos[top->link]; + if (top->selfpc == selfpc) { + /* + * there it is. + * increment its count + * move it to the head of the chain. + */ + top->count++; + toindex = prevtop->link; + prevtop->link = top->link; + top->link = *frompcindex; + *frompcindex = toindex; + goto done; + } + + } +done: +#ifdef KERNEL + MCOUNT_EXIT; +#else + p->state = GMON_PROF_ON; +#endif + return; +overflow: + p->state = GMON_PROF_ERROR; +#ifdef KERNEL + MCOUNT_EXIT; +#endif + return; +} + +/* + * Actual definition of mcount function. Defined in , + * which is included by . + */ +MCOUNT diff --git a/sys/libkern/moddi3.c b/sys/libkern/moddi3.c new file mode 100644 index 00000000000..f31c6e84f2b --- /dev/null +++ b/sys/libkern/moddi3.c @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)moddi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Return remainder after dividing two signed quads. + * + * XXX + * If -1/2 should produce -1 on this machine, this code is wrong. + */ +quad_t +__moddi3(a, b) + quad_t a, b; +{ + u_quad_t ua, ub, ur; + int neg; + + if (a < 0) + ua = -(u_quad_t)a, neg = 1; + else + ua = a, neg = 0; + if (b < 0) + ub = -(u_quad_t)b, neg ^= 1; + else + ub = b; + (void)__qdivrem(ua, ub, &ur); + return (neg ? -ur : ur); +} diff --git a/sys/libkern/muldi3.c b/sys/libkern/muldi3.c new file mode 100644 index 00000000000..a8d7cfc7eab --- /dev/null +++ b/sys/libkern/muldi3.c @@ -0,0 +1,246 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)muldi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Multiply two quads. + * + * Our algorithm is based on the following. Split incoming quad values + * u and v (where u,v >= 0) into + * + * u = 2^n u1 * u0 (n = number of bits in `u_long', usu. 32) + * + * and + * + * v = 2^n v1 * v0 + * + * Then + * + * uv = 2^2n u1 v1 + 2^n u1 v0 + 2^n v1 u0 + u0 v0 + * = 2^2n u1 v1 + 2^n (u1 v0 + v1 u0) + u0 v0 + * + * Now add 2^n u1 v1 to the first term and subtract it from the middle, + * and add 2^n u0 v0 to the last term and subtract it from the middle. + * This gives: + * + * uv = (2^2n + 2^n) (u1 v1) + + * (2^n) (u1 v0 - u1 v1 + u0 v1 - u0 v0) + + * (2^n + 1) (u0 v0) + * + * Factoring the middle a bit gives us: + * + * uv = (2^2n + 2^n) (u1 v1) + [u1v1 = high] + * (2^n) (u1 - u0) (v0 - v1) + [(u1-u0)... = mid] + * (2^n + 1) (u0 v0) [u0v0 = low] + * + * The terms (u1 v1), (u1 - u0) (v0 - v1), and (u0 v0) can all be done + * in just half the precision of the original. (Note that either or both + * of (u1 - u0) or (v0 - v1) may be negative.) + * + * This algorithm is from Knuth vol. 2 (2nd ed), section 4.3.3, p. 278. + * + * Since C does not give us a `long * long = quad' operator, we split + * our input quads into two longs, then split the two longs into two + * shorts. We can then calculate `short * short = long' in native + * arithmetic. + * + * Our product should, strictly speaking, be a `long quad', with 128 + * bits, but we are going to discard the upper 64. In other words, + * we are not interested in uv, but rather in (uv mod 2^2n). This + * makes some of the terms above vanish, and we get: + * + * (2^n)(high) + (2^n)(mid) + (2^n + 1)(low) + * + * or + * + * (2^n)(high + mid + low) + low + * + * Furthermore, `high' and `mid' can be computed mod 2^n, as any factor + * of 2^n in either one will also vanish. Only `low' need be computed + * mod 2^2n, and only because of the final term above. + */ +static quad_t __lmulq(u_long, u_long); + +quad_t +__muldi3(a, b) + quad_t a, b; +{ + union uu u, v, low, prod; + register u_long high, mid, udiff, vdiff; + register int negall, negmid; +#define u1 u.ul[H] +#define u0 u.ul[L] +#define v1 v.ul[H] +#define v0 v.ul[L] + + /* + * Get u and v such that u, v >= 0. When this is finished, + * u1, u0, v1, and v0 will be directly accessible through the + * longword fields. + */ + if (a >= 0) + u.q = a, negall = 0; + else + u.q = -a, negall = 1; + if (b >= 0) + v.q = b; + else + v.q = -b, negall ^= 1; + + if (u1 == 0 && v1 == 0) { + /* + * An (I hope) important optimization occurs when u1 and v1 + * are both 0. This should be common since most numbers + * are small. Here the product is just u0*v0. + */ + prod.q = __lmulq(u0, v0); + } else { + /* + * Compute the three intermediate products, remembering + * whether the middle term is negative. We can discard + * any upper bits in high and mid, so we can use native + * u_long * u_long => u_long arithmetic. + */ + low.q = __lmulq(u0, v0); + + if (u1 >= u0) + negmid = 0, udiff = u1 - u0; + else + negmid = 1, udiff = u0 - u1; + if (v0 >= v1) + vdiff = v0 - v1; + else + vdiff = v1 - v0, negmid ^= 1; + mid = udiff * vdiff; + + high = u1 * v1; + + /* + * Assemble the final product. + */ + prod.ul[H] = high + (negmid ? -mid : mid) + low.ul[L] + + low.ul[H]; + prod.ul[L] = low.ul[L]; + } + return (negall ? -prod.q : prod.q); +#undef u1 +#undef u0 +#undef v1 +#undef v0 +} + +/* + * Multiply two 2N-bit longs to produce a 4N-bit quad, where N is half + * the number of bits in a long (whatever that is---the code below + * does not care as long as quad.h does its part of the bargain---but + * typically N==16). + * + * We use the same algorithm from Knuth, but this time the modulo refinement + * does not apply. On the other hand, since N is half the size of a long, + * we can get away with native multiplication---none of our input terms + * exceeds (ULONG_MAX >> 1). + * + * Note that, for u_long l, the quad-precision result + * + * l << N + * + * splits into high and low longs as HHALF(l) and LHUP(l) respectively. + */ +static quad_t +__lmulq(u_long u, u_long v) +{ + u_long u1, u0, v1, v0, udiff, vdiff, high, mid, low; + u_long prodh, prodl, was; + union uu prod; + int neg; + + u1 = HHALF(u); + u0 = LHALF(u); + v1 = HHALF(v); + v0 = LHALF(v); + + low = u0 * v0; + + /* This is the same small-number optimization as before. */ + if (u1 == 0 && v1 == 0) + return (low); + + if (u1 >= u0) + udiff = u1 - u0, neg = 0; + else + udiff = u0 - u1, neg = 1; + if (v0 >= v1) + vdiff = v0 - v1; + else + vdiff = v1 - v0, neg ^= 1; + mid = udiff * vdiff; + + high = u1 * v1; + + /* prod = (high << 2N) + (high << N); */ + prodh = high + HHALF(high); + prodl = LHUP(high); + + /* if (neg) prod -= mid << N; else prod += mid << N; */ + if (neg) { + was = prodl; + prodl -= LHUP(mid); + prodh -= HHALF(mid) + (prodl > was); + } else { + was = prodl; + prodl += LHUP(mid); + prodh += HHALF(mid) + (prodl < was); + } + + /* prod += low << N */ + was = prodl; + prodl += LHUP(low); + prodh += HHALF(low) + (prodl < was); + /* ... + low; */ + if ((prodl += low) < low) + prodh++; + + /* return 4N-bit product */ + prod.ul[H] = prodh; + prod.ul[L] = prodl; + return (prod.q); +} diff --git a/sys/libkern/negdi2.c b/sys/libkern/negdi2.c new file mode 100644 index 00000000000..bb8670d8e2c --- /dev/null +++ b/sys/libkern/negdi2.c @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)negdi2.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Return -a (or, equivalently, 0 - a), in quad. See subdi3.c. + */ +quad_t +__negdi2(a) + quad_t a; +{ + union uu aa, res; + + aa.q = a; + res.ul[L] = -aa.ul[L]; + res.ul[H] = -aa.ul[H] - (res.ul[L] > 0); + return (res.q); +} diff --git a/sys/libkern/notdi2.c b/sys/libkern/notdi2.c new file mode 100644 index 00000000000..d6247339a80 --- /dev/null +++ b/sys/libkern/notdi2.c @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)notdi2.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Return ~a. For some reason gcc calls this `one's complement' rather + * than `not'. + */ +quad_t +__one_cmpldi2(a) + quad_t a; +{ + union uu aa; + + aa.q = a; + aa.ul[0] = ~aa.ul[0]; + aa.ul[1] = ~aa.ul[1]; + return (aa.q); +} diff --git a/sys/libkern/qdivrem.c b/sys/libkern/qdivrem.c new file mode 100644 index 00000000000..34b94ceaab2 --- /dev/null +++ b/sys/libkern/qdivrem.c @@ -0,0 +1,279 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)qdivrem.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +/* + * Multiprecision divide. This algorithm is from Knuth vol. 2 (2nd ed), + * section 4.3.1, pp. 257--259. + */ + +#include "quad.h" + +#define B (1 << HALF_BITS) /* digit base */ + +/* Combine two `digits' to make a single two-digit number. */ +#define COMBINE(a, b) (((u_long)(a) << HALF_BITS) | (b)) + +/* select a type for digits in base B: use unsigned short if they fit */ +#if ULONG_MAX == 0xffffffff && USHRT_MAX >= 0xffff +typedef unsigned short digit; +#else +typedef u_long digit; +#endif + +/* + * Shift p[0]..p[len] left `sh' bits, ignoring any bits that + * `fall out' the left (there never will be any such anyway). + * We may assume len >= 0. NOTE THAT THIS WRITES len+1 DIGITS. + */ +static void +shl(register digit *p, register int len, register int sh) +{ + register int i; + + for (i = 0; i < len; i++) + p[i] = LHALF(p[i] << sh) | (p[i + 1] >> (HALF_BITS - sh)); + p[i] = LHALF(p[i] << sh); +} + +/* + * __qdivrem(u, v, rem) returns u/v and, optionally, sets *rem to u%v. + * + * We do this in base 2-sup-HALF_BITS, so that all intermediate products + * fit within u_long. As a consequence, the maximum length dividend and + * divisor are 4 `digits' in this base (they are shorter if they have + * leading zeros). + */ +u_quad_t +__qdivrem(uq, vq, arq) + u_quad_t uq, vq, *arq; +{ + union uu tmp; + digit *u, *v, *q; + register digit v1, v2; + u_long qhat, rhat, t; + int m, n, d, j, i; + digit uspace[5], vspace[5], qspace[5]; + + /* + * Take care of special cases: divide by zero, and u < v. + */ + if (vq == 0) { + /* divide by zero. */ + static volatile const unsigned int zero = 0; + + tmp.ul[H] = tmp.ul[L] = 1 / zero; + if (arq) + *arq = uq; + return (tmp.q); + } + if (uq < vq) { + if (arq) + *arq = uq; + return (0); + } + u = &uspace[0]; + v = &vspace[0]; + q = &qspace[0]; + + /* + * Break dividend and divisor into digits in base B, then + * count leading zeros to determine m and n. When done, we + * will have: + * u = (u[1]u[2]...u[m+n]) sub B + * v = (v[1]v[2]...v[n]) sub B + * v[1] != 0 + * 1 < n <= 4 (if n = 1, we use a different division algorithm) + * m >= 0 (otherwise u < v, which we already checked) + * m + n = 4 + * and thus + * m = 4 - n <= 2 + */ + tmp.uq = uq; + u[0] = 0; + u[1] = HHALF(tmp.ul[H]); + u[2] = LHALF(tmp.ul[H]); + u[3] = HHALF(tmp.ul[L]); + u[4] = LHALF(tmp.ul[L]); + tmp.uq = vq; + v[1] = HHALF(tmp.ul[H]); + v[2] = LHALF(tmp.ul[H]); + v[3] = HHALF(tmp.ul[L]); + v[4] = LHALF(tmp.ul[L]); + for (n = 4; v[1] == 0; v++) { + if (--n == 1) { + u_long rbj; /* r*B+u[j] (not root boy jim) */ + digit q1, q2, q3, q4; + + /* + * Change of plan, per exercise 16. + * r = 0; + * for j = 1..4: + * q[j] = floor((r*B + u[j]) / v), + * r = (r*B + u[j]) % v; + * We unroll this completely here. + */ + t = v[2]; /* nonzero, by definition */ + q1 = u[1] / t; + rbj = COMBINE(u[1] % t, u[2]); + q2 = rbj / t; + rbj = COMBINE(rbj % t, u[3]); + q3 = rbj / t; + rbj = COMBINE(rbj % t, u[4]); + q4 = rbj / t; + if (arq) + *arq = rbj % t; + tmp.ul[H] = COMBINE(q1, q2); + tmp.ul[L] = COMBINE(q3, q4); + return (tmp.q); + } + } + + /* + * By adjusting q once we determine m, we can guarantee that + * there is a complete four-digit quotient at &qspace[1] when + * we finally stop. + */ + for (m = 4 - n; u[1] == 0; u++) + m--; + for (i = 4 - m; --i >= 0;) + q[i] = 0; + q += 4 - m; + + /* + * Here we run Program D, translated from MIX to C and acquiring + * a few minor changes. + * + * D1: choose multiplier 1 << d to ensure v[1] >= B/2. + */ + d = 0; + for (t = v[1]; t < B / 2; t <<= 1) + d++; + if (d > 0) { + shl(&u[0], m + n, d); /* u <<= d */ + shl(&v[1], n - 1, d); /* v <<= d */ + } + /* + * D2: j = 0. + */ + j = 0; + v1 = v[1]; /* for D3 -- note that v[1..n] are constant */ + v2 = v[2]; /* for D3 */ + do { + register digit uj0, uj1, uj2; + + /* + * D3: Calculate qhat (\^q, in TeX notation). + * Let qhat = min((u[j]*B + u[j+1])/v[1], B-1), and + * let rhat = (u[j]*B + u[j+1]) mod v[1]. + * While rhat < B and v[2]*qhat > rhat*B+u[j+2], + * decrement qhat and increase rhat correspondingly. + * Note that if rhat >= B, v[2]*qhat < rhat*B. + */ + uj0 = u[j + 0]; /* for D3 only -- note that u[j+...] change */ + uj1 = u[j + 1]; /* for D3 only */ + uj2 = u[j + 2]; /* for D3 only */ + if (uj0 == v1) { + qhat = B; + rhat = uj1; + goto qhat_too_big; + } else { + u_long n = COMBINE(uj0, uj1); + qhat = n / v1; + rhat = n % v1; + } + while (v2 * qhat > COMBINE(rhat, uj2)) { + qhat_too_big: + qhat--; + if ((rhat += v1) >= B) + break; + } + /* + * D4: Multiply and subtract. + * The variable `t' holds any borrows across the loop. + * We split this up so that we do not require v[0] = 0, + * and to eliminate a final special case. + */ + for (t = 0, i = n; i > 0; i--) { + t = u[i + j] - v[i] * qhat - t; + u[i + j] = LHALF(t); + t = (B - HHALF(t)) & (B - 1); + } + t = u[j] - t; + u[j] = LHALF(t); + /* + * D5: test remainder. + * There is a borrow if and only if HHALF(t) is nonzero; + * in that (rare) case, qhat was too large (by exactly 1). + * Fix it by adding v[1..n] to u[j..j+n]. + */ + if (HHALF(t)) { + qhat--; + for (t = 0, i = n; i > 0; i--) { /* D6: add back. */ + t += u[i + j] + v[i]; + u[i + j] = LHALF(t); + t = HHALF(t); + } + u[j] = LHALF(u[j] + t); + } + q[j] = qhat; + } while (++j <= m); /* D7: loop on j. */ + + /* + * If caller wants the remainder, we have to calculate it as + * u[m..m+n] >> d (this is at most n digits and thus fits in + * u[m+1..m+n], but we may need more source digits). + */ + if (arq) { + if (d) { + for (i = m + n; i > m; --i) + u[i] = (u[i] >> d) | + LHALF(u[i - 1] << (HALF_BITS - d)); + u[i] = 0; + } + tmp.ul[H] = COMBINE(uspace[1], uspace[2]); + tmp.ul[L] = COMBINE(uspace[3], uspace[4]); + *arq = tmp.q; + } + + tmp.ul[H] = COMBINE(qspace[1], qspace[2]); + tmp.ul[L] = COMBINE(qspace[3], qspace[4]); + return (tmp.q); +} diff --git a/sys/libkern/quad.h b/sys/libkern/quad.h new file mode 100644 index 00000000000..bc6a2f83632 --- /dev/null +++ b/sys/libkern/quad.h @@ -0,0 +1,110 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)quad.h 8.1 (Berkeley) 6/4/93 + */ + +/* + * Quad arithmetic. + * + * This library makes the following assumptions: + * + * - The type long long (aka quad_t) exists. + * + * - A quad variable is exactly twice as long as `long'. + * + * - The machine's arithmetic is two's complement. + * + * This library can provide 128-bit arithmetic on a machine with 128-bit + * quads and 64-bit longs, for instance, or 96-bit arithmetic on machines + * with 48-bit longs. + */ + +#include +#include + +/* + * Depending on the desired operation, we view a `long long' (aka quad_t) in + * one or more of the following formats. + */ +union uu { + quad_t q; /* as a (signed) quad */ + quad_t uq; /* as an unsigned quad */ + long sl[2]; /* as two signed longs */ + u_long ul[2]; /* as two unsigned longs */ +}; + +/* + * Define high and low longwords. + */ +#define H _QUAD_HIGHWORD +#define L _QUAD_LOWWORD + +/* + * Total number of bits in a quad_t and in the pieces that make it up. + * These are used for shifting, and also below for halfword extraction + * and assembly. + */ +#define QUAD_BITS (sizeof(quad_t) * CHAR_BIT) +#define LONG_BITS (sizeof(long) * CHAR_BIT) +#define HALF_BITS (sizeof(long) * CHAR_BIT / 2) + +/* + * Extract high and low shortwords from longword, and move low shortword of + * longword to upper half of long, i.e., produce the upper longword of + * ((quad_t)(x) << (number_of_bits_in_long/2)). (`x' must actually be u_long.) + * + * These are used in the multiply code, to split a longword into upper + * and lower halves, and to reassemble a product as a quad_t, shifted left + * (sizeof(long)*CHAR_BIT/2). + */ +#define HHALF(x) ((x) >> HALF_BITS) +#define LHALF(x) ((x) & ((1 << HALF_BITS) - 1)) +#define LHUP(x) ((x) << HALF_BITS) + +extern u_quad_t __qdivrem __P((u_quad_t u, u_quad_t v, u_quad_t *rem)); + +/* + * XXX + * Compensate for gcc 1 vs gcc 2. Gcc 1 defines ?sh?di3's second argument + * as u_quad_t, while gcc 2 correctly uses int. Unfortunately, we still use + * both compilers. + */ +#if __GNUC__ >= 2 +typedef unsigned int qshift_t; +#else +typedef u_quad_t qshift_t; +#endif diff --git a/sys/libkern/random.c b/sys/libkern/random.c new file mode 100644 index 00000000000..5153124e3fd --- /dev/null +++ b/sys/libkern/random.c @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)random.c 8.1 (Berkeley) 6/10/93 + */ + +#include + +/* + * Pseudo-random number generator for randomizing the profiling clock, + * and whatever else we might use it for. The result is uniform on + * [0, 2^31 - 1]. + */ +u_long +random() +{ + static u_long randseed = 1; + register long x, hi, lo, t; + + /* + * Compute x[n + 1] = (7^5 * x[n]) mod (2^31 - 1). + * From "Random number generators: good ones are hard to find", + * Park and Miller, Communications of the ACM, vol. 31, no. 10, + * October 1988, p. 1195. + */ + x = randseed; + hi = x / 127773; + lo = x % 127773; + t = 16807 * lo - 2836 * hi; + if (t <= 0) + t += 0x7fffffff; + randseed = t; + return (t); +} diff --git a/sys/libkern/rindex.c b/sys/libkern/rindex.c new file mode 100644 index 00000000000..69dced4c46d --- /dev/null +++ b/sys/libkern/rindex.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)rindex.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include +#include + +char * +#ifdef STRRCHR +strrchr(p, ch) +#else +rindex(p, ch) +#endif + register const char *p; + register int ch; +{ + register char *save; + + for (save = NULL;; ++p) { + if (*p == ch) + save = (char *)p; + if (!*p) + return(save); + } + /* NOTREACHED */ +} diff --git a/sys/libkern/scanc.c b/sys/libkern/scanc.c new file mode 100644 index 00000000000..2d8b6a06dd6 --- /dev/null +++ b/sys/libkern/scanc.c @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)scanc.c 8.1 (Berkeley) 6/10/93 + */ + +#include + +int +scanc(size, cp, table, mask0) + u_int size; + register u_char *cp, table[]; + int mask0; +{ + register u_char *end; + register u_char mask; + + mask = mask0; + for (end = &cp[size]; cp < end && (table[*cp] & mask) == 0; ++cp); + return (end - cp); +} diff --git a/sys/libkern/skpc.c b/sys/libkern/skpc.c new file mode 100644 index 00000000000..11b269ee7e7 --- /dev/null +++ b/sys/libkern/skpc.c @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)skpc.c 8.1 (Berkeley) 6/10/93 + */ + +#include + +int +skpc(mask0, size, cp0) + int mask0; + int size; + char *cp0; +{ + register u_char *cp, *end, mask; + + mask = mask0; + cp = (u_char *)cp0; + for (end = &cp[size]; cp < end && *cp == mask; ++cp); + return (end - cp); +} diff --git a/sys/libkern/strcat.c b/sys/libkern/strcat.c new file mode 100644 index 00000000000..343696719b7 --- /dev/null +++ b/sys/libkern/strcat.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)strcat.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include + +char * +strcat(s, append) + register char *s; + register const char *append; +{ + char *save = s; + + for (; *s; ++s); + while (*s++ = *append++); + return(save); +} diff --git a/sys/libkern/strcmp.c b/sys/libkern/strcmp.c new file mode 100644 index 00000000000..79cfaa831b2 --- /dev/null +++ b/sys/libkern/strcmp.c @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Chris Torek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)strcmp.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include +#include + +/* + * Compare strings. + */ +int +strcmp(s1, s2) + register const char *s1, *s2; +{ + while (*s1 == *s2++) + if (*s1++ == 0) + return (0); + return (*(unsigned char *)s1 - *(unsigned char *)--s2); +} diff --git a/sys/libkern/strcpy.c b/sys/libkern/strcpy.c new file mode 100644 index 00000000000..d1791dd00c3 --- /dev/null +++ b/sys/libkern/strcpy.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)strcpy.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include +#include + +char * +strcpy(to, from) + register char *to; + register const char *from; +{ + char *save = to; + + for (; *to = *from; ++from, ++to); + return(save); +} diff --git a/sys/libkern/strlen.c b/sys/libkern/strlen.c new file mode 100644 index 00000000000..323fbe48452 --- /dev/null +++ b/sys/libkern/strlen.c @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)strlen.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include +#include + +size_t +strlen(str) + const char *str; +{ + register const char *s; + + for (s = str; *s; ++s); + return(s - str); +} + diff --git a/sys/libkern/strncpy.c b/sys/libkern/strncpy.c new file mode 100644 index 00000000000..9e72740b8b9 --- /dev/null +++ b/sys/libkern/strncpy.c @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Chris Torek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)strncpy.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include +#include + +/* + * Copy src to dst, truncating or null-padding to always copy n bytes. + * Return dst. + */ +char * +strncpy(dst, src, n) + char *dst; + const char *src; + register size_t n; +{ + if (n != 0) { + register char *d = dst; + register const char *s = src; + + do { + if ((*d++ = *s++) == 0) { + /* NUL pad the remaining n-1 bytes */ + while (--n != 0) + *d++ = 0; + break; + } + } while (--n != 0); + } + return (dst); +} diff --git a/sys/libkern/subdi3.c b/sys/libkern/subdi3.c new file mode 100644 index 00000000000..e9763452e4f --- /dev/null +++ b/sys/libkern/subdi3.c @@ -0,0 +1,59 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)subdi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Subtract two quad values. This is trivial since a one-bit carry + * from a single u_long difference x-y occurs if and only if (x-y) > x. + */ +quad_t +__subdi3(a, b) + quad_t a, b; +{ + union uu aa, bb, diff; + + aa.q = a; + bb.q = b; + diff.ul[L] = aa.ul[L] - bb.ul[L]; + diff.ul[H] = aa.ul[H] - bb.ul[H] - (diff.ul[L] > aa.ul[L]); + return (diff.q); +} diff --git a/sys/libkern/ucmpdi2.c b/sys/libkern/ucmpdi2.c new file mode 100644 index 00000000000..e5dfc435d9c --- /dev/null +++ b/sys/libkern/ucmpdi2.c @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)ucmpdi2.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Return 0, 1, or 2 as a <, =, > b respectively. + * Neither a nor b are considered signed. + */ +int +__ucmpdi2(a, b) + u_quad_t a, b; +{ + union uu aa, bb; + + aa.uq = a; + bb.uq = b; + return (aa.ul[H] < bb.ul[H] ? 0 : aa.ul[H] > bb.ul[H] ? 2 : + aa.ul[L] < bb.ul[L] ? 0 : aa.ul[L] > bb.ul[L] ? 2 : 1); +} diff --git a/sys/libkern/udivdi3.c b/sys/libkern/udivdi3.c new file mode 100644 index 00000000000..8ddd5598911 --- /dev/null +++ b/sys/libkern/udivdi3.c @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)udivdi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Divide two unsigned quads. + */ +u_quad_t +__udivdi3(a, b) + u_quad_t a, b; +{ + + return (__qdivrem(a, b, (u_quad_t *)0)); +} diff --git a/sys/libkern/umoddi3.c b/sys/libkern/umoddi3.c new file mode 100644 index 00000000000..2a85f7699a6 --- /dev/null +++ b/sys/libkern/umoddi3.c @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)umoddi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Return remainder after dividing two unsigned quads. + */ +u_quad_t +__umoddi3(a, b) + u_quad_t a, b; +{ + u_quad_t r; + + (void)__qdivrem(a, b, &r); + return (r); +} diff --git a/sys/libkern/xordi3.c b/sys/libkern/xordi3.c new file mode 100644 index 00000000000..e3a85889360 --- /dev/null +++ b/sys/libkern/xordi3.c @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char sccsid[] = "@(#)xordi3.c 8.1 (Berkeley) 6/4/93"; +#endif /* LIBC_SCCS and not lint */ + +#include "quad.h" + +/* + * Return a ^ b, in quad. + */ +quad_t +__xordi3(a, b) + quad_t a, b; +{ + union uu aa, bb; + + aa.q = a; + bb.q = b; + aa.ul[0] ^= bb.ul[0]; + aa.ul[1] ^= bb.ul[1]; + return (aa.q); +} diff --git a/sys/miscfs/deadfs/dead_vnops.c b/sys/miscfs/deadfs/dead_vnops.c new file mode 100644 index 00000000000..9d04652b7fc --- /dev/null +++ b/sys/miscfs/deadfs/dead_vnops.c @@ -0,0 +1,354 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dead_vnops.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Prototypes for dead operations on vnodes. + */ +int dead_badop(), + dead_ebadf(); +int dead_lookup __P((struct vop_lookup_args *)); +#define dead_create ((int (*) __P((struct vop_create_args *)))dead_badop) +#define dead_mknod ((int (*) __P((struct vop_mknod_args *)))dead_badop) +int dead_open __P((struct vop_open_args *)); +#define dead_close ((int (*) __P((struct vop_close_args *)))nullop) +#define dead_access ((int (*) __P((struct vop_access_args *)))dead_ebadf) +#define dead_getattr ((int (*) __P((struct vop_getattr_args *)))dead_ebadf) +#define dead_setattr ((int (*) __P((struct vop_setattr_args *)))dead_ebadf) +int dead_read __P((struct vop_read_args *)); +int dead_write __P((struct vop_write_args *)); +int dead_ioctl __P((struct vop_ioctl_args *)); +int dead_select __P((struct vop_select_args *)); +#define dead_mmap ((int (*) __P((struct vop_mmap_args *)))dead_badop) +#define dead_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) +#define dead_seek ((int (*) __P((struct vop_seek_args *)))nullop) +#define dead_remove ((int (*) __P((struct vop_remove_args *)))dead_badop) +#define dead_link ((int (*) __P((struct vop_link_args *)))dead_badop) +#define dead_rename ((int (*) __P((struct vop_rename_args *)))dead_badop) +#define dead_mkdir ((int (*) __P((struct vop_mkdir_args *)))dead_badop) +#define dead_rmdir ((int (*) __P((struct vop_rmdir_args *)))dead_badop) +#define dead_symlink ((int (*) __P((struct vop_symlink_args *)))dead_badop) +#define dead_readdir ((int (*) __P((struct vop_readdir_args *)))dead_ebadf) +#define dead_readlink ((int (*) __P((struct vop_readlink_args *)))dead_ebadf) +#define dead_abortop ((int (*) __P((struct vop_abortop_args *)))dead_badop) +#define dead_inactive ((int (*) __P((struct vop_inactive_args *)))nullop) +#define dead_reclaim ((int (*) __P((struct vop_reclaim_args *)))nullop) +int dead_lock __P((struct vop_lock_args *)); +#define dead_unlock ((int (*) __P((struct vop_unlock_args *)))nullop) +int dead_bmap __P((struct vop_bmap_args *)); +int dead_strategy __P((struct vop_strategy_args *)); +int dead_print __P((struct vop_print_args *)); +#define dead_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +#define dead_pathconf ((int (*) __P((struct vop_pathconf_args *)))dead_ebadf) +#define dead_advlock ((int (*) __P((struct vop_advlock_args *)))dead_ebadf) +#define dead_blkatoff ((int (*) __P((struct vop_blkatoff_args *)))dead_badop) +#define dead_valloc ((int (*) __P((struct vop_valloc_args *)))dead_badop) +#define dead_vfree ((int (*) __P((struct vop_vfree_args *)))dead_badop) +#define dead_truncate ((int (*) __P((struct vop_truncate_args *)))nullop) +#define dead_update ((int (*) __P((struct vop_update_args *)))nullop) +#define dead_bwrite ((int (*) __P((struct vop_bwrite_args *)))nullop) + +int (**dead_vnodeop_p)(); +struct vnodeopv_entry_desc dead_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, dead_lookup }, /* lookup */ + { &vop_create_desc, dead_create }, /* create */ + { &vop_mknod_desc, dead_mknod }, /* mknod */ + { &vop_open_desc, dead_open }, /* open */ + { &vop_close_desc, dead_close }, /* close */ + { &vop_access_desc, dead_access }, /* access */ + { &vop_getattr_desc, dead_getattr }, /* getattr */ + { &vop_setattr_desc, dead_setattr }, /* setattr */ + { &vop_read_desc, dead_read }, /* read */ + { &vop_write_desc, dead_write }, /* write */ + { &vop_ioctl_desc, dead_ioctl }, /* ioctl */ + { &vop_select_desc, dead_select }, /* select */ + { &vop_mmap_desc, dead_mmap }, /* mmap */ + { &vop_fsync_desc, dead_fsync }, /* fsync */ + { &vop_seek_desc, dead_seek }, /* seek */ + { &vop_remove_desc, dead_remove }, /* remove */ + { &vop_link_desc, dead_link }, /* link */ + { &vop_rename_desc, dead_rename }, /* rename */ + { &vop_mkdir_desc, dead_mkdir }, /* mkdir */ + { &vop_rmdir_desc, dead_rmdir }, /* rmdir */ + { &vop_symlink_desc, dead_symlink }, /* symlink */ + { &vop_readdir_desc, dead_readdir }, /* readdir */ + { &vop_readlink_desc, dead_readlink }, /* readlink */ + { &vop_abortop_desc, dead_abortop }, /* abortop */ + { &vop_inactive_desc, dead_inactive }, /* inactive */ + { &vop_reclaim_desc, dead_reclaim }, /* reclaim */ + { &vop_lock_desc, dead_lock }, /* lock */ + { &vop_unlock_desc, dead_unlock }, /* unlock */ + { &vop_bmap_desc, dead_bmap }, /* bmap */ + { &vop_strategy_desc, dead_strategy }, /* strategy */ + { &vop_print_desc, dead_print }, /* print */ + { &vop_islocked_desc, dead_islocked }, /* islocked */ + { &vop_pathconf_desc, dead_pathconf }, /* pathconf */ + { &vop_advlock_desc, dead_advlock }, /* advlock */ + { &vop_blkatoff_desc, dead_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, dead_valloc }, /* valloc */ + { &vop_vfree_desc, dead_vfree }, /* vfree */ + { &vop_truncate_desc, dead_truncate }, /* truncate */ + { &vop_update_desc, dead_update }, /* update */ + { &vop_bwrite_desc, dead_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc dead_vnodeop_opv_desc = + { &dead_vnodeop_p, dead_vnodeop_entries }; + +/* + * Trivial lookup routine that always fails. + */ +/* ARGSUSED */ +int +dead_lookup(ap) + struct vop_lookup_args /* { + struct vnode * a_dvp; + struct vnode ** a_vpp; + struct componentname * a_cnp; + } */ *ap; +{ + + *ap->a_vpp = NULL; + return (ENOTDIR); +} + +/* + * Open always fails as if device did not exist. + */ +/* ARGSUSED */ +dead_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (ENXIO); +} + +/* + * Vnode op for read + */ +/* ARGSUSED */ +dead_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + + if (chkvnlock(ap->a_vp)) + panic("dead_read: lock"); + /* + * Return EOF for character devices, EIO for others + */ + if (ap->a_vp->v_type != VCHR) + return (EIO); + return (0); +} + +/* + * Vnode op for write + */ +/* ARGSUSED */ +dead_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + + if (chkvnlock(ap->a_vp)) + panic("dead_write: lock"); + return (EIO); +} + +/* + * Device ioctl operation. + */ +/* ARGSUSED */ +dead_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + if (!chkvnlock(ap->a_vp)) + return (EBADF); + return (VCALL(ap->a_vp, VOFFSET(vop_ioctl), ap)); +} + +/* ARGSUSED */ +dead_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* + * Let the user find out that the descriptor is gone. + */ + return (1); +} + +/* + * Just call the device strategy routine + */ +dead_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + + if (ap->a_bp->b_vp == NULL || !chkvnlock(ap->a_bp->b_vp)) { + ap->a_bp->b_flags |= B_ERROR; + biodone(ap->a_bp); + return (EIO); + } + return (VOP_STRATEGY(ap->a_bp)); +} + +/* + * Wait until the vnode has finished changing state. + */ +dead_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + if (!chkvnlock(ap->a_vp)) + return (0); + return (VCALL(ap->a_vp, VOFFSET(vop_lock), ap)); +} + +/* + * Wait until the vnode has finished changing state. + */ +dead_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + + if (!chkvnlock(ap->a_vp)) + return (EIO); + return (VOP_BMAP(ap->a_vp, ap->a_bn, ap->a_vpp, ap->a_bnp, ap->a_runp)); +} + +/* + * Print out the contents of a dead vnode. + */ +/* ARGSUSED */ +dead_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + printf("tag VT_NON, dead vnode\n"); +} + +/* + * Empty vnode failed operation + */ +dead_ebadf() +{ + + return (EBADF); +} + +/* + * Empty vnode bad operation + */ +dead_badop() +{ + + panic("dead_badop called"); + /* NOTREACHED */ +} + +/* + * Empty vnode null operation + */ +dead_nullop() +{ + + return (0); +} + +/* + * We have to wait during times when the vnode is + * in a state of change. + */ +chkvnlock(vp) + register struct vnode *vp; +{ + int locked = 0; + + while (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + locked = 1; + } + return (locked); +} diff --git a/sys/miscfs/fdesc/fdesc.h b/sys/miscfs/fdesc/fdesc.h new file mode 100644 index 00000000000..4c682e7bd37 --- /dev/null +++ b/sys/miscfs/fdesc/fdesc.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fdesc.h 8.5 (Berkeley) 1/21/94 + * + * $Id: fdesc.h,v 1.8 1993/04/06 15:28:33 jsp Exp $ + */ + +#ifdef KERNEL +struct fdescmount { + struct vnode *f_root; /* Root node */ +}; + +#define FD_ROOT 2 +#define FD_DEVFD 3 +#define FD_STDIN 4 +#define FD_STDOUT 5 +#define FD_STDERR 6 +#define FD_CTTY 7 +#define FD_DESC 8 +#define FD_MAX 12 + +typedef enum { + Froot, + Fdevfd, + Fdesc, + Flink, + Fctty +} fdntype; + +struct fdescnode { + struct fdescnode *fd_forw; /* Hash chain */ + struct fdescnode *fd_back; + struct vnode *fd_vnode; /* Back ptr to vnode */ + fdntype fd_type; /* Type of this node */ + unsigned fd_fd; /* Fd to be dup'ed */ + char *fd_link; /* Link to fd/n */ + int fd_ix; /* filesystem index */ +}; + +#define VFSTOFDESC(mp) ((struct fdescmount *)((mp)->mnt_data)) +#define VTOFDESC(vp) ((struct fdescnode *)(vp)->v_data) + +extern dev_t devctty; +extern int fdesc_init __P((void)); +extern int fdesc_root __P((struct mount *, struct vnode **)); +extern int fdesc_allocvp __P((fdntype, int, struct mount *, struct vnode **)); +extern int (**fdesc_vnodeop_p)(); +extern struct vfsops fdesc_vfsops; +#endif /* KERNEL */ diff --git a/sys/miscfs/fdesc/fdesc_vfsops.c b/sys/miscfs/fdesc/fdesc_vfsops.c new file mode 100644 index 00000000000..80c543da655 --- /dev/null +++ b/sys/miscfs/fdesc/fdesc_vfsops.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fdesc_vfsops.c 8.4 (Berkeley) 1/21/94 + * + * $Id: fdesc_vfsops.c,v 1.9 1993/04/06 15:28:33 jsp Exp $ + */ + +/* + * /dev/fd Filesystem + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Mount the per-process file descriptors (/dev/fd) + */ +int +fdesc_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + int error = 0; + u_int size; + struct fdescmount *fmp; + struct vnode *rvp; + + /* + * Update is a no-op + */ + if (mp->mnt_flag & MNT_UPDATE) + return (EOPNOTSUPP); + + error = fdesc_allocvp(Froot, FD_ROOT, mp, &rvp); + if (error) + return (error); + + MALLOC(fmp, struct fdescmount *, sizeof(struct fdescmount), + M_UFSMNT, M_WAITOK); /* XXX */ + rvp->v_type = VDIR; + rvp->v_flag |= VROOT; + fmp->f_root = rvp; + /* XXX -- don't mark as local to work around fts() problems */ + /*mp->mnt_flag |= MNT_LOCAL;*/ + mp->mnt_data = (qaddr_t) fmp; + getnewfsid(mp, MOUNT_FDESC); + + (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + bzero(mp->mnt_stat.f_mntfromname, MNAMELEN); + bcopy("fdesc", mp->mnt_stat.f_mntfromname, sizeof("fdesc")); + return (0); +} + +int +fdesc_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + return (0); +} + +int +fdesc_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + int error; + int flags = 0; + extern int doforce; + struct vnode *rootvp = VFSTOFDESC(mp)->f_root; + + if (mntflags & MNT_FORCE) { + /* fdesc can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + /* + * Clear out buffer cache. I don't think we + * ever get anything cached at this level at the + * moment, but who knows... + */ + if (rootvp->v_usecount > 1) + return (EBUSY); + if (error = vflush(mp, rootvp, flags)) + return (error); + + /* + * Release reference on underlying root vnode + */ + vrele(rootvp); + /* + * And blow it away for future re-use + */ + vgone(rootvp); + /* + * Finally, throw away the fdescmount structure + */ + free(mp->mnt_data, M_UFSMNT); /* XXX */ + mp->mnt_data = 0; + + return (0); +} + +int +fdesc_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *vp; + + /* + * Return locked reference to root. + */ + vp = VFSTOFDESC(mp)->f_root; + VREF(vp); + VOP_LOCK(vp); + *vpp = vp; + return (0); +} + +int +fdesc_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} + +int +fdesc_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + struct filedesc *fdp; + int lim; + int i; + int last; + int freefd; + + /* + * Compute number of free file descriptors. + * [ Strange results will ensue if the open file + * limit is ever reduced below the current number + * of open files... ] + */ + lim = p->p_rlimit[RLIMIT_NOFILE].rlim_cur; + fdp = p->p_fd; + last = min(fdp->fd_nfiles, lim); + freefd = 0; + for (i = fdp->fd_freefile; i < last; i++) + if (fdp->fd_ofiles[i] == NULL) + freefd++; + + /* + * Adjust for the fact that the fdesc array may not + * have been fully allocated yet. + */ + if (fdp->fd_nfiles < lim) + freefd += (lim - fdp->fd_nfiles); + + sbp->f_type = MOUNT_FDESC; + sbp->f_flags = 0; + sbp->f_bsize = DEV_BSIZE; + sbp->f_iosize = DEV_BSIZE; + sbp->f_blocks = 2; /* 1K to keep df happy */ + sbp->f_bfree = 0; + sbp->f_bavail = 0; + sbp->f_files = lim + 1; /* Allow for "." */ + sbp->f_ffree = freefd; /* See comments above */ + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + return (0); +} + +int +fdesc_sync(mp, waitfor) + struct mount *mp; + int waitfor; +{ + + return (0); +} + +/* + * Fdesc flat namespace lookup. + * Currently unsupported. + */ +int +fdesc_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +int +fdesc_fhtovp(mp, fhp, setgen, vpp) + struct mount *mp; + struct fid *fhp; + int setgen; + struct vnode **vpp; +{ + return (EOPNOTSUPP); +} + +int +fdesc_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + + return (EOPNOTSUPP); +} + +struct vfsops fdesc_vfsops = { + fdesc_mount, + fdesc_start, + fdesc_unmount, + fdesc_root, + fdesc_quotactl, + fdesc_statfs, + fdesc_sync, + fdesc_vget, + fdesc_fhtovp, + fdesc_vptofh, + fdesc_init, +}; diff --git a/sys/miscfs/fdesc/fdesc_vnops.c b/sys/miscfs/fdesc/fdesc_vnops.c new file mode 100644 index 00000000000..00d8675aea2 --- /dev/null +++ b/sys/miscfs/fdesc/fdesc_vnops.c @@ -0,0 +1,974 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fdesc_vnops.c 8.9 (Berkeley) 1/21/94 + * + * $Id: fdesc_vnops.c,v 1.12 1993/04/06 16:17:17 jsp Exp $ + */ + +/* + * /dev/fd Filesystem + */ + +#include +#include +#include +#include +#include +#include /* boottime */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define cttyvp(p) ((p)->p_flag & P_CONTROLT ? (p)->p_session->s_ttyvp : NULL) + +#define FDL_WANT 0x01 +#define FDL_LOCKED 0x02 +static int fdcache_lock; + +dev_t devctty; + +#if (FD_STDIN != FD_STDOUT-1) || (FD_STDOUT != FD_STDERR-1) +FD_STDIN, FD_STDOUT, FD_STDERR must be a sequence n, n+1, n+2 +#endif + +#define NFDCACHE 3 +#define FD_NHASH(ix) ((ix) & NFDCACHE) + +/* + * Cache head + */ +struct fdcache { + struct fdescnode *fc_forw; + struct fdescnode *fc_back; +}; + +static struct fdcache fdcache[NFDCACHE]; + +/* + * Initialise cache headers + */ +fdesc_init() +{ + struct fdcache *fc; + + devctty = makedev(nchrdev, 0); + + for (fc = fdcache; fc < fdcache + NFDCACHE; fc++) + fc->fc_forw = fc->fc_back = (struct fdescnode *) fc; +} + +/* + * Compute hash list for given target vnode + */ +static struct fdcache * +fdesc_hash(ix) + int ix; +{ + + return (&fdcache[FD_NHASH(ix)]); +} + +int +fdesc_allocvp(ftype, ix, mp, vpp) + fdntype ftype; + int ix; + struct mount *mp; + struct vnode **vpp; +{ + struct fdcache *fc; + struct fdescnode *fd; + int error = 0; + +loop: + fc = fdesc_hash(ix); + for (fd = fc->fc_forw; fd != (struct fdescnode *) fc; fd = fd->fd_forw) { + if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) { + if (vget(fd->fd_vnode, 0)) + goto loop; + *vpp = fd->fd_vnode; + return (error); + } + } + + /* + * otherwise lock the array while we call getnewvnode + * since that can block. + */ + if (fdcache_lock & FDL_LOCKED) { + fdcache_lock |= FDL_WANT; + sleep((caddr_t) &fdcache_lock, PINOD); + goto loop; + } + fdcache_lock |= FDL_LOCKED; + + error = getnewvnode(VT_FDESC, mp, fdesc_vnodeop_p, vpp); + if (error) + goto out; + MALLOC(fd, void *, sizeof(struct fdescnode), M_TEMP, M_WAITOK); + (*vpp)->v_data = fd; + fd->fd_vnode = *vpp; + fd->fd_type = ftype; + fd->fd_fd = -1; + fd->fd_link = 0; + fd->fd_ix = ix; + fc = fdesc_hash(ix); + insque(fd, fc); + +out:; + fdcache_lock &= ~FDL_LOCKED; + + if (fdcache_lock & FDL_WANT) { + fdcache_lock &= ~FDL_WANT; + wakeup((caddr_t) &fdcache_lock); + } + + return (error); +} + +/* + * vp is the current namei directory + * ndp is the name to locate in that directory... + */ +int +fdesc_lookup(ap) + struct vop_lookup_args /* { + struct vnode * a_dvp; + struct vnode ** a_vpp; + struct componentname * a_cnp; + } */ *ap; +{ + struct vnode **vpp = ap->a_vpp; + struct vnode *dvp = ap->a_dvp; + char *pname; + struct proc *p; + int nfiles; + unsigned fd; + int error; + struct vnode *fvp; + char *ln; + + pname = ap->a_cnp->cn_nameptr; + if (ap->a_cnp->cn_namelen == 1 && *pname == '.') { + *vpp = dvp; + VREF(dvp); + VOP_LOCK(dvp); + return (0); + } + + p = ap->a_cnp->cn_proc; + nfiles = p->p_fd->fd_nfiles; + + switch (VTOFDESC(dvp)->fd_type) { + default: + case Flink: + case Fdesc: + case Fctty: + error = ENOTDIR; + goto bad; + + case Froot: + if (ap->a_cnp->cn_namelen == 2 && bcmp(pname, "fd", 2) == 0) { + error = fdesc_allocvp(Fdevfd, FD_DEVFD, dvp->v_mount, &fvp); + if (error) + goto bad; + *vpp = fvp; + fvp->v_type = VDIR; + VOP_LOCK(fvp); + return (0); + } + + if (ap->a_cnp->cn_namelen == 3 && bcmp(pname, "tty", 3) == 0) { + struct vnode *ttyvp = cttyvp(p); + if (ttyvp == NULL) { + error = ENXIO; + goto bad; + } + error = fdesc_allocvp(Fctty, FD_CTTY, dvp->v_mount, &fvp); + if (error) + goto bad; + *vpp = fvp; + fvp->v_type = VFIFO; + VOP_LOCK(fvp); + return (0); + } + + ln = 0; + switch (ap->a_cnp->cn_namelen) { + case 5: + if (bcmp(pname, "stdin", 5) == 0) { + ln = "fd/0"; + fd = FD_STDIN; + } + break; + case 6: + if (bcmp(pname, "stdout", 6) == 0) { + ln = "fd/1"; + fd = FD_STDOUT; + } else + if (bcmp(pname, "stderr", 6) == 0) { + ln = "fd/2"; + fd = FD_STDERR; + } + break; + } + + if (ln) { + error = fdesc_allocvp(Flink, fd, dvp->v_mount, &fvp); + if (error) + goto bad; + VTOFDESC(fvp)->fd_link = ln; + *vpp = fvp; + fvp->v_type = VLNK; + VOP_LOCK(fvp); + return (0); + } else { + error = ENOENT; + goto bad; + } + + /* FALL THROUGH */ + + case Fdevfd: + if (ap->a_cnp->cn_namelen == 2 && bcmp(pname, "..", 2) == 0) { + error = fdesc_root(dvp->v_mount, vpp); + return (error); + } + + fd = 0; + while (*pname >= '0' && *pname <= '9') { + fd = 10 * fd + *pname++ - '0'; + if (fd >= nfiles) + break; + } + + if (*pname != '\0') { + error = ENOENT; + goto bad; + } + + if (fd >= nfiles || p->p_fd->fd_ofiles[fd] == NULL) { + error = EBADF; + goto bad; + } + + error = fdesc_allocvp(Fdesc, FD_DESC+fd, dvp->v_mount, &fvp); + if (error) + goto bad; + VTOFDESC(fvp)->fd_fd = fd; + *vpp = fvp; + return (0); + } + +bad:; + *vpp = NULL; + return (error); +} + +int +fdesc_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + int error = 0; + + switch (VTOFDESC(vp)->fd_type) { + case Fdesc: + /* + * XXX Kludge: set p->p_dupfd to contain the value of the + * the file descriptor being sought for duplication. The error + * return ensures that the vnode for this device will be + * released by vn_open. Open will detect this special error and + * take the actions in dupfdopen. Other callers of vn_open or + * VOP_OPEN will simply report the error. + */ + ap->a_p->p_dupfd = VTOFDESC(vp)->fd_fd; /* XXX */ + error = ENODEV; + break; + + case Fctty: + error = cttyopen(devctty, ap->a_mode, 0, ap->a_p); + break; + } + + return (error); +} + +static int +fdesc_attr(fd, vap, cred, p) + int fd; + struct vattr *vap; + struct ucred *cred; + struct proc *p; +{ + struct filedesc *fdp = p->p_fd; + struct file *fp; + struct stat stb; + int error; + + if (fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + + switch (fp->f_type) { + case DTYPE_VNODE: + error = VOP_GETATTR((struct vnode *) fp->f_data, vap, cred, p); + if (error == 0 && vap->va_type == VDIR) { + /* + * don't allow directories to show up because + * that causes loops in the namespace. + */ + vap->va_type = VFIFO; + } + break; + + case DTYPE_SOCKET: + error = soo_stat((struct socket *)fp->f_data, &stb); + if (error == 0) { + vattr_null(vap); + vap->va_type = VSOCK; + vap->va_mode = stb.st_mode; + vap->va_nlink = stb.st_nlink; + vap->va_uid = stb.st_uid; + vap->va_gid = stb.st_gid; + vap->va_fsid = stb.st_dev; + vap->va_fileid = stb.st_ino; + vap->va_size = stb.st_size; + vap->va_blocksize = stb.st_blksize; + vap->va_atime = stb.st_atimespec; + vap->va_mtime = stb.st_mtimespec; + vap->va_ctime = stb.st_ctimespec; + vap->va_gen = stb.st_gen; + vap->va_flags = stb.st_flags; + vap->va_rdev = stb.st_rdev; + vap->va_bytes = stb.st_blocks * stb.st_blksize; + } + break; + + default: + panic("fdesc attr"); + break; + } + + return (error); +} + +int +fdesc_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + unsigned fd; + int error = 0; + + switch (VTOFDESC(vp)->fd_type) { + case Froot: + case Fdevfd: + case Flink: + case Fctty: + bzero((caddr_t) vap, sizeof(*vap)); + vattr_null(vap); + vap->va_fileid = VTOFDESC(vp)->fd_ix; + + switch (VTOFDESC(vp)->fd_type) { + case Flink: + vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; + vap->va_type = VLNK; + vap->va_nlink = 1; + vap->va_size = strlen(VTOFDESC(vp)->fd_link); + break; + + case Fctty: + vap->va_mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH; + vap->va_type = VFIFO; + vap->va_nlink = 1; + vap->va_size = 0; + break; + + default: + vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; + vap->va_type = VDIR; + vap->va_nlink = 2; + vap->va_size = DEV_BSIZE; + break; + } + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + vap->va_blocksize = DEV_BSIZE; + vap->va_atime.ts_sec = boottime.tv_sec; + vap->va_atime.ts_nsec = 0; + vap->va_mtime = vap->va_atime; + vap->va_ctime = vap->va_mtime; + vap->va_gen = 0; + vap->va_flags = 0; + vap->va_rdev = 0; + vap->va_bytes = 0; + break; + + case Fdesc: + fd = VTOFDESC(vp)->fd_fd; + error = fdesc_attr(fd, vap, ap->a_cred, ap->a_p); + break; + + default: + panic("fdesc_getattr"); + break; + } + + if (error == 0) + vp->v_type = vap->va_type; + + return (error); +} + +int +fdesc_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct filedesc *fdp = ap->a_p->p_fd; + struct file *fp; + unsigned fd; + int error; + + /* + * Can't mess with the root vnode + */ + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fdesc: + break; + + case Fctty: + return (0); + + default: + return (EACCES); + } + + fd = VTOFDESC(ap->a_vp)->fd_fd; + if (fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) { + return (EBADF); + } + + /* + * Can setattr the underlying vnode, but not sockets! + */ + switch (fp->f_type) { + case DTYPE_VNODE: + error = VOP_SETATTR((struct vnode *) fp->f_data, ap->a_vap, ap->a_cred, ap->a_p); + break; + + case DTYPE_SOCKET: + error = 0; + break; + + default: + panic("fdesc setattr"); + break; + } + + return (error); +} + +#define UIO_MX 16 + +static struct dirtmp { + u_long d_fileno; + u_short d_reclen; + u_short d_namlen; + char d_name[8]; +} rootent[] = { + { FD_DEVFD, UIO_MX, 2, "fd" }, + { FD_STDIN, UIO_MX, 5, "stdin" }, + { FD_STDOUT, UIO_MX, 6, "stdout" }, + { FD_STDERR, UIO_MX, 6, "stderr" }, + { FD_CTTY, UIO_MX, 3, "tty" }, + { 0 } +}; + +int +fdesc_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + struct uio *uio = ap->a_uio; + struct filedesc *fdp; + int i; + int error; + + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fctty: + return (0); + + case Fdesc: + return (ENOTDIR); + + default: + break; + } + + fdp = uio->uio_procp->p_fd; + + if (VTOFDESC(ap->a_vp)->fd_type == Froot) { + struct dirent d; + struct dirent *dp = &d; + struct dirtmp *dt; + + i = uio->uio_offset / UIO_MX; + error = 0; + + while (uio->uio_resid > 0) { + dt = &rootent[i]; + if (dt->d_fileno == 0) { + /**eofflagp = 1;*/ + break; + } + i++; + + switch (dt->d_fileno) { + case FD_CTTY: + if (cttyvp(uio->uio_procp) == NULL) + continue; + break; + + case FD_STDIN: + case FD_STDOUT: + case FD_STDERR: + if ((dt->d_fileno-FD_STDIN) >= fdp->fd_nfiles) + continue; + if (fdp->fd_ofiles[dt->d_fileno-FD_STDIN] == NULL) + continue; + break; + } + bzero((caddr_t) dp, UIO_MX); + dp->d_fileno = dt->d_fileno; + dp->d_namlen = dt->d_namlen; + dp->d_type = DT_UNKNOWN; + dp->d_reclen = dt->d_reclen; + bcopy(dt->d_name, dp->d_name, dp->d_namlen+1); + error = uiomove((caddr_t) dp, UIO_MX, uio); + if (error) + break; + } + uio->uio_offset = i * UIO_MX; + return (error); + } + + i = uio->uio_offset / UIO_MX; + error = 0; + while (uio->uio_resid > 0) { + if (i >= fdp->fd_nfiles) + break; + + if (fdp->fd_ofiles[i] != NULL) { + struct dirent d; + struct dirent *dp = &d; + + bzero((caddr_t) dp, UIO_MX); + + dp->d_namlen = sprintf(dp->d_name, "%d", i); + dp->d_reclen = UIO_MX; + dp->d_type = DT_UNKNOWN; + dp->d_fileno = i + FD_STDIN; + /* + * And ship to userland + */ + error = uiomove((caddr_t) dp, UIO_MX, uio); + if (error) + break; + } + i++; + } + + uio->uio_offset = i * UIO_MX; + return (error); +} + +int +fdesc_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + int error; + + if (vp->v_type != VLNK) + return (EPERM); + + if (VTOFDESC(vp)->fd_type == Flink) { + char *ln = VTOFDESC(vp)->fd_link; + error = uiomove(ln, strlen(ln), ap->a_uio); + } else { + error = EOPNOTSUPP; + } + + return (error); +} + +int +fdesc_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error = EOPNOTSUPP; + + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fctty: + error = cttyread(devctty, ap->a_uio, ap->a_ioflag); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +int +fdesc_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error = EOPNOTSUPP; + + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fctty: + error = cttywrite(devctty, ap->a_uio, ap->a_ioflag); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +int +fdesc_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + int error = EOPNOTSUPP; + + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fctty: + error = cttyioctl(devctty, ap->a_command, ap->a_data, + ap->a_fflag, ap->a_p); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +int +fdesc_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + int error = EOPNOTSUPP; + + switch (VTOFDESC(ap->a_vp)->fd_type) { + case Fctty: + error = cttyselect(devctty, ap->a_fflags, ap->a_p); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +int +fdesc_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + /* + * Clear out the v_type field to avoid + * nasty things happening in vgone(). + */ + vp->v_type = VNON; + return (0); +} + +int +fdesc_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + remque(VTOFDESC(vp)); + FREE(vp->v_data, M_TEMP); + vp->v_data = 0; + + return (0); +} + +/* + * Return POSIX pathconf information applicable to special devices. + */ +fdesc_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_MAX_CANON: + *ap->a_retval = MAX_CANON; + return (0); + case _PC_MAX_INPUT: + *ap->a_retval = MAX_INPUT; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_VDISABLE: + *ap->a_retval = _POSIX_VDISABLE; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Print out the contents of a /dev/fd vnode. + */ +/* ARGSUSED */ +int +fdesc_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + printf("tag VT_NON, fdesc vnode\n"); + return (0); +} + +/*void*/ +int +fdesc_vfree(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + + return (0); +} + +/* + * /dev/fd vnode unsupported operation + */ +int +fdesc_enotsupp() +{ + + return (EOPNOTSUPP); +} + +/* + * /dev/fd "should never get here" operation + */ +int +fdesc_badop() +{ + + panic("fdesc: bad op"); + /* NOTREACHED */ +} + +/* + * /dev/fd vnode null operation + */ +int +fdesc_nullop() +{ + + return (0); +} + +#define fdesc_create ((int (*) __P((struct vop_create_args *)))fdesc_enotsupp) +#define fdesc_mknod ((int (*) __P((struct vop_mknod_args *)))fdesc_enotsupp) +#define fdesc_close ((int (*) __P((struct vop_close_args *)))nullop) +#define fdesc_access ((int (*) __P((struct vop_access_args *)))nullop) +#define fdesc_mmap ((int (*) __P((struct vop_mmap_args *)))fdesc_enotsupp) +#define fdesc_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) +#define fdesc_seek ((int (*) __P((struct vop_seek_args *)))nullop) +#define fdesc_remove ((int (*) __P((struct vop_remove_args *)))fdesc_enotsupp) +#define fdesc_link ((int (*) __P((struct vop_link_args *)))fdesc_enotsupp) +#define fdesc_rename ((int (*) __P((struct vop_rename_args *)))fdesc_enotsupp) +#define fdesc_mkdir ((int (*) __P((struct vop_mkdir_args *)))fdesc_enotsupp) +#define fdesc_rmdir ((int (*) __P((struct vop_rmdir_args *)))fdesc_enotsupp) +#define fdesc_symlink ((int (*) __P((struct vop_symlink_args *)))fdesc_enotsupp) +#define fdesc_abortop ((int (*) __P((struct vop_abortop_args *)))nullop) +#define fdesc_lock ((int (*) __P((struct vop_lock_args *)))nullop) +#define fdesc_unlock ((int (*) __P((struct vop_unlock_args *)))nullop) +#define fdesc_bmap ((int (*) __P((struct vop_bmap_args *)))fdesc_badop) +#define fdesc_strategy ((int (*) __P((struct vop_strategy_args *)))fdesc_badop) +#define fdesc_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +#define fdesc_advlock ((int (*) __P((struct vop_advlock_args *)))fdesc_enotsupp) +#define fdesc_blkatoff \ + ((int (*) __P((struct vop_blkatoff_args *)))fdesc_enotsupp) +#define fdesc_vget ((int (*) __P((struct vop_vget_args *)))fdesc_enotsupp) +#define fdesc_valloc ((int(*) __P(( \ + struct vnode *pvp, \ + int mode, \ + struct ucred *cred, \ + struct vnode **vpp))) fdesc_enotsupp) +#define fdesc_truncate \ + ((int (*) __P((struct vop_truncate_args *)))fdesc_enotsupp) +#define fdesc_update ((int (*) __P((struct vop_update_args *)))fdesc_enotsupp) +#define fdesc_bwrite ((int (*) __P((struct vop_bwrite_args *)))fdesc_enotsupp) + +int (**fdesc_vnodeop_p)(); +struct vnodeopv_entry_desc fdesc_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fdesc_lookup }, /* lookup */ + { &vop_create_desc, fdesc_create }, /* create */ + { &vop_mknod_desc, fdesc_mknod }, /* mknod */ + { &vop_open_desc, fdesc_open }, /* open */ + { &vop_close_desc, fdesc_close }, /* close */ + { &vop_access_desc, fdesc_access }, /* access */ + { &vop_getattr_desc, fdesc_getattr }, /* getattr */ + { &vop_setattr_desc, fdesc_setattr }, /* setattr */ + { &vop_read_desc, fdesc_read }, /* read */ + { &vop_write_desc, fdesc_write }, /* write */ + { &vop_ioctl_desc, fdesc_ioctl }, /* ioctl */ + { &vop_select_desc, fdesc_select }, /* select */ + { &vop_mmap_desc, fdesc_mmap }, /* mmap */ + { &vop_fsync_desc, fdesc_fsync }, /* fsync */ + { &vop_seek_desc, fdesc_seek }, /* seek */ + { &vop_remove_desc, fdesc_remove }, /* remove */ + { &vop_link_desc, fdesc_link }, /* link */ + { &vop_rename_desc, fdesc_rename }, /* rename */ + { &vop_mkdir_desc, fdesc_mkdir }, /* mkdir */ + { &vop_rmdir_desc, fdesc_rmdir }, /* rmdir */ + { &vop_symlink_desc, fdesc_symlink }, /* symlink */ + { &vop_readdir_desc, fdesc_readdir }, /* readdir */ + { &vop_readlink_desc, fdesc_readlink }, /* readlink */ + { &vop_abortop_desc, fdesc_abortop }, /* abortop */ + { &vop_inactive_desc, fdesc_inactive }, /* inactive */ + { &vop_reclaim_desc, fdesc_reclaim }, /* reclaim */ + { &vop_lock_desc, fdesc_lock }, /* lock */ + { &vop_unlock_desc, fdesc_unlock }, /* unlock */ + { &vop_bmap_desc, fdesc_bmap }, /* bmap */ + { &vop_strategy_desc, fdesc_strategy }, /* strategy */ + { &vop_print_desc, fdesc_print }, /* print */ + { &vop_islocked_desc, fdesc_islocked }, /* islocked */ + { &vop_pathconf_desc, fdesc_pathconf }, /* pathconf */ + { &vop_advlock_desc, fdesc_advlock }, /* advlock */ + { &vop_blkatoff_desc, fdesc_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fdesc_valloc }, /* valloc */ + { &vop_vfree_desc, fdesc_vfree }, /* vfree */ + { &vop_truncate_desc, fdesc_truncate }, /* truncate */ + { &vop_update_desc, fdesc_update }, /* update */ + { &vop_bwrite_desc, fdesc_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc fdesc_vnodeop_opv_desc = + { &fdesc_vnodeop_p, fdesc_vnodeop_entries }; diff --git a/sys/miscfs/fifofs/fifo.h b/sys/miscfs/fifofs/fifo.h new file mode 100644 index 00000000000..e89186d8b89 --- /dev/null +++ b/sys/miscfs/fifofs/fifo.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fifo.h 8.2 (Berkeley) 2/2/94 + */ + +#ifdef FIFO +/* + * Prototypes for fifo operations on vnodes. + */ +int fifo_badop(), + fifo_ebadf(); + +int fifo_lookup __P((struct vop_lookup_args *)); +#define fifo_create ((int (*) __P((struct vop_create_args *)))fifo_badop) +#define fifo_mknod ((int (*) __P((struct vop_mknod_args *)))fifo_badop) +int fifo_open __P((struct vop_open_args *)); +int fifo_close __P((struct vop_close_args *)); +#define fifo_access ((int (*) __P((struct vop_access_args *)))fifo_ebadf) +#define fifo_getattr ((int (*) __P((struct vop_getattr_args *)))fifo_ebadf) +#define fifo_setattr ((int (*) __P((struct vop_setattr_args *)))fifo_ebadf) +int fifo_read __P((struct vop_read_args *)); +int fifo_write __P((struct vop_write_args *)); +int fifo_ioctl __P((struct vop_ioctl_args *)); +int fifo_select __P((struct vop_select_args *)); +#define fifo_mmap ((int (*) __P((struct vop_mmap_args *)))fifo_badop) +#define fifo_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) +#define fifo_seek ((int (*) __P((struct vop_seek_args *)))fifo_badop) +#define fifo_remove ((int (*) __P((struct vop_remove_args *)))fifo_badop) +#define fifo_link ((int (*) __P((struct vop_link_args *)))fifo_badop) +#define fifo_rename ((int (*) __P((struct vop_rename_args *)))fifo_badop) +#define fifo_mkdir ((int (*) __P((struct vop_mkdir_args *)))fifo_badop) +#define fifo_rmdir ((int (*) __P((struct vop_rmdir_args *)))fifo_badop) +#define fifo_symlink ((int (*) __P((struct vop_symlink_args *)))fifo_badop) +#define fifo_readdir ((int (*) __P((struct vop_readdir_args *)))fifo_badop) +#define fifo_readlink ((int (*) __P((struct vop_readlink_args *)))fifo_badop) +#define fifo_abortop ((int (*) __P((struct vop_abortop_args *)))fifo_badop) +#define fifo_inactive ((int (*) __P((struct vop_inactive_args *)))nullop) +#define fifo_reclaim ((int (*) __P((struct vop_reclaim_args *)))nullop) +int fifo_lock __P((struct vop_lock_args *)); +int fifo_unlock __P((struct vop_unlock_args *)); +int fifo_bmap __P((struct vop_bmap_args *)); +#define fifo_strategy ((int (*) __P((struct vop_strategy_args *)))fifo_badop) +int fifo_print __P((struct vop_print_args *)); +#define fifo_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +int fifo_pathconf __P((struct vop_pathconf_args *)); +int fifo_advlock __P((struct vop_advlock_args *)); +#define fifo_blkatoff ((int (*) __P((struct vop_blkatoff_args *)))fifo_badop) +#define fifo_valloc ((int (*) __P((struct vop_valloc_args *)))fifo_badop) +#define fifo_reallocblks \ + ((int (*) __P((struct vop_reallocblks_args *)))fifo_badop) +#define fifo_vfree ((int (*) __P((struct vop_vfree_args *)))fifo_badop) +#define fifo_truncate ((int (*) __P((struct vop_truncate_args *)))nullop) +#define fifo_update ((int (*) __P((struct vop_update_args *)))nullop) +#define fifo_bwrite ((int (*) __P((struct vop_bwrite_args *)))nullop) +#endif /* FIFO */ diff --git a/sys/miscfs/fifofs/fifo_vnops.c b/sys/miscfs/fifofs/fifo_vnops.c new file mode 100644 index 00000000000..bad33a430b6 --- /dev/null +++ b/sys/miscfs/fifofs/fifo_vnops.c @@ -0,0 +1,494 @@ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fifo_vnops.c 8.2 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This structure is associated with the FIFO vnode and stores + * the state associated with the FIFO. + */ +struct fifoinfo { + struct socket *fi_readsock; + struct socket *fi_writesock; + long fi_readers; + long fi_writers; +}; + +int (**fifo_vnodeop_p)(); +struct vnodeopv_entry_desc fifo_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fifo_lookup }, /* lookup */ + { &vop_create_desc, fifo_create }, /* create */ + { &vop_mknod_desc, fifo_mknod }, /* mknod */ + { &vop_open_desc, fifo_open }, /* open */ + { &vop_close_desc, fifo_close }, /* close */ + { &vop_access_desc, fifo_access }, /* access */ + { &vop_getattr_desc, fifo_getattr }, /* getattr */ + { &vop_setattr_desc, fifo_setattr }, /* setattr */ + { &vop_read_desc, fifo_read }, /* read */ + { &vop_write_desc, fifo_write }, /* write */ + { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */ + { &vop_select_desc, fifo_select }, /* select */ + { &vop_mmap_desc, fifo_mmap }, /* mmap */ + { &vop_fsync_desc, fifo_fsync }, /* fsync */ + { &vop_seek_desc, fifo_seek }, /* seek */ + { &vop_remove_desc, fifo_remove }, /* remove */ + { &vop_link_desc, fifo_link }, /* link */ + { &vop_rename_desc, fifo_rename }, /* rename */ + { &vop_mkdir_desc, fifo_mkdir }, /* mkdir */ + { &vop_rmdir_desc, fifo_rmdir }, /* rmdir */ + { &vop_symlink_desc, fifo_symlink }, /* symlink */ + { &vop_readdir_desc, fifo_readdir }, /* readdir */ + { &vop_readlink_desc, fifo_readlink }, /* readlink */ + { &vop_abortop_desc, fifo_abortop }, /* abortop */ + { &vop_inactive_desc, fifo_inactive }, /* inactive */ + { &vop_reclaim_desc, fifo_reclaim }, /* reclaim */ + { &vop_lock_desc, fifo_lock }, /* lock */ + { &vop_unlock_desc, fifo_unlock }, /* unlock */ + { &vop_bmap_desc, fifo_bmap }, /* bmap */ + { &vop_strategy_desc, fifo_strategy }, /* strategy */ + { &vop_print_desc, fifo_print }, /* print */ + { &vop_islocked_desc, fifo_islocked }, /* islocked */ + { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */ + { &vop_advlock_desc, fifo_advlock }, /* advlock */ + { &vop_blkatoff_desc, fifo_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fifo_valloc }, /* valloc */ + { &vop_vfree_desc, fifo_vfree }, /* vfree */ + { &vop_truncate_desc, fifo_truncate }, /* truncate */ + { &vop_update_desc, fifo_update }, /* update */ + { &vop_bwrite_desc, fifo_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc fifo_vnodeop_opv_desc = + { &fifo_vnodeop_p, fifo_vnodeop_entries }; + +/* + * Trivial lookup routine that always fails. + */ +/* ARGSUSED */ +fifo_lookup(ap) + struct vop_lookup_args /* { + struct vnode * a_dvp; + struct vnode ** a_vpp; + struct componentname * a_cnp; + } */ *ap; +{ + + *ap->a_vpp = NULL; + return (ENOTDIR); +} + +/* + * Open called to set up a new instance of a fifo or + * to find an active instance of a fifo. + */ +/* ARGSUSED */ +fifo_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct fifoinfo *fip; + struct socket *rso, *wso; + int error; + static char openstr[] = "fifo"; + + if ((ap->a_mode & (FREAD|FWRITE)) == (FREAD|FWRITE)) + return (EINVAL); + if ((fip = vp->v_fifoinfo) == NULL) { + MALLOC(fip, struct fifoinfo *, sizeof(*fip), M_VNODE, M_WAITOK); + vp->v_fifoinfo = fip; + if (error = socreate(AF_UNIX, &rso, SOCK_STREAM, 0)) { + free(fip, M_VNODE); + vp->v_fifoinfo = NULL; + return (error); + } + fip->fi_readsock = rso; + if (error = socreate(AF_UNIX, &wso, SOCK_STREAM, 0)) { + (void)soclose(rso); + free(fip, M_VNODE); + vp->v_fifoinfo = NULL; + return (error); + } + fip->fi_writesock = wso; + if (error = unp_connect2(wso, rso)) { + (void)soclose(wso); + (void)soclose(rso); + free(fip, M_VNODE); + vp->v_fifoinfo = NULL; + return (error); + } + fip->fi_readers = fip->fi_writers = 0; + wso->so_state |= SS_CANTRCVMORE; + rso->so_state |= SS_CANTSENDMORE; + } + error = 0; + if (ap->a_mode & FREAD) { + fip->fi_readers++; + if (fip->fi_readers == 1) { + fip->fi_writesock->so_state &= ~SS_CANTSENDMORE; + if (fip->fi_writers > 0) + wakeup((caddr_t)&fip->fi_writers); + } + if (ap->a_mode & O_NONBLOCK) + return (0); + while (fip->fi_writers == 0) { + VOP_UNLOCK(vp); + error = tsleep((caddr_t)&fip->fi_readers, + PCATCH | PSOCK, openstr, 0); + VOP_LOCK(vp); + if (error) + break; + } + } else { + fip->fi_writers++; + if (fip->fi_readers == 0 && (ap->a_mode & O_NONBLOCK)) { + error = ENXIO; + } else { + if (fip->fi_writers == 1) { + fip->fi_readsock->so_state &= ~SS_CANTRCVMORE; + if (fip->fi_readers > 0) + wakeup((caddr_t)&fip->fi_readers); + } + while (fip->fi_readers == 0) { + VOP_UNLOCK(vp); + error = tsleep((caddr_t)&fip->fi_writers, + PCATCH | PSOCK, openstr, 0); + VOP_LOCK(vp); + if (error) + break; + } + } + } + if (error) + VOP_CLOSE(vp, ap->a_mode, ap->a_cred, ap->a_p); + return (error); +} + +/* + * Vnode op for read + */ +/* ARGSUSED */ +fifo_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct uio *uio = ap->a_uio; + register struct socket *rso = ap->a_vp->v_fifoinfo->fi_readsock; + int error, startresid; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("fifo_read mode"); +#endif + if (uio->uio_resid == 0) + return (0); + if (ap->a_ioflag & IO_NDELAY) + rso->so_state |= SS_NBIO; + startresid = uio->uio_resid; + VOP_UNLOCK(ap->a_vp); + error = soreceive(rso, (struct mbuf **)0, uio, (int *)0, + (struct mbuf **)0, (struct mbuf **)0); + VOP_LOCK(ap->a_vp); + /* + * Clear EOF indication after first such return. + */ + if (uio->uio_resid == startresid) + rso->so_state &= ~SS_CANTRCVMORE; + if (ap->a_ioflag & IO_NDELAY) + rso->so_state &= ~SS_NBIO; + return (error); +} + +/* + * Vnode op for write + */ +/* ARGSUSED */ +fifo_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct socket *wso = ap->a_vp->v_fifoinfo->fi_writesock; + int error; + +#ifdef DIAGNOSTIC + if (ap->a_uio->uio_rw != UIO_WRITE) + panic("fifo_write mode"); +#endif + if (ap->a_ioflag & IO_NDELAY) + wso->so_state |= SS_NBIO; + VOP_UNLOCK(ap->a_vp); + error = sosend(wso, (struct mbuf *)0, ap->a_uio, 0, (struct mbuf *)0, 0); + VOP_LOCK(ap->a_vp); + if (ap->a_ioflag & IO_NDELAY) + wso->so_state &= ~SS_NBIO; + return (error); +} + +/* + * Device ioctl operation. + */ +/* ARGSUSED */ +fifo_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct file filetmp; + + if (ap->a_command == FIONBIO) + return (0); + if (ap->a_fflag & FREAD) + filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock; + else + filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock; + return (soo_ioctl(&filetmp, ap->a_command, ap->a_data, ap->a_p)); +} + +/* ARGSUSED */ +fifo_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct file filetmp; + + if (ap->a_fflags & FREAD) + filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock; + else + filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock; + return (soo_select(&filetmp, ap->a_which, ap->a_p)); +} + +/* + * This is a noop, simply returning what one has been given. + */ +fifo_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + } */ *ap; +{ + + if (ap->a_vpp != NULL) + *ap->a_vpp = ap->a_vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + return (0); +} + +/* + * At the moment we do not do any locking. + */ +/* ARGSUSED */ +fifo_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* ARGSUSED */ +fifo_unlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* + * Device close routine + */ +/* ARGSUSED */ +fifo_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct fifoinfo *fip = vp->v_fifoinfo; + int error1, error2; + + if (ap->a_fflag & FWRITE) { + fip->fi_writers--; + if (fip->fi_writers == 0) + socantrcvmore(fip->fi_readsock); + } else { + fip->fi_readers--; + if (fip->fi_readers == 0) + socantsendmore(fip->fi_writesock); + } + if (vp->v_usecount > 1) + return (0); + error1 = soclose(fip->fi_readsock); + error2 = soclose(fip->fi_writesock); + FREE(fip, M_VNODE); + vp->v_fifoinfo = NULL; + if (error1) + return (error1); + return (error2); +} + +/* + * Print out the contents of a fifo vnode. + */ +fifo_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + printf("tag VT_NON"); + fifo_printinfo(ap->a_vp); + printf("\n"); +} + +/* + * Print out internal contents of a fifo vnode. + */ +fifo_printinfo(vp) + struct vnode *vp; +{ + register struct fifoinfo *fip = vp->v_fifoinfo; + + printf(", fifo with %d readers and %d writers", + fip->fi_readers, fip->fi_writers); +} + +/* + * Return POSIX pathconf information applicable to fifo's. + */ +fifo_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Fifo failed operation + */ +fifo_ebadf() +{ + + return (EBADF); +} + +/* + * Fifo advisory byte-level locks. + */ +/* ARGSUSED */ +fifo_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * Fifo bad operation + */ +fifo_badop() +{ + + panic("fifo_badop called"); + /* NOTREACHED */ +} diff --git a/sys/miscfs/kernfs/kernfs.h b/sys/miscfs/kernfs/kernfs.h new file mode 100644 index 00000000000..75ddecc6db1 --- /dev/null +++ b/sys/miscfs/kernfs/kernfs.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kernfs.h 8.4 (Berkeley) 1/21/94 + */ + +#define _PATH_KERNFS "/kern" /* Default mountpoint */ + +#ifdef KERNEL +struct kernfs_mount { + struct vnode *kf_root; /* Root node */ +}; + +struct kernfs_node { + struct kern_target *kf_kt; +}; + +#define VFSTOKERNFS(mp) ((struct kernfs_mount *)((mp)->mnt_data)) +#define VTOKERN(vp) ((struct kernfs_node *)(vp)->v_data) + +extern int (**kernfs_vnodeop_p)(); +extern struct vfsops kernfs_vfsops; +extern struct vnode *rrootvp; +#endif /* KERNEL */ diff --git a/sys/miscfs/kernfs/kernfs_vfsops.c b/sys/miscfs/kernfs/kernfs_vfsops.c new file mode 100644 index 00000000000..b68d76eaddf --- /dev/null +++ b/sys/miscfs/kernfs/kernfs_vfsops.c @@ -0,0 +1,329 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kernfs_vfsops.c 8.4 (Berkeley) 1/21/94 + */ + +/* + * Kernel params Filesystem + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct vnode *rrootvp; + +/* + * Create a vnode for a character device. + */ +int +cdevvp(dev, vpp) + dev_t dev; + struct vnode **vpp; +{ + register struct vnode *vp; + struct vnode *nvp; + int error; + + if (dev == NODEV) + return (0); + error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); + if (error) { + *vpp = 0; + return (error); + } + vp = nvp; + vp->v_type = VCHR; + if (nvp = checkalias(vp, dev, (struct mount *)0)) { + vput(vp); + vp = nvp; + } + *vpp = vp; + return (0); +} + +kernfs_init() +{ + int cmaj; + int bmaj = major(rootdev); + int error = ENXIO; + +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_init\n"); /* printed during system boot */ +#endif + + for (cmaj = 0; cmaj < nchrdev; cmaj++) { + if (cdevsw[cmaj].d_open == bdevsw[bmaj].d_open) { + dev_t cdev = makedev(cmaj, minor(rootdev)); + error = cdevvp(cdev, &rrootvp); + if (error == 0) + break; + } + } + + if (error) { + printf("kernfs: no raw boot device\n"); + rrootvp = 0; + } +} + +/* + * Mount the Kernel params filesystem + */ +kernfs_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + int error = 0; + u_int size; + struct kernfs_mount *fmp; + struct vnode *rvp; + +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_mount(mp = %x)\n", mp); +#endif + + /* + * Update is a no-op + */ + if (mp->mnt_flag & MNT_UPDATE) + return (EOPNOTSUPP); + + error = getnewvnode(VT_KERNFS, mp, kernfs_vnodeop_p, &rvp); /* XXX */ + if (error) + return (error); + + MALLOC(fmp, struct kernfs_mount *, sizeof(struct kernfs_mount), + M_UFSMNT, M_WAITOK); /* XXX */ + rvp->v_type = VDIR; + rvp->v_flag |= VROOT; +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_mount: root vp = %x\n", rvp); +#endif + fmp->kf_root = rvp; + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_data = (qaddr_t) fmp; + getnewfsid(mp, MOUNT_KERNFS); + + (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + bzero(mp->mnt_stat.f_mntfromname, MNAMELEN); + bcopy("kernfs", mp->mnt_stat.f_mntfromname, sizeof("kernfs")); +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_mount: at %s\n", mp->mnt_stat.f_mntonname); +#endif + return (0); +} + +kernfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + return (0); +} + +kernfs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + int error; + int flags = 0; + extern int doforce; + struct vnode *rootvp = VFSTOKERNFS(mp)->kf_root; + +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_unmount(mp = %x)\n", mp); +#endif + + if (mntflags & MNT_FORCE) { + /* kernfs can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + /* + * Clear out buffer cache. I don't think we + * ever get anything cached at this level at the + * moment, but who knows... + */ + if (rootvp->v_usecount > 1) + return (EBUSY); +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_unmount: calling vflush\n"); +#endif + if (error = vflush(mp, rootvp, flags)) + return (error); + +#ifdef KERNFS_DIAGNOSTIC + vprint("kernfs root", rootvp); +#endif + /* + * Release reference on underlying root vnode + */ + vrele(rootvp); + /* + * And blow it away for future re-use + */ + vgone(rootvp); + /* + * Finally, throw away the kernfs_mount structure + */ + free(mp->mnt_data, M_UFSMNT); /* XXX */ + mp->mnt_data = 0; + return 0; +} + +kernfs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *vp; + +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_root(mp = %x)\n", mp); +#endif + + /* + * Return locked reference to root. + */ + vp = VFSTOKERNFS(mp)->kf_root; + VREF(vp); + VOP_LOCK(vp); + *vpp = vp; + return (0); +} + +kernfs_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + return (EOPNOTSUPP); +} + +kernfs_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_statfs(mp = %x)\n", mp); +#endif + + sbp->f_type = MOUNT_KERNFS; + sbp->f_flags = 0; + sbp->f_bsize = DEV_BSIZE; + sbp->f_iosize = DEV_BSIZE; + sbp->f_blocks = 2; /* 1K to keep df happy */ + sbp->f_bfree = 0; + sbp->f_bavail = 0; + sbp->f_files = 0; + sbp->f_ffree = 0; + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + return (0); +} + +kernfs_sync(mp, waitfor) + struct mount *mp; + int waitfor; +{ + return (0); +} + +/* + * Kernfs flat namespace lookup. + * Currently unsupported. + */ +kernfs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + + +kernfs_fhtovp(mp, fhp, setgen, vpp) + struct mount *mp; + struct fid *fhp; + int setgen; + struct vnode **vpp; +{ + return (EOPNOTSUPP); +} + +kernfs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + return (EOPNOTSUPP); +} + +struct vfsops kernfs_vfsops = { + kernfs_mount, + kernfs_start, + kernfs_unmount, + kernfs_root, + kernfs_quotactl, + kernfs_statfs, + kernfs_sync, + kernfs_vget, + kernfs_fhtovp, + kernfs_vptofh, + kernfs_init, +}; diff --git a/sys/miscfs/kernfs/kernfs_vnops.c b/sys/miscfs/kernfs/kernfs_vnops.c new file mode 100644 index 00000000000..10b7d7c0a64 --- /dev/null +++ b/sys/miscfs/kernfs/kernfs_vnops.c @@ -0,0 +1,759 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kernfs_vnops.c 8.6 (Berkeley) 2/10/94 + */ + +/* + * Kernel parameter filesystem (/kern) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KSTRING 256 /* Largest I/O available via this filesystem */ +#define UIO_MX 32 + +#define READ_MODE (S_IRUSR|S_IRGRP|S_IROTH) +#define WRITE_MODE (S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH) +#define DIR_MODE (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH) + +struct kern_target { + char *kt_name; + void *kt_data; +#define KTT_NULL 1 +#define KTT_TIME 5 +#define KTT_INT 17 +#define KTT_STRING 31 +#define KTT_HOSTNAME 47 +#define KTT_AVENRUN 53 + int kt_tag; + int kt_rw; + int kt_vtype; +} kern_targets[] = { +/* NOTE: The name must be less than UIO_MX-16 chars in length */ + /* name data tag ro/rw */ + { ".", 0, KTT_NULL, VREAD, VDIR }, + { "..", 0, KTT_NULL, VREAD, VDIR }, + { "boottime", &boottime.tv_sec, KTT_INT, VREAD, VREG }, + { "copyright", copyright, KTT_STRING, VREAD, VREG }, + { "hostname", 0, KTT_HOSTNAME, VREAD|VWRITE, VREG }, + { "hz", &hz, KTT_INT, VREAD, VREG }, + { "loadavg", 0, KTT_AVENRUN, VREAD, VREG }, + { "pagesize", &cnt.v_page_size, KTT_INT, VREAD, VREG }, + { "physmem", &physmem, KTT_INT, VREAD, VREG }, +#if 0 + { "root", 0, KTT_NULL, VREAD, VDIR }, +#endif + { "rootdev", 0, KTT_NULL, VREAD, VBLK }, + { "rrootdev", 0, KTT_NULL, VREAD, VCHR }, + { "time", 0, KTT_TIME, VREAD, VREG }, + { "version", version, KTT_STRING, VREAD, VREG }, +}; + +static int nkern_targets = sizeof(kern_targets) / sizeof(kern_targets[0]); + +static int +kernfs_xread(kt, buf, len, lenp) + struct kern_target *kt; + char *buf; + int len; + int *lenp; +{ + switch (kt->kt_tag) { + case KTT_TIME: { + struct timeval tv; + microtime(&tv); + sprintf(buf, "%d %d\n", tv.tv_sec, tv.tv_usec); + break; + } + + case KTT_INT: { + int *ip = kt->kt_data; + sprintf(buf, "%d\n", *ip); + break; + } + + case KTT_STRING: { + char *cp = kt->kt_data; + int xlen = strlen(cp) + 1; + + if (xlen >= len) + return (EINVAL); + + bcopy(cp, buf, xlen); + break; + } + + case KTT_HOSTNAME: { + char *cp = hostname; + int xlen = hostnamelen; + + if (xlen >= (len-2)) + return (EINVAL); + + bcopy(cp, buf, xlen); + buf[xlen] = '\n'; + buf[xlen+1] = '\0'; + break; + } + + case KTT_AVENRUN: + sprintf(buf, "%ld %ld %ld %ld\n", + averunnable.ldavg[0], + averunnable.ldavg[1], + averunnable.ldavg[2], + averunnable.fscale); + break; + + default: + return (EINVAL); + } + + *lenp = strlen(buf); + return (0); +} + +static int +kernfs_xwrite(kt, buf, len) + struct kern_target *kt; + char *buf; + int len; +{ + switch (kt->kt_tag) { + case KTT_HOSTNAME: { + if (buf[len-1] == '\n') + --len; + bcopy(buf, hostname, len); + hostname[len] = '\0'; + hostnamelen = len; + return (0); + } + + default: + return (EIO); + } +} + + +/* + * vp is the current namei directory + * ndp is the name to locate in that directory... + */ +kernfs_lookup(ap) + struct vop_lookup_args /* { + struct vnode * a_dvp; + struct vnode ** a_vpp; + struct componentname * a_cnp; + } */ *ap; +{ + struct vnode **vpp = ap->a_vpp; + struct vnode *dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + struct vnode *fvp; + int error, i; + char *pname; + +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_lookup(%x)\n", ap); + printf("kernfs_lookup(dp = %x, vpp = %x, cnp = %x)\n", dvp, vpp, ap->a_cnp); +#endif + pname = cnp->cn_nameptr; +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_lookup(%s)\n", pname); +#endif + if (cnp->cn_namelen == 1 && *pname == '.') { + *vpp = dvp; + VREF(dvp); + /*VOP_LOCK(dvp);*/ + return (0); + } + +#if 0 + if (cnp->cn_namelen == 4 && bcmp(pname, "root", 4) == 0) { + *vpp = rootdir; + VREF(rootdir); + VOP_LOCK(rootdir); + return (0); + } +#endif + + /* + * /kern/rootdev is the root device + */ + if (cnp->cn_namelen == 7 && bcmp(pname, "rootdev", 7) == 0) { + *vpp = rootvp; + VREF(rootvp); + VOP_LOCK(rootvp); + return (0); + } + + /* + * /kern/rrootdev is the raw root device + */ + if (cnp->cn_namelen == 8 && bcmp(pname, "rrootdev", 8) == 0) { + if (rrootvp) { + *vpp = rrootvp; + VREF(rrootvp); + VOP_LOCK(rrootvp); + return (0); + } + error = ENXIO; + goto bad; + } + + error = ENOENT; + + for (i = 0; i < nkern_targets; i++) { + struct kern_target *kt = &kern_targets[i]; + if (cnp->cn_namelen == strlen(kt->kt_name) && + bcmp(kt->kt_name, pname, cnp->cn_namelen) == 0) { + error = 0; + break; + } + } + +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_lookup: i = %d, error = %d\n", i, error); +#endif + + if (error) + goto bad; + +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_lookup: allocate new vnode\n"); +#endif + error = getnewvnode(VT_KERNFS, dvp->v_mount, kernfs_vnodeop_p, &fvp); + if (error) + goto bad; + MALLOC(fvp->v_data, void *, sizeof(struct kernfs_node), M_TEMP, M_WAITOK); + VTOKERN(fvp)->kf_kt = &kern_targets[i]; + fvp->v_type = VTOKERN(fvp)->kf_kt->kt_vtype; + *vpp = fvp; +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_lookup: newvp = %x\n", fvp); +#endif + return (0); + +bad:; + *vpp = NULL; +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_lookup: error = %d\n", error); +#endif + return (error); +} + +kernfs_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + /* + * Can always open the root (modulo perms) + */ + if (vp->v_flag & VROOT) + return (0); + +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_open, mode = %x, file = %s\n", + ap->a_mode, VTOKERN(vp)->kf_kt->kt_name); +#endif + + if ((ap->a_mode & FWRITE) && !(VTOKERN(vp)->kf_kt->kt_rw & VWRITE)) + return (EOPNOTSUPP); + + return (0); +} + +static int +kernfs_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct ucred *cred = ap->a_cred; + mode_t mode = ap->a_mode; + + if (mode & VEXEC) { + if (vp->v_flag & VROOT) + return (0); + return (EACCES); + } + + if (cred->cr_uid == 0) { + if ((vp->v_flag & VROOT) == 0) { + struct kern_target *kt = VTOKERN(vp)->kf_kt; + + if ((mode & VWRITE) && !(kt->kt_rw & VWRITE)) + return (EROFS); + } + return (0); + } + + if (mode & VWRITE) + return (EACCES); + + return (0); +} + + +kernfs_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + int error = 0; + char strbuf[KSTRING]; + + bzero((caddr_t) vap, sizeof(*vap)); + vattr_null(vap); + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + /* vap->va_qsize = 0; */ + vap->va_blocksize = DEV_BSIZE; + microtime(&vap->va_atime); + vap->va_mtime = vap->va_atime; + vap->va_ctime = vap->va_ctime; + vap->va_gen = 0; + vap->va_flags = 0; + vap->va_rdev = 0; + /* vap->va_qbytes = 0; */ + vap->va_bytes = 0; + + if (vp->v_flag & VROOT) { +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_getattr: stat rootdir\n"); +#endif + vap->va_type = VDIR; + vap->va_mode = DIR_MODE; + vap->va_nlink = 2; + vap->va_fileid = 2; + vap->va_size = DEV_BSIZE; + } else { + struct kern_target *kt = VTOKERN(vp)->kf_kt; + int nbytes; +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_getattr: stat target %s\n", kt->kt_name); +#endif + vap->va_type = kt->kt_vtype; + vap->va_mode = (kt->kt_rw & VWRITE ? WRITE_MODE : READ_MODE); + vap->va_nlink = 1; + vap->va_fileid = 3 + (kt - kern_targets) / sizeof(*kt); + error = kernfs_xread(kt, strbuf, sizeof(strbuf), &nbytes); + vap->va_size = nbytes; + } + + vp->v_type = vap->va_type; +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_getattr: return error %d\n", error); +#endif + return (error); +} + +kernfs_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* + * Silently ignore attribute changes. + * This allows for open with truncate to have no + * effect until some data is written. I want to + * do it this way because all writes are atomic. + */ + return (0); +} + +static int +kernfs_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + struct kern_target *kt; + char strbuf[KSTRING]; + int off = uio->uio_offset; + int error, len; + char *cp; + + if (vp->v_flag & VROOT) + return (EOPNOTSUPP); + + kt = VTOKERN(vp)->kf_kt; + +#ifdef KERNFS_DIAGNOSTIC + printf("kern_read %s\n", kt->kt_name); +#endif + + len = 0; + error = kernfs_xread(kt, strbuf, sizeof(strbuf), &len); + if (error) + return (error); + cp = strbuf + off; + len -= off; + return (uiomove(cp, len, uio)); +} + +static int +kernfs_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + struct kern_target *kt; + int error, xlen; + char strbuf[KSTRING]; + + if (vp->v_flag & VROOT) + return (0); + + kt = VTOKERN(vp)->kf_kt; + + if (uio->uio_offset != 0) + return (EINVAL); + + xlen = min(uio->uio_resid, KSTRING-1); + error = uiomove(strbuf, xlen, uio); + if (error) + return (error); + + if (uio->uio_resid != 0) + return (EIO); + + strbuf[xlen] = '\0'; + xlen = strlen(strbuf); + return (kernfs_xwrite(kt, strbuf, xlen)); +} + + +kernfs_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + struct uio *uio = ap->a_uio; + int i; + int error; + + i = uio->uio_offset / UIO_MX; + error = 0; + while (uio->uio_resid > 0 && i < nkern_targets) { + struct dirent d; + struct dirent *dp = &d; + struct kern_target *kt = &kern_targets[i]; +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_readdir: i = %d\n", i); +#endif + + bzero((caddr_t) dp, UIO_MX); + + dp->d_namlen = strlen(kt->kt_name); + bcopy(kt->kt_name, dp->d_name, dp->d_namlen+1); + +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_readdir: name = %s, len = %d\n", + dp->d_name, dp->d_namlen); +#endif + /* + * Fill in the remaining fields + */ + dp->d_reclen = UIO_MX; + dp->d_fileno = i + 3; + dp->d_type = DT_UNKNOWN; /* XXX */ + /* + * And ship to userland + */ + error = uiomove((caddr_t) dp, UIO_MX, uio); + if (error) + break; + i++; + } + + uio->uio_offset = i * UIO_MX; + + return (error); +} + +kernfs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + /* + * Clear out the v_type field to avoid + * nasty things happening in vgone(). + */ + vp->v_type = VNON; +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_inactive(%x)\n", vp); +#endif + return (0); +} + +kernfs_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; +#ifdef KERNFS_DIAGNOSTIC + printf("kernfs_reclaim(%x)\n", vp); +#endif + if (vp->v_data) { + FREE(vp->v_data, M_TEMP); + vp->v_data = 0; + } + return (0); +} + +/* + * Return POSIX pathconf information applicable to special devices. + */ +kernfs_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_MAX_CANON: + *ap->a_retval = MAX_CANON; + return (0); + case _PC_MAX_INPUT: + *ap->a_retval = MAX_INPUT; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_VDISABLE: + *ap->a_retval = _POSIX_VDISABLE; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Print out the contents of a /dev/fd vnode. + */ +/* ARGSUSED */ +kernfs_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + printf("tag VT_KERNFS, kernfs vnode\n"); + return (0); +} + +/*void*/ +kernfs_vfree(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + + return (0); +} + +/* + * /dev/fd vnode unsupported operation + */ +kernfs_enotsupp() +{ + + return (EOPNOTSUPP); +} + +/* + * /dev/fd "should never get here" operation + */ +kernfs_badop() +{ + + panic("kernfs: bad op"); + /* NOTREACHED */ +} + +/* + * kernfs vnode null operation + */ +kernfs_nullop() +{ + + return (0); +} + +#define kernfs_create ((int (*) __P((struct vop_create_args *)))kernfs_enotsupp) +#define kernfs_mknod ((int (*) __P((struct vop_mknod_args *)))kernfs_enotsupp) +#define kernfs_close ((int (*) __P((struct vop_close_args *)))nullop) +#define kernfs_ioctl ((int (*) __P((struct vop_ioctl_args *)))kernfs_enotsupp) +#define kernfs_select ((int (*) __P((struct vop_select_args *)))kernfs_enotsupp) +#define kernfs_mmap ((int (*) __P((struct vop_mmap_args *)))kernfs_enotsupp) +#define kernfs_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) +#define kernfs_seek ((int (*) __P((struct vop_seek_args *)))nullop) +#define kernfs_remove ((int (*) __P((struct vop_remove_args *)))kernfs_enotsupp) +#define kernfs_link ((int (*) __P((struct vop_link_args *)))kernfs_enotsupp) +#define kernfs_rename ((int (*) __P((struct vop_rename_args *)))kernfs_enotsupp) +#define kernfs_mkdir ((int (*) __P((struct vop_mkdir_args *)))kernfs_enotsupp) +#define kernfs_rmdir ((int (*) __P((struct vop_rmdir_args *)))kernfs_enotsupp) +#define kernfs_symlink ((int (*) __P((struct vop_symlink_args *)))kernfs_enotsupp) +#define kernfs_readlink \ + ((int (*) __P((struct vop_readlink_args *)))kernfs_enotsupp) +#define kernfs_abortop ((int (*) __P((struct vop_abortop_args *)))nullop) +#define kernfs_lock ((int (*) __P((struct vop_lock_args *)))nullop) +#define kernfs_unlock ((int (*) __P((struct vop_unlock_args *)))nullop) +#define kernfs_bmap ((int (*) __P((struct vop_bmap_args *)))kernfs_badop) +#define kernfs_strategy ((int (*) __P((struct vop_strategy_args *)))kernfs_badop) +#define kernfs_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +#define kernfs_advlock ((int (*) __P((struct vop_advlock_args *)))kernfs_enotsupp) +#define kernfs_blkatoff \ + ((int (*) __P((struct vop_blkatoff_args *)))kernfs_enotsupp) +#define kernfs_valloc ((int(*) __P(( \ + struct vnode *pvp, \ + int mode, \ + struct ucred *cred, \ + struct vnode **vpp))) kernfs_enotsupp) +#define kernfs_truncate \ + ((int (*) __P((struct vop_truncate_args *)))kernfs_enotsupp) +#define kernfs_update ((int (*) __P((struct vop_update_args *)))kernfs_enotsupp) +#define kernfs_bwrite ((int (*) __P((struct vop_bwrite_args *)))kernfs_enotsupp) + +int (**kernfs_vnodeop_p)(); +struct vnodeopv_entry_desc kernfs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, kernfs_lookup }, /* lookup */ + { &vop_create_desc, kernfs_create }, /* create */ + { &vop_mknod_desc, kernfs_mknod }, /* mknod */ + { &vop_open_desc, kernfs_open }, /* open */ + { &vop_close_desc, kernfs_close }, /* close */ + { &vop_access_desc, kernfs_access }, /* access */ + { &vop_getattr_desc, kernfs_getattr }, /* getattr */ + { &vop_setattr_desc, kernfs_setattr }, /* setattr */ + { &vop_read_desc, kernfs_read }, /* read */ + { &vop_write_desc, kernfs_write }, /* write */ + { &vop_ioctl_desc, kernfs_ioctl }, /* ioctl */ + { &vop_select_desc, kernfs_select }, /* select */ + { &vop_mmap_desc, kernfs_mmap }, /* mmap */ + { &vop_fsync_desc, kernfs_fsync }, /* fsync */ + { &vop_seek_desc, kernfs_seek }, /* seek */ + { &vop_remove_desc, kernfs_remove }, /* remove */ + { &vop_link_desc, kernfs_link }, /* link */ + { &vop_rename_desc, kernfs_rename }, /* rename */ + { &vop_mkdir_desc, kernfs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, kernfs_rmdir }, /* rmdir */ + { &vop_symlink_desc, kernfs_symlink }, /* symlink */ + { &vop_readdir_desc, kernfs_readdir }, /* readdir */ + { &vop_readlink_desc, kernfs_readlink },/* readlink */ + { &vop_abortop_desc, kernfs_abortop }, /* abortop */ + { &vop_inactive_desc, kernfs_inactive },/* inactive */ + { &vop_reclaim_desc, kernfs_reclaim }, /* reclaim */ + { &vop_lock_desc, kernfs_lock }, /* lock */ + { &vop_unlock_desc, kernfs_unlock }, /* unlock */ + { &vop_bmap_desc, kernfs_bmap }, /* bmap */ + { &vop_strategy_desc, kernfs_strategy },/* strategy */ + { &vop_print_desc, kernfs_print }, /* print */ + { &vop_islocked_desc, kernfs_islocked },/* islocked */ + { &vop_pathconf_desc, kernfs_pathconf },/* pathconf */ + { &vop_advlock_desc, kernfs_advlock }, /* advlock */ + { &vop_blkatoff_desc, kernfs_blkatoff },/* blkatoff */ + { &vop_valloc_desc, kernfs_valloc }, /* valloc */ + { &vop_vfree_desc, kernfs_vfree }, /* vfree */ + { &vop_truncate_desc, kernfs_truncate },/* truncate */ + { &vop_update_desc, kernfs_update }, /* update */ + { &vop_bwrite_desc, kernfs_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc kernfs_vnodeop_opv_desc = + { &kernfs_vnodeop_p, kernfs_vnodeop_entries }; diff --git a/sys/miscfs/nullfs/null.h b/sys/miscfs/nullfs/null.h new file mode 100644 index 00000000000..14286ffeee0 --- /dev/null +++ b/sys/miscfs/nullfs/null.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null.h 8.2 (Berkeley) 1/21/94 + * + * $Id: lofs.h,v 1.8 1992/05/30 10:05:43 jsp Exp jsp $ + */ + +struct null_args { + char *target; /* Target of loopback */ +}; + +struct null_mount { + struct mount *nullm_vfs; + struct vnode *nullm_rootvp; /* Reference to root null_node */ +}; + +#ifdef KERNEL +/* + * A cache of vnode references + */ +struct null_node { + struct null_node *null_forw; /* Hash chain */ + struct null_node *null_back; + struct vnode *null_lowervp; /* VREFed once */ + struct vnode *null_vnode; /* Back pointer */ +}; + +extern int null_node_create __P((struct mount *mp, struct vnode *target, struct vnode **vpp)); + +#define MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data)) +#define VTONULL(vp) ((struct null_node *)(vp)->v_data) +#define NULLTOV(xp) ((xp)->null_vnode) +#ifdef NULLFS_DIAGNOSTIC +extern struct vnode *null_checkvp __P((struct vnode *vp, char *fil, int lno)); +#define NULLVPTOLOWERVP(vp) null_checkvp((vp), __FILE__, __LINE__) +#else +#define NULLVPTOLOWERVP(vp) (VTONULL(vp)->null_lowervp) +#endif + +extern int (**null_vnodeop_p)(); +extern struct vfsops null_vfsops; +#endif /* KERNEL */ diff --git a/sys/miscfs/nullfs/null_subr.c b/sys/miscfs/nullfs/null_subr.c new file mode 100644 index 00000000000..a31723fe4c2 --- /dev/null +++ b/sys/miscfs/nullfs/null_subr.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null_subr.c 8.4 (Berkeley) 1/21/94 + * + * $Id: lofs_subr.c,v 1.11 1992/05/30 10:05:43 jsp Exp jsp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOG2_SIZEVNODE 7 /* log2(sizeof struct vnode) */ +#define NNULLNODECACHE 16 +#define NULL_NHASH(vp) ((((u_long)vp)>>LOG2_SIZEVNODE) & (NNULLNODECACHE-1)) + +/* + * Null layer cache: + * Each cache entry holds a reference to the lower vnode + * along with a pointer to the alias vnode. When an + * entry is added the lower vnode is VREF'd. When the + * alias is removed the lower vnode is vrele'd. + */ + +/* + * Cache head + */ +struct null_node_cache { + struct null_node *ac_forw; + struct null_node *ac_back; +}; + +static struct null_node_cache null_node_cache[NNULLNODECACHE]; + +/* + * Initialise cache headers + */ +nullfs_init() +{ + struct null_node_cache *ac; +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_init\n"); /* printed during system boot */ +#endif + + for (ac = null_node_cache; ac < null_node_cache + NNULLNODECACHE; ac++) + ac->ac_forw = ac->ac_back = (struct null_node *) ac; +} + +/* + * Compute hash list for given lower vnode + */ +static struct null_node_cache * +null_node_hash(lowervp) +struct vnode *lowervp; +{ + + return (&null_node_cache[NULL_NHASH(lowervp)]); +} + +/* + * Return a VREF'ed alias for lower vnode if already exists, else 0. + */ +static struct vnode * +null_node_find(mp, lowervp) + struct mount *mp; + struct vnode *lowervp; +{ + struct null_node_cache *hd; + struct null_node *a; + struct vnode *vp; + + /* + * Find hash base, and then search the (two-way) linked + * list looking for a null_node structure which is referencing + * the lower vnode. If found, the increment the null_node + * reference count (but NOT the lower vnode's VREF counter). + */ + hd = null_node_hash(lowervp); +loop: + for (a = hd->ac_forw; a != (struct null_node *) hd; a = a->null_forw) { + if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) { + vp = NULLTOV(a); + /* + * We need vget for the VXLOCK + * stuff, but we don't want to lock + * the lower node. + */ + if (vget(vp, 0)) { + printf ("null_node_find: vget failed.\n"); + goto loop; + }; + return (vp); + } + } + + return NULL; +} + + +/* + * Make a new null_node node. + * Vp is the alias vnode, lofsvp is the lower vnode. + * Maintain a reference to (lowervp). + */ +static int +null_node_alloc(mp, lowervp, vpp) + struct mount *mp; + struct vnode *lowervp; + struct vnode **vpp; +{ + struct null_node_cache *hd; + struct null_node *xp; + struct vnode *othervp, *vp; + int error; + + if (error = getnewvnode(VT_NULL, mp, null_vnodeop_p, vpp)) + return (error); + vp = *vpp; + + MALLOC(xp, struct null_node *, sizeof(struct null_node), M_TEMP, M_WAITOK); + vp->v_type = lowervp->v_type; + xp->null_vnode = vp; + vp->v_data = xp; + xp->null_lowervp = lowervp; + /* + * Before we insert our new node onto the hash chains, + * check to see if someone else has beaten us to it. + * (We could have slept in MALLOC.) + */ + if (othervp = null_node_find(lowervp)) { + FREE(xp, M_TEMP); + vp->v_type = VBAD; /* node is discarded */ + vp->v_usecount = 0; /* XXX */ + *vpp = othervp; + return 0; + }; + VREF(lowervp); /* Extra VREF will be vrele'd in null_node_create */ + hd = null_node_hash(lowervp); + insque(xp, hd); + return 0; +} + + +/* + * Try to find an existing null_node vnode refering + * to it, otherwise make a new null_node vnode which + * contains a reference to the lower vnode. + */ +int +null_node_create(mp, lowervp, newvpp) + struct mount *mp; + struct vnode *lowervp; + struct vnode **newvpp; +{ + struct vnode *aliasvp; + + if (aliasvp = null_node_find(mp, lowervp)) { + /* + * null_node_find has taken another reference + * to the alias vnode. + */ +#ifdef NULLFS_DIAGNOSTIC + vprint("null_node_create: exists", NULLTOV(ap)); +#endif + /* VREF(aliasvp); --- done in null_node_find */ + } else { + int error; + + /* + * Get new vnode. + */ +#ifdef NULLFS_DIAGNOSTIC + printf("null_node_create: create new alias vnode\n"); +#endif + + /* + * Make new vnode reference the null_node. + */ + if (error = null_node_alloc(mp, lowervp, &aliasvp)) + return error; + + /* + * aliasvp is already VREF'd by getnewvnode() + */ + } + + vrele(lowervp); + +#ifdef DIAGNOSTIC + if (lowervp->v_usecount < 1) { + /* Should never happen... */ + vprint ("null_node_create: alias "); + vprint ("null_node_create: lower "); + printf ("null_node_create: lower has 0 usecount.\n"); + panic ("null_node_create: lower has 0 usecount."); + }; +#endif + +#ifdef NULLFS_DIAGNOSTIC + vprint("null_node_create: alias", aliasvp); + vprint("null_node_create: lower", lowervp); +#endif + + *newvpp = aliasvp; + return (0); +} +#ifdef NULLFS_DIAGNOSTIC +struct vnode * +null_checkvp(vp, fil, lno) + struct vnode *vp; + char *fil; + int lno; +{ + struct null_node *a = VTONULL(vp); +#ifdef notyet + /* + * Can't do this check because vop_reclaim runs + * with a funny vop vector. + */ + if (vp->v_op != null_vnodeop_p) { + printf ("null_checkvp: on non-null-node\n"); + while (null_checkvp_barrier) /*WAIT*/ ; + panic("null_checkvp"); + }; +#endif + if (a->null_lowervp == NULL) { + /* Should never happen */ + int i; u_long *p; + printf("vp = %x, ZERO ptr\n", vp); + for (p = (u_long *) a, i = 0; i < 8; i++) + printf(" %x", p[i]); + printf("\n"); + /* wait for debugger */ + while (null_checkvp_barrier) /*WAIT*/ ; + panic("null_checkvp"); + } + if (a->null_lowervp->v_usecount < 1) { + int i; u_long *p; + printf("vp = %x, unref'ed lowervp\n", vp); + for (p = (u_long *) a, i = 0; i < 8; i++) + printf(" %x", p[i]); + printf("\n"); + /* wait for debugger */ + while (null_checkvp_barrier) /*WAIT*/ ; + panic ("null with unref'ed lowervp"); + }; +#ifdef notyet + printf("null %x/%d -> %x/%d [%s, %d]\n", + NULLTOV(a), NULLTOV(a)->v_usecount, + a->null_lowervp, a->null_lowervp->v_usecount, + fil, lno); +#endif + return a->null_lowervp; +} +#endif diff --git a/sys/miscfs/nullfs/null_vfsops.c b/sys/miscfs/nullfs/null_vfsops.c new file mode 100644 index 00000000000..b0d2df75cda --- /dev/null +++ b/sys/miscfs/nullfs/null_vfsops.c @@ -0,0 +1,366 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null_vfsops.c 8.2 (Berkeley) 1/21/94 + * + * @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92 + * $Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp jsp $ + */ + +/* + * Null Layer + * (See null_vnops.c for a description of what this does.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Mount null layer + */ +int +nullfs_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + int error = 0; + struct null_args args; + struct vnode *lowerrootvp, *vp; + struct vnode *nullm_rootvp; + struct null_mount *xmp; + u_int size; + +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_mount(mp = %x)\n", mp); +#endif + + /* + * Update is a no-op + */ + if (mp->mnt_flag & MNT_UPDATE) { + return (EOPNOTSUPP); + /* return VFS_MOUNT(MOUNTTONULLMOUNT(mp)->nullm_vfs, path, data, ndp, p);*/ + } + + /* + * Get argument + */ + if (error = copyin(data, (caddr_t)&args, sizeof(struct null_args))) + return (error); + + /* + * Find lower node + */ + NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT|LOCKLEAF, + UIO_USERSPACE, args.target, p); + if (error = namei(ndp)) + return (error); + + /* + * Sanity check on lower vnode + */ + lowerrootvp = ndp->ni_vp; + + vrele(ndp->ni_dvp); + ndp->ni_dvp = NULL; + + xmp = (struct null_mount *) malloc(sizeof(struct null_mount), + M_UFSMNT, M_WAITOK); /* XXX */ + + /* + * Save reference to underlying FS + */ + xmp->nullm_vfs = lowerrootvp->v_mount; + + /* + * Save reference. Each mount also holds + * a reference on the root vnode. + */ + error = null_node_create(mp, lowerrootvp, &vp); + /* + * Unlock the node (either the lower or the alias) + */ + VOP_UNLOCK(vp); + /* + * Make sure the node alias worked + */ + if (error) { + vrele(lowerrootvp); + free(xmp, M_UFSMNT); /* XXX */ + return (error); + } + + /* + * Keep a held reference to the root vnode. + * It is vrele'd in nullfs_unmount. + */ + nullm_rootvp = vp; + nullm_rootvp->v_flag |= VROOT; + xmp->nullm_rootvp = nullm_rootvp; + if (NULLVPTOLOWERVP(nullm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_data = (qaddr_t) xmp; + getnewfsid(mp, MOUNT_LOFS); + + (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + (void) copyinstr(args.target, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_mount: lower %s, alias at %s\n", + mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); +#endif + return (0); +} + +/* + * VFS start. Nothing needed here - the start routine + * on the underlying filesystem will have been called + * when that filesystem was mounted. + */ +int +nullfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + return (0); + /* return VFS_START(MOUNTTONULLMOUNT(mp)->nullm_vfs, flags, p); */ +} + +/* + * Free reference to null layer + */ +int +nullfs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + struct vnode *nullm_rootvp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; + int error; + int flags = 0; + extern int doforce; + +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_unmount(mp = %x)\n", mp); +#endif + + if (mntflags & MNT_FORCE) { + /* lofs can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + /* + * Clear out buffer cache. I don't think we + * ever get anything cached at this level at the + * moment, but who knows... + */ +#if 0 + mntflushbuf(mp, 0); + if (mntinvalbuf(mp, 1)) + return (EBUSY); +#endif + if (nullm_rootvp->v_usecount > 1) + return (EBUSY); + if (error = vflush(mp, nullm_rootvp, flags)) + return (error); + +#ifdef NULLFS_DIAGNOSTIC + vprint("alias root of lower", nullm_rootvp); +#endif + /* + * Release reference on underlying root vnode + */ + vrele(nullm_rootvp); + /* + * And blow it away for future re-use + */ + vgone(nullm_rootvp); + /* + * Finally, throw away the null_mount structure + */ + free(mp->mnt_data, M_UFSMNT); /* XXX */ + mp->mnt_data = 0; + return 0; +} + +int +nullfs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *vp; + +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_root(mp = %x, vp = %x->%x)\n", mp, + MOUNTTONULLMOUNT(mp)->nullm_rootvp, + NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp) + ); +#endif + + /* + * Return locked reference to root. + */ + vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; + VREF(vp); + VOP_LOCK(vp); + *vpp = vp; + return 0; +} + +int +nullfs_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg, p); +} + +int +nullfs_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + int error; + struct statfs mstat; + +#ifdef NULLFS_DIAGNOSTIC + printf("nullfs_statfs(mp = %x, vp = %x->%x)\n", mp, + MOUNTTONULLMOUNT(mp)->nullm_rootvp, + NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp) + ); +#endif + + bzero(&mstat, sizeof(mstat)); + + error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, p); + if (error) + return (error); + + /* now copy across the "interesting" information and fake the rest */ + sbp->f_type = mstat.f_type; + sbp->f_flags = mstat.f_flags; + sbp->f_bsize = mstat.f_bsize; + sbp->f_iosize = mstat.f_iosize; + sbp->f_blocks = mstat.f_blocks; + sbp->f_bfree = mstat.f_bfree; + sbp->f_bavail = mstat.f_bavail; + sbp->f_files = mstat.f_files; + sbp->f_ffree = mstat.f_ffree; + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + return (0); +} + +int +nullfs_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + /* + * XXX - Assumes no data cached at null layer. + */ + return (0); +} + +int +nullfs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, vpp); +} + +int +nullfs_fhtovp(mp, fidp, nam, vpp, exflagsp, credanonp) + struct mount *mp; + struct fid *fidp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred**credanonp; +{ + + return VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, nam, vpp, exflagsp,credanonp); +} + +int +nullfs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + return VFS_VPTOFH(NULLVPTOLOWERVP(vp), fhp); +} + +int nullfs_init __P((void)); + +struct vfsops null_vfsops = { + nullfs_mount, + nullfs_start, + nullfs_unmount, + nullfs_root, + nullfs_quotactl, + nullfs_statfs, + nullfs_sync, + nullfs_vget, + nullfs_fhtovp, + nullfs_vptofh, + nullfs_init, +}; diff --git a/sys/miscfs/nullfs/null_vnops.c b/sys/miscfs/nullfs/null_vnops.c new file mode 100644 index 00000000000..115ff6f4643 --- /dev/null +++ b/sys/miscfs/nullfs/null_vnops.c @@ -0,0 +1,462 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * John Heidemann of the UCLA Ficus project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)null_vnops.c 8.1 (Berkeley) 6/10/93 + * + * Ancestors: + * @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92 + * $Id: lofs_vnops.c,v 1.11 1992/05/30 10:05:43 jsp Exp jsp $ + * ...and... + * @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project + */ + +/* + * Null Layer + * + * (See mount_null(8) for more information.) + * + * The null layer duplicates a portion of the file system + * name space under a new name. In this respect, it is + * similar to the loopback file system. It differs from + * the loopback fs in two respects: it is implemented using + * a stackable layers techniques, and it's "null-node"s stack above + * all lower-layer vnodes, not just over directory vnodes. + * + * The null layer has two purposes. First, it serves as a demonstration + * of layering by proving a layer which does nothing. (It actually + * does everything the loopback file system does, which is slightly + * more than nothing.) Second, the null layer can serve as a prototype + * layer. Since it provides all necessary layer framework, + * new file system layers can be created very easily be starting + * with a null layer. + * + * The remainder of this man page examines the null layer as a basis + * for constructing new layers. + * + * + * INSTANTIATING NEW NULL LAYERS + * + * New null layers are created with mount_null(8). + * Mount_null(8) takes two arguments, the pathname + * of the lower vfs (target-pn) and the pathname where the null + * layer will appear in the namespace (alias-pn). After + * the null layer is put into place, the contents + * of target-pn subtree will be aliased under alias-pn. + * + * + * OPERATION OF A NULL LAYER + * + * The null layer is the minimum file system layer, + * simply bypassing all possible operations to the lower layer + * for processing there. The majority of its activity centers + * on the bypass routine, though which nearly all vnode operations + * pass. + * + * The bypass routine accepts arbitrary vnode operations for + * handling by the lower layer. It begins by examing vnode + * operation arguments and replacing any null-nodes by their + * lower-layer equivlants. It then invokes the operation + * on the lower layer. Finally, it replaces the null-nodes + * in the arguments and, if a vnode is return by the operation, + * stacks a null-node on top of the returned vnode. + * + * Although bypass handles most operations, + * vop_getattr, _inactive, _reclaim, and _print are not bypassed. + * Vop_getattr must change the fsid being returned. + * Vop_inactive and vop_reclaim are not bypassed so that + * they can handle freeing null-layer specific data. + * Vop_print is not bypassed to avoid excessive debugging + * information. + * + * + * INSTANTIATING VNODE STACKS + * + * Mounting associates the null layer with a lower layer, + * effect stacking two VFSes. Vnode stacks are instead + * created on demand as files are accessed. + * + * The initial mount creates a single vnode stack for the + * root of the new null layer. All other vnode stacks + * are created as a result of vnode operations on + * this or other null vnode stacks. + * + * New vnode stacks come into existance as a result of + * an operation which returns a vnode. + * The bypass routine stacks a null-node above the new + * vnode before returning it to the caller. + * + * For example, imagine mounting a null layer with + * "mount_null /usr/include /dev/layer/null". + * Changing directory to /dev/layer/null will assign + * the root null-node (which was created when the null layer was mounted). + * Now consider opening "sys". A vop_lookup would be + * done on the root null-node. This operation would bypass through + * to the lower layer which would return a vnode representing + * the UFS "sys". Null_bypass then builds a null-node + * aliasing the UFS "sys" and returns this to the caller. + * Later operations on the null-node "sys" will repeat this + * process when constructing other vnode stacks. + * + * + * CREATING OTHER FILE SYSTEM LAYERS + * + * One of the easiest ways to construct new file system layers is to make + * a copy of the null layer, rename all files and variables, and + * then begin modifing the copy. Sed can be used to easily rename + * all variables. + * + * The umap layer is an example of a layer descended from the + * null layer. + * + * + * INVOKING OPERATIONS ON LOWER LAYERS + * + * There are two techniques to invoke operations on a lower layer + * when the operation cannot be completely bypassed. Each method + * is appropriate in different situations. In both cases, + * it is the responsibility of the aliasing layer to make + * the operation arguments "correct" for the lower layer + * by mapping an vnode arguments to the lower layer. + * + * The first approach is to call the aliasing layer's bypass routine. + * This method is most suitable when you wish to invoke the operation + * currently being hanldled on the lower layer. It has the advantage + * that the bypass routine already must do argument mapping. + * An example of this is null_getattrs in the null layer. + * + * A second approach is to directly invoked vnode operations on + * the lower layer with the VOP_OPERATIONNAME interface. + * The advantage of this method is that it is easy to invoke + * arbitrary operations on the lower layer. The disadvantage + * is that vnodes arguments must be manualy mapped. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ + +/* + * This is the 10-Apr-92 bypass routine. + * This version has been optimized for speed, throwing away some + * safety checks. It should still always work, but it's not as + * robust to programmer errors. + * Define SAFETY to include some error checking code. + * + * In general, we map all vnodes going down and unmap them on the way back. + * As an exception to this, vnodes can be marked "unmapped" by setting + * the Nth bit in operation's vdesc_flags. + * + * Also, some BSD vnode operations have the side effect of vrele'ing + * their arguments. With stacking, the reference counts are held + * by the upper node, not the lower one, so we must handle these + * side-effects here. This is not of concern in Sun-derived systems + * since there are no such side-effects. + * + * This makes the following assumptions: + * - only one returned vpp + * - no INOUT vpp's (Sun's vop_open has one of these) + * - the vnode operation vector of the first vnode should be used + * to determine what implementation of the op should be invoked + * - all mapped vnodes are of our vnode-type (NEEDSWORK: + * problems on rmdir'ing mount points and renaming?) + */ +int +null_bypass(ap) + struct vop_generic_args /* { + struct vnodeop_desc *a_desc; + + } */ *ap; +{ + extern int (**null_vnodeop_p)(); /* not extern, really "forward" */ + register struct vnode **this_vp_p; + int error; + struct vnode *old_vps[VDESC_MAX_VPS]; + struct vnode **vps_p[VDESC_MAX_VPS]; + struct vnode ***vppp; + struct vnodeop_desc *descp = ap->a_desc; + int reles, i; + + if (null_bug_bypass) + printf ("null_bypass: %s\n", descp->vdesc_name); + +#ifdef SAFETY + /* + * We require at least one vp. + */ + if (descp->vdesc_vp_offsets == NULL || + descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) + panic ("null_bypass: no vp's in map.\n"); +#endif + + /* + * Map the vnodes going in. + * Later, we'll invoke the operation based on + * the first mapped vnode's operation vector. + */ + reles = descp->vdesc_flags; + for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { + if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) + break; /* bail out at end of list */ + vps_p[i] = this_vp_p = + VOPARG_OFFSETTO(struct vnode**,descp->vdesc_vp_offsets[i],ap); + /* + * We're not guaranteed that any but the first vnode + * are of our type. Check for and don't map any + * that aren't. (We must always map first vp or vclean fails.) + */ + if (i && (*this_vp_p)->v_op != null_vnodeop_p) { + old_vps[i] = NULL; + } else { + old_vps[i] = *this_vp_p; + *(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p); + /* + * XXX - Several operations have the side effect + * of vrele'ing their vp's. We must account for + * that. (This should go away in the future.) + */ + if (reles & 1) + VREF(*this_vp_p); + } + + } + + /* + * Call the operation on the lower layer + * with the modified argument structure. + */ + error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap); + + /* + * Maintain the illusion of call-by-value + * by restoring vnodes in the argument structure + * to their original value. + */ + reles = descp->vdesc_flags; + for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { + if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) + break; /* bail out at end of list */ + if (old_vps[i]) { + *(vps_p[i]) = old_vps[i]; + if (reles & 1) + vrele(*(vps_p[i])); + } + } + + /* + * Map the possible out-going vpp + * (Assumes that the lower layer always returns + * a VREF'ed vpp unless it gets an error.) + */ + if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && + !(descp->vdesc_flags & VDESC_NOMAP_VPP) && + !error) { + /* + * XXX - even though some ops have vpp returned vp's, + * several ops actually vrele this before returning. + * We must avoid these ops. + * (This should go away when these ops are regularized.) + */ + if (descp->vdesc_flags & VDESC_VPP_WILLRELE) + goto out; + vppp = VOPARG_OFFSETTO(struct vnode***, + descp->vdesc_vpp_offset,ap); + error = null_node_create(old_vps[0]->v_mount, **vppp, *vppp); + } + + out: + return (error); +} + + +/* + * We handle getattr only to change the fsid. + */ +int +null_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + int error; + if (error = null_bypass(ap)) + return (error); + /* Requires that arguments be restored. */ + ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; + return (0); +} + + +int +null_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + /* + * Do nothing (and _don't_ bypass). + * Wait to vrele lowervp until reclaim, + * so that until then our null_node is in the + * cache and reusable. + * + * NEEDSWORK: Someday, consider inactive'ing + * the lowervp and then trying to reactivate it + * with capabilities (v_id) + * like they do in the name lookup cache code. + * That's too much work for now. + */ + return (0); +} + +int +null_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct null_node *xp = VTONULL(vp); + struct vnode *lowervp = xp->null_lowervp; + + /* + * Note: in vop_reclaim, vp->v_op == dead_vnodeop_p, + * so we can't call VOPs on ourself. + */ + /* After this assignment, this node will not be re-used. */ + xp->null_lowervp = NULL; + remque(xp); + FREE(vp->v_data, M_TEMP); + vp->v_data = NULL; + vrele (lowervp); + return (0); +} + + +int +null_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + printf ("\ttag VT_NULLFS, vp=%x, lowervp=%x\n", vp, NULLVPTOLOWERVP(vp)); + return (0); +} + + +/* + * XXX - vop_strategy must be hand coded because it has no + * vnode in its arguments. + * This goes away with a merged VM/buffer cache. + */ +int +null_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + int error; + struct vnode *savedvp; + + savedvp = bp->b_vp; + bp->b_vp = NULLVPTOLOWERVP(bp->b_vp); + + error = VOP_STRATEGY(bp); + + bp->b_vp = savedvp; + + return (error); +} + + +/* + * XXX - like vop_strategy, vop_bwrite must be hand coded because it has no + * vnode in its arguments. + * This goes away with a merged VM/buffer cache. + */ +int +null_bwrite(ap) + struct vop_bwrite_args /* { + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + int error; + struct vnode *savedvp; + + savedvp = bp->b_vp; + bp->b_vp = NULLVPTOLOWERVP(bp->b_vp); + + error = VOP_BWRITE(bp); + + bp->b_vp = savedvp; + + return (error); +} + +/* + * Global vfs data structures + */ +int (**null_vnodeop_p)(); +struct vnodeopv_entry_desc null_vnodeop_entries[] = { + { &vop_default_desc, null_bypass }, + + { &vop_getattr_desc, null_getattr }, + { &vop_inactive_desc, null_inactive }, + { &vop_reclaim_desc, null_reclaim }, + { &vop_print_desc, null_print }, + + { &vop_strategy_desc, null_strategy }, + { &vop_bwrite_desc, null_bwrite }, + + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc null_vnodeop_opv_desc = + { &null_vnodeop_p, null_vnodeop_entries }; diff --git a/sys/miscfs/portal/portal.h b/sys/miscfs/portal/portal.h new file mode 100644 index 00000000000..38d7ee0cdd2 --- /dev/null +++ b/sys/miscfs/portal/portal.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)portal.h 8.4 (Berkeley) 1/21/94 + * + * $Id: portal.h,v 1.3 1992/05/30 10:05:24 jsp Exp jsp $ + */ + +struct portal_args { + char *pa_config; /* Config file */ + int pa_socket; /* Socket to server */ +}; + +struct portal_cred { + int pcr_flag; /* File open mode */ + uid_t pcr_uid; /* From ucred */ + short pcr_ngroups; /* From ucred */ + gid_t pcr_groups[NGROUPS]; /* From ucred */ +}; + +#ifdef KERNEL +struct portalmount { + struct vnode *pm_root; /* Root node */ + struct file *pm_server; /* Held reference to server socket */ +}; + +struct portalnode { + int pt_size; /* Length of Arg */ + char *pt_arg; /* Arg to send to server */ + int pt_fileid; /* cookie */ +}; + +#define VFSTOPORTAL(mp) ((struct portalmount *)((mp)->mnt_data)) +#define VTOPORTAL(vp) ((struct portalnode *)(vp)->v_data) + +#define PORTAL_ROOTFILEID 2 + +extern int (**portal_vnodeop_p)(); +extern struct vfsops portal_vfsops; +#endif /* KERNEL */ diff --git a/sys/miscfs/portal/portal_vfsops.c b/sys/miscfs/portal/portal_vfsops.c new file mode 100644 index 00000000000..39e8563009b --- /dev/null +++ b/sys/miscfs/portal/portal_vfsops.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)portal_vfsops.c 8.6 (Berkeley) 1/21/94 + * + * $Id: portal_vfsops.c,v 1.5 1992/05/30 10:25:27 jsp Exp jsp $ + */ + +/* + * Portal Filesystem + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +portal_init() +{ + + return (0); +} + +/* + * Mount the per-process file descriptors (/dev/fd) + */ +int +portal_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct file *fp; + struct portal_args args; + struct portalmount *fmp; + struct socket *so; + struct vnode *rvp; + u_int size; + int error; + + /* + * Update is a no-op + */ + if (mp->mnt_flag & MNT_UPDATE) + return (EOPNOTSUPP); + + if (error = copyin(data, (caddr_t) &args, sizeof(struct portal_args))) + return (error); + + if (error = getsock(p->p_fd, args.pa_socket, &fp)) + return (error); + so = (struct socket *) fp->f_data; + if (so->so_proto->pr_domain->dom_family != AF_UNIX) + return (ESOCKTNOSUPPORT); + + error = getnewvnode(VT_PORTAL, mp, portal_vnodeop_p, &rvp); /* XXX */ + if (error) + return (error); + MALLOC(rvp->v_data, void *, sizeof(struct portalnode), + M_TEMP, M_WAITOK); + + fmp = (struct portalmount *) malloc(sizeof(struct portalmount), + M_UFSMNT, M_WAITOK); /* XXX */ + rvp->v_type = VDIR; + rvp->v_flag |= VROOT; + VTOPORTAL(rvp)->pt_arg = 0; + VTOPORTAL(rvp)->pt_size = 0; + VTOPORTAL(rvp)->pt_fileid = PORTAL_ROOTFILEID; + fmp->pm_root = rvp; + fmp->pm_server = fp; fp->f_count++; + + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_data = (qaddr_t) fmp; + getnewfsid(mp, MOUNT_PORTAL); + + (void)copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + (void)copyinstr(args.pa_config, + mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + +#ifdef notdef + bzero(mp->mnt_stat.f_mntfromname, MNAMELEN); + bcopy("portal", mp->mnt_stat.f_mntfromname, sizeof("portal")); +#endif + + return (0); +} + +int +portal_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + + return (0); +} + +int +portal_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + extern int doforce; + struct vnode *rootvp = VFSTOPORTAL(mp)->pm_root; + int error, flags = 0; + + + if (mntflags & MNT_FORCE) { + /* portal can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + /* + * Clear out buffer cache. I don't think we + * ever get anything cached at this level at the + * moment, but who knows... + */ +#ifdef notyet + mntflushbuf(mp, 0); + if (mntinvalbuf(mp, 1)) + return (EBUSY); +#endif + if (rootvp->v_usecount > 1) + return (EBUSY); + if (error = vflush(mp, rootvp, flags)) + return (error); + + /* + * Release reference on underlying root vnode + */ + vrele(rootvp); + /* + * And blow it away for future re-use + */ + vgone(rootvp); + /* + * Shutdown the socket. This will cause the select in the + * daemon to wake up, and then the accept will get ECONNABORTED + * which it interprets as a request to go and bury itself. + */ + soshutdown((struct socket *) VFSTOPORTAL(mp)->pm_server->f_data, 2); + /* + * Discard reference to underlying file. Must call closef because + * this may be the last reference. + */ + closef(VFSTOPORTAL(mp)->pm_server, (struct proc *) 0); + /* + * Finally, throw away the portalmount structure + */ + free(mp->mnt_data, M_UFSMNT); /* XXX */ + mp->mnt_data = 0; + return (0); +} + +int +portal_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *vp; + + + /* + * Return locked reference to root. + */ + vp = VFSTOPORTAL(mp)->pm_root; + VREF(vp); + VOP_LOCK(vp); + *vpp = vp; + return (0); +} + +int +portal_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} + +int +portal_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + + sbp->f_type = MOUNT_PORTAL; + sbp->f_flags = 0; + sbp->f_bsize = DEV_BSIZE; + sbp->f_iosize = DEV_BSIZE; + sbp->f_blocks = 2; /* 1K to keep df happy */ + sbp->f_bfree = 0; + sbp->f_bavail = 0; + sbp->f_files = 1; /* Allow for "." */ + sbp->f_ffree = 0; /* See comments above */ + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + return (0); +} + +int +portal_sync(mp, waitfor) + struct mount *mp; + int waitfor; +{ + + return (0); +} + +int +portal_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +int +portal_fhtovp(mp, fhp, vpp) + struct mount *mp; + struct fid *fhp; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +int +portal_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + + return (EOPNOTSUPP); +} + +struct vfsops portal_vfsops = { + portal_mount, + portal_start, + portal_unmount, + portal_root, + portal_quotactl, + portal_statfs, + portal_sync, + portal_vget, + portal_fhtovp, + portal_vptofh, + portal_init, +}; diff --git a/sys/miscfs/portal/portal_vnops.c b/sys/miscfs/portal/portal_vnops.c new file mode 100644 index 00000000000..5e170261e71 --- /dev/null +++ b/sys/miscfs/portal/portal_vnops.c @@ -0,0 +1,707 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)portal_vnops.c 8.8 (Berkeley) 1/21/94 + * + * $Id: portal_vnops.c,v 1.4 1992/05/30 10:05:24 jsp Exp jsp $ + */ + +/* + * Portal Filesystem + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int portal_fileid = PORTAL_ROOTFILEID+1; + +static void +portal_closefd(p, fd) + struct proc *p; + int fd; +{ + int error; + struct { + int fd; + } ua; + int rc; + + ua.fd = fd; + error = close(p, &ua, &rc); + /* + * We should never get an error, and there isn't anything + * we could do if we got one, so just print a message. + */ + if (error) + printf("portal_closefd: error = %d\n", error); +} + +/* + * vp is the current namei directory + * cnp is the name to locate in that directory... + */ +int +portal_lookup(ap) + struct vop_lookup_args /* { + struct vnode * a_dvp; + struct vnode ** a_vpp; + struct componentname * a_cnp; + } */ *ap; +{ + char *pname = ap->a_cnp->cn_nameptr; + struct portalnode *pt; + int error; + struct vnode *fvp = 0; + char *path; + int size; + + if (ap->a_cnp->cn_namelen == 1 && *pname == '.') { + *ap->a_vpp = ap->a_dvp; + VREF(ap->a_dvp); + /*VOP_LOCK(ap->a_dvp);*/ + return (0); + } + + + error = getnewvnode(VT_PORTAL, ap->a_dvp->v_mount, portal_vnodeop_p, &fvp); + if (error) + goto bad; + fvp->v_type = VREG; + MALLOC(fvp->v_data, void *, sizeof(struct portalnode), + M_TEMP, M_WAITOK); + + pt = VTOPORTAL(fvp); + /* + * Save all of the remaining pathname and + * advance the namei next pointer to the end + * of the string. + */ + for (size = 0, path = pname; *path; path++) + size++; + ap->a_cnp->cn_consume = size - ap->a_cnp->cn_namelen; + + pt->pt_arg = malloc(size+1, M_TEMP, M_WAITOK); + pt->pt_size = size+1; + bcopy(pname, pt->pt_arg, pt->pt_size); + pt->pt_fileid = portal_fileid++; + + *ap->a_vpp = fvp; + /*VOP_LOCK(fvp);*/ + return (0); + +bad:; + if (fvp) { + vrele(fvp); + } + *ap->a_vpp = NULL; + return (error); +} + +static int +portal_connect(so, so2) + struct socket *so; + struct socket *so2; +{ + /* from unp_connect, bypassing the namei stuff... */ + struct socket *so3; + struct unpcb *unp2; + struct unpcb *unp3; + + if (so2 == 0) + return (ECONNREFUSED); + + if (so->so_type != so2->so_type) + return (EPROTOTYPE); + + if ((so2->so_options & SO_ACCEPTCONN) == 0) + return (ECONNREFUSED); + + if ((so3 = sonewconn(so2, 0)) == 0) + return (ECONNREFUSED); + + unp2 = sotounpcb(so2); + unp3 = sotounpcb(so3); + if (unp2->unp_addr) + unp3->unp_addr = m_copy(unp2->unp_addr, 0, (int)M_COPYALL); + + so2 = so3; + + + return (unp_connect2(so, so2)); +} + +int +portal_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct socket *so = 0; + struct portalnode *pt; + struct proc *p = ap->a_p; + struct vnode *vp = ap->a_vp; + int s; + struct uio auio; + struct iovec aiov[2]; + int res; + struct mbuf *cm = 0; + struct cmsghdr *cmsg; + int newfds; + int *ip; + int fd; + int error; + int len; + struct portalmount *fmp; + struct file *fp; + struct portal_cred pcred; + + /* + * Nothing to do when opening the root node. + */ + if (vp->v_flag & VROOT) + return (0); + + /* + * Can't be opened unless the caller is set up + * to deal with the side effects. Check for this + * by testing whether the p_dupfd has been set. + */ + if (p->p_dupfd >= 0) + return (ENODEV); + + pt = VTOPORTAL(vp); + fmp = VFSTOPORTAL(vp->v_mount); + + /* + * Create a new socket. + */ + error = socreate(AF_UNIX, &so, SOCK_STREAM, 0); + if (error) + goto bad; + + /* + * Reserve some buffer space + */ + res = pt->pt_size + sizeof(pcred) + 512; /* XXX */ + error = soreserve(so, res, res); + if (error) + goto bad; + + /* + * Kick off connection + */ + error = portal_connect(so, (struct socket *)fmp->pm_server->f_data); + if (error) + goto bad; + + /* + * Wait for connection to complete + */ + /* + * XXX: Since the mount point is holding a reference on the + * underlying server socket, it is not easy to find out whether + * the server process is still running. To handle this problem + * we loop waiting for the new socket to be connected (something + * which will only happen if the server is still running) or for + * the reference count on the server socket to drop to 1, which + * will happen if the server dies. Sleep for 5 second intervals + * and keep polling the reference count. XXX. + */ + s = splnet(); + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { + if (fmp->pm_server->f_count == 1) { + error = ECONNREFUSED; + splx(s); + goto bad; + } + (void) tsleep((caddr_t) &so->so_timeo, PSOCK, "portalcon", 5 * hz); + } + splx(s); + + if (so->so_error) { + error = so->so_error; + goto bad; + } + + /* + * Set miscellaneous flags + */ + so->so_rcv.sb_timeo = 0; + so->so_snd.sb_timeo = 0; + so->so_rcv.sb_flags |= SB_NOINTR; + so->so_snd.sb_flags |= SB_NOINTR; + + + pcred.pcr_flag = ap->a_mode; + pcred.pcr_uid = ap->a_cred->cr_uid; + pcred.pcr_ngroups = ap->a_cred->cr_ngroups; + bcopy(ap->a_cred->cr_groups, pcred.pcr_groups, NGROUPS * sizeof(gid_t)); + aiov[0].iov_base = (caddr_t) &pcred; + aiov[0].iov_len = sizeof(pcred); + aiov[1].iov_base = pt->pt_arg; + aiov[1].iov_len = pt->pt_size; + auio.uio_iov = aiov; + auio.uio_iovcnt = 2; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_procp = p; + auio.uio_offset = 0; + auio.uio_resid = aiov[0].iov_len + aiov[1].iov_len; + + error = sosend(so, (struct mbuf *) 0, &auio, + (struct mbuf *) 0, (struct mbuf *) 0, 0); + if (error) + goto bad; + + len = auio.uio_resid = sizeof(int); + do { + struct mbuf *m = 0; + int flags = MSG_WAITALL; + error = soreceive(so, (struct mbuf **) 0, &auio, + &m, &cm, &flags); + if (error) + goto bad; + + /* + * Grab an error code from the mbuf. + */ + if (m) { + m = m_pullup(m, sizeof(int)); /* Needed? */ + if (m) { + error = *(mtod(m, int *)); + m_freem(m); + } else { + error = EINVAL; + } + } else { + if (cm == 0) { + error = ECONNRESET; /* XXX */ +#ifdef notdef + break; +#endif + } + } + } while (cm == 0 && auio.uio_resid == len && !error); + + if (cm == 0) + goto bad; + + if (auio.uio_resid) { + error = 0; +#ifdef notdef + error = EMSGSIZE; + goto bad; +#endif + } + + /* + * XXX: Break apart the control message, and retrieve the + * received file descriptor. Note that more than one descriptor + * may have been received, or that the rights chain may have more + * than a single mbuf in it. What to do? + */ + cmsg = mtod(cm, struct cmsghdr *); + newfds = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof (int); + if (newfds == 0) { + error = ECONNREFUSED; + goto bad; + } + /* + * At this point the rights message consists of a control message + * header, followed by a data region containing a vector of + * integer file descriptors. The fds were allocated by the action + * of receiving the control message. + */ + ip = (int *) (cmsg + 1); + fd = *ip++; + if (newfds > 1) { + /* + * Close extra fds. + */ + int i; + printf("portal_open: %d extra fds\n", newfds - 1); + for (i = 1; i < newfds; i++) { + portal_closefd(p, *ip); + ip++; + } + } + + /* + * Check that the mode the file is being opened for is a subset + * of the mode of the existing descriptor. + */ + fp = p->p_fd->fd_ofiles[fd]; + if (((ap->a_mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { + portal_closefd(p, fd); + error = EACCES; + goto bad; + } + + /* + * Save the dup fd in the proc structure then return the + * special error code (ENXIO) which causes magic things to + * happen in vn_open. The whole concept is, well, hmmm. + */ + p->p_dupfd = fd; + error = ENXIO; + +bad:; + /* + * And discard the control message. + */ + if (cm) { + m_freem(cm); + } + + if (so) { + soshutdown(so, 2); + soclose(so); + } + return (error); +} + +int +portal_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + + bzero(vap, sizeof(*vap)); + vattr_null(vap); + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + vap->va_size = DEV_BSIZE; + vap->va_blocksize = DEV_BSIZE; + microtime(&vap->va_atime); + vap->va_mtime = vap->va_atime; + vap->va_ctime = vap->va_ctime; + vap->va_gen = 0; + vap->va_flags = 0; + vap->va_rdev = 0; + /* vap->va_qbytes = 0; */ + vap->va_bytes = 0; + /* vap->va_qsize = 0; */ + if (vp->v_flag & VROOT) { + vap->va_type = VDIR; + vap->va_mode = S_IRUSR|S_IWUSR|S_IXUSR| + S_IRGRP|S_IWGRP|S_IXGRP| + S_IROTH|S_IWOTH|S_IXOTH; + vap->va_nlink = 2; + vap->va_fileid = 2; + } else { + vap->va_type = VREG; + vap->va_mode = S_IRUSR|S_IWUSR| + S_IRGRP|S_IWGRP| + S_IROTH|S_IWOTH; + vap->va_nlink = 1; + vap->va_fileid = VTOPORTAL(vp)->pt_fileid; + } + return (0); +} + +int +portal_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* + * Can't mess with the root vnode + */ + if (ap->a_vp->v_flag & VROOT) + return (EACCES); + + return (0); +} + +/* + * Fake readdir, just return empty directory. + * It is hard to deal with '.' and '..' so don't bother. + */ +int +portal_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + + return (0); +} + +int +portal_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +int +portal_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct portalnode *pt = VTOPORTAL(ap->a_vp); + + if (pt->pt_arg) { + free((caddr_t) pt->pt_arg, M_TEMP); + pt->pt_arg = 0; + } + FREE(ap->a_vp->v_data, M_TEMP); + ap->a_vp->v_data = 0; + + return (0); +} + +/* + * Return POSIX pathconf information applicable to special devices. + */ +portal_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_MAX_CANON: + *ap->a_retval = MAX_CANON; + return (0); + case _PC_MAX_INPUT: + *ap->a_retval = MAX_INPUT; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_VDISABLE: + *ap->a_retval = _POSIX_VDISABLE; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Print out the contents of a Portal vnode. + */ +/* ARGSUSED */ +int +portal_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + printf("tag VT_PORTAL, portal vnode\n"); + return (0); +} + +/*void*/ +int +portal_vfree(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + + return (0); +} + + +/* + * Portal vnode unsupported operation + */ +int +portal_enotsupp() +{ + + return (EOPNOTSUPP); +} + +/* + * Portal "should never get here" operation + */ +int +portal_badop() +{ + + panic("portal: bad op"); + /* NOTREACHED */ +} + +/* + * Portal vnode null operation + */ +int +portal_nullop() +{ + + return (0); +} + +#define portal_create ((int (*) __P((struct vop_create_args *)))portal_enotsupp) +#define portal_mknod ((int (*) __P((struct vop_mknod_args *)))portal_enotsupp) +#define portal_close ((int (*) __P((struct vop_close_args *)))nullop) +#define portal_access ((int (*) __P((struct vop_access_args *)))nullop) +#define portal_read ((int (*) __P((struct vop_read_args *)))portal_enotsupp) +#define portal_write ((int (*) __P((struct vop_write_args *)))portal_enotsupp) +#define portal_ioctl ((int (*) __P((struct vop_ioctl_args *)))portal_enotsupp) +#define portal_select ((int (*) __P((struct vop_select_args *)))portal_enotsupp) +#define portal_mmap ((int (*) __P((struct vop_mmap_args *)))portal_enotsupp) +#define portal_fsync ((int (*) __P((struct vop_fsync_args *)))nullop) +#define portal_seek ((int (*) __P((struct vop_seek_args *)))nullop) +#define portal_remove ((int (*) __P((struct vop_remove_args *)))portal_enotsupp) +#define portal_link ((int (*) __P((struct vop_link_args *)))portal_enotsupp) +#define portal_rename ((int (*) __P((struct vop_rename_args *)))portal_enotsupp) +#define portal_mkdir ((int (*) __P((struct vop_mkdir_args *)))portal_enotsupp) +#define portal_rmdir ((int (*) __P((struct vop_rmdir_args *)))portal_enotsupp) +#define portal_symlink \ + ((int (*) __P((struct vop_symlink_args *)))portal_enotsupp) +#define portal_readlink \ + ((int (*) __P((struct vop_readlink_args *)))portal_enotsupp) +#define portal_abortop ((int (*) __P((struct vop_abortop_args *)))nullop) +#define portal_lock ((int (*) __P((struct vop_lock_args *)))nullop) +#define portal_unlock ((int (*) __P((struct vop_unlock_args *)))nullop) +#define portal_bmap ((int (*) __P((struct vop_bmap_args *)))portal_badop) +#define portal_strategy \ + ((int (*) __P((struct vop_strategy_args *)))portal_badop) +#define portal_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +#define portal_advlock \ + ((int (*) __P((struct vop_advlock_args *)))portal_enotsupp) +#define portal_blkatoff \ + ((int (*) __P((struct vop_blkatoff_args *)))portal_enotsupp) +#define portal_valloc ((int(*) __P(( \ + struct vnode *pvp, \ + int mode, \ + struct ucred *cred, \ + struct vnode **vpp))) portal_enotsupp) +#define portal_truncate \ + ((int (*) __P((struct vop_truncate_args *)))portal_enotsupp) +#define portal_update ((int (*) __P((struct vop_update_args *)))portal_enotsupp) +#define portal_bwrite ((int (*) __P((struct vop_bwrite_args *)))portal_enotsupp) + +int (**portal_vnodeop_p)(); +struct vnodeopv_entry_desc portal_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, portal_lookup }, /* lookup */ + { &vop_create_desc, portal_create }, /* create */ + { &vop_mknod_desc, portal_mknod }, /* mknod */ + { &vop_open_desc, portal_open }, /* open */ + { &vop_close_desc, portal_close }, /* close */ + { &vop_access_desc, portal_access }, /* access */ + { &vop_getattr_desc, portal_getattr }, /* getattr */ + { &vop_setattr_desc, portal_setattr }, /* setattr */ + { &vop_read_desc, portal_read }, /* read */ + { &vop_write_desc, portal_write }, /* write */ + { &vop_ioctl_desc, portal_ioctl }, /* ioctl */ + { &vop_select_desc, portal_select }, /* select */ + { &vop_mmap_desc, portal_mmap }, /* mmap */ + { &vop_fsync_desc, portal_fsync }, /* fsync */ + { &vop_seek_desc, portal_seek }, /* seek */ + { &vop_remove_desc, portal_remove }, /* remove */ + { &vop_link_desc, portal_link }, /* link */ + { &vop_rename_desc, portal_rename }, /* rename */ + { &vop_mkdir_desc, portal_mkdir }, /* mkdir */ + { &vop_rmdir_desc, portal_rmdir }, /* rmdir */ + { &vop_symlink_desc, portal_symlink }, /* symlink */ + { &vop_readdir_desc, portal_readdir }, /* readdir */ + { &vop_readlink_desc, portal_readlink }, /* readlink */ + { &vop_abortop_desc, portal_abortop }, /* abortop */ + { &vop_inactive_desc, portal_inactive }, /* inactive */ + { &vop_reclaim_desc, portal_reclaim }, /* reclaim */ + { &vop_lock_desc, portal_lock }, /* lock */ + { &vop_unlock_desc, portal_unlock }, /* unlock */ + { &vop_bmap_desc, portal_bmap }, /* bmap */ + { &vop_strategy_desc, portal_strategy }, /* strategy */ + { &vop_print_desc, portal_print }, /* print */ + { &vop_islocked_desc, portal_islocked }, /* islocked */ + { &vop_pathconf_desc, portal_pathconf }, /* pathconf */ + { &vop_advlock_desc, portal_advlock }, /* advlock */ + { &vop_blkatoff_desc, portal_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, portal_valloc }, /* valloc */ + { &vop_vfree_desc, portal_vfree }, /* vfree */ + { &vop_truncate_desc, portal_truncate }, /* truncate */ + { &vop_update_desc, portal_update }, /* update */ + { &vop_bwrite_desc, portal_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc portal_vnodeop_opv_desc = + { &portal_vnodeop_p, portal_vnodeop_entries }; diff --git a/sys/miscfs/procfs/README b/sys/miscfs/procfs/README new file mode 100644 index 00000000000..38811b3f6e3 --- /dev/null +++ b/sys/miscfs/procfs/README @@ -0,0 +1,113 @@ +saute procfs lyonnais + +procfs supports two levels of directory. the filesystem root +directory contains a representation of the system process table. +this consists of an entry for each active and zombie process, and +an additional entry "curproc" which always represents the process +making the lookup request. + +each of the sub-directories contains several files. these files +are used to control and interrogate processes. the files implemented +are: + + file - xxx. the exec'ed file. + + status - r/o. returns process status. + + ctl - w/o. sends a control message to the process. + for example: + echo hup > /proc/curproc/note + will send a SIGHUP to the shell. + whereas + echo attach > /proc/1293/ctl + would set up process 1293 for debugging. + see below for more details. + + mem - r/w. virtual memory image of the process. + parts of the address space are readable + only if they exist in the target process. + a more reasonable alternative might be + to return zero pages instead of an error. + comments? + + note - w/o. writing a string here sends the + equivalent note to the process. + [ not implemented. ] + + notepg - w/o. the same as note, but sends to all + members of the process group. + [ not implemented. ] + + regs - r/w. process register set. this can be read + or written any time even if the process + is not stopped. since the bsd kernel + is single-processor, this implementation + will get the "right" register values. + a multi-proc kernel would need to do some + synchronisation. + +this then looks like: + +% ls -li /proc +total 0 + 9 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 0 + 17 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 1 + 89 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 10 + 25 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 2 +2065 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 257 +2481 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 309 + 265 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 32 +3129 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 390 +3209 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 400 +3217 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 401 +3273 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 408 + 393 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 48 + 409 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 50 + 465 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 57 + 481 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 59 + 537 dr-xr-xr-x 2 root kmem 0 Sep 21 15:06 66 + 545 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 67 + 657 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 81 + 665 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 82 + 673 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 83 + 681 dr-xr-xr-x 2 root wheel 0 Sep 21 15:06 84 +3273 dr-xr-xr-x 2 jsp staff 0 Sep 21 15:06 curproc +% ls -li /proc/curproc +total 408 +3341 --w------- 1 jsp staff 0 Sep 21 15:06 ctl +1554 -r-xr-xr-x 1 bin bin 90112 Mar 29 04:52 file +3339 -rw------- 1 jsp staff 118784 Sep 21 15:06 mem +3343 --w------- 1 jsp staff 0 Sep 21 15:06 note +3344 --w------- 1 jsp staff 0 Sep 21 15:06 notepg +3340 -rw------- 1 jsp staff 0 Sep 21 15:06 regs +3342 -r--r--r-- 1 jsp staff 0 Sep 21 15:06 status +% df /proc/curproc /proc/curproc/file +Filesystem 512-blocks Used Avail Capacity Mounted on +proc 2 2 0 100% /proc +/dev/wd0a 16186 13548 1018 93% / +% cat /proc/curproc/status +cat 446 439 400 81 12,0 ctty 748620684 270000 0 0 0 20000 nochan 11 20 20 20 0 21 117 + + + +the basic sequence of commands written to "ctl" would be + + attach - this stops the target process and + arranges for the sending process + to become the debug control process + wait - wait for the target process to come to + a steady state ready for debugging. + step - single step, with no signal delivery. + run - continue running, with no signal delivery, + until next trap or breakpoint. + - deliver signal and continue running. + detach - continue execution of the target process + and remove it from control by the debug process + +in a normal debugging environment, where the target is fork/exec'd by +the debugger, the debugger should fork and the child should stop itself +(with a self-inflicted SIGSTOP). the parent should do a "wait" then an +"attach". as before, the child will hit a breakpoint on the first +instruction in any newly exec'd image. + +$Id: README,v 3.1 1993/12/15 09:40:17 jsp Exp $ diff --git a/sys/miscfs/procfs/procfs.h b/sys/miscfs/procfs/procfs.h new file mode 100644 index 00000000000..f7b8fa3ef0e --- /dev/null +++ b/sys/miscfs/procfs/procfs.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs.h 8.6 (Berkeley) 2/3/94 + * + * From: + * $Id: procfs.h,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +/* + * The different types of node in a procfs filesystem + */ +typedef enum { + Proot, /* the filesystem root */ + Pproc, /* a process-specific sub-directory */ + Pfile, /* the executable file */ + Pmem, /* the process's memory image */ + Pregs, /* the process's register set */ + Pfpregs, /* the process's FP register set */ + Pctl, /* process control */ + Pstatus, /* process status */ + Pnote, /* process notifier */ + Pnotepg /* process group notifier */ +} pfstype; + +/* + * control data for the proc file system. + */ +struct pfsnode { + struct pfsnode *pfs_next; /* next on list */ + struct vnode *pfs_vnode; /* vnode associated with this pfsnode */ + pfstype pfs_type; /* type of procfs node */ + pid_t pfs_pid; /* associated process */ + u_short pfs_mode; /* mode bits for stat() */ + u_long pfs_flags; /* open flags */ + u_long pfs_fileno; /* unique file id */ +}; + +#define PROCFS_NOTELEN 64 /* max length of a note (/proc/$pid/note) */ +#define PROCFS_CTLLEN 8 /* max length of a ctl msg (/proc/$pid/ctl */ + +/* + * Kernel stuff follows + */ +#ifdef KERNEL +#define CNEQ(cnp, s, len) \ + ((cnp)->cn_namelen == (len) && \ + (bcmp((s), (cnp)->cn_nameptr, (len)) == 0)) + +/* + * Format of a directory entry in /proc, ... + * This must map onto struct dirent (see ) + */ +#define PROCFS_NAMELEN 8 +struct pfsdent { + u_long d_fileno; + u_short d_reclen; + u_char d_type; + u_char d_namlen; + char d_name[PROCFS_NAMELEN]; +}; +#define UIO_MX sizeof(struct pfsdent) +#define PROCFS_FILENO(pid, type) \ + (((type) == Proot) ? \ + 2 : \ + ((((pid)+1) << 3) + ((int) (type)))) + +/* + * Convert between pfsnode vnode + */ +#define VTOPFS(vp) ((struct pfsnode *)(vp)->v_data) +#define PFSTOV(pfs) ((pfs)->pfs_vnode) + +typedef struct vfs_namemap vfs_namemap_t; +struct vfs_namemap { + const char *nm_name; + int nm_val; +}; + +extern int vfs_getuserstr __P((struct uio *, char *, int *)); +extern vfs_namemap_t *vfs_findname __P((vfs_namemap_t *, char *, int)); + +/* */ +struct reg; +struct fpreg; + +#define PFIND(pid) ((pid) ? pfind(pid) : &proc0) +extern int procfs_freevp __P((struct vnode *)); +extern int procfs_allocvp __P((struct mount *, struct vnode **, long, pfstype)); +extern struct vnode *procfs_findtextvp __P((struct proc *)); +extern int procfs_sstep __P((struct proc *)); +extern void procfs_fix_sstep __P((struct proc *)); +extern int procfs_read_regs __P((struct proc *, struct reg *)); +extern int procfs_write_regs __P((struct proc *, struct reg *)); +extern int procfs_read_fpregs __P((struct proc *, struct fpreg *)); +extern int procfs_write_fpregs __P((struct proc *, struct fpreg *)); +extern int procfs_donote __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); +extern int procfs_doregs __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); +extern int procfs_dofpregs __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); +extern int procfs_domem __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); +extern int procfs_doctl __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); +extern int procfs_dostatus __P((struct proc *, struct proc *, struct pfsnode *pfsp, struct uio *uio)); + +#define PROCFS_LOCKED 0x01 +#define PROCFS_WANT 0x02 + +extern int (**procfs_vnodeop_p)(); +extern struct vfsops procfs_vfsops; + +/* + * Prototypes for procfs vnode ops + */ +int procfs_badop(); /* varargs */ +int procfs_rw __P((struct vop_read_args *)); +int procfs_lookup __P((struct vop_lookup_args *)); +#define procfs_create ((int (*) __P((struct vop_create_args *))) procfs_badop) +#define procfs_mknod ((int (*) __P((struct vop_mknod_args *))) procfs_badop) +int procfs_open __P((struct vop_open_args *)); +int procfs_close __P((struct vop_close_args *)); +int procfs_access __P((struct vop_access_args *)); +int procfs_getattr __P((struct vop_getattr_args *)); +int procfs_setattr __P((struct vop_setattr_args *)); +#define procfs_read procfs_rw +#define procfs_write procfs_rw +int procfs_ioctl __P((struct vop_ioctl_args *)); +#define procfs_select ((int (*) __P((struct vop_select_args *))) procfs_badop) +#define procfs_mmap ((int (*) __P((struct vop_mmap_args *))) procfs_badop) +#define procfs_fsync ((int (*) __P((struct vop_fsync_args *))) procfs_badop) +#define procfs_seek ((int (*) __P((struct vop_seek_args *))) procfs_badop) +#define procfs_remove ((int (*) __P((struct vop_remove_args *))) procfs_badop) +#define procfs_link ((int (*) __P((struct vop_link_args *))) procfs_badop) +#define procfs_rename ((int (*) __P((struct vop_rename_args *))) procfs_badop) +#define procfs_mkdir ((int (*) __P((struct vop_mkdir_args *))) procfs_badop) +#define procfs_rmdir ((int (*) __P((struct vop_rmdir_args *))) procfs_badop) +#define procfs_symlink ((int (*) __P((struct vop_symlink_args *))) procfs_badop) +int procfs_readdir __P((struct vop_readdir_args *)); +#define procfs_readlink ((int (*) __P((struct vop_readlink_args *))) procfs_badop) +int procfs_abortop __P((struct vop_abortop_args *)); +int procfs_inactive __P((struct vop_inactive_args *)); +int procfs_reclaim __P((struct vop_reclaim_args *)); +#define procfs_lock ((int (*) __P((struct vop_lock_args *))) nullop) +#define procfs_unlock ((int (*) __P((struct vop_unlock_args *))) nullop) +int procfs_bmap __P((struct vop_bmap_args *)); +#define procfs_strategy ((int (*) __P((struct vop_strategy_args *))) procfs_badop) +int procfs_print __P((struct vop_print_args *)); +#define procfs_islocked ((int (*) __P((struct vop_islocked_args *))) nullop) +#define procfs_advlock ((int (*) __P((struct vop_advlock_args *))) procfs_badop) +#define procfs_blkatoff ((int (*) __P((struct vop_blkatoff_args *))) procfs_badop) +#define procfs_valloc ((int (*) __P((struct vop_valloc_args *))) procfs_badop) +#define procfs_vfree ((int (*) __P((struct vop_vfree_args *))) nullop) +#define procfs_truncate ((int (*) __P((struct vop_truncate_args *))) procfs_badop) +#define procfs_update ((int (*) __P((struct vop_update_args *))) nullop) +#endif /* KERNEL */ diff --git a/sys/miscfs/procfs/procfs_ctl.c b/sys/miscfs/procfs/procfs_ctl.c new file mode 100644 index 00000000000..a42a03ce91c --- /dev/null +++ b/sys/miscfs/procfs/procfs_ctl.c @@ -0,0 +1,302 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_ctl.c 8.3 (Berkeley) 1/21/94 + * + * From: + * $Id: procfs_ctl.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * True iff process (p) is in trace wait state + * relative to process (curp) + */ +#define TRACE_WAIT_P(curp, p) \ + ((p)->p_stat == SSTOP && \ + (p)->p_pptr == (curp) && \ + ((p)->p_flag & P_TRACED)) + +#ifdef notdef +#define FIX_SSTEP(p) { \ + procfs_fix_sstep(p); \ + } \ +} +#else +#define FIX_SSTEP(p) +#endif + +#define PROCFS_CTL_ATTACH 1 +#define PROCFS_CTL_DETACH 2 +#define PROCFS_CTL_STEP 3 +#define PROCFS_CTL_RUN 4 +#define PROCFS_CTL_WAIT 5 + +static vfs_namemap_t ctlnames[] = { + /* special /proc commands */ + { "attach", PROCFS_CTL_ATTACH }, + { "detach", PROCFS_CTL_DETACH }, + { "step", PROCFS_CTL_STEP }, + { "run", PROCFS_CTL_RUN }, + { "wait", PROCFS_CTL_WAIT }, + { 0 }, +}; + +static vfs_namemap_t signames[] = { + /* regular signal names */ + { "hup", SIGHUP }, { "int", SIGINT }, + { "quit", SIGQUIT }, { "ill", SIGILL }, + { "trap", SIGTRAP }, { "abrt", SIGABRT }, + { "iot", SIGIOT }, { "emt", SIGEMT }, + { "fpe", SIGFPE }, { "kill", SIGKILL }, + { "bus", SIGBUS }, { "segv", SIGSEGV }, + { "sys", SIGSYS }, { "pipe", SIGPIPE }, + { "alrm", SIGALRM }, { "term", SIGTERM }, + { "urg", SIGURG }, { "stop", SIGSTOP }, + { "tstp", SIGTSTP }, { "cont", SIGCONT }, + { "chld", SIGCHLD }, { "ttin", SIGTTIN }, + { "ttou", SIGTTOU }, { "io", SIGIO }, + { "xcpu", SIGXCPU }, { "xfsz", SIGXFSZ }, + { "vtalrm", SIGVTALRM }, { "prof", SIGPROF }, + { "winch", SIGWINCH }, { "info", SIGINFO }, + { "usr1", SIGUSR1 }, { "usr2", SIGUSR2 }, + { 0 }, +}; + +static int +procfs_control(curp, p, op) + struct proc *curp; + struct proc *p; + int op; +{ + int error; + + /* + * Attach - attaches the target process for debugging + * by the calling process. + */ + if (op == PROCFS_CTL_ATTACH) { + /* check whether already being traced */ + if (p->p_flag & P_TRACED) + return (EBUSY); + + /* can't trace yourself! */ + if (p->p_pid == curp->p_pid) + return (EINVAL); + + /* + * Go ahead and set the trace flag. + * Save the old parent (it's reset in + * _DETACH, and also in kern_exit.c:wait4() + * Reparent the process so that the tracing + * proc gets to see all the action. + * Stop the target. + */ + p->p_flag |= P_TRACED; + p->p_xstat = 0; /* XXX ? */ + if (p->p_pptr != curp) { + p->p_oppid = p->p_pptr->p_pid; + proc_reparent(p, curp); + } + psignal(p, SIGSTOP); + return (0); + } + + /* + * Target process must be stopped, owned by (curp) and + * be set up for tracing (P_TRACED flag set). + * Allow DETACH to take place at any time for sanity. + * Allow WAIT any time, of course. + */ + switch (op) { + case PROCFS_CTL_DETACH: + case PROCFS_CTL_WAIT: + break; + + default: + if (!TRACE_WAIT_P(curp, p)) + return (EBUSY); + } + + /* + * do single-step fixup if needed + */ + FIX_SSTEP(p); + + /* + * Don't deliver any signal by default. + * To continue with a signal, just send + * the signal name to the ctl file + */ + p->p_xstat = 0; + + switch (op) { + /* + * Detach. Cleans up the target process, reparent it if possible + * and set it running once more. + */ + case PROCFS_CTL_DETACH: + /* if not being traced, then this is a painless no-op */ + if ((p->p_flag & P_TRACED) == 0) + return (0); + + /* not being traced any more */ + p->p_flag &= ~P_TRACED; + + /* give process back to original parent */ + if (p->p_oppid != p->p_pptr->p_pid) { + struct proc *pp; + + pp = pfind(p->p_oppid); + if (pp) + proc_reparent(p, pp); + } + + p->p_oppid = 0; + p->p_flag &= ~P_WAITED; /* XXX ? */ + wakeup((caddr_t) curp); /* XXX for CTL_WAIT below ? */ + + break; + + /* + * Step. Let the target process execute a single instruction. + */ + case PROCFS_CTL_STEP: + procfs_sstep(p); + break; + + /* + * Run. Let the target process continue running until a breakpoint + * or some other trap. + */ + case PROCFS_CTL_RUN: + break; + + /* + * Wait for the target process to stop. + * If the target is not being traced then just wait + * to enter + */ + case PROCFS_CTL_WAIT: + error = 0; + if (p->p_flag & P_TRACED) { + while (error == 0 && + (p->p_stat != SSTOP) && + (p->p_flag & P_TRACED) && + (p->p_pptr == curp)) { + error = tsleep((caddr_t) p, + PWAIT|PCATCH, "procfsx", 0); + } + if (error == 0 && !TRACE_WAIT_P(curp, p)) + error = EBUSY; + } else { + while (error == 0 && p->p_stat != SSTOP) { + error = tsleep((caddr_t) p, + PWAIT|PCATCH, "procfs", 0); + } + } + return (error); + + default: + panic("procfs_control"); + } + + if (p->p_stat == SSTOP) + setrunnable(p); + return (0); +} + +int +procfs_doctl(curp, p, pfs, uio) + struct proc *curp; + struct pfsnode *pfs; + struct uio *uio; + struct proc *p; +{ + int xlen; + int error; + char msg[PROCFS_CTLLEN+1]; + vfs_namemap_t *nm; + + if (uio->uio_rw != UIO_WRITE) + return (EOPNOTSUPP); + + xlen = PROCFS_CTLLEN; + error = vfs_getuserstr(uio, msg, &xlen); + if (error) + return (error); + + /* + * Map signal names into signal generation + * or debug control. Unknown commands and/or signals + * return EOPNOTSUPP. + * + * Sending a signal while the process is being debugged + * also has the side effect of letting the target continue + * to run. There is no way to single-step a signal delivery. + */ + error = EOPNOTSUPP; + + nm = vfs_findname(ctlnames, msg, xlen); + if (nm) { + error = procfs_control(curp, p, nm->nm_val); + } else { + nm = vfs_findname(signames, msg, xlen); + if (nm) { + if (TRACE_WAIT_P(curp, p)) { + p->p_xstat = nm->nm_val; + FIX_SSTEP(p); + setrunnable(p); + } else { + psignal(p, nm->nm_val); + } + error = 0; + } + } + + return (error); +} diff --git a/sys/miscfs/procfs/procfs_fpregs.c b/sys/miscfs/procfs/procfs_fpregs.c new file mode 100644 index 00000000000..6d850a6a881 --- /dev/null +++ b/sys/miscfs/procfs/procfs_fpregs.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_fpregs.c 8.1 (Berkeley) 1/27/94 + * + * From: + * $Id: procfs_regs.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int +procfs_dofpregs(curp, p, pfs, uio) + struct proc *curp; + struct proc *p; + struct pfsnode *pfs; + struct uio *uio; +{ + int error; + struct fpreg r; + char *kv; + int kl; + + kl = sizeof(r); + kv = (char *) &r; + + kv += uio->uio_offset; + kl -= uio->uio_offset; + if (kl > uio->uio_resid) + kl = uio->uio_resid; + + if (kl < 0) + error = EINVAL; + else + error = procfs_read_fpregs(p, &r); + if (error == 0) + error = uiomove(kv, kl, uio); + if (error == 0 && uio->uio_rw == UIO_WRITE) { + if (p->p_stat != SSTOP) + error = EBUSY; + else + error = procfs_write_fpregs(p, &r); + } + + uio->uio_offset = 0; + return (error); +} diff --git a/sys/miscfs/procfs/procfs_mem.c b/sys/miscfs/procfs/procfs_mem.c new file mode 100644 index 00000000000..039983da09c --- /dev/null +++ b/sys/miscfs/procfs/procfs_mem.c @@ -0,0 +1,302 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 Sean Eric Fagan + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry and Sean Eric Fagan. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_mem.c 8.4 (Berkeley) 1/21/94 + * + * From: + * $Id: procfs_mem.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +/* + * This is a lightly hacked and merged version + * of sef's pread/pwrite functions + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +procfs_rwmem(p, uio) + struct proc *p; + struct uio *uio; +{ + int error; + int writing; + + writing = uio->uio_rw == UIO_WRITE; + + /* + * Only map in one page at a time. We don't have to, but it + * makes things easier. This way is trivial - right? + */ + do { + vm_map_t map, tmap; + vm_object_t object; + vm_offset_t kva; + vm_offset_t uva; + int page_offset; /* offset into page */ + vm_offset_t pageno; /* page number */ + vm_map_entry_t out_entry; + vm_prot_t out_prot; + vm_page_t m; + boolean_t wired, single_use; + vm_offset_t off; + u_int len; + int fix_prot; + + uva = (vm_offset_t) uio->uio_offset; + if (uva > VM_MAXUSER_ADDRESS) { + error = 0; + break; + } + + /* + * Get the page number of this segment. + */ + pageno = trunc_page(uva); + page_offset = uva - pageno; + + /* + * How many bytes to copy + */ + len = min(PAGE_SIZE - page_offset, uio->uio_resid); + + /* + * The map we want... + */ + map = &p->p_vmspace->vm_map; + + /* + * Check the permissions for the area we're interested + * in. + */ + fix_prot = 0; + if (writing) + fix_prot = !vm_map_check_protection(map, pageno, + pageno + PAGE_SIZE, VM_PROT_WRITE); + + if (fix_prot) { + /* + * If the page is not writable, we make it so. + * XXX It is possible that a page may *not* be + * read/executable, if a process changes that! + * We will assume, for now, that a page is either + * VM_PROT_ALL, or VM_PROT_READ|VM_PROT_EXECUTE. + */ + error = vm_map_protect(map, pageno, + pageno + PAGE_SIZE, VM_PROT_ALL, 0); + if (error) + break; + } + + /* + * Now we need to get the page. out_entry, out_prot, wired, + * and single_use aren't used. One would think the vm code + * would be a *bit* nicer... We use tmap because + * vm_map_lookup() can change the map argument. + */ + tmap = map; + error = vm_map_lookup(&tmap, pageno, + writing ? VM_PROT_WRITE : VM_PROT_READ, + &out_entry, &object, &off, &out_prot, + &wired, &single_use); + /* + * We're done with tmap now. + */ + if (!error) + vm_map_lookup_done(tmap, out_entry); + + /* + * Fault the page in... + */ + if (!error && writing && object->shadow) { + m = vm_page_lookup(object, off); + if (m == 0 || (m->flags & PG_COPYONWRITE)) + error = vm_fault(map, pageno, + VM_PROT_WRITE, FALSE); + } + + /* Find space in kernel_map for the page we're interested in */ + if (!error) + error = vm_map_find(kernel_map, object, off, &kva, + PAGE_SIZE, 1); + + if (!error) { + /* + * Neither vm_map_lookup() nor vm_map_find() appear + * to add a reference count to the object, so we do + * that here and now. + */ + vm_object_reference(object); + + /* + * Mark the page we just found as pageable. + */ + error = vm_map_pageable(kernel_map, kva, + kva + PAGE_SIZE, 0); + + /* + * Now do the i/o move. + */ + if (!error) + error = uiomove(kva + page_offset, len, uio); + + vm_map_remove(kernel_map, kva, kva + PAGE_SIZE); + } + if (fix_prot) + vm_map_protect(map, pageno, pageno + PAGE_SIZE, + VM_PROT_READ|VM_PROT_EXECUTE, 0); + } while (error == 0 && uio->uio_resid > 0); + + return (error); +} + +/* + * Copy data in and out of the target process. + * We do this by mapping the process's page into + * the kernel and then doing a uiomove direct + * from the kernel address space. + */ +int +procfs_domem(curp, p, pfs, uio) + struct proc *curp; + struct proc *p; + struct pfsnode *pfs; + struct uio *uio; +{ + int error; + + if (uio->uio_resid == 0) + return (0); + + error = procfs_rwmem(p, uio); + + return (error); +} + +/* + * Given process (p), find the vnode from which + * it's text segment is being executed. + * + * It would be nice to grab this information from + * the VM system, however, there is no sure-fire + * way of doing that. Instead, fork(), exec() and + * wait() all maintain the p_textvp field in the + * process proc structure which contains a held + * reference to the exec'ed vnode. + */ +struct vnode * +procfs_findtextvp(p) + struct proc *p; +{ + return (p->p_textvp); +} + + +#ifdef probably_never +/* + * Given process (p), find the vnode from which + * it's text segment is being mapped. + * + * (This is here, rather than in procfs_subr in order + * to keep all the VM related code in one place.) + */ +struct vnode * +procfs_findtextvp(p) + struct proc *p; +{ + int error; + vm_object_t object; + vm_offset_t pageno; /* page number */ + + /* find a vnode pager for the user address space */ + + for (pageno = VM_MIN_ADDRESS; + pageno < VM_MAXUSER_ADDRESS; + pageno += PAGE_SIZE) { + vm_map_t map; + vm_map_entry_t out_entry; + vm_prot_t out_prot; + boolean_t wired, single_use; + vm_offset_t off; + + map = &p->p_vmspace->vm_map; + error = vm_map_lookup(&map, pageno, + VM_PROT_READ, + &out_entry, &object, &off, &out_prot, + &wired, &single_use); + + if (!error) { + vm_pager_t pager; + + printf("procfs: found vm object\n"); + vm_map_lookup_done(map, out_entry); + printf("procfs: vm object = %x\n", object); + + /* + * At this point, assuming no errors, object + * is the VM object mapping UVA (pageno). + * Ensure it has a vnode pager, then grab + * the vnode from that pager's handle. + */ + + pager = object->pager; + printf("procfs: pager = %x\n", pager); + if (pager) + printf("procfs: found pager, type = %d\n", pager->pg_type); + if (pager && pager->pg_type == PG_VNODE) { + struct vnode *vp; + + vp = (struct vnode *) pager->pg_handle; + printf("procfs: vp = 0x%x\n", vp); + return (vp); + } + } + } + + printf("procfs: text object not found\n"); + return (0); +} +#endif /* probably_never */ diff --git a/sys/miscfs/procfs/procfs_note.c b/sys/miscfs/procfs/procfs_note.c new file mode 100644 index 00000000000..bf2f160baa0 --- /dev/null +++ b/sys/miscfs/procfs/procfs_note.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_note.c 8.2 (Berkeley) 1/21/94 + * + * From: + * $Id: procfs_note.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int +procfs_donote(curp, p, pfs, uio) + struct proc *curp; + struct proc *p; + struct pfsnode *pfs; + struct uio *uio; +{ + int xlen; + int error; + char note[PROCFS_NOTELEN+1]; + + if (uio->uio_rw != UIO_WRITE) + return (EINVAL); + + xlen = PROCFS_NOTELEN; + error = vfs_getuserstr(uio, note, &xlen); + if (error) + return (error); + + /* send to process's notify function */ + return (EOPNOTSUPP); +} diff --git a/sys/miscfs/procfs/procfs_regs.c b/sys/miscfs/procfs/procfs_regs.c new file mode 100644 index 00000000000..fa95fef8f10 --- /dev/null +++ b/sys/miscfs/procfs/procfs_regs.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_regs.c 8.3 (Berkeley) 1/27/94 + * + * From: + * $Id: procfs_regs.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int +procfs_doregs(curp, p, pfs, uio) + struct proc *curp; + struct proc *p; + struct pfsnode *pfs; + struct uio *uio; +{ + int error; + struct reg r; + char *kv; + int kl; + + kl = sizeof(r); + kv = (char *) &r; + + kv += uio->uio_offset; + kl -= uio->uio_offset; + if (kl > uio->uio_resid) + kl = uio->uio_resid; + + if (kl < 0) + error = EINVAL; + else + error = procfs_read_regs(p, &r); + if (error == 0) + error = uiomove(kv, kl, uio); + if (error == 0 && uio->uio_rw == UIO_WRITE) { + if (p->p_stat != SSTOP) + error = EBUSY; + else + error = procfs_write_regs(p, &r); + } + + uio->uio_offset = 0; + return (error); +} diff --git a/sys/miscfs/procfs/procfs_status.c b/sys/miscfs/procfs/procfs_status.c new file mode 100644 index 00000000000..d88aaabdfb0 --- /dev/null +++ b/sys/miscfs/procfs/procfs_status.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_status.c 8.3 (Berkeley) 2/17/94 + * + * From: + * $Id: procfs_status.c,v 3.1 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +procfs_dostatus(curp, p, pfs, uio) + struct proc *curp; + struct proc *p; + struct pfsnode *pfs; + struct uio *uio; +{ + struct session *sess; + struct tty *tp; + struct ucred *cr; + char *ps; + char *sep; + int pid, ppid, pgid, sid; + int i; + int xlen; + int error; + char psbuf[256]; /* XXX - conservative */ + + if (uio->uio_rw != UIO_READ) + return (EOPNOTSUPP); + + pid = p->p_pid; + ppid = p->p_pptr ? p->p_pptr->p_pid : 0, + pgid = p->p_pgrp->pg_id; + sess = p->p_pgrp->pg_session; + sid = sess->s_leader ? sess->s_leader->p_pid : 0; + +/* comm pid ppid pgid sid maj,min ctty,sldr start ut st wmsg uid groups ... */ + + ps = psbuf; + bcopy(p->p_comm, ps, MAXCOMLEN); + ps[MAXCOMLEN] = '\0'; + ps += strlen(ps); + ps += sprintf(ps, " %d %d %d %d ", pid, ppid, pgid, sid); + + if ((p->p_flag&P_CONTROLT) && (tp = sess->s_ttyp)) + ps += sprintf(ps, "%d,%d ", major(tp->t_dev), minor(tp->t_dev)); + else + ps += sprintf(ps, "%d,%d ", -1, -1); + + sep = ""; + if (sess->s_ttyvp) { + ps += sprintf(ps, "%sctty", sep); + sep = ","; + } + if (SESS_LEADER(p)) { + ps += sprintf(ps, "%ssldr", sep); + sep = ","; + } + if (*sep != ',') + ps += sprintf(ps, "noflags"); + + if (p->p_flag & P_INMEM) + ps += sprintf(ps, " %d,%d", + p->p_stats->p_start.tv_sec, + p->p_stats->p_start.tv_usec); + else + ps += sprintf(ps, " -1,-1"); + + { + struct timeval ut, st; + + calcru(p, &ut, &st, (void *) 0); + ps += sprintf(ps, " %d,%d %d,%d", + ut.tv_sec, + ut.tv_usec, + st.tv_sec, + st.tv_usec); + } + + ps += sprintf(ps, " %s", + (p->p_wchan && p->p_wmesg) ? p->p_wmesg : "nochan"); + + cr = p->p_ucred; + + ps += sprintf(ps, " %d", cr->cr_uid, cr->cr_gid); + for (i = 0; i < cr->cr_ngroups; i++) + ps += sprintf(ps, ",%d", cr->cr_groups[i]); + ps += sprintf(ps, "\n"); + + xlen = ps - psbuf; + xlen -= uio->uio_offset; + ps = psbuf + uio->uio_offset; + xlen = min(xlen, uio->uio_resid); + if (xlen <= 0) + error = 0; + else + error = uiomove(ps, xlen, uio); + + return (error); +} diff --git a/sys/miscfs/procfs/procfs_subr.c b/sys/miscfs/procfs/procfs_subr.c new file mode 100644 index 00000000000..b371af19af0 --- /dev/null +++ b/sys/miscfs/procfs/procfs_subr.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_subr.c 8.4 (Berkeley) 1/27/94 + * + * From: + * $Id: procfs_subr.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct pfsnode *pfshead; +static int pfsvplock; + +/* + * allocate a pfsnode/vnode pair. the vnode is + * referenced, but not locked. + * + * the pid, pfs_type, and mount point uniquely + * identify a pfsnode. the mount point is needed + * because someone might mount this filesystem + * twice. + * + * all pfsnodes are maintained on a singly-linked + * list. new nodes are only allocated when they cannot + * be found on this list. entries on the list are + * removed when the vfs reclaim entry is called. + * + * a single lock is kept for the entire list. this is + * needed because the getnewvnode() function can block + * waiting for a vnode to become free, in which case there + * may be more than one process trying to get the same + * vnode. this lock is only taken if we are going to + * call getnewvnode, since the kernel itself is single-threaded. + * + * if an entry is found on the list, then call vget() to + * take a reference. this is done because there may be + * zero references to it and so it needs to removed from + * the vnode free list. + */ +int +procfs_allocvp(mp, vpp, pid, pfs_type) + struct mount *mp; + struct vnode **vpp; + long pid; + pfstype pfs_type; +{ + int error; + struct pfsnode *pfs; + struct pfsnode **pp; + +loop: + for (pfs = pfshead; pfs != 0; pfs = pfs->pfs_next) { + if (pfs->pfs_pid == pid && + pfs->pfs_type == pfs_type && + PFSTOV(pfs)->v_mount == mp) { + if (vget(pfs->pfs_vnode, 0)) + goto loop; + *vpp = pfs->pfs_vnode; + return (0); + } + } + + /* + * otherwise lock the vp list while we call getnewvnode + * since that can block. + */ + if (pfsvplock & PROCFS_LOCKED) { + pfsvplock |= PROCFS_WANT; + sleep((caddr_t) &pfsvplock, PINOD); + goto loop; + } + pfsvplock |= PROCFS_LOCKED; + + error = getnewvnode(VT_PROCFS, mp, procfs_vnodeop_p, vpp); + if (error) + goto out; + + MALLOC((*vpp)->v_data, void *, sizeof(struct pfsnode), + M_TEMP, M_WAITOK); + + pfs = VTOPFS(*vpp); + pfs->pfs_next = 0; + pfs->pfs_pid = (pid_t) pid; + pfs->pfs_type = pfs_type; + pfs->pfs_vnode = *vpp; + pfs->pfs_flags = 0; + pfs->pfs_fileno = PROCFS_FILENO(pid, pfs_type); + + switch (pfs_type) { + case Proot: /* /proc = dr-xr-xr-x */ + pfs->pfs_mode = (VREAD|VEXEC) | + (VREAD|VEXEC) >> 3 | + (VREAD|VEXEC) >> 6; + break; + + case Pproc: + pfs->pfs_mode = (VREAD|VEXEC) | + (VREAD|VEXEC) >> 3 | + (VREAD|VEXEC) >> 6; + break; + + case Pfile: + pfs->pfs_mode = (VREAD|VWRITE); + break; + + case Pmem: + pfs->pfs_mode = (VREAD|VWRITE); + break; + + case Pregs: + pfs->pfs_mode = (VREAD|VWRITE); + break; + + case Pfpregs: + pfs->pfs_mode = (VREAD|VWRITE); + break; + + case Pctl: + pfs->pfs_mode = (VWRITE); + break; + + case Pstatus: + pfs->pfs_mode = (VREAD) | + (VREAD >> 3) | + (VREAD >> 6); + break; + + case Pnote: + pfs->pfs_mode = (VWRITE); + break; + + case Pnotepg: + pfs->pfs_mode = (VWRITE); + break; + + default: + panic("procfs_allocvp"); + } + + /* add to procfs vnode list */ + for (pp = &pfshead; *pp; pp = &(*pp)->pfs_next) + continue; + *pp = pfs; + +out: + pfsvplock &= ~PROCFS_LOCKED; + + if (pfsvplock & PROCFS_WANT) { + pfsvplock &= ~PROCFS_WANT; + wakeup((caddr_t) &pfsvplock); + } + + return (error); +} + +int +procfs_freevp(vp) + struct vnode *vp; +{ + struct pfsnode **pfspp; + struct pfsnode *pfs = VTOPFS(vp); + + for (pfspp = &pfshead; *pfspp != 0; pfspp = &(*pfspp)->pfs_next) { + if (*pfspp == pfs) { + *pfspp = pfs->pfs_next; + break; + } + } + + FREE(vp->v_data, M_TEMP); + vp->v_data = 0; + return (0); +} + +int +procfs_rw(ap) + struct vop_read_args *ap; +{ + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + struct proc *curp = uio->uio_procp; + struct pfsnode *pfs = VTOPFS(vp); + struct proc *p; + + p = PFIND(pfs->pfs_pid); + if (p == 0) + return (EINVAL); + + switch (pfs->pfs_type) { + case Pnote: + case Pnotepg: + return (procfs_donote(curp, p, pfs, uio)); + + case Pregs: + return (procfs_doregs(curp, p, pfs, uio)); + + case Pfpregs: + return (procfs_dofpregs(curp, p, pfs, uio)); + + case Pctl: + return (procfs_doctl(curp, p, pfs, uio)); + + case Pstatus: + return (procfs_dostatus(curp, p, pfs, uio)); + + case Pmem: + return (procfs_domem(curp, p, pfs, uio)); + + default: + return (EOPNOTSUPP); + } +} + +/* + * Get a string from userland into (buf). Strip a trailing + * nl character (to allow easy access from the shell). + * The buffer should be *buflenp + 1 chars long. vfs_getuserstr + * will automatically add a nul char at the end. + * + * Returns 0 on success or the following errors + * + * EINVAL: file offset is non-zero. + * EMSGSIZE: message is longer than kernel buffer + * EFAULT: user i/o buffer is not addressable + */ +int +vfs_getuserstr(uio, buf, buflenp) + struct uio *uio; + char *buf; + int *buflenp; +{ + int xlen; + int error; + + if (uio->uio_offset != 0) + return (EINVAL); + + xlen = *buflenp; + + /* must be able to read the whole string in one go */ + if (xlen < uio->uio_resid) + return (EMSGSIZE); + xlen = uio->uio_resid; + + error = uiomove(buf, xlen, uio); + if (error) + return (error); + + /* allow multiple writes without seeks */ + uio->uio_offset = 0; + + /* cleanup string and remove trailing newline */ + buf[xlen] = '\0'; + xlen = strlen(buf); + if (xlen > 0 && buf[xlen-1] == '\n') + buf[--xlen] = '\0'; + *buflenp = xlen; + + return (0); +} + +vfs_namemap_t * +vfs_findname(nm, buf, buflen) + vfs_namemap_t *nm; + char *buf; + int buflen; +{ + for (; nm->nm_name; nm++) + if (bcmp(buf, (char *) nm->nm_name, buflen+1) == 0) + return (nm); + + return (0); +} diff --git a/sys/miscfs/procfs/procfs_vfsops.c b/sys/miscfs/procfs/procfs_vfsops.c new file mode 100644 index 00000000000..3938ca12357 --- /dev/null +++ b/sys/miscfs/procfs/procfs_vfsops.c @@ -0,0 +1,243 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_vfsops.c 8.4 (Berkeley) 1/21/94 + * + * From: + * $Id: procfs_vfsops.c,v 3.1 1993/12/15 09:40:17 jsp Exp $ + */ + +/* + * procfs VFS interface + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for PAGE_SIZE */ + +/* + * VFS Operations. + * + * mount system call + */ +/* ARGSUSED */ +procfs_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + u_int size; + + if (UIO_MX & (UIO_MX-1)) { + log(LOG_ERR, "procfs: invalid directory entry size"); + return (EINVAL); + } + + if (mp->mnt_flag & MNT_UPDATE) + return (EOPNOTSUPP); + + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_data = 0; + getnewfsid(mp, MOUNT_PROCFS); + + (void) copyinstr(path, (caddr_t)mp->mnt_stat.f_mntonname, MNAMELEN, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + + size = sizeof("procfs") - 1; + bcopy("procfs", mp->mnt_stat.f_mntfromname, size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + + return (0); +} + +/* + * unmount system call + */ +procfs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + int error; + extern int doforce; + int flags = 0; + + if (mntflags & MNT_FORCE) { + /* procfs can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + if (error = vflush(mp, 0, flags)) + return (error); + + return (0); +} + +procfs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct pfsnode *pfs; + struct vnode *vp; + int error; + + error = procfs_allocvp(mp, &vp, (pid_t) 0, Proot); + if (error) + return (error); + + vp->v_type = VDIR; + vp->v_flag = VROOT; + pfs = VTOPFS(vp); + + *vpp = vp; + return (0); +} + +/* + */ +/* ARGSUSED */ +procfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + + return (0); +} + +/* + * Get file system statistics. + */ +procfs_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + sbp->f_type = MOUNT_PROCFS; + sbp->f_bsize = PAGE_SIZE; + sbp->f_iosize = PAGE_SIZE; + sbp->f_blocks = 1; /* avoid divide by zero in some df's */ + sbp->f_bfree = 0; + sbp->f_bavail = 0; + sbp->f_files = maxproc; /* approx */ + sbp->f_ffree = maxproc - nprocs; /* approx */ + + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + + return (0); +} + + +procfs_quotactl(mp, cmds, uid, arg, p) + struct mount *mp; + int cmds; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} + +procfs_sync(mp, waitfor) + struct mount *mp; + int waitfor; +{ + + return (0); +} + +procfs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +procfs_fhtovp(mp, fhp, vpp) + struct mount *mp; + struct fid *fhp; + struct vnode **vpp; +{ + + return (EINVAL); +} + +procfs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + + return EINVAL; +} + +procfs_init() +{ + + return (0); +} + +struct vfsops procfs_vfsops = { + procfs_mount, + procfs_start, + procfs_unmount, + procfs_root, + procfs_quotactl, + procfs_statfs, + procfs_sync, + procfs_vget, + procfs_fhtovp, + procfs_vptofh, + procfs_init, +}; diff --git a/sys/miscfs/procfs/procfs_vnops.c b/sys/miscfs/procfs/procfs_vnops.c new file mode 100644 index 00000000000..4e1ee002bb9 --- /dev/null +++ b/sys/miscfs/procfs/procfs_vnops.c @@ -0,0 +1,814 @@ +/* + * Copyright (c) 1993 Jan-Simon Pendry + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)procfs_vnops.c 8.6 (Berkeley) 2/7/94 + * + * From: + * $Id: procfs_vnops.c,v 3.2 1993/12/15 09:40:17 jsp Exp $ + */ + +/* + * procfs vnode interface + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for PAGE_SIZE */ + +/* + * Vnode Operations. + * + */ + +/* + * This is a list of the valid names in the + * process-specific sub-directories. It is + * used in procfs_lookup and procfs_readdir + */ +static struct pfsnames { + u_short d_namlen; + char d_name[PROCFS_NAMELEN]; + pfstype d_pfstype; +} procent[] = { +#define N(s) sizeof(s)-1, s + /* namlen, nam, type */ + { N("file"), Pfile }, + { N("mem"), Pmem }, + { N("regs"), Pregs }, + { N("fpregs"), Pfpregs }, + { N("ctl"), Pctl }, + { N("status"), Pstatus }, + { N("note"), Pnote }, + { N("notepg"), Pnotepg }, +#undef N +}; +#define Nprocent (sizeof(procent)/sizeof(procent[0])) + +static pid_t atopid __P((const char *, u_int)); + +/* + * set things up for doing i/o on + * the pfsnode (vp). (vp) is locked + * on entry, and should be left locked + * on exit. + * + * for procfs we don't need to do anything + * in particular for i/o. all that is done + * is to support exclusive open on process + * memory images. + */ +procfs_open(ap) + struct vop_open_args *ap; +{ + struct pfsnode *pfs = VTOPFS(ap->a_vp); + + switch (pfs->pfs_type) { + case Pmem: + if (PFIND(pfs->pfs_pid) == 0) + return (ENOENT); /* was ESRCH, jsp */ + + if ((pfs->pfs_flags & FWRITE) && (ap->a_mode & O_EXCL) || + (pfs->pfs_flags & O_EXCL) && (ap->a_mode & FWRITE)) + return (EBUSY); + + + if (ap->a_mode & FWRITE) + pfs->pfs_flags = ap->a_mode & (FWRITE|O_EXCL); + + return (0); + + default: + break; + } + + return (0); +} + +/* + * close the pfsnode (vp) after doing i/o. + * (vp) is not locked on entry or exit. + * + * nothing to do for procfs other than undo + * any exclusive open flag (see _open above). + */ +procfs_close(ap) + struct vop_close_args *ap; +{ + struct pfsnode *pfs = VTOPFS(ap->a_vp); + + switch (pfs->pfs_type) { + case Pmem: + if ((ap->a_fflag & FWRITE) && (pfs->pfs_flags & O_EXCL)) + pfs->pfs_flags &= ~(FWRITE|O_EXCL); + break; + } + + return (0); +} + +/* + * do an ioctl operation on pfsnode (vp). + * (vp) is not locked on entry or exit. + */ +procfs_ioctl(ap) + struct vop_ioctl_args *ap; +{ + + return (ENOTTY); +} + +/* + * do block mapping for pfsnode (vp). + * since we don't use the buffer cache + * for procfs this function should never + * be called. in any case, it's not clear + * what part of the kernel ever makes use + * of this function. for sanity, this is the + * usual no-op bmap, although returning + * (EIO) would be a reasonable alternative. + */ +procfs_bmap(ap) + struct vop_bmap_args *ap; +{ + + if (ap->a_vpp != NULL) + *ap->a_vpp = ap->a_vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + return (0); +} + +/* + * _inactive is called when the pfsnode + * is vrele'd and the reference count goes + * to zero. (vp) will be on the vnode free + * list, so to get it back vget() must be + * used. + * + * for procfs, check if the process is still + * alive and if it isn't then just throw away + * the vnode by calling vgone(). this may + * be overkill and a waste of time since the + * chances are that the process will still be + * there and PFIND is not free. + * + * (vp) is not locked on entry or exit. + */ +procfs_inactive(ap) + struct vop_inactive_args *ap; +{ + struct pfsnode *pfs = VTOPFS(ap->a_vp); + + if (PFIND(pfs->pfs_pid) == 0) + vgone(ap->a_vp); + + return (0); +} + +/* + * _reclaim is called when getnewvnode() + * wants to make use of an entry on the vnode + * free list. at this time the filesystem needs + * to free any private data and remove the node + * from any private lists. + */ +procfs_reclaim(ap) + struct vop_reclaim_args *ap; +{ + int error; + + error = procfs_freevp(ap->a_vp); + return (error); +} + +/* + * Return POSIX pathconf information applicable to special devices. + */ +procfs_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_MAX_CANON: + *ap->a_retval = MAX_CANON; + return (0); + case _PC_MAX_INPUT: + *ap->a_retval = MAX_INPUT; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_VDISABLE: + *ap->a_retval = _POSIX_VDISABLE; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * _print is used for debugging. + * just print a readable description + * of (vp). + */ +procfs_print(ap) + struct vop_print_args *ap; +{ + struct pfsnode *pfs = VTOPFS(ap->a_vp); + + printf("tag VT_PROCFS, pid %d, mode %x, flags %x\n", + pfs->pfs_pid, + pfs->pfs_mode, pfs->pfs_flags); +} + +/* + * _abortop is called when operations such as + * rename and create fail. this entry is responsible + * for undoing any side-effects caused by the lookup. + * this will always include freeing the pathname buffer. + */ +procfs_abortop(ap) + struct vop_abortop_args *ap; +{ + + if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) + FREE(ap->a_cnp->cn_pnbuf, M_NAMEI); + return (0); +} + +/* + * generic entry point for unsupported operations + */ +procfs_badop() +{ + + return (EIO); +} + +/* + * Invent attributes for pfsnode (vp) and store + * them in (vap). + * Directories lengths are returned as zero since + * any real length would require the genuine size + * to be computed, and nothing cares anyway. + * + * this is relatively minimal for procfs. + */ +procfs_getattr(ap) + struct vop_getattr_args *ap; +{ + struct pfsnode *pfs = VTOPFS(ap->a_vp); + struct vattr *vap = ap->a_vap; + struct proc *procp; + int error; + + /* first check the process still exists */ + switch (pfs->pfs_type) { + case Proot: + procp = 0; + break; + + default: + procp = PFIND(pfs->pfs_pid); + if (procp == 0) + return (ENOENT); + } + + error = 0; + + /* start by zeroing out the attributes */ + VATTR_NULL(vap); + + /* next do all the common fields */ + vap->va_type = ap->a_vp->v_type; + vap->va_mode = pfs->pfs_mode; + vap->va_fileid = pfs->pfs_fileno; + vap->va_flags = 0; + vap->va_blocksize = PAGE_SIZE; + vap->va_bytes = vap->va_size = 0; + + /* + * If the process has exercised some setuid or setgid + * privilege, then rip away read/write permission so + * that only root can gain access. + */ + switch (pfs->pfs_type) { + case Pregs: + case Pfpregs: + case Pmem: + if (procp->p_flag & P_SUGID) + vap->va_mode &= ~((VREAD|VWRITE)| + ((VREAD|VWRITE)>>3)| + ((VREAD|VWRITE)>>6)); + break; + } + + /* + * Make all times be current TOD. + * It would be possible to get the process start + * time from the p_stat structure, but there's + * no "file creation" time stamp anyway, and the + * p_stat structure is not addressible if u. gets + * swapped out for that process. + */ + microtime(&vap->va_ctime); + vap->va_atime = vap->va_mtime = vap->va_ctime; + + /* + * now do the object specific fields + * + * The size could be set from struct reg, but it's hardly + * worth the trouble, and it puts some (potentially) machine + * dependent data into this machine-independent code. If it + * becomes important then this function should break out into + * a per-file stat function in the corresponding .c file. + */ + + switch (pfs->pfs_type) { + case Proot: + vap->va_nlink = 2; + vap->va_uid = 0; + vap->va_gid = 0; + break; + + case Pproc: + vap->va_nlink = 2; + vap->va_uid = procp->p_ucred->cr_uid; + vap->va_gid = procp->p_ucred->cr_gid; + break; + + case Pfile: + error = EOPNOTSUPP; + break; + + case Pmem: + vap->va_nlink = 1; + vap->va_bytes = vap->va_size = + ctob(procp->p_vmspace->vm_tsize + + procp->p_vmspace->vm_dsize + + procp->p_vmspace->vm_ssize); + vap->va_uid = procp->p_ucred->cr_uid; + vap->va_gid = procp->p_ucred->cr_gid; + break; + + case Pregs: + case Pfpregs: + case Pctl: + case Pstatus: + case Pnote: + case Pnotepg: + vap->va_nlink = 1; + vap->va_uid = procp->p_ucred->cr_uid; + vap->va_gid = procp->p_ucred->cr_gid; + break; + + default: + panic("procfs_getattr"); + } + + return (error); +} + +procfs_setattr(ap) + struct vop_setattr_args *ap; +{ + /* + * just fake out attribute setting + * it's not good to generate an error + * return, otherwise things like creat() + * will fail when they try to set the + * file length to 0. worse, this means + * that echo $note > /proc/$pid/note will fail. + */ + + return (0); +} + +/* + * implement access checking. + * + * something very similar to this code is duplicated + * throughout the 4bsd kernel and should be moved + * into kern/vfs_subr.c sometime. + * + * actually, the check for super-user is slightly + * broken since it will allow read access to write-only + * objects. this doesn't cause any particular trouble + * but does mean that the i/o entry points need to check + * that the operation really does make sense. + */ +procfs_access(ap) + struct vop_access_args *ap; +{ + struct vattr *vap; + struct vattr vattr; + int error; + + /* + * If you're the super-user, + * you always get access. + */ + if (ap->a_cred->cr_uid == (uid_t) 0) + return (0); + vap = &vattr; + if (error = VOP_GETATTR(ap->a_vp, vap, ap->a_cred, ap->a_p)) + return (error); + + /* + * Access check is based on only one of owner, group, public. + * If not owner, then check group. If not a member of the + * group, then check public access. + */ + if (ap->a_cred->cr_uid != vap->va_uid) { + gid_t *gp; + int i; + + (ap->a_mode) >>= 3; + gp = ap->a_cred->cr_groups; + for (i = 0; i < ap->a_cred->cr_ngroups; i++, gp++) + if (vap->va_gid == *gp) + goto found; + ap->a_mode >>= 3; +found: + ; + } + + if ((vap->va_mode & ap->a_mode) == ap->a_mode) + return (0); + + return (EACCES); +} + +/* + * lookup. this is incredibly complicated in the + * general case, however for most pseudo-filesystems + * very little needs to be done. + * + * unless you want to get a migraine, just make sure your + * filesystem doesn't do any locking of its own. otherwise + * read and inwardly digest ufs_lookup(). + */ +procfs_lookup(ap) + struct vop_lookup_args *ap; +{ + struct componentname *cnp = ap->a_cnp; + struct vnode **vpp = ap->a_vpp; + struct vnode *dvp = ap->a_dvp; + char *pname = cnp->cn_nameptr; + int error = 0; + pid_t pid; + struct vnode *nvp; + struct pfsnode *pfs; + struct proc *procp; + pfstype pfs_type; + int i; + + if (cnp->cn_namelen == 1 && *pname == '.') { + *vpp = dvp; + VREF(dvp); + /*VOP_LOCK(dvp);*/ + return (0); + } + + *vpp = NULL; + + pfs = VTOPFS(dvp); + switch (pfs->pfs_type) { + case Proot: + if (cnp->cn_flags & ISDOTDOT) + return (EIO); + + if (CNEQ(cnp, "curproc", 7)) + pid = cnp->cn_proc->p_pid; + else + pid = atopid(pname, cnp->cn_namelen); + if (pid == NO_PID) + return (ENOENT); + + procp = PFIND(pid); + if (procp == 0) + return (ENOENT); + + error = procfs_allocvp(dvp->v_mount, &nvp, pid, Pproc); + if (error) + return (error); + + nvp->v_type = VDIR; + pfs = VTOPFS(nvp); + + *vpp = nvp; + return (0); + + case Pproc: + if (cnp->cn_flags & ISDOTDOT) { + error = procfs_root(dvp->v_mount, vpp); + return (error); + } + + procp = PFIND(pfs->pfs_pid); + if (procp == 0) + return (ENOENT); + + for (i = 0; i < Nprocent; i++) { + struct pfsnames *dp = &procent[i]; + + if (cnp->cn_namelen == dp->d_namlen && + bcmp(pname, dp->d_name, dp->d_namlen) == 0) { + pfs_type = dp->d_pfstype; + goto found; + } + } + return (ENOENT); + + found: + if (pfs_type == Pfile) { + nvp = procfs_findtextvp(procp); + if (nvp) { + VREF(nvp); + VOP_LOCK(nvp); + } else { + error = ENXIO; + } + } else { + error = procfs_allocvp(dvp->v_mount, &nvp, + pfs->pfs_pid, pfs_type); + if (error) + return (error); + + nvp->v_type = VREG; + pfs = VTOPFS(nvp); + } + *vpp = nvp; + return (error); + + default: + return (ENOTDIR); + } +} + +/* + * readdir returns directory entries from pfsnode (vp). + * + * the strategy here with procfs is to generate a single + * directory entry at a time (struct pfsdent) and then + * copy that out to userland using uiomove. a more efficent + * though more complex implementation, would try to minimize + * the number of calls to uiomove(). for procfs, this is + * hardly worth the added code complexity. + * + * this should just be done through read() + */ +procfs_readdir(ap) + struct vop_readdir_args *ap; +{ + struct uio *uio = ap->a_uio; + struct pfsdent d; + struct pfsdent *dp = &d; + struct pfsnode *pfs; + int error; + int count; + int i; + + pfs = VTOPFS(ap->a_vp); + + if (uio->uio_resid < UIO_MX) + return (EINVAL); + if (uio->uio_offset & (UIO_MX-1)) + return (EINVAL); + if (uio->uio_offset < 0) + return (EINVAL); + + error = 0; + count = 0; + i = uio->uio_offset / UIO_MX; + + switch (pfs->pfs_type) { + /* + * this is for the process-specific sub-directories. + * all that is needed to is copy out all the entries + * from the procent[] table (top of this file). + */ + case Pproc: { + while (uio->uio_resid >= UIO_MX) { + struct pfsnames *dt; + + if (i >= Nprocent) + break; + + dt = &procent[i]; + + dp->d_reclen = UIO_MX; + dp->d_fileno = PROCFS_FILENO(pfs->pfs_pid, dt->d_pfstype); + dp->d_type = DT_REG; + dp->d_namlen = dt->d_namlen; + bcopy(dt->d_name, dp->d_name, sizeof(dt->d_name)-1); + error = uiomove((caddr_t) dp, UIO_MX, uio); + if (error) + break; + count += UIO_MX; + i++; + } + + break; + + } + + /* + * this is for the root of the procfs filesystem + * what is needed is a special entry for "curproc" + * followed by an entry for each process on allproc +#ifdef PROCFS_ZOMBIE + * and zombproc. +#endif + */ + + case Proot: { + int pcnt; +#ifdef PROCFS_ZOMBIE + int doingzomb = 0; +#endif + volatile struct proc *p; + + p = allproc; + +#define PROCFS_XFILES 1 /* number of other entries, like "curproc" */ + pcnt = PROCFS_XFILES; + + while (p && uio->uio_resid >= UIO_MX) { + bzero((char *) dp, UIO_MX); + dp->d_type = DT_DIR; + dp->d_reclen = UIO_MX; + + switch (i) { + case 0: + /* ship out entry for "curproc" */ + dp->d_fileno = PROCFS_FILENO(PID_MAX+1, Pproc); + dp->d_namlen = sprintf(dp->d_name, "curproc"); + break; + + default: + if (pcnt >= i) { + dp->d_fileno = PROCFS_FILENO(p->p_pid, Pproc); + dp->d_namlen = sprintf(dp->d_name, "%ld", (long) p->p_pid); + } + + p = p->p_next; + +#ifdef PROCFS_ZOMBIE + if (p == 0 && doingzomb == 0) { + doingzomb = 1; + p = zombproc; + } +#endif + + if (pcnt++ < i) + continue; + + break; + } + error = uiomove((caddr_t) dp, UIO_MX, uio); + if (error) + break; + count += UIO_MX; + i++; + } + + break; + + } + + default: + error = ENOTDIR; + break; + } + + uio->uio_offset = i * UIO_MX; + + return (error); +} + +/* + * convert decimal ascii to pid_t + */ +static pid_t +atopid(b, len) + const char *b; + u_int len; +{ + pid_t p = 0; + + while (len--) { + char c = *b++; + if (c < '0' || c > '9') + return (NO_PID); + p = 10 * p + (c - '0'); + if (p > PID_MAX) + return (NO_PID); + } + + return (p); +} + +/* + * procfs vnode operations. + */ +int (**procfs_vnodeop_p)(); +struct vnodeopv_entry_desc procfs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, procfs_lookup }, /* lookup */ + { &vop_create_desc, procfs_create }, /* create */ + { &vop_mknod_desc, procfs_mknod }, /* mknod */ + { &vop_open_desc, procfs_open }, /* open */ + { &vop_close_desc, procfs_close }, /* close */ + { &vop_access_desc, procfs_access }, /* access */ + { &vop_getattr_desc, procfs_getattr }, /* getattr */ + { &vop_setattr_desc, procfs_setattr }, /* setattr */ + { &vop_read_desc, procfs_read }, /* read */ + { &vop_write_desc, procfs_write }, /* write */ + { &vop_ioctl_desc, procfs_ioctl }, /* ioctl */ + { &vop_select_desc, procfs_select }, /* select */ + { &vop_mmap_desc, procfs_mmap }, /* mmap */ + { &vop_fsync_desc, procfs_fsync }, /* fsync */ + { &vop_seek_desc, procfs_seek }, /* seek */ + { &vop_remove_desc, procfs_remove }, /* remove */ + { &vop_link_desc, procfs_link }, /* link */ + { &vop_rename_desc, procfs_rename }, /* rename */ + { &vop_mkdir_desc, procfs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, procfs_rmdir }, /* rmdir */ + { &vop_symlink_desc, procfs_symlink }, /* symlink */ + { &vop_readdir_desc, procfs_readdir }, /* readdir */ + { &vop_readlink_desc, procfs_readlink }, /* readlink */ + { &vop_abortop_desc, procfs_abortop }, /* abortop */ + { &vop_inactive_desc, procfs_inactive }, /* inactive */ + { &vop_reclaim_desc, procfs_reclaim }, /* reclaim */ + { &vop_lock_desc, procfs_lock }, /* lock */ + { &vop_unlock_desc, procfs_unlock }, /* unlock */ + { &vop_bmap_desc, procfs_bmap }, /* bmap */ + { &vop_strategy_desc, procfs_strategy }, /* strategy */ + { &vop_print_desc, procfs_print }, /* print */ + { &vop_islocked_desc, procfs_islocked }, /* islocked */ + { &vop_pathconf_desc, procfs_pathconf }, /* pathconf */ + { &vop_advlock_desc, procfs_advlock }, /* advlock */ + { &vop_blkatoff_desc, procfs_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, procfs_valloc }, /* valloc */ + { &vop_vfree_desc, procfs_vfree }, /* vfree */ + { &vop_truncate_desc, procfs_truncate }, /* truncate */ + { &vop_update_desc, procfs_update }, /* update */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc procfs_vnodeop_opv_desc = + { &procfs_vnodeop_p, procfs_vnodeop_entries }; diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c new file mode 100644 index 00000000000..111c517b162 --- /dev/null +++ b/sys/miscfs/specfs/spec_vnops.c @@ -0,0 +1,689 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)spec_vnops.c 8.6 (Berkeley) 4/9/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* symbolic sleep message strings for devices */ +char devopn[] = "devopn"; +char devio[] = "devio"; +char devwait[] = "devwait"; +char devin[] = "devin"; +char devout[] = "devout"; +char devioc[] = "devioc"; +char devcls[] = "devcls"; + +int (**spec_vnodeop_p)(); +struct vnodeopv_entry_desc spec_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, spec_create }, /* create */ + { &vop_mknod_desc, spec_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, spec_close }, /* close */ + { &vop_access_desc, spec_access }, /* access */ + { &vop_getattr_desc, spec_getattr }, /* getattr */ + { &vop_setattr_desc, spec_setattr }, /* setattr */ + { &vop_read_desc, spec_read }, /* read */ + { &vop_write_desc, spec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_select_desc, spec_select }, /* select */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, spec_remove }, /* remove */ + { &vop_link_desc, spec_link }, /* link */ + { &vop_rename_desc, spec_rename }, /* rename */ + { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ + { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ + { &vop_symlink_desc, spec_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, spec_inactive }, /* inactive */ + { &vop_reclaim_desc, spec_reclaim }, /* reclaim */ + { &vop_lock_desc, spec_lock }, /* lock */ + { &vop_unlock_desc, spec_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + { &vop_strategy_desc, spec_strategy }, /* strategy */ + { &vop_print_desc, spec_print }, /* print */ + { &vop_islocked_desc, spec_islocked }, /* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_blkatoff_desc, spec_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, spec_valloc }, /* valloc */ + { &vop_vfree_desc, spec_vfree }, /* vfree */ + { &vop_truncate_desc, spec_truncate }, /* truncate */ + { &vop_update_desc, spec_update }, /* update */ + { &vop_bwrite_desc, spec_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc spec_vnodeop_opv_desc = + { &spec_vnodeop_p, spec_vnodeop_entries }; + +/* + * Trivial lookup routine that always fails. + */ +int +spec_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + + *ap->a_vpp = NULL; + return (ENOTDIR); +} + +/* + * Open a special file. + */ +/* ARGSUSED */ +spec_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct vnode *bvp, *vp = ap->a_vp; + dev_t bdev, dev = (dev_t)vp->v_rdev; + register int maj = major(dev); + int error; + + /* + * Don't allow open if fs is mounted -nodev. + */ + if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV)) + return (ENXIO); + + switch (vp->v_type) { + + case VCHR: + if ((u_int)maj >= nchrdev) + return (ENXIO); + if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { + /* + * When running in very secure mode, do not allow + * opens for writing of any disk character devices. + */ + if (securelevel >= 2 && isdisk(dev, VCHR)) + return (EPERM); + /* + * When running in secure mode, do not allow opens + * for writing of /dev/mem, /dev/kmem, or character + * devices whose corresponding block devices are + * currently mounted. + */ + if (securelevel >= 1) { + if ((bdev = chrtoblk(dev)) != NODEV && + vfinddev(bdev, VBLK, &bvp) && + bvp->v_usecount > 0 && + (error = vfs_mountedon(bvp))) + return (error); + if (iskmemdev(dev)) + return (EPERM); + } + } + VOP_UNLOCK(vp); + error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, ap->a_p); + VOP_LOCK(vp); + return (error); + + case VBLK: + if ((u_int)maj >= nblkdev) + return (ENXIO); + /* + * When running in very secure mode, do not allow + * opens for writing of any disk block devices. + */ + if (securelevel >= 2 && ap->a_cred != FSCRED && + (ap->a_mode & FWRITE) && isdisk(dev, VBLK)) + return (EPERM); + /* + * Do not allow opens of block devices that are + * currently mounted. + */ + if (error = vfs_mountedon(vp)) + return (error); + return ((*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, ap->a_p)); + } + return (0); +} + +/* + * Vnode op for read + */ +/* ARGSUSED */ +spec_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct uio *uio = ap->a_uio; + struct proc *p = uio->uio_procp; + struct buf *bp; + daddr_t bn, nextbn; + long bsize, bscale; + struct partinfo dpart; + int n, on, majordev, (*ioctl)(); + int error = 0; + dev_t dev; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("spec_read mode"); + if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) + panic("spec_read proc"); +#endif + if (uio->uio_resid == 0) + return (0); + + switch (vp->v_type) { + + case VCHR: + VOP_UNLOCK(vp); + error = (*cdevsw[major(vp->v_rdev)].d_read) + (vp->v_rdev, uio, ap->a_ioflag); + VOP_LOCK(vp); + return (error); + + case VBLK: + if (uio->uio_offset < 0) + return (EINVAL); + bsize = BLKDEV_IOSIZE; + dev = vp->v_rdev; + if ((majordev = major(dev)) < nblkdev && + (ioctl = bdevsw[majordev].d_ioctl) != NULL && + (*ioctl)(dev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0 && + dpart.part->p_fstype == FS_BSDFFS && + dpart.part->p_frag != 0 && dpart.part->p_fsize != 0) + bsize = dpart.part->p_frag * dpart.part->p_fsize; + bscale = bsize / DEV_BSIZE; + do { + bn = (uio->uio_offset / DEV_BSIZE) &~ (bscale - 1); + on = uio->uio_offset % bsize; + n = min((unsigned)(bsize - on), uio->uio_resid); + if (vp->v_lastr + bscale == bn) { + nextbn = bn + bscale; + error = breadn(vp, bn, (int)bsize, &nextbn, + (int *)&bsize, 1, NOCRED, &bp); + } else + error = bread(vp, bn, (int)bsize, NOCRED, &bp); + vp->v_lastr = bn; + n = min(n, bsize - bp->b_resid); + if (error) { + brelse(bp); + return (error); + } + error = uiomove((char *)bp->b_data + on, n, uio); + if (n + on == bsize) + bp->b_flags |= B_AGE; + brelse(bp); + } while (error == 0 && uio->uio_resid > 0 && n != 0); + return (error); + + default: + panic("spec_read type"); + } + /* NOTREACHED */ +} + +/* + * Vnode op for write + */ +/* ARGSUSED */ +spec_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct uio *uio = ap->a_uio; + struct proc *p = uio->uio_procp; + struct buf *bp; + daddr_t bn; + int bsize, blkmask; + struct partinfo dpart; + register int n, on; + int error = 0; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE) + panic("spec_write mode"); + if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) + panic("spec_write proc"); +#endif + + switch (vp->v_type) { + + case VCHR: + VOP_UNLOCK(vp); + error = (*cdevsw[major(vp->v_rdev)].d_write) + (vp->v_rdev, uio, ap->a_ioflag); + VOP_LOCK(vp); + return (error); + + case VBLK: + if (uio->uio_resid == 0) + return (0); + if (uio->uio_offset < 0) + return (EINVAL); + bsize = BLKDEV_IOSIZE; + if ((*bdevsw[major(vp->v_rdev)].d_ioctl)(vp->v_rdev, DIOCGPART, + (caddr_t)&dpart, FREAD, p) == 0) { + if (dpart.part->p_fstype == FS_BSDFFS && + dpart.part->p_frag != 0 && dpart.part->p_fsize != 0) + bsize = dpart.part->p_frag * + dpart.part->p_fsize; + } + blkmask = (bsize / DEV_BSIZE) - 1; + do { + bn = (uio->uio_offset / DEV_BSIZE) &~ blkmask; + on = uio->uio_offset % bsize; + n = min((unsigned)(bsize - on), uio->uio_resid); + if (n == bsize) + bp = getblk(vp, bn, bsize, 0, 0); + else + error = bread(vp, bn, bsize, NOCRED, &bp); + n = min(n, bsize - bp->b_resid); + if (error) { + brelse(bp); + return (error); + } + error = uiomove((char *)bp->b_data + on, n, uio); + if (n + on == bsize) { + bp->b_flags |= B_AGE; + bawrite(bp); + } else + bdwrite(bp); + } while (error == 0 && uio->uio_resid > 0 && n != 0); + return (error); + + default: + panic("spec_write type"); + } + /* NOTREACHED */ +} + +/* + * Device ioctl operation. + */ +/* ARGSUSED */ +spec_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + dev_t dev = ap->a_vp->v_rdev; + + switch (ap->a_vp->v_type) { + + case VCHR: + return ((*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, + ap->a_fflag, ap->a_p)); + + case VBLK: + if (ap->a_command == 0 && (int)ap->a_data == B_TAPE) + if (bdevsw[major(dev)].d_flags & B_TAPE) + return (0); + else + return (1); + return ((*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, + ap->a_fflag, ap->a_p)); + + default: + panic("spec_ioctl"); + /* NOTREACHED */ + } +} + +/* ARGSUSED */ +spec_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register dev_t dev; + + switch (ap->a_vp->v_type) { + + default: + return (1); /* XXX */ + + case VCHR: + dev = ap->a_vp->v_rdev; + return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_p); + } +} +/* + * Synch buffers associated with a block device + */ +/* ARGSUSED */ +int +spec_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct buf *bp; + struct buf *nbp; + int s; + + if (vp->v_type == VCHR) + return (0); + /* + * Flush all dirty buffers associated with a block device. + */ +loop: + s = splbio(); + for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { + nbp = bp->b_vnbufs.le_next; + if ((bp->b_flags & B_BUSY)) + continue; + if ((bp->b_flags & B_DELWRI) == 0) + panic("spec_fsync: not dirty"); + bremfree(bp); + bp->b_flags |= B_BUSY; + splx(s); + bawrite(bp); + goto loop; + } + if (ap->a_waitfor == MNT_WAIT) { + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1); + } +#ifdef DIAGNOSTIC + if (vp->v_dirtyblkhd.lh_first) { + vprint("spec_fsync: dirty", vp); + goto loop; + } +#endif + } + splx(s); + return (0); +} + +/* + * Just call the device strategy routine + */ +spec_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + + (*bdevsw[major(ap->a_bp->b_dev)].d_strategy)(ap->a_bp); + return (0); +} + +/* + * This is a noop, simply returning what one has been given. + */ +spec_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + } */ *ap; +{ + + if (ap->a_vpp != NULL) + *ap->a_vpp = ap->a_vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + return (0); +} + +/* + * At the moment we do not do any locking. + */ +/* ARGSUSED */ +spec_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* ARGSUSED */ +spec_unlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* + * Device close routine + */ +/* ARGSUSED */ +spec_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + dev_t dev = vp->v_rdev; + int (*devclose) __P((dev_t, int, int, struct proc *)); + int mode, error; + + switch (vp->v_type) { + + case VCHR: + /* + * Hack: a tty device that is a controlling terminal + * has a reference from the session structure. + * We cannot easily tell that a character device is + * a controlling terminal, unless it is the closing + * process' controlling terminal. In that case, + * if the reference count is 2 (this last descriptor + * plus the session), release the reference from the session. + */ + if (vcount(vp) == 2 && ap->a_p && + vp == ap->a_p->p_session->s_ttyvp) { + vrele(vp); + ap->a_p->p_session->s_ttyvp = NULL; + } + /* + * If the vnode is locked, then we are in the midst + * of forcably closing the device, otherwise we only + * close on last reference. + */ + if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) + return (0); + devclose = cdevsw[major(dev)].d_close; + mode = S_IFCHR; + break; + + case VBLK: + /* + * On last close of a block device (that isn't mounted) + * we must invalidate any in core blocks, so that + * we can, for instance, change floppy disks. + */ + if (error = vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 0, 0)) + return (error); + /* + * We do not want to really close the device if it + * is still in use unless we are trying to close it + * forcibly. Since every use (buffer, vnode, swap, cmap) + * holds a reference to the vnode, and because we mark + * any other vnodes that alias this device, when the + * sum of the reference counts on all the aliased + * vnodes descends to one, we are on last close. + */ + if (vcount(vp) > 1 && (vp->v_flag & VXLOCK) == 0) + return (0); + devclose = bdevsw[major(dev)].d_close; + mode = S_IFBLK; + break; + + default: + panic("spec_close: not special"); + } + + return ((*devclose)(dev, ap->a_fflag, mode, ap->a_p)); +} + +/* + * Print out the contents of a special device vnode. + */ +spec_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + printf("tag VT_NON, dev %d, %d\n", major(ap->a_vp->v_rdev), + minor(ap->a_vp->v_rdev)); +} + +/* + * Return POSIX pathconf information applicable to special devices. + */ +spec_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_MAX_CANON: + *ap->a_retval = MAX_CANON; + return (0); + case _PC_MAX_INPUT: + *ap->a_retval = MAX_INPUT; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_VDISABLE: + *ap->a_retval = _POSIX_VDISABLE; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Special device advisory byte-level locks. + */ +/* ARGSUSED */ +spec_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * Special device failed operation + */ +spec_ebadf() +{ + + return (EBADF); +} + +/* + * Special device bad operation + */ +spec_badop() +{ + + panic("spec_badop called"); + /* NOTREACHED */ +} diff --git a/sys/miscfs/specfs/specdev.h b/sys/miscfs/specfs/specdev.h new file mode 100644 index 00000000000..a13b66e5113 --- /dev/null +++ b/sys/miscfs/specfs/specdev.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)specdev.h 8.2 (Berkeley) 2/2/94 + */ + +/* + * This structure defines the information maintained about + * special devices. It is allocated in checkalias and freed + * in vgone. + */ +struct specinfo { + struct vnode **si_hashchain; + struct vnode *si_specnext; + long si_flags; + dev_t si_rdev; +}; +/* + * Exported shorthand + */ +#define v_rdev v_specinfo->si_rdev +#define v_hashchain v_specinfo->si_hashchain +#define v_specnext v_specinfo->si_specnext +#define v_specflags v_specinfo->si_flags + +/* + * Flags for specinfo + */ +#define SI_MOUNTEDON 0x0001 /* block special device is mounted on */ + +/* + * Special device management + */ +#define SPECHSZ 64 +#if ((SPECHSZ&(SPECHSZ-1)) == 0) +#define SPECHASH(rdev) (((rdev>>5)+(rdev))&(SPECHSZ-1)) +#else +#define SPECHASH(rdev) (((unsigned)((rdev>>5)+(rdev)))%SPECHSZ) +#endif + +struct vnode *speclisth[SPECHSZ]; + +/* + * Prototypes for special file operations on vnodes. + */ +extern int (**spec_vnodeop_p)(); +struct nameidata; +struct componentname; +struct ucred; +struct flock; +struct buf; +struct uio; + +int spec_badop(), + spec_ebadf(); + +int spec_lookup __P((struct vop_lookup_args *)); +#define spec_create ((int (*) __P((struct vop_create_args *)))spec_badop) +#define spec_mknod ((int (*) __P((struct vop_mknod_args *)))spec_badop) +int spec_open __P((struct vop_open_args *)); +int spec_close __P((struct vop_close_args *)); +#define spec_access ((int (*) __P((struct vop_access_args *)))spec_ebadf) +#define spec_getattr ((int (*) __P((struct vop_getattr_args *)))spec_ebadf) +#define spec_setattr ((int (*) __P((struct vop_setattr_args *)))spec_ebadf) +int spec_read __P((struct vop_read_args *)); +int spec_write __P((struct vop_write_args *)); +int spec_ioctl __P((struct vop_ioctl_args *)); +int spec_select __P((struct vop_select_args *)); +#define spec_mmap ((int (*) __P((struct vop_mmap_args *)))spec_badop) +int spec_fsync __P((struct vop_fsync_args *)); +#define spec_seek ((int (*) __P((struct vop_seek_args *)))spec_badop) +#define spec_remove ((int (*) __P((struct vop_remove_args *)))spec_badop) +#define spec_link ((int (*) __P((struct vop_link_args *)))spec_badop) +#define spec_rename ((int (*) __P((struct vop_rename_args *)))spec_badop) +#define spec_mkdir ((int (*) __P((struct vop_mkdir_args *)))spec_badop) +#define spec_rmdir ((int (*) __P((struct vop_rmdir_args *)))spec_badop) +#define spec_symlink ((int (*) __P((struct vop_symlink_args *)))spec_badop) +#define spec_readdir ((int (*) __P((struct vop_readdir_args *)))spec_badop) +#define spec_readlink ((int (*) __P((struct vop_readlink_args *)))spec_badop) +#define spec_abortop ((int (*) __P((struct vop_abortop_args *)))spec_badop) +#define spec_inactive ((int (*) __P((struct vop_inactive_args *)))nullop) +#define spec_reclaim ((int (*) __P((struct vop_reclaim_args *)))nullop) +int spec_lock __P((struct vop_lock_args *)); +int spec_unlock __P((struct vop_unlock_args *)); +int spec_bmap __P((struct vop_bmap_args *)); +int spec_strategy __P((struct vop_strategy_args *)); +int spec_print __P((struct vop_print_args *)); +#define spec_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +int spec_pathconf __P((struct vop_pathconf_args *)); +int spec_advlock __P((struct vop_advlock_args *)); +#define spec_blkatoff ((int (*) __P((struct vop_blkatoff_args *)))spec_badop) +#define spec_valloc ((int (*) __P((struct vop_valloc_args *)))spec_badop) +#define spec_reallocblks \ + ((int (*) __P((struct vop_reallocblks_args *)))spec_badop) +#define spec_vfree ((int (*) __P((struct vop_vfree_args *)))spec_badop) +#define spec_truncate ((int (*) __P((struct vop_truncate_args *)))nullop) +#define spec_update ((int (*) __P((struct vop_update_args *)))nullop) +#define spec_bwrite ((int (*) __P((struct vop_bwrite_args *)))nullop) diff --git a/sys/miscfs/umapfs/umap.h b/sys/miscfs/umapfs/umap.h new file mode 100644 index 00000000000..9f4d1e7ace5 --- /dev/null +++ b/sys/miscfs/umapfs/umap.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * the UCLA Ficus project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)umap.h 8.3 (Berkeley) 1/21/94 + * + * @(#)null_vnops.c 1.5 (Berkeley) 7/10/92 + */ + +#define MAPFILEENTRIES 64 +#define GMAPFILEENTRIES 16 +#define NOBODY 32767 +#define NULLGROUP 65534 + +struct umap_args { + char *target; /* Target of loopback */ + int nentries; /* # of entries in user map array */ + int gnentries; /* # of entries in group map array */ + u_long (*mapdata)[2]; /* pointer to array of user mappings */ + u_long (*gmapdata)[2]; /* pointer to array of group mappings */ +}; + +struct umap_mount { + struct mount *umapm_vfs; + struct vnode *umapm_rootvp; /* Reference to root umap_node */ + int info_nentries; /* number of uid mappings */ + int info_gnentries; /* number of gid mappings */ + u_long info_mapdata[MAPFILEENTRIES][2]; /* mapping data for + user mapping in ficus */ + u_long info_gmapdata[GMAPFILEENTRIES][2]; /*mapping data for + group mapping in ficus */ +}; + +#ifdef KERNEL +/* + * A cache of vnode references + */ +struct umap_node { + struct umap_node *umap_forw; /* Hash chain */ + struct umap_node *umap_back; + struct vnode *umap_lowervp; /* Aliased vnode - VREFed once */ + struct vnode *umap_vnode; /* Back pointer to vnode/umap_node */ +}; + +extern int umap_node_create __P((struct mount *mp, struct vnode *target, struct vnode **vpp)); +extern u_long umap_reverse_findid __P((u_long id, u_long map[][2], int nentries)); +extern void umap_mapids __P((struct mount *v_mount, struct ucred *credp)); + +#define MOUNTTOUMAPMOUNT(mp) ((struct umap_mount *)((mp)->mnt_data)) +#define VTOUMAP(vp) ((struct umap_node *)(vp)->v_data) +#define UMAPTOV(xp) ((xp)->umap_vnode) +#ifdef UMAPFS_DIAGNOSTIC +extern struct vnode *umap_checkvp __P((struct vnode *vp, char *fil, int lno)); +#define UMAPVPTOLOWERVP(vp) umap_checkvp((vp), __FILE__, __LINE__) +#else +#define UMAPVPTOLOWERVP(vp) (VTOUMAP(vp)->umap_lowervp) +#endif + +extern int (**umap_vnodeop_p)(); +extern struct vfsops umap_vfsops; +#endif /* KERNEL */ diff --git a/sys/miscfs/umapfs/umap_subr.c b/sys/miscfs/umapfs/umap_subr.c new file mode 100644 index 00000000000..6f1f077a621 --- /dev/null +++ b/sys/miscfs/umapfs/umap_subr.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)umap_subr.c 8.6 (Berkeley) 1/26/94 + * + * $Id: lofs_subr.c, v 1.11 1992/05/30 10:05:43 jsp Exp jsp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOG2_SIZEVNODE 7 /* log2(sizeof struct vnode) */ +#define NUMAPNODECACHE 16 +#define UMAP_NHASH(vp) ((((u_long) vp)>>LOG2_SIZEVNODE) & (NUMAPNODECACHE-1)) + +/* + * Null layer cache: + * Each cache entry holds a reference to the target vnode + * along with a pointer to the alias vnode. When an + * entry is added the target vnode is VREF'd. When the + * alias is removed the target vnode is vrele'd. + */ + +/* + * Cache head + */ +struct umap_node_cache { + struct umap_node *ac_forw; + struct umap_node *ac_back; +}; + +static struct umap_node_cache umap_node_cache[NUMAPNODECACHE]; + +/* + * Initialise cache headers + */ +umapfs_init() +{ + struct umap_node_cache *ac; +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_init\n"); /* printed during system boot */ +#endif + + for (ac = umap_node_cache; ac < umap_node_cache + NUMAPNODECACHE; ac++) + ac->ac_forw = ac->ac_back = (struct umap_node *) ac; +} + +/* + * Compute hash list for given target vnode + */ +static struct umap_node_cache * +umap_node_hash(targetvp) + struct vnode *targetvp; +{ + + return (&umap_node_cache[UMAP_NHASH(targetvp)]); +} + +/* + * umap_findid is called by various routines in umap_vnodeops.c to + * find a user or group id in a map. + */ +static u_long +umap_findid(id, map, nentries) + u_long id; + u_long map[][2]; + int nentries; +{ + int i; + + /* Find uid entry in map */ + i = 0; + while ((iac_forw; a != (struct umap_node *) hd; a = a->umap_forw) { + if (a->umap_lowervp == targetvp && + a->umap_vnode->v_mount == mp) { + vp = UMAPTOV(a); + /* + * We need vget for the VXLOCK + * stuff, but we don't want to lock + * the lower node. + */ + if (vget(vp, 0)) { +#ifdef UMAPFS_DIAGNOSTIC + printf ("umap_node_find: vget failed.\n"); +#endif + goto loop; + } + return (vp); + } + } + +#ifdef UMAPFS_DIAGNOSTIC + printf("umap_node_find(%x, %x): NOT found\n", mp, targetvp); +#endif + + return (0); +} + +/* + * Make a new umap_node node. + * Vp is the alias vnode, lofsvp is the target vnode. + * Maintain a reference to (targetvp). + */ +static int +umap_node_alloc(mp, lowervp, vpp) + struct mount *mp; + struct vnode *lowervp; + struct vnode **vpp; +{ + struct umap_node_cache *hd; + struct umap_node *xp; + struct vnode *othervp, *vp; + int error; + + if (error = getnewvnode(VT_UMAP, mp, umap_vnodeop_p, vpp)) + return (error); + vp = *vpp; + + MALLOC(xp, struct umap_node *, sizeof(struct umap_node), + M_TEMP, M_WAITOK); + vp->v_type = lowervp->v_type; + xp->umap_vnode = vp; + vp->v_data = xp; + xp->umap_lowervp = lowervp; + /* + * Before we insert our new node onto the hash chains, + * check to see if someone else has beaten us to it. + * (We could have slept in MALLOC.) + */ + if (othervp = umap_node_find(lowervp)) { + FREE(xp, M_TEMP); + vp->v_type = VBAD; /* node is discarded */ + vp->v_usecount = 0; /* XXX */ + *vpp = othervp; + return (0); + } + VREF(lowervp); /* Extra VREF will be vrele'd in umap_node_create */ + hd = umap_node_hash(lowervp); + insque(xp, hd); + return (0); +} + + +/* + * Try to find an existing umap_node vnode refering + * to it, otherwise make a new umap_node vnode which + * contains a reference to the target vnode. + */ +int +umap_node_create(mp, targetvp, newvpp) + struct mount *mp; + struct vnode *targetvp; + struct vnode **newvpp; +{ + struct vnode *aliasvp; + + if (aliasvp = umap_node_find(mp, targetvp)) { + /* + * Take another reference to the alias vnode + */ +#ifdef UMAPFS_DIAGNOSTIC + vprint("umap_node_create: exists", ap->umap_vnode); +#endif + /* VREF(aliasvp); */ + } else { + int error; + + /* + * Get new vnode. + */ +#ifdef UMAPFS_DIAGNOSTIC + printf("umap_node_create: create new alias vnode\n"); +#endif + /* + * Make new vnode reference the umap_node. + */ + if (error = umap_node_alloc(mp, targetvp, &aliasvp)) + return (error); + + /* + * aliasvp is already VREF'd by getnewvnode() + */ + } + + vrele(targetvp); + +#ifdef UMAPFS_DIAGNOSTIC + vprint("umap_node_create: alias", aliasvp); + vprint("umap_node_create: target", targetvp); +#endif + + *newvpp = aliasvp; + return (0); +} + +#ifdef UMAPFS_DIAGNOSTIC +int umap_checkvp_barrier = 1; +struct vnode * +umap_checkvp(vp, fil, lno) + struct vnode *vp; + char *fil; + int lno; +{ + struct umap_node *a = VTOUMAP(vp); +#if 0 + /* + * Can't do this check because vop_reclaim runs + * with funny vop vector. + */ + if (vp->v_op != umap_vnodeop_p) { + printf ("umap_checkvp: on non-umap-node\n"); + while (umap_checkvp_barrier) /*WAIT*/ ; + panic("umap_checkvp"); + } +#endif + if (a->umap_lowervp == NULL) { + /* Should never happen */ + int i; u_long *p; + printf("vp = %x, ZERO ptr\n", vp); + for (p = (u_long *) a, i = 0; i < 8; i++) + printf(" %x", p[i]); + printf("\n"); + /* wait for debugger */ + while (umap_checkvp_barrier) /*WAIT*/ ; + panic("umap_checkvp"); + } + if (a->umap_lowervp->v_usecount < 1) { + int i; u_long *p; + printf("vp = %x, unref'ed lowervp\n", vp); + for (p = (u_long *) a, i = 0; i < 8; i++) + printf(" %x", p[i]); + printf("\n"); + /* wait for debugger */ + while (umap_checkvp_barrier) /*WAIT*/ ; + panic ("umap with unref'ed lowervp"); + } +#if 0 + printf("umap %x/%d -> %x/%d [%s, %d]\n", + a->umap_vnode, a->umap_vnode->v_usecount, + a->umap_lowervp, a->umap_lowervp->v_usecount, + fil, lno); +#endif + return (a->umap_lowervp); +} +#endif + +/* umap_mapids maps all of the ids in a credential, both user and group. */ + +void +umap_mapids(v_mount, credp) + struct mount *v_mount; + struct ucred *credp; +{ + int i, unentries, gnentries; + u_long *groupmap, *usermap; + uid_t uid; + gid_t gid; + + unentries = MOUNTTOUMAPMOUNT(v_mount)->info_nentries; + usermap = &(MOUNTTOUMAPMOUNT(v_mount)->info_mapdata[0][0]); + gnentries = MOUNTTOUMAPMOUNT(v_mount)->info_gnentries; + groupmap = &(MOUNTTOUMAPMOUNT(v_mount)->info_gmapdata[0][0]); + + /* Find uid entry in map */ + + uid = (uid_t) umap_findid(credp->cr_uid, usermap, unentries); + + if (uid != -1) + credp->cr_uid = uid; + else + credp->cr_uid = (uid_t) NOBODY; + +#ifdef notdef + /* cr_gid is the same as cr_groups[0] in 4BSD */ + + /* Find gid entry in map */ + + gid = (gid_t) umap_findid(credp->cr_gid, groupmap, gnentries); + + if (gid != -1) + credp->cr_gid = gid; + else + credp->cr_gid = NULLGROUP; +#endif + + /* Now we must map each of the set of groups in the cr_groups + structure. */ + + i = 0; + while (credp->cr_groups[i] != 0) { + gid = (gid_t) umap_findid(credp->cr_groups[i], + groupmap, gnentries); + + if (gid != -1) + credp->cr_groups[i++] = gid; + else + credp->cr_groups[i++] = NULLGROUP; + } +} diff --git a/sys/miscfs/umapfs/umap_vfsops.c b/sys/miscfs/umapfs/umap_vfsops.c new file mode 100644 index 00000000000..2480a85e440 --- /dev/null +++ b/sys/miscfs/umapfs/umap_vfsops.c @@ -0,0 +1,407 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * the UCLA Ficus project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)umap_vfsops.c 8.3 (Berkeley) 1/21/94 + * + * @(#)null_vfsops.c 1.5 (Berkeley) 7/10/92 + */ + +/* + * Umap Layer + * (See mount_umap(8) for a description of this layer.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Mount umap layer + */ +int +umapfs_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct umap_args args; + struct vnode *lowerrootvp, *vp; + struct vnode *umapm_rootvp; + struct umap_mount *amp; + u_int size; + int error; + +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_mount(mp = %x)\n", mp); +#endif + + /* + * Update is a no-op + */ + if (mp->mnt_flag & MNT_UPDATE) { + return (EOPNOTSUPP); + /* return (VFS_MOUNT(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, path, data, ndp, p));*/ + } + + /* + * Get argument + */ + if (error = copyin(data, (caddr_t)&args, sizeof(struct umap_args))) + return (error); + + /* + * Find lower node + */ + NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT|LOCKLEAF, + UIO_USERSPACE, args.target, p); + if (error = namei(ndp)) + return (error); + + /* + * Sanity check on lower vnode + */ + lowerrootvp = ndp->ni_vp; +#ifdef UMAPFS_DIAGNOSTIC + printf("vp = %x, check for VDIR...\n", lowerrootvp); +#endif + vrele(ndp->ni_dvp); + ndp->ni_dvp = 0; + + if (lowerrootvp->v_type != VDIR) { + vput(lowerrootvp); + return (EINVAL); + } + +#ifdef UMAPFS_DIAGNOSTIC + printf("mp = %x\n", mp); +#endif + + amp = (struct umap_mount *) malloc(sizeof(struct umap_mount), + M_UFSMNT, M_WAITOK); /* XXX */ + + /* + * Save reference to underlying FS + */ + amp->umapm_vfs = lowerrootvp->v_mount; + + /* + * Now copy in the number of entries and maps for umap mapping. + */ + amp->info_nentries = args.nentries; + amp->info_gnentries = args.gnentries; + error = copyin(args.mapdata, (caddr_t)amp->info_mapdata, + 2*sizeof(u_long)*args.nentries); + if (error) + return (error); + +#ifdef UMAP_DIAGNOSTIC + printf("umap_mount:nentries %d\n",args.nentries); + for (i = 0; i < args.nentries; i++) + printf(" %d maps to %d\n", amp->info_mapdata[i][0], + amp->info_mapdata[i][1]); +#endif + + error = copyin(args.gmapdata, (caddr_t)amp->info_gmapdata, + 2*sizeof(u_long)*args.nentries); + if (error) + return (error); + +#ifdef UMAP_DIAGNOSTIC + printf("umap_mount:gnentries %d\n",args.gnentries); + for (i = 0; i < args.gnentries; i++) + printf(" group %d maps to %d\n", + amp->info_gmapdata[i][0], + amp->info_gmapdata[i][1]); +#endif + + + /* + * Save reference. Each mount also holds + * a reference on the root vnode. + */ + error = umap_node_create(mp, lowerrootvp, &vp); + /* + * Unlock the node (either the lower or the alias) + */ + VOP_UNLOCK(vp); + /* + * Make sure the node alias worked + */ + if (error) { + vrele(lowerrootvp); + free(amp, M_UFSMNT); /* XXX */ + return (error); + } + + /* + * Keep a held reference to the root vnode. + * It is vrele'd in umapfs_unmount. + */ + umapm_rootvp = vp; + umapm_rootvp->v_flag |= VROOT; + amp->umapm_rootvp = umapm_rootvp; + if (UMAPVPTOLOWERVP(umapm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) + mp->mnt_flag |= MNT_LOCAL; + mp->mnt_data = (qaddr_t) amp; + getnewfsid(mp, MOUNT_LOFS); + + (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + (void) copyinstr(args.target, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_mount: lower %s, alias at %s\n", + mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); +#endif + return (0); +} + +/* + * VFS start. Nothing needed here - the start routine + * on the underlying filesystem will have been called + * when that filesystem was mounted. + */ +int +umapfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + return (0); + /* return (VFS_START(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, flags, p)); */ +} + +/* + * Free reference to umap layer + */ +int +umapfs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + struct vnode *umapm_rootvp = MOUNTTOUMAPMOUNT(mp)->umapm_rootvp; + int error; + int flags = 0; + extern int doforce; + +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_unmount(mp = %x)\n", mp); +#endif + + if (mntflags & MNT_FORCE) { + /* lofs can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + /* + * Clear out buffer cache. I don't think we + * ever get anything cached at this level at the + * moment, but who knows... + */ +#ifdef notyet + mntflushbuf(mp, 0); + if (mntinvalbuf(mp, 1)) + return (EBUSY); +#endif + if (umapm_rootvp->v_usecount > 1) + return (EBUSY); + if (error = vflush(mp, umapm_rootvp, flags)) + return (error); + +#ifdef UMAPFS_DIAGNOSTIC + vprint("alias root of lower", umapm_rootvp); +#endif + /* + * Release reference on underlying root vnode + */ + vrele(umapm_rootvp); + /* + * And blow it away for future re-use + */ + vgone(umapm_rootvp); + /* + * Finally, throw away the umap_mount structure + */ + free(mp->mnt_data, M_UFSMNT); /* XXX */ + mp->mnt_data = 0; + return (0); +} + +int +umapfs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *vp; + +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_root(mp = %x, vp = %x->%x)\n", mp, + MOUNTTOUMAPMOUNT(mp)->umapm_rootvp, + UMAPVPTOLOWERVP(MOUNTTOUMAPMOUNT(mp)->umapm_rootvp) + ); +#endif + + /* + * Return locked reference to root. + */ + vp = MOUNTTOUMAPMOUNT(mp)->umapm_rootvp; + VREF(vp); + VOP_LOCK(vp); + *vpp = vp; + return (0); +} + +int +umapfs_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + return (VFS_QUOTACTL(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, cmd, uid, arg, p)); +} + +int +umapfs_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + int error; + struct statfs mstat; + +#ifdef UMAPFS_DIAGNOSTIC + printf("umapfs_statfs(mp = %x, vp = %x->%x)\n", mp, + MOUNTTOUMAPMOUNT(mp)->umapm_rootvp, + UMAPVPTOLOWERVP(MOUNTTOUMAPMOUNT(mp)->umapm_rootvp) + ); +#endif + + bzero(&mstat, sizeof(mstat)); + + error = VFS_STATFS(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, &mstat, p); + if (error) + return (error); + + /* now copy across the "interesting" information and fake the rest */ + sbp->f_type = mstat.f_type; + sbp->f_flags = mstat.f_flags; + sbp->f_bsize = mstat.f_bsize; + sbp->f_iosize = mstat.f_iosize; + sbp->f_blocks = mstat.f_blocks; + sbp->f_bfree = mstat.f_bfree; + sbp->f_bavail = mstat.f_bavail; + sbp->f_files = mstat.f_files; + sbp->f_ffree = mstat.f_ffree; + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + return (0); +} + +int +umapfs_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + /* + * XXX - Assumes no data cached at umap layer. + */ + return (0); +} + +int +umapfs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (VFS_VGET(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, ino, vpp)); +} + +int +umapfs_fhtovp(mp, fidp, nam, vpp, exflagsp, credanonp) + struct mount *mp; + struct fid *fidp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred**credanonp; +{ + + return (VFS_FHTOVP(MOUNTTOUMAPMOUNT(mp)->umapm_vfs, fidp, nam, vpp, exflagsp,credanonp)); +} + +int +umapfs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + return (VFS_VPTOFH(UMAPVPTOLOWERVP(vp), fhp)); +} + +int umapfs_init __P((void)); + +struct vfsops umap_vfsops = { + umapfs_mount, + umapfs_start, + umapfs_unmount, + umapfs_root, + umapfs_quotactl, + umapfs_statfs, + umapfs_sync, + umapfs_vget, + umapfs_fhtovp, + umapfs_vptofh, + umapfs_init, +}; diff --git a/sys/miscfs/umapfs/umap_vnops.c b/sys/miscfs/umapfs/umap_vnops.c new file mode 100644 index 00000000000..287804e1561 --- /dev/null +++ b/sys/miscfs/umapfs/umap_vnops.c @@ -0,0 +1,488 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software donated to Berkeley by + * the UCLA Ficus project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)umap_vnops.c 8.3 (Berkeley) 1/5/94 + */ + +/* + * Umap Layer + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +int umap_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ + +/* + * This is the 10-Apr-92 bypass routine. + * See null_vnops.c:null_bypass for more details. + */ +int +umap_bypass(ap) + struct vop_generic_args /* { + struct vnodeop_desc *a_desc; + + } */ *ap; +{ + extern int (**umap_vnodeop_p)(); /* not extern, really "forward" */ + struct ucred **credpp = 0, *credp = 0; + struct ucred *savecredp, *savecompcredp = 0; + struct ucred *compcredp = 0; + struct vnode **this_vp_p; + int error; + struct vnode *old_vps[VDESC_MAX_VPS]; + struct vnode *vp1 = 0; + struct vnode **vps_p[VDESC_MAX_VPS]; + struct vnode ***vppp; + struct vnodeop_desc *descp = ap->a_desc; + int reles, i; + struct componentname **compnamepp = 0; + + if (umap_bug_bypass) + printf ("umap_bypass: %s\n", descp->vdesc_name); + +#ifdef SAFETY + /* + * We require at least one vp. + */ + if (descp->vdesc_vp_offsets == NULL || + descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) + panic ("umap_bypass: no vp's in map.\n"); +#endif + + /* + * Map the vnodes going in. + * Later, we'll invoke the operation based on + * the first mapped vnode's operation vector. + */ + reles = descp->vdesc_flags; + for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { + if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) + break; /* bail out at end of list */ + vps_p[i] = this_vp_p = + VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[i], ap); + + if (i == 0) { + vp1 = *vps_p[0]; + } + + /* + * We're not guaranteed that any but the first vnode + * are of our type. Check for and don't map any + * that aren't. (Must map first vp or vclean fails.) + */ + + if (i && (*this_vp_p)->v_op != umap_vnodeop_p) { + old_vps[i] = NULL; + } else { + old_vps[i] = *this_vp_p; + *(vps_p[i]) = UMAPVPTOLOWERVP(*this_vp_p); + if (reles & 1) + VREF(*this_vp_p); + } + + } + + /* + * Fix the credentials. (That's the purpose of this layer.) + */ + + if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) { + + credpp = VOPARG_OFFSETTO(struct ucred**, + descp->vdesc_cred_offset, ap); + + /* Save old values */ + + savecredp = (*credpp); + (*credpp) = crdup(savecredp); + credp = *credpp; + + if (umap_bug_bypass && credp->cr_uid != 0) + printf("umap_bypass: user was %d, group %d\n", + credp->cr_uid, credp->cr_gid); + + /* Map all ids in the credential structure. */ + + umap_mapids(vp1->v_mount, credp); + + if (umap_bug_bypass && credp->cr_uid != 0) + printf("umap_bypass: user now %d, group %d\n", + credp->cr_uid, credp->cr_gid); + } + + /* BSD often keeps a credential in the componentname structure + * for speed. If there is one, it better get mapped, too. + */ + + if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) { + + compnamepp = VOPARG_OFFSETTO(struct componentname**, + descp->vdesc_componentname_offset, ap); + + compcredp = (*compnamepp)->cn_cred; + savecompcredp = compcredp; + compcredp = (*compnamepp)->cn_cred = crdup(savecompcredp); + + if (umap_bug_bypass && compcredp->cr_uid != 0) + printf("umap_bypass: component credit user was %d, group %d\n", + compcredp->cr_uid, compcredp->cr_gid); + + /* Map all ids in the credential structure. */ + + umap_mapids(vp1->v_mount, compcredp); + + if (umap_bug_bypass && compcredp->cr_uid != 0) + printf("umap_bypass: component credit user now %d, group %d\n", + compcredp->cr_uid, compcredp->cr_gid); + } + + /* + * Call the operation on the lower layer + * with the modified argument structure. + */ + error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap); + + /* + * Maintain the illusion of call-by-value + * by restoring vnodes in the argument structure + * to their original value. + */ + reles = descp->vdesc_flags; + for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { + if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) + break; /* bail out at end of list */ + if (old_vps[i]) { + *(vps_p[i]) = old_vps[i]; + if (reles & 1) + vrele(*(vps_p[i])); + }; + }; + + /* + * Map the possible out-going vpp + * (Assumes that the lower layer always returns + * a VREF'ed vpp unless it gets an error.) + */ + if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && + !(descp->vdesc_flags & VDESC_NOMAP_VPP) && + !error) { + if (descp->vdesc_flags & VDESC_VPP_WILLRELE) + goto out; + vppp = VOPARG_OFFSETTO(struct vnode***, + descp->vdesc_vpp_offset, ap); + error = umap_node_create(old_vps[0]->v_mount, **vppp, *vppp); + }; + + out: + /* + * Free duplicate cred structure and restore old one. + */ + if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) { + if (umap_bug_bypass && credp && credp->cr_uid != 0) + printf("umap_bypass: returning-user was %d\n", + credp->cr_uid); + + crfree(credp); + (*credpp) = savecredp; + if (umap_bug_bypass && credpp && (*credpp)->cr_uid != 0) + printf("umap_bypass: returning-user now %d\n\n", + (*credpp)->cr_uid); + } + + if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) { + if (umap_bug_bypass && compcredp && compcredp->cr_uid != 0) + printf("umap_bypass: returning-component-user was %d\n", + compcredp->cr_uid); + + crfree(compcredp); + (*compnamepp)->cn_cred = savecompcredp; + if (umap_bug_bypass && credpp && (*credpp)->cr_uid != 0) + printf("umap_bypass: returning-component-user now %d\n", + compcredp->cr_uid); + } + + return (error); +} + + +/* + * We handle getattr to change the fsid. + */ +int +umap_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + short uid, gid; + int error, tmpid, nentries, gnentries; + u_long (*mapdata)[2], (*gmapdata)[2]; + struct vnode **vp1p; + struct vnodeop_desc *descp = ap->a_desc; + + if (error = umap_bypass(ap)) + return (error); + /* Requires that arguments be restored. */ + ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; + + /* + * Umap needs to map the uid and gid returned by a stat + * into the proper values for this site. This involves + * finding the returned uid in the mapping information, + * translating it into the uid on the other end, + * and filling in the proper field in the vattr + * structure pointed to by ap->a_vap. The group + * is easier, since currently all groups will be + * translate to the NULLGROUP. + */ + + /* Find entry in map */ + + uid = ap->a_vap->va_uid; + gid = ap->a_vap->va_gid; + if (umap_bug_bypass) + printf("umap_getattr: mapped uid = %d, mapped gid = %d\n", uid, + gid); + + vp1p = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap); + nentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_nentries; + mapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_mapdata); + gnentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gnentries; + gmapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gmapdata); + + /* Reverse map the uid for the vnode. Since it's a reverse + map, we can't use umap_mapids() to do it. */ + + tmpid = umap_reverse_findid(uid, mapdata, nentries); + + if (tmpid != -1) { + + ap->a_vap->va_uid = (uid_t) tmpid; + if (umap_bug_bypass) + printf("umap_getattr: original uid = %d\n", uid); + } else + ap->a_vap->va_uid = (uid_t) NOBODY; + + /* Reverse map the gid for the vnode. */ + + tmpid = umap_reverse_findid(gid, gmapdata, gnentries); + + if (tmpid != -1) { + + ap->a_vap->va_gid = (gid_t) tmpid; + if (umap_bug_bypass) + printf("umap_getattr: original gid = %d\n", gid); + } else + ap->a_vap->va_gid = (gid_t) NULLGROUP; + + return (0); +} + +int +umap_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + /* + * Do nothing (and _don't_ bypass). + * Wait to vrele lowervp until reclaim, + * so that until then our umap_node is in the + * cache and reusable. + * + */ + return (0); +} + +int +umap_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct umap_node *xp = VTOUMAP(vp); + struct vnode *lowervp = xp->umap_lowervp; + + /* After this assignment, this node will not be re-used. */ + xp->umap_lowervp = NULL; + remque(xp); + FREE(vp->v_data, M_TEMP); + vp->v_data = NULL; + vrele(lowervp); + return (0); +} + +int +umap_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + int error; + struct vnode *savedvp; + + savedvp = bp->b_vp; + bp->b_vp = UMAPVPTOLOWERVP(bp->b_vp); + + error = VOP_STRATEGY(ap->a_bp); + + bp->b_vp = savedvp; + + return (error); +} + +int +umap_bwrite(ap) + struct vop_bwrite_args /* { + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + int error; + struct vnode *savedvp; + + savedvp = bp->b_vp; + bp->b_vp = UMAPVPTOLOWERVP(bp->b_vp); + + error = VOP_BWRITE(ap->a_bp); + + bp->b_vp = savedvp; + + return (error); +} + + +int +umap_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + printf("\ttag VT_UMAPFS, vp=%x, lowervp=%x\n", vp, UMAPVPTOLOWERVP(vp)); + return (0); +} + +int +umap_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + int error; + struct componentname *compnamep; + struct ucred *compcredp, *savecompcredp; + struct vnode *vp; + + /* + * Rename is irregular, having two componentname structures. + * We need to map the cre in the second structure, + * and then bypass takes care of the rest. + */ + + vp = ap->a_fdvp; + compnamep = ap->a_tcnp; + compcredp = compnamep->cn_cred; + + savecompcredp = compcredp; + compcredp = compnamep->cn_cred = crdup(savecompcredp); + + if (umap_bug_bypass && compcredp->cr_uid != 0) + printf("umap_rename: rename component credit user was %d, group %d\n", + compcredp->cr_uid, compcredp->cr_gid); + + /* Map all ids in the credential structure. */ + + umap_mapids(vp->v_mount, compcredp); + + if (umap_bug_bypass && compcredp->cr_uid != 0) + printf("umap_rename: rename component credit user now %d, group %d\n", + compcredp->cr_uid, compcredp->cr_gid); + + error = umap_bypass(ap); + + /* Restore the additional mapped componentname cred structure. */ + + crfree(compcredp); + compnamep->cn_cred = savecompcredp; + + return error; +} + +/* + * Global vfs data structures + */ +/* + * XXX - strategy, bwrite are hand coded currently. They should + * go away with a merged buffer/block cache. + * + */ +int (**umap_vnodeop_p)(); +struct vnodeopv_entry_desc umap_vnodeop_entries[] = { + { &vop_default_desc, umap_bypass }, + + { &vop_getattr_desc, umap_getattr }, + { &vop_inactive_desc, umap_inactive }, + { &vop_reclaim_desc, umap_reclaim }, + { &vop_print_desc, umap_print }, + { &vop_rename_desc, umap_rename }, + + { &vop_strategy_desc, umap_strategy }, + { &vop_bwrite_desc, umap_bwrite }, + + { (struct vnodeop_desc*) NULL, (int(*)()) NULL } +}; +struct vnodeopv_desc umap_vnodeop_opv_desc = + { &umap_vnodeop_p, umap_vnodeop_entries }; diff --git a/sys/miscfs/union/README b/sys/miscfs/union/README new file mode 100644 index 00000000000..14a476987c9 --- /dev/null +++ b/sys/miscfs/union/README @@ -0,0 +1,7 @@ +If you plan on using union mounts, then you should consider replacing +"libc/gen/opendir.c" in the C library with the file "libc.opendir.c" +in this directory. The replacement version of opendir() automatically +removes duplicate names when a union stack is encountered. You will +then need to rebuild the C library and all commands. + +@(#)README 8.1 (Berkeley) 2/15/94 diff --git a/sys/miscfs/union/libc.opendir.c b/sys/miscfs/union/libc.opendir.c new file mode 100644 index 00000000000..99ed58b86fd --- /dev/null +++ b/sys/miscfs/union/libc.opendir.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 1983, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char orig_sccsid[] = "@(#)opendir.c 8.2 (Berkeley) 2/12/94"; +static char sccsid[] = "@(#)libc.opendir.c 8.1 (Berkeley) 2/15/94"; +#endif /* LIBC_SCCS and not lint */ + +#include +#include + +#include +#include +#include +#include + +/* + * open a directory. + */ +DIR * +opendir(name) + const char *name; +{ + DIR *dirp; + int fd; + int incr; + struct statfs sfb; + + if ((fd = open(name, 0)) == -1) + return (NULL); + if (fcntl(fd, F_SETFD, FD_CLOEXEC) == -1 || + (dirp = (DIR *)malloc(sizeof(DIR))) == NULL) { + close(fd); + return (NULL); + } + + /* + * If CLBYTES is an exact multiple of DIRBLKSIZ, use a CLBYTES + * buffer that it cluster boundary aligned. + * Hopefully this can be a big win someday by allowing page + * trades trade to user space to be done by getdirentries() + */ + if ((CLBYTES % DIRBLKSIZ) == 0) + incr = CLBYTES; + else + incr = DIRBLKSIZ; + +#ifdef MOUNT_UNION + /* + * Determine whether this directory is the top of a union stack. + */ + if (fstatfs(fd, &sfb) < 0) { + free(dirp); + close(fd); + return (NULL); + } + + if (sfb.f_type == MOUNT_UNION) { + int len = 0; + int space = 0; + char *buf = 0; + char *ddptr = 0; + int n; + struct dirent **dpv; + + /* + * The strategy here is to read all the directory + * entries into a buffer, sort the buffer, and + * remove duplicate entries by setting the inode + * number to zero. + */ + + /* + * Fixup dd_loc to be non-zero to fake out readdir + */ + dirp->dd_loc = sizeof(void *); + + do { + /* + * Always make at least DIRBLKSIZ bytes + * available to getdirentries + */ + if (space < DIRBLKSIZ) { + space += incr; + len += incr; + buf = realloc(buf, len); + if (buf == NULL) { + free(dirp); + close(fd); + return (NULL); + } + ddptr = buf + (len - space) + dirp->dd_loc; + } + + n = getdirentries(fd, ddptr, space, &dirp->dd_seek); + if (n > 0) { + ddptr += n; + space -= n; + } + } while (n > 0); + + /* + * There is now a buffer full of (possibly) duplicate + * names. + */ + dirp->dd_buf = buf; + + /* + * Go round this loop twice... + * + * Scan through the buffer, counting entries. + * On the second pass, save pointers to each one. + * Then sort the pointers and remove duplicate names. + */ + for (dpv = 0;;) { + n = 0; + ddptr = buf + dirp->dd_loc; + while (ddptr < buf + len) { + struct dirent *dp; + + dp = (struct dirent *) ddptr; + if ((int)dp & 03) + break; + if ((dp->d_reclen <= 0) || + (dp->d_reclen > (buf + len + 1 - ddptr))) + break; + ddptr += dp->d_reclen; + if (dp->d_fileno) { + if (dpv) + dpv[n] = dp; + n++; + } + } + + if (dpv) { + struct dirent *xp; + + /* + * If and when whiteouts happen, + * this sort would need to be stable. + */ + heapsort(dpv, n, sizeof(*dpv), alphasort); + + dpv[n] = NULL; + xp = NULL; + + /* + * Scan through the buffer in sort order, + * zapping the inode number of any + * duplicate names. + */ + for (n = 0; dpv[n]; n++) { + struct dirent *dp = dpv[n]; + + if ((xp == NULL) || + strcmp(dp->d_name, xp->d_name)) + xp = dp; + else + dp->d_fileno = 0; + } + + free(dpv); + break; + } else { + dpv = malloc((n+1) * sizeof(struct dirent *)); + if (dpv == NULL) + break; + } + } + + dirp->dd_len = len; + dirp->dd_size = ddptr - dirp->dd_buf; + } else +#endif /* MOUNT_UNION */ + { + dirp->dd_len = incr; + dirp->dd_buf = malloc(dirp->dd_len); + if (dirp->dd_buf == NULL) { + free(dirp); + close (fd); + return (NULL); + } + dirp->dd_seek = 0; + dirp->dd_loc = 0; + } + + dirp->dd_fd = fd; + + /* + * Set up seek point for rewinddir. + */ + dirp->dd_rewind = telldir(dirp); + + return (dirp); +} diff --git a/sys/miscfs/union/union.h b/sys/miscfs/union/union.h new file mode 100644 index 00000000000..463218ac3ed --- /dev/null +++ b/sys/miscfs/union/union.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 1994 The Regents of the University of California. + * Copyright (c) 1994 Jan-Simon Pendry. + * All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)union.h 8.2 (Berkeley) 2/17/94 + */ + +struct union_args { + char *target; /* Target of loopback */ + int mntflags; /* Options on the mount */ +}; + +#define UNMNT_ABOVE 0x0001 /* Target appears below mount point */ +#define UNMNT_BELOW 0x0002 /* Target appears below mount point */ +#define UNMNT_REPLACE 0x0003 /* Target replaces mount point */ +#define UNMNT_OPMASK 0x0003 + +struct union_mount { + struct vnode *um_uppervp; + struct vnode *um_lowervp; + struct ucred *um_cred; /* Credentials of user calling mount */ + int um_cmode; /* cmask from mount process */ + int um_op; /* Operation mode */ +}; + +#ifdef KERNEL + +/* + * DEFDIRMODE is the mode bits used to create a shadow directory. + */ +#define VRWXMODE (VREAD|VWRITE|VEXEC) +#define VRWMODE (VREAD|VWRITE) +#define UN_DIRMODE ((VRWXMODE)|(VRWXMODE>>3)|(VRWXMODE>>6)) +#define UN_FILEMODE ((VRWMODE)|(VRWMODE>>3)|(VRWMODE>>6)) + +/* + * A cache of vnode references + */ +struct union_node { + LIST_ENTRY(union_node) un_cache; /* Hash chain */ + struct vnode *un_vnode; /* Back pointer */ + struct vnode *un_uppervp; /* overlaying object */ + struct vnode *un_lowervp; /* underlying object */ + struct vnode *un_dirvp; /* Parent dir of uppervp */ + char *un_path; /* saved component name */ + int un_hash; /* saved un_path hash value */ + int un_openl; /* # of opens on lowervp */ + int un_flags; +#ifdef DIAGNOSTIC + pid_t un_pid; +#endif +}; + +#define UN_WANT 0x01 +#define UN_LOCKED 0x02 +#define UN_ULOCK 0x04 /* Upper node is locked */ +#define UN_KLOCK 0x08 /* Keep upper node locked on vput */ + +extern int union_allocvp __P((struct vnode **, struct mount *, + struct vnode *, struct vnode *, + struct componentname *, struct vnode *, + struct vnode *)); +extern int union_copyfile __P((struct proc *, struct ucred *, + struct vnode *, struct vnode *)); +extern int union_mkshadow __P((struct union_mount *, struct vnode *, + struct componentname *, struct vnode **)); +extern int union_vn_create __P((struct vnode **, struct union_node *, + struct proc *)); +extern int union_cn_close __P((struct vnode *, int, struct ucred *, + struct proc *)); +extern void union_removed_upper __P((struct union_node *un)); +extern struct vnode *union_lowervp __P((struct vnode *)); +extern void union_newlower __P((struct union_node *, struct vnode *)); +extern void union_newupper __P((struct union_node *, struct vnode *)); + +#define MOUNTTOUNIONMOUNT(mp) ((struct union_mount *)((mp)->mnt_data)) +#define VTOUNION(vp) ((struct union_node *)(vp)->v_data) +#define UNIONTOV(un) ((un)->un_vnode) +#define LOWERVP(vp) (VTOUNION(vp)->un_lowervp) +#define UPPERVP(vp) (VTOUNION(vp)->un_uppervp) +#define OTHERVP(vp) (UPPERVP(vp) ? UPPERVP(vp) : LOWERVP(vp)) + +extern int (**union_vnodeop_p)(); +extern struct vfsops union_vfsops; +#endif /* KERNEL */ diff --git a/sys/miscfs/union/union_subr.c b/sys/miscfs/union/union_subr.c new file mode 100644 index 00000000000..77947d1dfbe --- /dev/null +++ b/sys/miscfs/union/union_subr.c @@ -0,0 +1,744 @@ +/* + * Copyright (c) 1994 Jan-Simon Pendry + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)union_subr.c 8.4 (Berkeley) 2/17/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DIAGNOSTIC +#include +#endif + +/* must be power of two, otherwise change UNION_HASH() */ +#define NHASH 32 + +/* unsigned int ... */ +#define UNION_HASH(u, l) \ + (((((unsigned long) (u)) + ((unsigned long) l)) >> 8) & (NHASH-1)) + +static LIST_HEAD(unhead, union_node) unhead[NHASH]; +static int unvplock[NHASH]; + +int +union_init() +{ + int i; + + for (i = 0; i < NHASH; i++) + LIST_INIT(&unhead[i]); + bzero((caddr_t) unvplock, sizeof(unvplock)); +} + +static int +union_list_lock(ix) + int ix; +{ + + if (unvplock[ix] & UN_LOCKED) { + unvplock[ix] |= UN_WANT; + sleep((caddr_t) &unvplock[ix], PINOD); + return (1); + } + + unvplock[ix] |= UN_LOCKED; + + return (0); +} + +static void +union_list_unlock(ix) + int ix; +{ + + unvplock[ix] &= ~UN_LOCKED; + + if (unvplock[ix] & UN_WANT) { + unvplock[ix] &= ~UN_WANT; + wakeup((caddr_t) &unvplock[ix]); + } +} + +void +union_updatevp(un, uppervp, lowervp) + struct union_node *un; + struct vnode *uppervp; + struct vnode *lowervp; +{ + int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); + int nhash = UNION_HASH(uppervp, lowervp); + + if (ohash != nhash) { + /* + * Ensure locking is ordered from lower to higher + * to avoid deadlocks. + */ + if (nhash < ohash) { + int t = ohash; + ohash = nhash; + nhash = t; + } + + while (union_list_lock(ohash)) + continue; + + while (union_list_lock(nhash)) + continue; + + LIST_REMOVE(un, un_cache); + union_list_unlock(ohash); + } else { + while (union_list_lock(nhash)) + continue; + } + + if (un->un_lowervp != lowervp) { + if (un->un_lowervp) { + vrele(un->un_lowervp); + if (un->un_path) { + free(un->un_path, M_TEMP); + un->un_path = 0; + } + if (un->un_dirvp) { + vrele(un->un_dirvp); + un->un_dirvp = NULLVP; + } + } + un->un_lowervp = lowervp; + } + + if (un->un_uppervp != uppervp) { + if (un->un_uppervp) + vrele(un->un_uppervp); + + un->un_uppervp = uppervp; + } + + if (ohash != nhash) + LIST_INSERT_HEAD(&unhead[nhash], un, un_cache); + + union_list_unlock(nhash); +} + +void +union_newlower(un, lowervp) + struct union_node *un; + struct vnode *lowervp; +{ + + union_updatevp(un, un->un_uppervp, lowervp); +} + +void +union_newupper(un, uppervp) + struct union_node *un; + struct vnode *uppervp; +{ + + union_updatevp(un, uppervp, un->un_lowervp); +} + +/* + * allocate a union_node/vnode pair. the vnode is + * referenced and locked. the new vnode is returned + * via (vpp). (mp) is the mountpoint of the union filesystem, + * (dvp) is the parent directory where the upper layer object + * should exist (but doesn't) and (cnp) is the componentname + * information which is partially copied to allow the upper + * layer object to be created at a later time. (uppervp) + * and (lowervp) reference the upper and lower layer objects + * being mapped. either, but not both, can be nil. + * if supplied, (uppervp) is locked. + * the reference is either maintained in the new union_node + * object which is allocated, or they are vrele'd. + * + * all union_nodes are maintained on a singly-linked + * list. new nodes are only allocated when they cannot + * be found on this list. entries on the list are + * removed when the vfs reclaim entry is called. + * + * a single lock is kept for the entire list. this is + * needed because the getnewvnode() function can block + * waiting for a vnode to become free, in which case there + * may be more than one process trying to get the same + * vnode. this lock is only taken if we are going to + * call getnewvnode, since the kernel itself is single-threaded. + * + * if an entry is found on the list, then call vget() to + * take a reference. this is done because there may be + * zero references to it and so it needs to removed from + * the vnode free list. + */ +int +union_allocvp(vpp, mp, undvp, dvp, cnp, uppervp, lowervp) + struct vnode **vpp; + struct mount *mp; + struct vnode *undvp; + struct vnode *dvp; /* may be null */ + struct componentname *cnp; /* may be null */ + struct vnode *uppervp; /* may be null */ + struct vnode *lowervp; /* may be null */ +{ + int error; + struct union_node *un; + struct union_node **pp; + struct vnode *xlowervp = NULLVP; + int hash; + int try; + + if (uppervp == NULLVP && lowervp == NULLVP) + panic("union: unidentifiable allocation"); + + if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) { + xlowervp = lowervp; + lowervp = NULLVP; + } + +loop: + for (try = 0; try < 3; try++) { + switch (try) { + case 0: + if (lowervp == NULLVP) + continue; + hash = UNION_HASH(uppervp, lowervp); + break; + + case 1: + if (uppervp == NULLVP) + continue; + hash = UNION_HASH(uppervp, NULLVP); + break; + + case 2: + if (lowervp == NULLVP) + continue; + hash = UNION_HASH(NULLVP, lowervp); + break; + } + + while (union_list_lock(hash)) + continue; + + for (un = unhead[hash].lh_first; un != 0; + un = un->un_cache.le_next) { + if ((un->un_lowervp == lowervp || + un->un_lowervp == NULLVP) && + (un->un_uppervp == uppervp || + un->un_uppervp == NULLVP) && + (UNIONTOV(un)->v_mount == mp)) { + if (vget(UNIONTOV(un), 0)) { + union_list_unlock(hash); + goto loop; + } + break; + } + } + + union_list_unlock(hash); + + if (un) + break; + } + + if (un) { + /* + * Obtain a lock on the union_node. + * uppervp is locked, though un->un_uppervp + * may not be. this doesn't break the locking + * hierarchy since in the case that un->un_uppervp + * is not yet locked it will be vrele'd and replaced + * with uppervp. + */ + + if ((dvp != NULLVP) && (uppervp == dvp)) { + /* + * Access ``.'', so (un) will already + * be locked. Since this process has + * the lock on (uppervp) no other + * process can hold the lock on (un). + */ +#ifdef DIAGNOSTIC + if ((un->un_flags & UN_LOCKED) == 0) + panic("union: . not locked"); + else if (curproc && un->un_pid != curproc->p_pid && + un->un_pid > -1 && curproc->p_pid > -1) + panic("union: allocvp not lock owner"); +#endif + } else { + if (un->un_flags & UN_LOCKED) { + vrele(UNIONTOV(un)); + un->un_flags |= UN_WANT; + sleep((caddr_t) &un->un_flags, PINOD); + goto loop; + } + un->un_flags |= UN_LOCKED; + +#ifdef DIAGNOSTIC + if (curproc) + un->un_pid = curproc->p_pid; + else + un->un_pid = -1; +#endif + } + + /* + * At this point, the union_node is locked, + * un->un_uppervp may not be locked, and uppervp + * is locked or nil. + */ + + /* + * Save information about the upper layer. + */ + if (uppervp != un->un_uppervp) { + union_newupper(un, uppervp); + } else if (uppervp) { + vrele(uppervp); + } + + if (un->un_uppervp) { + un->un_flags |= UN_ULOCK; + un->un_flags &= ~UN_KLOCK; + } + + /* + * Save information about the lower layer. + * This needs to keep track of pathname + * and directory information which union_vn_create + * might need. + */ + if (lowervp != un->un_lowervp) { + union_newlower(un, lowervp); + if (cnp && (lowervp != NULLVP) && + (lowervp->v_type == VREG)) { + un->un_hash = cnp->cn_hash; + un->un_path = malloc(cnp->cn_namelen+1, + M_TEMP, M_WAITOK); + bcopy(cnp->cn_nameptr, un->un_path, + cnp->cn_namelen); + un->un_path[cnp->cn_namelen] = '\0'; + VREF(dvp); + un->un_dirvp = dvp; + } + } else if (lowervp) { + vrele(lowervp); + } + *vpp = UNIONTOV(un); + return (0); + } + + /* + * otherwise lock the vp list while we call getnewvnode + * since that can block. + */ + hash = UNION_HASH(uppervp, lowervp); + + if (union_list_lock(hash)) + goto loop; + + error = getnewvnode(VT_UNION, mp, union_vnodeop_p, vpp); + if (error) { + if (uppervp) { + if (dvp == uppervp) + vrele(uppervp); + else + vput(uppervp); + } + if (lowervp) + vrele(lowervp); + + goto out; + } + + MALLOC((*vpp)->v_data, void *, sizeof(struct union_node), + M_TEMP, M_WAITOK); + + if (uppervp) + (*vpp)->v_type = uppervp->v_type; + else + (*vpp)->v_type = lowervp->v_type; + un = VTOUNION(*vpp); + un->un_vnode = *vpp; + un->un_uppervp = uppervp; + un->un_lowervp = lowervp; + un->un_openl = 0; + un->un_flags = UN_LOCKED; + if (un->un_uppervp) + un->un_flags |= UN_ULOCK; +#ifdef DIAGNOSTIC + if (curproc) + un->un_pid = curproc->p_pid; + else + un->un_pid = -1; +#endif + if (cnp && (lowervp != NULLVP) && (lowervp->v_type == VREG)) { + un->un_hash = cnp->cn_hash; + un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK); + bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen); + un->un_path[cnp->cn_namelen] = '\0'; + VREF(dvp); + un->un_dirvp = dvp; + } else { + un->un_hash = 0; + un->un_path = 0; + un->un_dirvp = 0; + } + + LIST_INSERT_HEAD(&unhead[hash], un, un_cache); + + if (xlowervp) + vrele(xlowervp); + +out: + union_list_unlock(hash); + + return (error); +} + +int +union_freevp(vp) + struct vnode *vp; +{ + struct union_node *un = VTOUNION(vp); + + LIST_REMOVE(un, un_cache); + + if (un->un_uppervp) + vrele(un->un_uppervp); + if (un->un_lowervp) + vrele(un->un_lowervp); + if (un->un_dirvp) + vrele(un->un_dirvp); + if (un->un_path) + free(un->un_path, M_TEMP); + + FREE(vp->v_data, M_TEMP); + vp->v_data = 0; + + return (0); +} + +/* + * copyfile. copy the vnode (fvp) to the vnode (tvp) + * using a sequence of reads and writes. both (fvp) + * and (tvp) are locked on entry and exit. + */ +int +union_copyfile(p, cred, fvp, tvp) + struct proc *p; + struct ucred *cred; + struct vnode *fvp; + struct vnode *tvp; +{ + char *buf; + struct uio uio; + struct iovec iov; + int error = 0; + + /* + * strategy: + * allocate a buffer of size MAXBSIZE. + * loop doing reads and writes, keeping track + * of the current uio offset. + * give up at the first sign of trouble. + */ + + uio.uio_procp = p; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_offset = 0; + + VOP_UNLOCK(fvp); /* XXX */ + LEASE_CHECK(fvp, p, cred, LEASE_READ); + VOP_LOCK(fvp); /* XXX */ + VOP_UNLOCK(tvp); /* XXX */ + LEASE_CHECK(tvp, p, cred, LEASE_WRITE); + VOP_LOCK(tvp); /* XXX */ + + buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); + + /* ugly loop follows... */ + do { + off_t offset = uio.uio_offset; + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + iov.iov_base = buf; + iov.iov_len = MAXBSIZE; + uio.uio_resid = iov.iov_len; + uio.uio_rw = UIO_READ; + error = VOP_READ(fvp, &uio, 0, cred); + + if (error == 0) { + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + iov.iov_base = buf; + iov.iov_len = MAXBSIZE - uio.uio_resid; + uio.uio_offset = offset; + uio.uio_rw = UIO_WRITE; + uio.uio_resid = iov.iov_len; + + if (uio.uio_resid == 0) + break; + + do { + error = VOP_WRITE(tvp, &uio, 0, cred); + } while ((uio.uio_resid > 0) && (error == 0)); + } + + } while (error == 0); + + free(buf, M_TEMP); + return (error); +} + +/* + * Create a shadow directory in the upper layer. + * The new vnode is returned locked. + * + * (um) points to the union mount structure for access to the + * the mounting process's credentials. + * (dvp) is the directory in which to create the shadow directory. + * it is unlocked on entry and exit. + * (cnp) is the componentname to be created. + * (vpp) is the returned newly created shadow directory, which + * is returned locked. + */ +int +union_mkshadow(um, dvp, cnp, vpp) + struct union_mount *um; + struct vnode *dvp; + struct componentname *cnp; + struct vnode **vpp; +{ + int error; + struct vattr va; + struct proc *p = cnp->cn_proc; + struct componentname cn; + + /* + * policy: when creating the shadow directory in the + * upper layer, create it owned by the user who did + * the mount, group from parent directory, and mode + * 777 modified by umask (ie mostly identical to the + * mkdir syscall). (jsp, kb) + */ + + /* + * A new componentname structure must be faked up because + * there is no way to know where the upper level cnp came + * from or what it is being used for. This must duplicate + * some of the work done by NDINIT, some of the work done + * by namei, some of the work done by lookup and some of + * the work done by VOP_LOOKUP when given a CREATE flag. + * Conclusion: Horrible. + * + * The pathname buffer will be FREEed by VOP_MKDIR. + */ + cn.cn_pnbuf = malloc(cnp->cn_namelen+1, M_NAMEI, M_WAITOK); + bcopy(cnp->cn_nameptr, cn.cn_pnbuf, cnp->cn_namelen); + cn.cn_pnbuf[cnp->cn_namelen] = '\0'; + + cn.cn_nameiop = CREATE; + cn.cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN); + cn.cn_proc = cnp->cn_proc; + if (um->um_op == UNMNT_ABOVE) + cn.cn_cred = cnp->cn_cred; + else + cn.cn_cred = um->um_cred; + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_namelen = cnp->cn_namelen; + cn.cn_hash = cnp->cn_hash; + cn.cn_consume = cnp->cn_consume; + + VREF(dvp); + if (error = relookup(dvp, vpp, &cn)) + return (error); + vrele(dvp); + + if (*vpp) { + VOP_ABORTOP(dvp, &cn); + VOP_UNLOCK(dvp); + vrele(*vpp); + *vpp = NULLVP; + return (EEXIST); + } + + VATTR_NULL(&va); + va.va_type = VDIR; + va.va_mode = um->um_cmode; + + /* LEASE_CHECK: dvp is locked */ + LEASE_CHECK(dvp, p, p->p_ucred, LEASE_WRITE); + + error = VOP_MKDIR(dvp, vpp, &cn, &va); + return (error); +} + +/* + * union_vn_create: creates and opens a new shadow file + * on the upper union layer. this function is similar + * in spirit to calling vn_open but it avoids calling namei(). + * the problem with calling namei is that a) it locks too many + * things, and b) it doesn't start at the "right" directory, + * whereas relookup is told where to start. + */ +int +union_vn_create(vpp, un, p) + struct vnode **vpp; + struct union_node *un; + struct proc *p; +{ + struct vnode *vp; + struct ucred *cred = p->p_ucred; + struct vattr vat; + struct vattr *vap = &vat; + int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); + int error; + int cmode = UN_FILEMODE & ~p->p_fd->fd_cmask; + char *cp; + struct componentname cn; + + *vpp = NULLVP; + + /* + * Build a new componentname structure (for the same + * reasons outlines in union_mkshadow). + * The difference here is that the file is owned by + * the current user, rather than by the person who + * did the mount, since the current user needs to be + * able to write the file (that's why it is being + * copied in the first place). + */ + cn.cn_namelen = strlen(un->un_path); + cn.cn_pnbuf = (caddr_t) malloc(cn.cn_namelen, M_NAMEI, M_WAITOK); + bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1); + cn.cn_nameiop = CREATE; + cn.cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN); + cn.cn_proc = p; + cn.cn_cred = p->p_ucred; + cn.cn_nameptr = cn.cn_pnbuf; + cn.cn_hash = un->un_hash; + cn.cn_consume = 0; + + VREF(un->un_dirvp); + if (error = relookup(un->un_dirvp, &vp, &cn)) + return (error); + vrele(un->un_dirvp); + + if (vp) { + VOP_ABORTOP(un->un_dirvp, &cn); + if (un->un_dirvp == vp) + vrele(un->un_dirvp); + else + vput(un->un_dirvp); + vrele(vp); + return (EEXIST); + } + + /* + * Good - there was no race to create the file + * so go ahead and create it. The permissions + * on the file will be 0666 modified by the + * current user's umask. Access to the file, while + * it is unioned, will require access to the top *and* + * bottom files. Access when not unioned will simply + * require access to the top-level file. + * TODO: confirm choice of access permissions. + */ + VATTR_NULL(vap); + vap->va_type = VREG; + vap->va_mode = cmode; + LEASE_CHECK(un->un_dirvp, p, cred, LEASE_WRITE); + if (error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap)) + return (error); + + if (error = VOP_OPEN(vp, fmode, cred, p)) { + vput(vp); + return (error); + } + + vp->v_writecount++; + *vpp = vp; + return (0); +} + +int +union_vn_close(vp, fmode, cred, p) + struct vnode *vp; + int fmode; + struct ucred *cred; + struct proc *p; +{ + if (fmode & FWRITE) + --vp->v_writecount; + return (VOP_CLOSE(vp, fmode)); +} + +void +union_removed_upper(un) + struct union_node *un; +{ + if (un->un_flags & UN_ULOCK) { + un->un_flags &= ~UN_ULOCK; + VOP_UNLOCK(un->un_uppervp); + } + + union_newupper(un, NULLVP); +} + +struct vnode * +union_lowervp(vp) + struct vnode *vp; +{ + struct union_node *un = VTOUNION(vp); + + if (un->un_lowervp && (vp->v_type == un->un_lowervp->v_type)) { + if (vget(un->un_lowervp, 0)) + return (NULLVP); + } + + return (un->un_lowervp); +} diff --git a/sys/miscfs/union/union_vfsops.c b/sys/miscfs/union/union_vfsops.c new file mode 100644 index 00000000000..9fa27460e3d --- /dev/null +++ b/sys/miscfs/union/union_vfsops.c @@ -0,0 +1,550 @@ +/* + * Copyright (c) 1994 The Regents of the University of California. + * Copyright (c) 1994 Jan-Simon Pendry. + * All rights reserved. + * + * This code is derived from software donated to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)union_vfsops.c 8.7 (Berkeley) 3/5/94 + */ + +/* + * Union Layer + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Mount union filesystem + */ +int +union_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + int error = 0; + struct union_args args; + struct vnode *lowerrootvp = NULLVP; + struct vnode *upperrootvp = NULLVP; + struct union_mount *um; + struct ucred *cred = 0; + struct ucred *scred; + struct vattr va; + char *cp; + int len; + u_int size; + +#ifdef UNION_DIAGNOSTIC + printf("union_mount(mp = %x)\n", mp); +#endif + + /* + * Update is a no-op + */ + if (mp->mnt_flag & MNT_UPDATE) { + /* + * Need to provide. + * 1. a way to convert between rdonly and rdwr mounts. + * 2. support for nfs exports. + */ + error = EOPNOTSUPP; + goto bad; + } + + /* + * Take a copy of the process's credentials. This isn't + * quite right since the euid will always be zero and we + * want to get the "real" users credentials. So fix up + * the uid field after taking the copy. + */ + cred = crdup(p->p_ucred); + cred->cr_uid = p->p_cred->p_ruid; + + /* + * Ensure the *real* user has write permission on the + * mounted-on directory. This allows the mount_union + * command to be made setuid root so allowing anyone + * to do union mounts onto any directory on which they + * have write permission and which they also own. + */ + error = VOP_GETATTR(mp->mnt_vnodecovered, &va, cred, p); + if (error) + goto bad; + if ((va.va_uid != cred->cr_uid) && + (cred->cr_uid != 0)) { + error = EACCES; + goto bad; + } + error = VOP_ACCESS(mp->mnt_vnodecovered, VWRITE, cred, p); + if (error) + goto bad; + + /* + * Get argument + */ + if (error = copyin(data, (caddr_t)&args, sizeof(struct union_args))) + goto bad; + + lowerrootvp = mp->mnt_vnodecovered; + VREF(lowerrootvp); + + /* + * Find upper node. Use the real process credentials, + * not the effective ones since this will have come + * through a setuid process (mount_union). All this + * messing around with permissions is entirely bogus + * and should be removed by allowing any user straight + * past the mount system call. + */ + scred = p->p_ucred; + p->p_ucred = cred; + NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT, + UIO_USERSPACE, args.target, p); + p->p_ucred = scred; + + if (error = namei(ndp)) + goto bad; + + upperrootvp = ndp->ni_vp; + vrele(ndp->ni_dvp); + ndp->ni_dvp = NULL; + + if (upperrootvp->v_type != VDIR) { + error = EINVAL; + goto bad; + } + + um = (struct union_mount *) malloc(sizeof(struct union_mount), + M_UFSMNT, M_WAITOK); /* XXX */ + + /* + * Keep a held reference to the target vnodes. + * They are vrele'd in union_unmount. + * + * Depending on the _BELOW flag, the filesystems are + * viewed in a different order. In effect, this is the + * same as providing a mount under option to the mount syscall. + */ + + um->um_op = args.mntflags & UNMNT_OPMASK; + switch (um->um_op) { + case UNMNT_ABOVE: + um->um_lowervp = lowerrootvp; + um->um_uppervp = upperrootvp; + break; + + case UNMNT_BELOW: + um->um_lowervp = upperrootvp; + um->um_uppervp = lowerrootvp; + break; + + case UNMNT_REPLACE: + vrele(lowerrootvp); + lowerrootvp = NULLVP; + um->um_uppervp = upperrootvp; + um->um_lowervp = lowerrootvp; + break; + + default: + error = EINVAL; + goto bad; + } + + um->um_cred = cred; + um->um_cmode = UN_DIRMODE &~ p->p_fd->fd_cmask; + + /* + * Depending on what you think the MNT_LOCAL flag might mean, + * you may want the && to be || on the conditional below. + * At the moment it has been defined that the filesystem is + * only local if it is all local, ie the MNT_LOCAL flag implies + * that the entire namespace is local. If you think the MNT_LOCAL + * flag implies that some of the files might be stored locally + * then you will want to change the conditional. + */ + if (um->um_op == UNMNT_ABOVE) { + if (((um->um_lowervp == NULLVP) || + (um->um_lowervp->v_mount->mnt_flag & MNT_LOCAL)) && + (um->um_uppervp->v_mount->mnt_flag & MNT_LOCAL)) + mp->mnt_flag |= MNT_LOCAL; + } + + /* + * Copy in the upper layer's RDONLY flag. This is for the benefit + * of lookup() which explicitly checks the flag, rather than asking + * the filesystem for it's own opinion. This means, that an update + * mount of the underlying filesystem to go from rdonly to rdwr + * will leave the unioned view as read-only. + */ + mp->mnt_flag |= (um->um_uppervp->v_mount->mnt_flag & MNT_RDONLY); + + /* + * This is a user mount. Privilege check for unmount + * will be done in union_unmount. + */ + mp->mnt_flag |= MNT_USER; + + mp->mnt_data = (qaddr_t) um; + getnewfsid(mp, MOUNT_UNION); + + (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); + + switch (um->um_op) { + case UNMNT_ABOVE: + cp = ""; + break; + case UNMNT_BELOW: + cp = ""; + break; + case UNMNT_REPLACE: + cp = ""; + break; + } + len = strlen(cp); + bcopy(cp, mp->mnt_stat.f_mntfromname, len); + + cp = mp->mnt_stat.f_mntfromname + len; + len = MNAMELEN - len; + + (void) copyinstr(args.target, cp, len - 1, &size); + bzero(cp + size, len - size); + +#ifdef UNION_DIAGNOSTIC + printf("union_mount: from %s, on %s\n", + mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); +#endif + return (0); + +bad: + if (cred) + crfree(cred); + if (upperrootvp) + vrele(upperrootvp); + if (lowerrootvp) + vrele(lowerrootvp); + return (error); +} + +/* + * VFS start. Nothing needed here - the start routine + * on the underlying filesystem(s) will have been called + * when that filesystem was mounted. + */ +int +union_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + + return (0); +} + +/* + * Free reference to union layer + */ +int +union_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + struct union_mount *um = MOUNTTOUNIONMOUNT(mp); + struct vnode *um_rootvp; + int error; + int flags = 0; + extern int doforce; + +#ifdef UNION_DIAGNOSTIC + printf("union_unmount(mp = %x)\n", mp); +#endif + + /* only the mounter, or superuser can unmount */ + if ((p->p_cred->p_ruid != um->um_cred->cr_uid) && + (error = suser(p->p_ucred, &p->p_acflag))) + return (error); + + if (mntflags & MNT_FORCE) { + /* union can never be rootfs so don't check for it */ + if (!doforce) + return (EINVAL); + flags |= FORCECLOSE; + } + + if (error = union_root(mp, &um_rootvp)) + return (error); + if (um_rootvp->v_usecount > 1) { + vput(um_rootvp); + return (EBUSY); + } + if (error = vflush(mp, um_rootvp, flags)) { + vput(um_rootvp); + return (error); + } + +#ifdef UNION_DIAGNOSTIC + vprint("alias root of lower", um_rootvp); +#endif + /* + * Discard references to upper and lower target vnodes. + */ + if (um->um_lowervp) + vrele(um->um_lowervp); + vrele(um->um_uppervp); + crfree(um->um_cred); + /* + * Release reference on underlying root vnode + */ + vput(um_rootvp); + /* + * And blow it away for future re-use + */ + vgone(um_rootvp); + /* + * Finally, throw away the union_mount structure + */ + free(mp->mnt_data, M_UFSMNT); /* XXX */ + mp->mnt_data = 0; + return (0); +} + +int +union_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct union_mount *um = MOUNTTOUNIONMOUNT(mp); + int error; + int loselock; + +#ifdef UNION_DIAGNOSTIC + printf("union_root(mp = %x, lvp = %x, uvp = %x)\n", mp, + um->um_lowervp, + um->um_uppervp); +#endif + + /* + * Return locked reference to root. + */ + VREF(um->um_uppervp); + if ((um->um_op == UNMNT_BELOW) && + VOP_ISLOCKED(um->um_uppervp)) { + loselock = 1; + } else { + VOP_LOCK(um->um_uppervp); + loselock = 0; + } + if (um->um_lowervp) + VREF(um->um_lowervp); + error = union_allocvp(vpp, mp, + (struct vnode *) 0, + (struct vnode *) 0, + (struct componentname *) 0, + um->um_uppervp, + um->um_lowervp); + + if (error) { + if (!loselock) + VOP_UNLOCK(um->um_uppervp); + vrele(um->um_uppervp); + if (um->um_lowervp) + vrele(um->um_lowervp); + } else { + (*vpp)->v_flag |= VROOT; + if (loselock) + VTOUNION(*vpp)->un_flags &= ~UN_ULOCK; + } + + return (error); +} + +int +union_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} + +int +union_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + int error; + struct union_mount *um = MOUNTTOUNIONMOUNT(mp); + struct statfs mstat; + int lbsize; + +#ifdef UNION_DIAGNOSTIC + printf("union_statfs(mp = %x, lvp = %x, uvp = %x)\n", mp, + um->um_lowervp, + um->um_uppervp); +#endif + + bzero(&mstat, sizeof(mstat)); + + if (um->um_lowervp) { + error = VFS_STATFS(um->um_lowervp->v_mount, &mstat, p); + if (error) + return (error); + } + + /* now copy across the "interesting" information and fake the rest */ +#if 0 + sbp->f_type = mstat.f_type; + sbp->f_flags = mstat.f_flags; + sbp->f_bsize = mstat.f_bsize; + sbp->f_iosize = mstat.f_iosize; +#endif + lbsize = mstat.f_bsize; + sbp->f_blocks = mstat.f_blocks; + sbp->f_bfree = mstat.f_bfree; + sbp->f_bavail = mstat.f_bavail; + sbp->f_files = mstat.f_files; + sbp->f_ffree = mstat.f_ffree; + + error = VFS_STATFS(um->um_uppervp->v_mount, &mstat, p); + if (error) + return (error); + + sbp->f_type = MOUNT_UNION; + sbp->f_flags = mstat.f_flags; + sbp->f_bsize = mstat.f_bsize; + sbp->f_iosize = mstat.f_iosize; + + /* + * if the lower and upper blocksizes differ, then frig the + * block counts so that the sizes reported by df make some + * kind of sense. none of this makes sense though. + */ + + if (mstat.f_bsize != lbsize) { + sbp->f_blocks = sbp->f_blocks * lbsize / mstat.f_bsize; + sbp->f_bfree = sbp->f_bfree * lbsize / mstat.f_bsize; + sbp->f_bavail = sbp->f_bavail * lbsize / mstat.f_bsize; + } + sbp->f_blocks += mstat.f_blocks; + sbp->f_bfree += mstat.f_bfree; + sbp->f_bavail += mstat.f_bavail; + sbp->f_files += mstat.f_files; + sbp->f_ffree += mstat.f_ffree; + + if (sbp != &mp->mnt_stat) { + bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid)); + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + return (0); +} + +int +union_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + + /* + * XXX - Assumes no data cached at union layer. + */ + return (0); +} + +int +union_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +int +union_fhtovp(mp, fidp, nam, vpp, exflagsp, credanonp) + struct mount *mp; + struct fid *fidp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + + return (EOPNOTSUPP); +} + +int +union_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + + return (EOPNOTSUPP); +} + +int union_init __P((void)); + +struct vfsops union_vfsops = { + union_mount, + union_start, + union_unmount, + union_root, + union_quotactl, + union_statfs, + union_sync, + union_vget, + union_fhtovp, + union_vptofh, + union_init, +}; diff --git a/sys/miscfs/union/union_vnops.c b/sys/miscfs/union/union_vnops.c new file mode 100644 index 00000000000..96327b0922d --- /dev/null +++ b/sys/miscfs/union/union_vnops.c @@ -0,0 +1,1495 @@ +/* + * Copyright (c) 1992, 1993, 1994 The Regents of the University of California. + * Copyright (c) 1992, 1993, 1994 Jan-Simon Pendry. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Jan-Simon Pendry. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)union_vnops.c 8.6 (Berkeley) 2/17/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FIXUP(un) { \ + if (((un)->un_flags & UN_ULOCK) == 0) { \ + union_fixup(un); \ + } \ +} + +static void +union_fixup(un) + struct union_node *un; +{ + + VOP_LOCK(un->un_uppervp); + un->un_flags |= UN_ULOCK; +} + +static int +union_lookup1(udvp, dvp, vpp, cnp) + struct vnode *udvp; + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; +{ + int error; + struct vnode *tdvp; + struct mount *mp; + + /* + * If stepping up the directory tree, check for going + * back across the mount point, in which case do what + * lookup would do by stepping back down the mount + * hierarchy. + */ + if (cnp->cn_flags & ISDOTDOT) { + for (;;) { + /* + * Don't do the NOCROSSMOUNT check + * at this level. By definition, + * union fs deals with namespaces, not + * filesystems. + */ + if ((dvp->v_flag & VROOT) == 0) + break; + + tdvp = dvp; + dvp = dvp->v_mount->mnt_vnodecovered; + vput(tdvp); + VREF(dvp); + VOP_LOCK(dvp); + } + } + + error = VOP_LOOKUP(dvp, &tdvp, cnp); + if (error) + return (error); + + /* + * The parent directory will have been unlocked, unless lookup + * found the last component. In which case, re-lock the node + * here to allow it to be unlocked again (phew) in union_lookup. + */ + if (dvp != tdvp && !(cnp->cn_flags & ISLASTCN)) + VOP_LOCK(dvp); + + dvp = tdvp; + + /* + * Lastly check if the current node is a mount point in + * which case walk up the mount hierarchy making sure not to + * bump into the root of the mount tree (ie. dvp != udvp). + */ + while (dvp != udvp && (dvp->v_type == VDIR) && + (mp = dvp->v_mountedhere)) { + + if (mp->mnt_flag & MNT_MLOCK) { + mp->mnt_flag |= MNT_MWAIT; + sleep((caddr_t) mp, PVFS); + continue; + } + + if (error = VFS_ROOT(mp, &tdvp)) { + vput(dvp); + return (error); + } + + vput(dvp); + dvp = tdvp; + } + + *vpp = dvp; + return (0); +} + +int +union_lookup(ap) + struct vop_lookup_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + int error; + int uerror, lerror; + struct vnode *uppervp, *lowervp; + struct vnode *upperdvp, *lowerdvp; + struct vnode *dvp = ap->a_dvp; + struct union_node *dun = VTOUNION(dvp); + struct componentname *cnp = ap->a_cnp; + int lockparent = cnp->cn_flags & LOCKPARENT; + int rdonly = cnp->cn_flags & RDONLY; + struct union_mount *um = MOUNTTOUNIONMOUNT(dvp->v_mount); + struct ucred *saved_cred; + + cnp->cn_flags |= LOCKPARENT; + + upperdvp = dun->un_uppervp; + lowerdvp = dun->un_lowervp; + uppervp = NULLVP; + lowervp = NULLVP; + + /* + * do the lookup in the upper level. + * if that level comsumes additional pathnames, + * then assume that something special is going + * on and just return that vnode. + */ + if (upperdvp) { + FIXUP(dun); + uerror = union_lookup1(um->um_uppervp, upperdvp, + &uppervp, cnp); + /*if (uppervp == upperdvp) + dun->un_flags |= UN_KLOCK;*/ + + if (cnp->cn_consume != 0) { + *ap->a_vpp = uppervp; + if (!lockparent) + cnp->cn_flags &= ~LOCKPARENT; + return (uerror); + } + } else { + uerror = ENOENT; + } + + /* + * in a similar way to the upper layer, do the lookup + * in the lower layer. this time, if there is some + * component magic going on, then vput whatever we got + * back from the upper layer and return the lower vnode + * instead. + */ + if (lowerdvp) { + int nameiop; + + VOP_LOCK(lowerdvp); + + /* + * Only do a LOOKUP on the bottom node, since + * we won't be making changes to it anyway. + */ + nameiop = cnp->cn_nameiop; + cnp->cn_nameiop = LOOKUP; + if (um->um_op == UNMNT_BELOW) { + saved_cred = cnp->cn_cred; + cnp->cn_cred = um->um_cred; + } + lerror = union_lookup1(um->um_lowervp, lowerdvp, + &lowervp, cnp); + if (um->um_op == UNMNT_BELOW) + cnp->cn_cred = saved_cred; + cnp->cn_nameiop = nameiop; + + if (lowervp != lowerdvp) + VOP_UNLOCK(lowerdvp); + + if (cnp->cn_consume != 0) { + if (uppervp) { + if (uppervp == upperdvp) + vrele(uppervp); + else + vput(uppervp); + uppervp = NULLVP; + } + *ap->a_vpp = lowervp; + if (!lockparent) + cnp->cn_flags &= ~LOCKPARENT; + return (lerror); + } + } else { + lerror = ENOENT; + } + + if (!lockparent) + cnp->cn_flags &= ~LOCKPARENT; + + /* + * at this point, we have uerror and lerror indicating + * possible errors with the lookups in the upper and lower + * layers. additionally, uppervp and lowervp are (locked) + * references to existing vnodes in the upper and lower layers. + * + * there are now three cases to consider. + * 1. if both layers returned an error, then return whatever + * error the upper layer generated. + * + * 2. if the top layer failed and the bottom layer succeeded + * then two subcases occur. + * a. the bottom vnode is not a directory, in which + * case just return a new union vnode referencing + * an empty top layer and the existing bottom layer. + * b. the bottom vnode is a directory, in which case + * create a new directory in the top-level and + * continue as in case 3. + * + * 3. if the top layer succeeded then return a new union + * vnode referencing whatever the new top layer and + * whatever the bottom layer returned. + */ + + *ap->a_vpp = NULLVP; + + /* case 1. */ + if ((uerror != 0) && (lerror != 0)) { + return (uerror); + } + + /* case 2. */ + if (uerror != 0 /* && (lerror == 0) */ ) { + if (lowervp->v_type == VDIR) { /* case 2b. */ + dun->un_flags &= ~UN_ULOCK; + VOP_UNLOCK(upperdvp); + uerror = union_mkshadow(um, upperdvp, cnp, &uppervp); + VOP_LOCK(upperdvp); + dun->un_flags |= UN_ULOCK; + + if (uerror) { + if (lowervp) { + vput(lowervp); + lowervp = NULLVP; + } + return (uerror); + } + } + } + + if (lowervp) + VOP_UNLOCK(lowervp); + + error = union_allocvp(ap->a_vpp, dvp->v_mount, dvp, upperdvp, cnp, + uppervp, lowervp); + + if (error) { + if (uppervp) + vput(uppervp); + if (lowervp) + vrele(lowervp); + } else { + if (*ap->a_vpp != dvp) + if (!lockparent || !(cnp->cn_flags & ISLASTCN)) + VOP_UNLOCK(dvp); + } + + return (error); +} + +int +union_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_dvp); + struct vnode *dvp = un->un_uppervp; + + if (dvp) { + int error; + struct vnode *vp; + + FIXUP(un); + + VREF(dvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + error = VOP_CREATE(dvp, &vp, ap->a_cnp, ap->a_vap); + if (error) + return (error); + + error = union_allocvp( + ap->a_vpp, + ap->a_dvp->v_mount, + ap->a_dvp, + NULLVP, + ap->a_cnp, + vp, + NULLVP); + if (error) + vput(vp); + return (error); + } + + vput(ap->a_dvp); + return (EROFS); +} + +int +union_mknod(ap) + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_dvp); + struct vnode *dvp = un->un_uppervp; + + if (dvp) { + int error; + struct vnode *vp; + + FIXUP(un); + + VREF(dvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + error = VOP_MKNOD(dvp, &vp, ap->a_cnp, ap->a_vap); + if (error) + return (error); + + if (vp) { + error = union_allocvp( + ap->a_vpp, + ap->a_dvp->v_mount, + ap->a_dvp, + NULLVP, + ap->a_cnp, + vp, + NULLVP); + if (error) + vput(vp); + } + return (error); + } + + vput(ap->a_dvp); + return (EROFS); +} + +int +union_open(ap) + struct vop_open_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_vp); + struct vnode *tvp; + int mode = ap->a_mode; + struct ucred *cred = ap->a_cred; + struct proc *p = ap->a_p; + int error; + + /* + * If there is an existing upper vp then simply open that. + */ + tvp = un->un_uppervp; + if (tvp == NULLVP) { + /* + * If the lower vnode is being opened for writing, then + * copy the file contents to the upper vnode and open that, + * otherwise can simply open the lower vnode. + */ + tvp = un->un_lowervp; + if ((ap->a_mode & FWRITE) && (tvp->v_type == VREG)) { + struct vnode *vp; + int i; + + /* + * Open the named file in the upper layer. Note that + * the file may have come into existence *since* the + * lookup was done, since the upper layer may really + * be a loopback mount of some other filesystem... + * so open the file with exclusive create and barf if + * it already exists. + * XXX - perhaps should re-lookup the node (once more + * with feeling) and simply open that. Who knows. + */ + error = union_vn_create(&vp, un, p); + if (error) + return (error); + + /* at this point, uppervp is locked */ + union_newupper(un, vp); + un->un_flags |= UN_ULOCK; + + /* + * Now, if the file is being opened with truncation, + * then the (new) upper vnode is ready to fly, + * otherwise the data from the lower vnode must be + * copied to the upper layer first. This only works + * for regular files (check is made above). + */ + if ((mode & O_TRUNC) == 0) { + /* + * XXX - should not ignore errors + * from VOP_CLOSE + */ + VOP_LOCK(tvp); + error = VOP_OPEN(tvp, FREAD, cred, p); + if (error == 0) { + error = union_copyfile(p, cred, + tvp, un->un_uppervp); + VOP_UNLOCK(tvp); + (void) VOP_CLOSE(tvp, FREAD); + } else { + VOP_UNLOCK(tvp); + } + +#ifdef UNION_DIAGNOSTIC + if (!error) + uprintf("union: copied up %s\n", + un->un_path); +#endif + } + + un->un_flags &= ~UN_ULOCK; + VOP_UNLOCK(un->un_uppervp); + union_vn_close(un->un_uppervp, FWRITE, cred, p); + VOP_LOCK(un->un_uppervp); + un->un_flags |= UN_ULOCK; + + /* + * Subsequent IOs will go to the top layer, so + * call close on the lower vnode and open on the + * upper vnode to ensure that the filesystem keeps + * its references counts right. This doesn't do + * the right thing with (cred) and (FREAD) though. + * Ignoring error returns is not righ, either. + */ + for (i = 0; i < un->un_openl; i++) { + (void) VOP_CLOSE(tvp, FREAD); + (void) VOP_OPEN(un->un_uppervp, FREAD, cred, p); + } + un->un_openl = 0; + + if (error == 0) + error = VOP_OPEN(un->un_uppervp, mode, cred, p); + return (error); + } + + /* + * Just open the lower vnode + */ + un->un_openl++; + VOP_LOCK(tvp); + error = VOP_OPEN(tvp, mode, cred, p); + VOP_UNLOCK(tvp); + + return (error); + } + + FIXUP(un); + + error = VOP_OPEN(tvp, mode, cred, p); + + return (error); +} + +int +union_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_vp); + struct vnode *vp; + + if (un->un_uppervp) { + vp = un->un_uppervp; + } else { +#ifdef UNION_DIAGNOSTIC + if (un->un_openl <= 0) + panic("union: un_openl cnt"); +#endif + --un->un_openl; + vp = un->un_lowervp; + } + + return (VOP_CLOSE(vp, ap->a_fflag, ap->a_cred, ap->a_p)); +} + +/* + * Check access permission on the union vnode. + * The access check being enforced is to check + * against both the underlying vnode, and any + * copied vnode. This ensures that no additional + * file permissions are given away simply because + * the user caused an implicit file copy. + */ +int +union_access(ap) + struct vop_access_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_vp); + int error = EACCES; + struct vnode *vp; + + if (vp = un->un_uppervp) { + FIXUP(un); + return (VOP_ACCESS(vp, ap->a_mode, ap->a_cred, ap->a_p)); + } + + if (vp = un->un_lowervp) { + VOP_LOCK(vp); + error = VOP_ACCESS(vp, ap->a_mode, ap->a_cred, ap->a_p); + if (error == 0) { + struct union_mount *um = MOUNTTOUNIONMOUNT(vp->v_mount); + + if (um->um_op == UNMNT_BELOW) + error = VOP_ACCESS(vp, ap->a_mode, + um->um_cred, ap->a_p); + } + VOP_UNLOCK(vp); + if (error) + return (error); + } + + return (error); +} + +/* + * We handle getattr only to change the fsid. + */ +int +union_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + int error; + struct union_node *un = VTOUNION(ap->a_vp); + struct vnode *vp = un->un_uppervp; + struct vattr *vap; + struct vattr va; + + + /* + * Some programs walk the filesystem hierarchy by counting + * links to directories to avoid stat'ing all the time. + * This means the link count on directories needs to be "correct". + * The only way to do that is to call getattr on both layers + * and fix up the link count. The link count will not necessarily + * be accurate but will be large enough to defeat the tree walkers. + */ + + vap = ap->a_vap; + + vp = un->un_uppervp; + if (vp != NULLVP) { + FIXUP(un); + error = VOP_GETATTR(vp, vap, ap->a_cred, ap->a_p); + if (error) + return (error); + } + + if (vp == NULLVP) { + vp = un->un_lowervp; + } else if (vp->v_type == VDIR) { + vp = un->un_lowervp; + vap = &va; + } else { + vp = NULLVP; + } + + if (vp != NULLVP) { + VOP_LOCK(vp); + error = VOP_GETATTR(vp, vap, ap->a_cred, ap->a_p); + VOP_UNLOCK(vp); + if (error) + return (error); + } + + if ((vap != ap->a_vap) && (vap->va_type == VDIR)) + ap->a_vap->va_nlink += vap->va_nlink; + + vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; + return (0); +} + +int +union_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_vp); + int error; + + /* + * Handle case of truncating lower object to zero size, + * by creating a zero length upper object. This is to + * handle the case of open with O_TRUNC and O_CREAT. + */ + if ((un->un_uppervp == NULLVP) && + /* assert(un->un_lowervp != NULLVP) */ + (un->un_lowervp->v_type == VREG) && + (ap->a_vap->va_size == 0)) { + struct vnode *vp; + + error = union_vn_create(&vp, un, ap->a_p); + if (error) + return (error); + + /* at this point, uppervp is locked */ + union_newupper(un, vp); + + VOP_UNLOCK(vp); + union_vn_close(un->un_uppervp, FWRITE, ap->a_cred, ap->a_p); + VOP_LOCK(vp); + un->un_flags |= UN_ULOCK; + } + + /* + * Try to set attributes in upper layer, + * otherwise return read-only filesystem error. + */ + if (un->un_uppervp != NULLVP) { + FIXUP(un); + error = VOP_SETATTR(un->un_uppervp, ap->a_vap, + ap->a_cred, ap->a_p); + } else { + error = EROFS; + } + + return (error); +} + +int +union_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_vp); + int dolock = (vp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_READ(vp, ap->a_uio, ap->a_ioflag, ap->a_cred); + if (dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_write(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_vp); + int dolock = (vp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_WRITE(vp, ap->a_uio, ap->a_ioflag, ap->a_cred); + if (dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (VOP_IOCTL(OTHERVP(ap->a_vp), ap->a_command, ap->a_data, + ap->a_fflag, ap->a_cred, ap->a_p)); +} + +int +union_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (VOP_SELECT(OTHERVP(ap->a_vp), ap->a_which, ap->a_fflags, + ap->a_cred, ap->a_p)); +} + +int +union_mmap(ap) + struct vop_mmap_args /* { + struct vnode *a_vp; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (VOP_MMAP(OTHERVP(ap->a_vp), ap->a_fflags, + ap->a_cred, ap->a_p)); +} + +int +union_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + int error = 0; + struct vnode *targetvp = OTHERVP(ap->a_vp); + + if (targetvp) { + int dolock = (targetvp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(targetvp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_FSYNC(targetvp, ap->a_cred, + ap->a_waitfor, ap->a_p); + if (dolock) + VOP_UNLOCK(targetvp); + } + + return (error); +} + +int +union_seek(ap) + struct vop_seek_args /* { + struct vnode *a_vp; + off_t a_oldoff; + off_t a_newoff; + struct ucred *a_cred; + } */ *ap; +{ + + return (VOP_SEEK(OTHERVP(ap->a_vp), ap->a_oldoff, ap->a_newoff, ap->a_cred)); +} + +int +union_remove(ap) + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + int error; + struct union_node *dun = VTOUNION(ap->a_dvp); + struct union_node *un = VTOUNION(ap->a_vp); + + if (dun->un_uppervp && un->un_uppervp) { + struct vnode *dvp = dun->un_uppervp; + struct vnode *vp = un->un_uppervp; + + FIXUP(dun); + VREF(dvp); + dun->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + FIXUP(un); + VREF(vp); + un->un_flags |= UN_KLOCK; + vput(ap->a_vp); + + error = VOP_REMOVE(dvp, vp, ap->a_cnp); + if (!error) + union_removed_upper(un); + + /* + * XXX: should create a whiteout here + */ + } else { + /* + * XXX: should create a whiteout here + */ + vput(ap->a_dvp); + vput(ap->a_vp); + error = EROFS; + } + + return (error); +} + +int +union_link(ap) + struct vop_link_args /* { + struct vnode *a_vp; + struct vnode *a_tdvp; + struct componentname *a_cnp; + } */ *ap; +{ + int error; + struct union_node *dun = VTOUNION(ap->a_vp); + struct union_node *un = VTOUNION(ap->a_tdvp); + + if (dun->un_uppervp && un->un_uppervp) { + struct vnode *dvp = dun->un_uppervp; + struct vnode *vp = un->un_uppervp; + + FIXUP(dun); + VREF(dvp); + dun->un_flags |= UN_KLOCK; + vput(ap->a_vp); + FIXUP(un); + VREF(vp); + vrele(ap->a_tdvp); + + error = VOP_LINK(dvp, vp, ap->a_cnp); + } else { + /* + * XXX: need to copy to upper layer + * and do the link there. + */ + vput(ap->a_vp); + vrele(ap->a_tdvp); + error = EROFS; + } + + return (error); +} + +int +union_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + int error; + + struct vnode *fdvp = ap->a_fdvp; + struct vnode *fvp = ap->a_fvp; + struct vnode *tdvp = ap->a_tdvp; + struct vnode *tvp = ap->a_tvp; + + if (fdvp->v_op == union_vnodeop_p) { /* always true */ + struct union_node *un = VTOUNION(fdvp); + if (un->un_uppervp == NULLVP) { + error = EROFS; + goto bad; + } + + FIXUP(un); + fdvp = un->un_uppervp; + VREF(fdvp); + vrele(ap->a_fdvp); + } + + if (fvp->v_op == union_vnodeop_p) { /* always true */ + struct union_node *un = VTOUNION(fvp); + if (un->un_uppervp == NULLVP) { + error = EROFS; + goto bad; + } + + FIXUP(un); + fvp = un->un_uppervp; + VREF(fvp); + vrele(ap->a_fvp); + } + + if (tdvp->v_op == union_vnodeop_p) { + struct union_node *un = VTOUNION(tdvp); + if (un->un_uppervp == NULLVP) { + error = EROFS; + goto bad; + } + + tdvp = un->un_uppervp; + VREF(tdvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_tdvp); + } + + if (tvp && tvp->v_op == union_vnodeop_p) { + struct union_node *un = VTOUNION(tvp); + if (un->un_uppervp == NULLVP) { + error = EROFS; + goto bad; + } + + tvp = un->un_uppervp; + VREF(tvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_tvp); + } + + return (VOP_RENAME(fdvp, fvp, ap->a_fcnp, tdvp, tvp, ap->a_tcnp)); + +bad: + vrele(fdvp); + vrele(fvp); + vput(tdvp); + if (tvp) + vput(tvp); + + return (error); +} + +int +union_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_dvp); + struct vnode *dvp = un->un_uppervp; + + if (dvp) { + int error; + struct vnode *vp; + + FIXUP(un); + VREF(dvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + error = VOP_MKDIR(dvp, &vp, ap->a_cnp, ap->a_vap); + if (error) + return (error); + + error = union_allocvp( + ap->a_vpp, + ap->a_dvp->v_mount, + ap->a_dvp, + NULLVP, + ap->a_cnp, + vp, + NULLVP); + if (error) + vput(vp); + return (error); + } + + vput(ap->a_dvp); + return (EROFS); +} + +int +union_rmdir(ap) + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + int error; + struct union_node *dun = VTOUNION(ap->a_dvp); + struct union_node *un = VTOUNION(ap->a_vp); + + if (dun->un_uppervp && un->un_uppervp) { + struct vnode *dvp = dun->un_uppervp; + struct vnode *vp = un->un_uppervp; + + FIXUP(dun); + VREF(dvp); + dun->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + FIXUP(un); + VREF(vp); + un->un_flags |= UN_KLOCK; + vput(ap->a_vp); + + error = VOP_RMDIR(dvp, vp, ap->a_cnp); + if (!error) + union_removed_upper(un); + + /* + * XXX: should create a whiteout here + */ + } else { + /* + * XXX: should create a whiteout here + */ + vput(ap->a_dvp); + vput(ap->a_vp); + error = EROFS; + } + + return (error); +} + +int +union_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + struct union_node *un = VTOUNION(ap->a_dvp); + struct vnode *dvp = un->un_uppervp; + + if (dvp) { + int error; + struct vnode *vp; + struct mount *mp = ap->a_dvp->v_mount; + + FIXUP(un); + VREF(dvp); + un->un_flags |= UN_KLOCK; + vput(ap->a_dvp); + error = VOP_SYMLINK(dvp, &vp, ap->a_cnp, + ap->a_vap, ap->a_target); + *ap->a_vpp = NULLVP; + return (error); + } + + vput(ap->a_dvp); + return (EROFS); +} + +/* + * union_readdir works in concert with getdirentries and + * readdir(3) to provide a list of entries in the unioned + * directories. getdirentries is responsible for walking + * down the union stack. readdir(3) is responsible for + * eliminating duplicate names from the returned data stream. + */ +int +union_readdir(ap) + struct vop_readdir_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + int error = 0; + struct union_node *un = VTOUNION(ap->a_vp); + + if (un->un_uppervp) { + FIXUP(un); + error = VOP_READDIR(un->un_uppervp, ap->a_uio, ap->a_cred); + } + + return (error); +} + +int +union_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_vp); + int dolock = (vp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_READLINK(vp, ap->a_uio, ap->a_cred); + if (dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_abortop(ap) + struct vop_abortop_args /* { + struct vnode *a_dvp; + struct componentname *a_cnp; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_dvp); + struct union_node *un = VTOUNION(ap->a_dvp); + int islocked = un->un_flags & UN_LOCKED; + int dolock = (vp == LOWERVP(ap->a_dvp)); + + if (islocked) { + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_dvp)); + } + error = VOP_ABORTOP(vp, ap->a_cnp); + if (islocked && dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + /* + * Do nothing (and _don't_ bypass). + * Wait to vrele lowervp until reclaim, + * so that until then our union_node is in the + * cache and reusable. + * + * NEEDSWORK: Someday, consider inactive'ing + * the lowervp and then trying to reactivate it + * with capabilities (v_id) + * like they do in the name lookup cache code. + * That's too much work for now. + */ + +#ifdef UNION_DIAGNOSTIC + struct union_node *un = VTOUNION(ap->a_vp); + + if (un->un_flags & UN_LOCKED) + panic("union: inactivating locked node"); +#endif + + return (0); +} + +int +union_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + union_freevp(ap->a_vp); + + return (0); +} + +int +union_lock(ap) + struct vop_lock_args *ap; +{ + struct vnode *vp = ap->a_vp; + struct union_node *un; + +start: + while (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + } + + un = VTOUNION(vp); + + if (un->un_uppervp) { + if ((un->un_flags & UN_ULOCK) == 0) { + un->un_flags |= UN_ULOCK; + VOP_LOCK(un->un_uppervp); + } +#ifdef DIAGNOSTIC + if (un->un_flags & UN_KLOCK) + panic("union: dangling upper lock"); +#endif + } + + if (un->un_flags & UN_LOCKED) { +#ifdef DIAGNOSTIC + if (curproc && un->un_pid == curproc->p_pid && + un->un_pid > -1 && curproc->p_pid > -1) + panic("union: locking against myself"); +#endif + un->un_flags |= UN_WANT; + sleep((caddr_t) &un->un_flags, PINOD); + goto start; + } + +#ifdef DIAGNOSTIC + if (curproc) + un->un_pid = curproc->p_pid; + else + un->un_pid = -1; +#endif + + un->un_flags |= UN_LOCKED; + return (0); +} + +int +union_unlock(ap) + struct vop_lock_args *ap; +{ + struct union_node *un = VTOUNION(ap->a_vp); + +#ifdef DIAGNOSTIC + if ((un->un_flags & UN_LOCKED) == 0) + panic("union: unlock unlocked node"); + if (curproc && un->un_pid != curproc->p_pid && + curproc->p_pid > -1 && un->un_pid > -1) + panic("union: unlocking other process's union node"); +#endif + + un->un_flags &= ~UN_LOCKED; + + if ((un->un_flags & (UN_ULOCK|UN_KLOCK)) == UN_ULOCK) + VOP_UNLOCK(un->un_uppervp); + + un->un_flags &= ~(UN_ULOCK|UN_KLOCK); + + if (un->un_flags & UN_WANT) { + un->un_flags &= ~UN_WANT; + wakeup((caddr_t) &un->un_flags); + } + +#ifdef DIAGNOSTIC + un->un_pid = 0; +#endif + + return (0); +} + +int +union_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_vp); + int dolock = (vp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_BMAP(vp, ap->a_bn, ap->a_vpp, ap->a_bnp, ap->a_runp); + if (dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + printf("\ttag VT_UNION, vp=%x, uppervp=%x, lowervp=%x\n", + vp, UPPERVP(vp), LOWERVP(vp)); + return (0); +} + +int +union_islocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return ((VTOUNION(ap->a_vp)->un_flags & UN_LOCKED) ? 1 : 0); +} + +int +union_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + int error; + struct vnode *vp = OTHERVP(ap->a_vp); + int dolock = (vp == LOWERVP(ap->a_vp)); + + if (dolock) + VOP_LOCK(vp); + else + FIXUP(VTOUNION(ap->a_vp)); + error = VOP_PATHCONF(vp, ap->a_name, ap->a_retval); + if (dolock) + VOP_UNLOCK(vp); + + return (error); +} + +int +union_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + + return (VOP_ADVLOCK(OTHERVP(ap->a_vp), ap->a_id, ap->a_op, + ap->a_fl, ap->a_flags)); +} + + +/* + * XXX - vop_strategy must be hand coded because it has no + * vnode in its arguments. + * This goes away with a merged VM/buffer cache. + */ +int +union_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + int error; + struct vnode *savedvp; + + savedvp = bp->b_vp; + bp->b_vp = OTHERVP(bp->b_vp); + +#ifdef DIAGNOSTIC + if (bp->b_vp == NULLVP) + panic("union_strategy: nil vp"); + if (((bp->b_flags & B_READ) == 0) && + (bp->b_vp == LOWERVP(savedvp))) + panic("union_strategy: writing to lowervp"); +#endif + + error = VOP_STRATEGY(bp); + bp->b_vp = savedvp; + + return (error); +} + +/* + * Global vfs data structures + */ +int (**union_vnodeop_p)(); +struct vnodeopv_entry_desc union_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, union_lookup }, /* lookup */ + { &vop_create_desc, union_create }, /* create */ + { &vop_mknod_desc, union_mknod }, /* mknod */ + { &vop_open_desc, union_open }, /* open */ + { &vop_close_desc, union_close }, /* close */ + { &vop_access_desc, union_access }, /* access */ + { &vop_getattr_desc, union_getattr }, /* getattr */ + { &vop_setattr_desc, union_setattr }, /* setattr */ + { &vop_read_desc, union_read }, /* read */ + { &vop_write_desc, union_write }, /* write */ + { &vop_ioctl_desc, union_ioctl }, /* ioctl */ + { &vop_select_desc, union_select }, /* select */ + { &vop_mmap_desc, union_mmap }, /* mmap */ + { &vop_fsync_desc, union_fsync }, /* fsync */ + { &vop_seek_desc, union_seek }, /* seek */ + { &vop_remove_desc, union_remove }, /* remove */ + { &vop_link_desc, union_link }, /* link */ + { &vop_rename_desc, union_rename }, /* rename */ + { &vop_mkdir_desc, union_mkdir }, /* mkdir */ + { &vop_rmdir_desc, union_rmdir }, /* rmdir */ + { &vop_symlink_desc, union_symlink }, /* symlink */ + { &vop_readdir_desc, union_readdir }, /* readdir */ + { &vop_readlink_desc, union_readlink }, /* readlink */ + { &vop_abortop_desc, union_abortop }, /* abortop */ + { &vop_inactive_desc, union_inactive }, /* inactive */ + { &vop_reclaim_desc, union_reclaim }, /* reclaim */ + { &vop_lock_desc, union_lock }, /* lock */ + { &vop_unlock_desc, union_unlock }, /* unlock */ + { &vop_bmap_desc, union_bmap }, /* bmap */ + { &vop_strategy_desc, union_strategy }, /* strategy */ + { &vop_print_desc, union_print }, /* print */ + { &vop_islocked_desc, union_islocked }, /* islocked */ + { &vop_pathconf_desc, union_pathconf }, /* pathconf */ + { &vop_advlock_desc, union_advlock }, /* advlock */ +#ifdef notdef + { &vop_blkatoff_desc, union_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, union_valloc }, /* valloc */ + { &vop_vfree_desc, union_vfree }, /* vfree */ + { &vop_truncate_desc, union_truncate }, /* truncate */ + { &vop_update_desc, union_update }, /* update */ + { &vop_bwrite_desc, union_bwrite }, /* bwrite */ +#endif + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc union_vnodeop_opv_desc = + { &union_vnodeop_p, union_vnodeop_entries }; diff --git a/sys/net/bpf.c b/sys/net/bpf.c new file mode 100644 index 00000000000..e40b769b980 --- /dev/null +++ b/sys/net/bpf.c @@ -0,0 +1,1316 @@ +/* + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf.c 8.2 (Berkeley) 3/28/94 + * + * static char rcsid[] = + * "$Header: bpf.c,v 1.33 91/10/27 21:21:58 mccanne Exp $"; + */ + +#include "bpfilter.h" + +#if NBPFILTER > 0 + +#ifndef __GNUC__ +#define inline +#else +#define inline __inline +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#if defined(sparc) && BSD < 199103 +#include +#endif +#include +#include + +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include + +/* + * Older BSDs don't have kernel malloc. + */ +#if BSD < 199103 +extern bcopy(); +static caddr_t bpf_alloc(); +#include +#define BPF_BUFSIZE (MCLBYTES-8) +#define UIOMOVE(cp, len, code, uio) uiomove(cp, len, code, uio) +#else +#define BPF_BUFSIZE 4096 +#define UIOMOVE(cp, len, code, uio) uiomove(cp, len, uio) +#endif + +#define PRINET 26 /* interruptible */ + +/* + * The default read buffer size is patchable. + */ +int bpf_bufsize = BPF_BUFSIZE; + +/* + * bpf_iflist is the list of interfaces; each corresponds to an ifnet + * bpf_dtab holds the descriptors, indexed by minor device # + */ +struct bpf_if *bpf_iflist; +struct bpf_d bpf_dtab[NBPFILTER]; + +#if BSD >= 199207 +/* + * bpfilterattach() is called at boot time in new systems. We do + * nothing here since old systems will not call this. + */ +/* ARGSUSED */ +void +bpfilterattach(n) + int n; +{ +} +#endif + +static int bpf_allocbufs __P((struct bpf_d *)); +static int bpf_allocbufs __P((struct bpf_d *)); +static void bpf_freed __P((struct bpf_d *)); +static void bpf_freed __P((struct bpf_d *)); +static void bpf_ifname __P((struct ifnet *, struct ifreq *)); +static void bpf_ifname __P((struct ifnet *, struct ifreq *)); +static void bpf_mcopy __P((const void *, void *, u_int)); +static int bpf_movein __P((struct uio *, int, + struct mbuf **, struct sockaddr *, int *)); +static int bpf_setif __P((struct bpf_d *, struct ifreq *)); +static int bpf_setif __P((struct bpf_d *, struct ifreq *)); +static inline void + bpf_wakeup __P((struct bpf_d *)); +static void catchpacket __P((struct bpf_d *, u_char *, u_int, + u_int, void (*)(const void *, void *, u_int))); +static void reset_d __P((struct bpf_d *)); + +static int +bpf_movein(uio, linktype, mp, sockp, datlen) + register struct uio *uio; + int linktype, *datlen; + register struct mbuf **mp; + register struct sockaddr *sockp; +{ + struct mbuf *m; + int error; + int len; + int hlen; + + /* + * Build a sockaddr based on the data link layer type. + * We do this at this level because the ethernet header + * is copied directly into the data field of the sockaddr. + * In the case of SLIP, there is no header and the packet + * is forwarded as is. + * Also, we are careful to leave room at the front of the mbuf + * for the link level header. + */ + switch (linktype) { + + case DLT_SLIP: + sockp->sa_family = AF_INET; + hlen = 0; + break; + + case DLT_EN10MB: + sockp->sa_family = AF_UNSPEC; + /* XXX Would MAXLINKHDR be better? */ + hlen = sizeof(struct ether_header); + break; + + case DLT_FDDI: + sockp->sa_family = AF_UNSPEC; + /* XXX 4(FORMAC)+6(dst)+6(src)+3(LLC)+5(SNAP) */ + hlen = 24; + break; + + case DLT_NULL: + sockp->sa_family = AF_UNSPEC; + hlen = 0; + break; + + default: + return (EIO); + } + + len = uio->uio_resid; + *datlen = len - hlen; + if ((unsigned)len > MCLBYTES) + return (EIO); + + MGET(m, M_WAIT, MT_DATA); + if (m == 0) + return (ENOBUFS); + if (len > MLEN) { +#if BSD >= 199103 + MCLGET(m, M_WAIT); + if ((m->m_flags & M_EXT) == 0) { +#else + MCLGET(m); + if (m->m_len != MCLBYTES) { +#endif + error = ENOBUFS; + goto bad; + } + } + m->m_len = len; + *mp = m; + /* + * Make room for link header. + */ + if (hlen != 0) { + m->m_len -= hlen; +#if BSD >= 199103 + m->m_data += hlen; /* XXX */ +#else + m->m_off += hlen; +#endif + error = UIOMOVE((caddr_t)sockp->sa_data, hlen, UIO_WRITE, uio); + if (error) + goto bad; + } + error = UIOMOVE(mtod(m, caddr_t), len - hlen, UIO_WRITE, uio); + if (!error) + return (0); + bad: + m_freem(m); + return (error); +} + +/* + * Attach file to the bpf interface, i.e. make d listen on bp. + * Must be called at splimp. + */ +static void +bpf_attachd(d, bp) + struct bpf_d *d; + struct bpf_if *bp; +{ + /* + * Point d at bp, and add d to the interface's list of listeners. + * Finally, point the driver's bpf cookie at the interface so + * it will divert packets to bpf. + */ + d->bd_bif = bp; + d->bd_next = bp->bif_dlist; + bp->bif_dlist = d; + + *bp->bif_driverp = bp; +} + +/* + * Detach a file from its interface. + */ +static void +bpf_detachd(d) + struct bpf_d *d; +{ + struct bpf_d **p; + struct bpf_if *bp; + + bp = d->bd_bif; + /* + * Check if this descriptor had requested promiscuous mode. + * If so, turn it off. + */ + if (d->bd_promisc) { + d->bd_promisc = 0; + if (ifpromisc(bp->bif_ifp, 0)) + /* + * Something is really wrong if we were able to put + * the driver into promiscuous mode, but can't + * take it out. + */ + panic("bpf: ifpromisc failed"); + } + /* Remove d from the interface's descriptor list. */ + p = &bp->bif_dlist; + while (*p != d) { + p = &(*p)->bd_next; + if (*p == 0) + panic("bpf_detachd: descriptor not in list"); + } + *p = (*p)->bd_next; + if (bp->bif_dlist == 0) + /* + * Let the driver know that there are no more listeners. + */ + *d->bd_bif->bif_driverp = 0; + d->bd_bif = 0; +} + + +/* + * Mark a descriptor free by making it point to itself. + * This is probably cheaper than marking with a constant since + * the address should be in a register anyway. + */ +#define D_ISFREE(d) ((d) == (d)->bd_next) +#define D_MARKFREE(d) ((d)->bd_next = (d)) +#define D_MARKUSED(d) ((d)->bd_next = 0) + +/* + * Open ethernet device. Returns ENXIO for illegal minor device number, + * EBUSY if file is open by another process. + */ +/* ARGSUSED */ +int +bpfopen(dev, flag) + dev_t dev; + int flag; +{ + register struct bpf_d *d; + + if (minor(dev) >= NBPFILTER) + return (ENXIO); + /* + * Each minor can be opened by only one process. If the requested + * minor is in use, return EBUSY. + */ + d = &bpf_dtab[minor(dev)]; + if (!D_ISFREE(d)) + return (EBUSY); + + /* Mark "free" and do most initialization. */ + bzero((char *)d, sizeof(*d)); + d->bd_bufsize = bpf_bufsize; + + return (0); +} + +/* + * Close the descriptor by detaching it from its interface, + * deallocating its buffers, and marking it free. + */ +/* ARGSUSED */ +int +bpfclose(dev, flag) + dev_t dev; + int flag; +{ + register struct bpf_d *d = &bpf_dtab[minor(dev)]; + register int s; + + s = splimp(); + if (d->bd_bif) + bpf_detachd(d); + splx(s); + bpf_freed(d); + + return (0); +} + +/* + * Support for SunOS, which does not have tsleep. + */ +#if BSD < 199103 +static +bpf_timeout(arg) + caddr_t arg; +{ + struct bpf_d *d = (struct bpf_d *)arg; + d->bd_timedout = 1; + wakeup(arg); +} + +#define BPF_SLEEP(chan, pri, s, t) bpf_sleep((struct bpf_d *)chan) + +int +bpf_sleep(d) + register struct bpf_d *d; +{ + register int rto = d->bd_rtout; + register int st; + + if (rto != 0) { + d->bd_timedout = 0; + timeout(bpf_timeout, (caddr_t)d, rto); + } + st = sleep((caddr_t)d, PRINET|PCATCH); + if (rto != 0) { + if (d->bd_timedout == 0) + untimeout(bpf_timeout, (caddr_t)d); + else if (st == 0) + return EWOULDBLOCK; + } + return (st != 0) ? EINTR : 0; +} +#else +#define BPF_SLEEP tsleep +#endif + +/* + * Rotate the packet buffers in descriptor d. Move the store buffer + * into the hold slot, and the free buffer into the store slot. + * Zero the length of the new store buffer. + */ +#define ROTATE_BUFFERS(d) \ + (d)->bd_hbuf = (d)->bd_sbuf; \ + (d)->bd_hlen = (d)->bd_slen; \ + (d)->bd_sbuf = (d)->bd_fbuf; \ + (d)->bd_slen = 0; \ + (d)->bd_fbuf = 0; +/* + * bpfread - read next chunk of packets from buffers + */ +int +bpfread(dev, uio) + dev_t dev; + register struct uio *uio; +{ + register struct bpf_d *d = &bpf_dtab[minor(dev)]; + int error; + int s; + + /* + * Restrict application to use a buffer the same size as + * as kernel buffers. + */ + if (uio->uio_resid != d->bd_bufsize) + return (EINVAL); + + s = splimp(); + /* + * If the hold buffer is empty, then do a timed sleep, which + * ends when the timeout expires or when enough packets + * have arrived to fill the store buffer. + */ + while (d->bd_hbuf == 0) { + if (d->bd_immediate && d->bd_slen != 0) { + /* + * A packet(s) either arrived since the previous + * read or arrived while we were asleep. + * Rotate the buffers and return what's here. + */ + ROTATE_BUFFERS(d); + break; + } + error = BPF_SLEEP((caddr_t)d, PRINET|PCATCH, "bpf", + d->bd_rtout); + if (error == EINTR || error == ERESTART) { + splx(s); + return (error); + } + if (error == EWOULDBLOCK) { + /* + * On a timeout, return what's in the buffer, + * which may be nothing. If there is something + * in the store buffer, we can rotate the buffers. + */ + if (d->bd_hbuf) + /* + * We filled up the buffer in between + * getting the timeout and arriving + * here, so we don't need to rotate. + */ + break; + + if (d->bd_slen == 0) { + splx(s); + return (0); + } + ROTATE_BUFFERS(d); + break; + } + } + /* + * At this point, we know we have something in the hold slot. + */ + splx(s); + + /* + * Move data from hold buffer into user space. + * We know the entire buffer is transferred since + * we checked above that the read buffer is bpf_bufsize bytes. + */ + error = UIOMOVE(d->bd_hbuf, d->bd_hlen, UIO_READ, uio); + + s = splimp(); + d->bd_fbuf = d->bd_hbuf; + d->bd_hbuf = 0; + d->bd_hlen = 0; + splx(s); + + return (error); +} + + +/* + * If there are processes sleeping on this descriptor, wake them up. + */ +static inline void +bpf_wakeup(d) + register struct bpf_d *d; +{ + wakeup((caddr_t)d); +#if BSD >= 199103 + selwakeup(&d->bd_sel); + /* XXX */ + d->bd_sel.si_pid = 0; +#else + if (d->bd_selproc) { + selwakeup(d->bd_selproc, (int)d->bd_selcoll); + d->bd_selcoll = 0; + d->bd_selproc = 0; + } +#endif +} + +int +bpfwrite(dev, uio) + dev_t dev; + struct uio *uio; +{ + register struct bpf_d *d = &bpf_dtab[minor(dev)]; + struct ifnet *ifp; + struct mbuf *m; + int error, s; + static struct sockaddr dst; + int datlen; + + if (d->bd_bif == 0) + return (ENXIO); + + ifp = d->bd_bif->bif_ifp; + + if (uio->uio_resid == 0) + return (0); + + error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, &m, &dst, &datlen); + if (error) + return (error); + + if (datlen > ifp->if_mtu) + return (EMSGSIZE); + + s = splnet(); +#if BSD >= 199103 + error = (*ifp->if_output)(ifp, m, &dst, (struct rtentry *)0); +#else + error = (*ifp->if_output)(ifp, m, &dst); +#endif + splx(s); + /* + * The driver frees the mbuf. + */ + return (error); +} + +/* + * Reset a descriptor by flushing its packet buffer and clearing the + * receive and drop counts. Should be called at splimp. + */ +static void +reset_d(d) + struct bpf_d *d; +{ + if (d->bd_hbuf) { + /* Free the hold buffer. */ + d->bd_fbuf = d->bd_hbuf; + d->bd_hbuf = 0; + } + d->bd_slen = 0; + d->bd_hlen = 0; + d->bd_rcount = 0; + d->bd_dcount = 0; +} + +/* + * FIONREAD Check for read packet available. + * SIOCGIFADDR Get interface address - convenient hook to driver. + * BIOCGBLEN Get buffer len [for read()]. + * BIOCSETF Set ethernet read filter. + * BIOCFLUSH Flush read packet buffer. + * BIOCPROMISC Put interface into promiscuous mode. + * BIOCGDLT Get link layer type. + * BIOCGETIF Get interface name. + * BIOCSETIF Set interface. + * BIOCSRTIMEOUT Set read timeout. + * BIOCGRTIMEOUT Get read timeout. + * BIOCGSTATS Get packet stats. + * BIOCIMMEDIATE Set immediate mode. + * BIOCVERSION Get filter language version. + */ +/* ARGSUSED */ +int +bpfioctl(dev, cmd, addr, flag) + dev_t dev; + int cmd; + caddr_t addr; + int flag; +{ + register struct bpf_d *d = &bpf_dtab[minor(dev)]; + int s, error = 0; + + switch (cmd) { + + default: + error = EINVAL; + break; + + /* + * Check for read packet available. + */ + case FIONREAD: + { + int n; + + s = splimp(); + n = d->bd_slen; + if (d->bd_hbuf) + n += d->bd_hlen; + splx(s); + + *(int *)addr = n; + break; + } + + case SIOCGIFADDR: + { + struct ifnet *ifp; + + if (d->bd_bif == 0) + error = EINVAL; + else { + ifp = d->bd_bif->bif_ifp; + error = (*ifp->if_ioctl)(ifp, cmd, addr); + } + break; + } + + /* + * Get buffer len [for read()]. + */ + case BIOCGBLEN: + *(u_int *)addr = d->bd_bufsize; + break; + + /* + * Set buffer length. + */ + case BIOCSBLEN: +#if BSD < 199103 + error = EINVAL; +#else + if (d->bd_bif != 0) + error = EINVAL; + else { + register u_int size = *(u_int *)addr; + + if (size > BPF_MAXBUFSIZE) + *(u_int *)addr = size = BPF_MAXBUFSIZE; + else if (size < BPF_MINBUFSIZE) + *(u_int *)addr = size = BPF_MINBUFSIZE; + d->bd_bufsize = size; + } +#endif + break; + + /* + * Set link layer read filter. + */ + case BIOCSETF: + error = bpf_setf(d, (struct bpf_program *)addr); + break; + + /* + * Flush read packet buffer. + */ + case BIOCFLUSH: + s = splimp(); + reset_d(d); + splx(s); + break; + + /* + * Put interface into promiscuous mode. + */ + case BIOCPROMISC: + if (d->bd_bif == 0) { + /* + * No interface attached yet. + */ + error = EINVAL; + break; + } + s = splimp(); + if (d->bd_promisc == 0) { + error = ifpromisc(d->bd_bif->bif_ifp, 1); + if (error == 0) + d->bd_promisc = 1; + } + splx(s); + break; + + /* + * Get device parameters. + */ + case BIOCGDLT: + if (d->bd_bif == 0) + error = EINVAL; + else + *(u_int *)addr = d->bd_bif->bif_dlt; + break; + + /* + * Set interface name. + */ + case BIOCGETIF: + if (d->bd_bif == 0) + error = EINVAL; + else + bpf_ifname(d->bd_bif->bif_ifp, (struct ifreq *)addr); + break; + + /* + * Set interface. + */ + case BIOCSETIF: + error = bpf_setif(d, (struct ifreq *)addr); + break; + + /* + * Set read timeout. + */ + case BIOCSRTIMEOUT: + { + struct timeval *tv = (struct timeval *)addr; + u_long msec; + + /* Compute number of milliseconds. */ + msec = tv->tv_sec * 1000 + tv->tv_usec / 1000; + /* Scale milliseconds to ticks. Assume hard + clock has millisecond or greater resolution + (i.e. tick >= 1000). For 10ms hardclock, + tick/1000 = 10, so rtout<-msec/10. */ + d->bd_rtout = msec / (tick / 1000); + break; + } + + /* + * Get read timeout. + */ + case BIOCGRTIMEOUT: + { + struct timeval *tv = (struct timeval *)addr; + u_long msec = d->bd_rtout; + + msec *= tick / 1000; + tv->tv_sec = msec / 1000; + tv->tv_usec = msec % 1000; + break; + } + + /* + * Get packet stats. + */ + case BIOCGSTATS: + { + struct bpf_stat *bs = (struct bpf_stat *)addr; + + bs->bs_recv = d->bd_rcount; + bs->bs_drop = d->bd_dcount; + break; + } + + /* + * Set immediate mode. + */ + case BIOCIMMEDIATE: + d->bd_immediate = *(u_int *)addr; + break; + + case BIOCVERSION: + { + struct bpf_version *bv = (struct bpf_version *)addr; + + bv->bv_major = BPF_MAJOR_VERSION; + bv->bv_minor = BPF_MINOR_VERSION; + break; + } + } + return (error); +} + +/* + * Set d's packet filter program to fp. If this file already has a filter, + * free it and replace it. Returns EINVAL for bogus requests. + */ +int +bpf_setf(d, fp) + struct bpf_d *d; + struct bpf_program *fp; +{ + struct bpf_insn *fcode, *old; + u_int flen, size; + int s; + + old = d->bd_filter; + if (fp->bf_insns == 0) { + if (fp->bf_len != 0) + return (EINVAL); + s = splimp(); + d->bd_filter = 0; + reset_d(d); + splx(s); + if (old != 0) + free((caddr_t)old, M_DEVBUF); + return (0); + } + flen = fp->bf_len; + if (flen > BPF_MAXINSNS) + return (EINVAL); + + size = flen * sizeof(*fp->bf_insns); + fcode = (struct bpf_insn *)malloc(size, M_DEVBUF, M_WAITOK); + if (copyin((caddr_t)fp->bf_insns, (caddr_t)fcode, size) == 0 && + bpf_validate(fcode, (int)flen)) { + s = splimp(); + d->bd_filter = fcode; + reset_d(d); + splx(s); + if (old != 0) + free((caddr_t)old, M_DEVBUF); + + return (0); + } + free((caddr_t)fcode, M_DEVBUF); + return (EINVAL); +} + +/* + * Detach a file from its current interface (if attached at all) and attach + * to the interface indicated by the name stored in ifr. + * Return an errno or 0. + */ +static int +bpf_setif(d, ifr) + struct bpf_d *d; + struct ifreq *ifr; +{ + struct bpf_if *bp; + char *cp; + int unit, s, error; + + /* + * Separate string into name part and unit number. Put a null + * byte at the end of the name part, and compute the number. + * If the a unit number is unspecified, the default is 0, + * as initialized above. XXX This should be common code. + */ + unit = 0; + cp = ifr->ifr_name; + cp[sizeof(ifr->ifr_name) - 1] = '\0'; + while (*cp++) { + if (*cp >= '0' && *cp <= '9') { + unit = *cp - '0'; + *cp++ = '\0'; + while (*cp) + unit = 10 * unit + *cp++ - '0'; + break; + } + } + /* + * Look through attached interfaces for the named one. + */ + for (bp = bpf_iflist; bp != 0; bp = bp->bif_next) { + struct ifnet *ifp = bp->bif_ifp; + + if (ifp == 0 || unit != ifp->if_unit + || strcmp(ifp->if_name, ifr->ifr_name) != 0) + continue; + /* + * We found the requested interface. + * If it's not up, return an error. + * Allocate the packet buffers if we need to. + * If we're already attached to requested interface, + * just flush the buffer. + */ + if ((ifp->if_flags & IFF_UP) == 0) + return (ENETDOWN); + + if (d->bd_sbuf == 0) { + error = bpf_allocbufs(d); + if (error != 0) + return (error); + } + s = splimp(); + if (bp != d->bd_bif) { + if (d->bd_bif) + /* + * Detach if attached to something else. + */ + bpf_detachd(d); + + bpf_attachd(d, bp); + } + reset_d(d); + splx(s); + return (0); + } + /* Not found. */ + return (ENXIO); +} + +/* + * Convert an interface name plus unit number of an ifp to a single + * name which is returned in the ifr. + */ +static void +bpf_ifname(ifp, ifr) + struct ifnet *ifp; + struct ifreq *ifr; +{ + char *s = ifp->if_name; + char *d = ifr->ifr_name; + + while (*d++ = *s++) + continue; + /* XXX Assume that unit number is less than 10. */ + *d++ = ifp->if_unit + '0'; + *d = '\0'; +} + +/* + * The new select interface passes down the proc pointer; the old select + * stubs had to grab it out of the user struct. This glue allows either case. + */ +#if BSD >= 199103 +#define bpf_select bpfselect +#else +int +bpfselect(dev, rw) + register dev_t dev; + int rw; +{ + return (bpf_select(dev, rw, u.u_procp)); +} +#endif + +/* + * Support for select() system call + * + * Return true iff the specific operation will not block indefinitely. + * Otherwise, return false but make a note that a selwakeup() must be done. + */ +int +bpf_select(dev, rw, p) + register dev_t dev; + int rw; + struct proc *p; +{ + register struct bpf_d *d; + register int s; + + if (rw != FREAD) + return (0); + /* + * An imitation of the FIONREAD ioctl code. + */ + d = &bpf_dtab[minor(dev)]; + + s = splimp(); + if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0)) { + /* + * There is data waiting. + */ + splx(s); + return (1); + } +#if BSD >= 199103 + selrecord(p, &d->bd_sel); +#else + /* + * No data ready. If there's already a select() waiting on this + * minor device then this is a collision. This shouldn't happen + * because minors really should not be shared, but if a process + * forks while one of these is open, it is possible that both + * processes could select on the same descriptor. + */ + if (d->bd_selproc && d->bd_selproc->p_wchan == (caddr_t)&selwait) + d->bd_selcoll = 1; + else + d->bd_selproc = p; +#endif + splx(s); + return (0); +} + +/* + * Incoming linkage from device drivers. Process the packet pkt, of length + * pktlen, which is stored in a contiguous buffer. The packet is parsed + * by each process' filter, and if accepted, stashed into the corresponding + * buffer. + */ +void +bpf_tap(arg, pkt, pktlen) + caddr_t arg; + register u_char *pkt; + register u_int pktlen; +{ + struct bpf_if *bp; + register struct bpf_d *d; + register u_int slen; + /* + * Note that the ipl does not have to be raised at this point. + * The only problem that could arise here is that if two different + * interfaces shared any data. This is not the case. + */ + bp = (struct bpf_if *)arg; + for (d = bp->bif_dlist; d != 0; d = d->bd_next) { + ++d->bd_rcount; + slen = bpf_filter(d->bd_filter, pkt, pktlen, pktlen); + if (slen != 0) + catchpacket(d, pkt, pktlen, slen, bcopy); + } +} + +/* + * Copy data from an mbuf chain into a buffer. This code is derived + * from m_copydata in sys/uipc_mbuf.c. + */ +static void +bpf_mcopy(src_arg, dst_arg, len) + const void *src_arg; + void *dst_arg; + register u_int len; +{ + register const struct mbuf *m; + register u_int count; + u_char *dst; + + m = src_arg; + dst = dst_arg; + while (len > 0) { + if (m == 0) + panic("bpf_mcopy"); + count = min(m->m_len, len); + bcopy(mtod(m, caddr_t), (caddr_t)dst, count); + m = m->m_next; + dst += count; + len -= count; + } +} + +/* + * Incoming linkage from device drivers, when packet is in an mbuf chain. + */ +void +bpf_mtap(arg, m) + caddr_t arg; + struct mbuf *m; +{ + struct bpf_if *bp = (struct bpf_if *)arg; + struct bpf_d *d; + u_int pktlen, slen; + struct mbuf *m0; + + pktlen = 0; + for (m0 = m; m0 != 0; m0 = m0->m_next) + pktlen += m0->m_len; + + for (d = bp->bif_dlist; d != 0; d = d->bd_next) { + ++d->bd_rcount; + slen = bpf_filter(d->bd_filter, (u_char *)m, pktlen, 0); + if (slen != 0) + catchpacket(d, (u_char *)m, pktlen, slen, bpf_mcopy); + } +} + +/* + * Move the packet data from interface memory (pkt) into the + * store buffer. Return 1 if it's time to wakeup a listener (buffer full), + * otherwise 0. "copy" is the routine called to do the actual data + * transfer. bcopy is passed in to copy contiguous chunks, while + * bpf_mcopy is passed in to copy mbuf chains. In the latter case, + * pkt is really an mbuf. + */ +static void +catchpacket(d, pkt, pktlen, snaplen, cpfn) + register struct bpf_d *d; + register u_char *pkt; + register u_int pktlen, snaplen; + register void (*cpfn)(const void *, void *, u_int); +{ + register struct bpf_hdr *hp; + register int totlen, curlen; + register int hdrlen = d->bd_bif->bif_hdrlen; + /* + * Figure out how many bytes to move. If the packet is + * greater or equal to the snapshot length, transfer that + * much. Otherwise, transfer the whole packet (unless + * we hit the buffer size limit). + */ + totlen = hdrlen + min(snaplen, pktlen); + if (totlen > d->bd_bufsize) + totlen = d->bd_bufsize; + + /* + * Round up the end of the previous packet to the next longword. + */ + curlen = BPF_WORDALIGN(d->bd_slen); + if (curlen + totlen > d->bd_bufsize) { + /* + * This packet will overflow the storage buffer. + * Rotate the buffers if we can, then wakeup any + * pending reads. + */ + if (d->bd_fbuf == 0) { + /* + * We haven't completed the previous read yet, + * so drop the packet. + */ + ++d->bd_dcount; + return; + } + ROTATE_BUFFERS(d); + bpf_wakeup(d); + curlen = 0; + } + else if (d->bd_immediate) + /* + * Immediate mode is set. A packet arrived so any + * reads should be woken up. + */ + bpf_wakeup(d); + + /* + * Append the bpf header. + */ + hp = (struct bpf_hdr *)(d->bd_sbuf + curlen); +#if BSD >= 199103 + microtime(&hp->bh_tstamp); +#elif defined(sun) + uniqtime(&hp->bh_tstamp); +#else + hp->bh_tstamp = time; +#endif + hp->bh_datalen = pktlen; + hp->bh_hdrlen = hdrlen; + /* + * Copy the packet data into the store buffer and update its length. + */ + (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen)); + d->bd_slen = curlen + totlen; +} + +/* + * Initialize all nonzero fields of a descriptor. + */ +static int +bpf_allocbufs(d) + register struct bpf_d *d; +{ + d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_DEVBUF, M_WAITOK); + if (d->bd_fbuf == 0) + return (ENOBUFS); + + d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_DEVBUF, M_WAITOK); + if (d->bd_sbuf == 0) { + free(d->bd_fbuf, M_DEVBUF); + return (ENOBUFS); + } + d->bd_slen = 0; + d->bd_hlen = 0; + return (0); +} + +/* + * Free buffers currently in use by a descriptor. + * Called on close. + */ +static void +bpf_freed(d) + register struct bpf_d *d; +{ + /* + * We don't need to lock out interrupts since this descriptor has + * been detached from its interface and it yet hasn't been marked + * free. + */ + if (d->bd_sbuf != 0) { + free(d->bd_sbuf, M_DEVBUF); + if (d->bd_hbuf != 0) + free(d->bd_hbuf, M_DEVBUF); + if (d->bd_fbuf != 0) + free(d->bd_fbuf, M_DEVBUF); + } + if (d->bd_filter) + free((caddr_t)d->bd_filter, M_DEVBUF); + + D_MARKFREE(d); +} + +/* + * Attach an interface to bpf. driverp is a pointer to a (struct bpf_if *) + * in the driver's softc; dlt is the link layer type; hdrlen is the fixed + * size of the link header (variable length headers not yet supported). + */ +void +bpfattach(driverp, ifp, dlt, hdrlen) + caddr_t *driverp; + struct ifnet *ifp; + u_int dlt, hdrlen; +{ + struct bpf_if *bp; + int i; +#if BSD < 199103 + static struct bpf_if bpf_ifs[NBPFILTER]; + static int bpfifno; + + bp = (bpfifno < NBPFILTER) ? &bpf_ifs[bpfifno++] : 0; +#else + bp = (struct bpf_if *)malloc(sizeof(*bp), M_DEVBUF, M_DONTWAIT); +#endif + if (bp == 0) + panic("bpfattach"); + + bp->bif_dlist = 0; + bp->bif_driverp = (struct bpf_if **)driverp; + bp->bif_ifp = ifp; + bp->bif_dlt = dlt; + + bp->bif_next = bpf_iflist; + bpf_iflist = bp; + + *bp->bif_driverp = 0; + + /* + * Compute the length of the bpf header. This is not necessarily + * equal to SIZEOF_BPF_HDR because we want to insert spacing such + * that the network layer header begins on a longword boundary (for + * performance reasons and to alleviate alignment restrictions). + */ + bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen; + + /* + * Mark all the descriptors free if this hasn't been done. + */ + if (!D_ISFREE(&bpf_dtab[0])) + for (i = 0; i < NBPFILTER; ++i) + D_MARKFREE(&bpf_dtab[i]); + + printf("bpf: %s%d attached\n", ifp->if_name, ifp->if_unit); +} + +#if BSD >= 199103 +/* XXX This routine belongs in net/if.c. */ +/* + * Set/clear promiscuous mode on interface ifp based on the truth value + * of pswitch. The calls are reference counted so that only the first + * "on" request actually has an effect, as does the final "off" request. + * Results are undefined if the "off" and "on" requests are not matched. + */ +int +ifpromisc(ifp, pswitch) + struct ifnet *ifp; + int pswitch; +{ + struct ifreq ifr; + /* + * If the device is not configured up, we cannot put it in + * promiscuous mode. + */ + if ((ifp->if_flags & IFF_UP) == 0) + return (ENETDOWN); + + if (pswitch) { + if (ifp->if_pcount++ != 0) + return (0); + ifp->if_flags |= IFF_PROMISC; + } else { + if (--ifp->if_pcount > 0) + return (0); + ifp->if_flags &= ~IFF_PROMISC; + } + ifr.ifr_flags = ifp->if_flags; + return ((*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); +} +#endif + +#if BSD < 199103 +/* + * Allocate some memory for bpf. This is temporary SunOS support, and + * is admittedly a hack. + * If resources unavaiable, return 0. + */ +static caddr_t +bpf_alloc(size, canwait) + register int size; + register int canwait; +{ + register struct mbuf *m; + + if ((unsigned)size > (MCLBYTES-8)) + return 0; + + MGET(m, canwait, MT_DATA); + if (m == 0) + return 0; + if ((unsigned)size > (MLEN-8)) { + MCLGET(m); + if (m->m_len != MCLBYTES) { + m_freem(m); + return 0; + } + } + *mtod(m, struct mbuf **) = m; + return mtod(m, caddr_t) + 8; +} +#endif +#endif diff --git a/sys/net/bpf.h b/sys/net/bpf.h new file mode 100644 index 00000000000..2e093ac5ce1 --- /dev/null +++ b/sys/net/bpf.h @@ -0,0 +1,252 @@ +/* + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf.h 8.1 (Berkeley) 6/10/93 + * + * @(#) $Header: bpf.h,v 1.24 91/10/27 21:22:32 mccanne Exp $ (LBL) + */ + +/* + * Alignment macros. BPF_WORDALIGN rounds up to the next + * even multiple of BPF_ALIGNMENT. + */ +#define BPF_ALIGNMENT sizeof(long) +#define BPF_WORDALIGN(x) (((x)+(BPF_ALIGNMENT-1))&~(BPF_ALIGNMENT-1)) + +#define BPF_MAXINSNS 512 +#define BPF_MAXBUFSIZE 0x8000 +#define BPF_MINBUFSIZE 32 + +/* + * Structure for BIOCSETF. + */ +struct bpf_program { + u_int bf_len; + struct bpf_insn *bf_insns; +}; + +/* + * Struct returned by BIOCGSTATS. + */ +struct bpf_stat { + u_int bs_recv; /* number of packets received */ + u_int bs_drop; /* number of packets dropped */ +}; + +/* + * Struct return by BIOCVERSION. This represents the version number of + * the filter language described by the instruction encodings below. + * bpf understands a program iff kernel_major == filter_major && + * kernel_minor >= filter_minor, that is, if the value returned by the + * running kernel has the same major number and a minor number equal + * equal to or less than the filter being downloaded. Otherwise, the + * results are undefined, meaning an error may be returned or packets + * may be accepted haphazardly. + * It has nothing to do with the source code version. + */ +struct bpf_version { + u_short bv_major; + u_short bv_minor; +}; +/* Current version number. */ +#define BPF_MAJOR_VERSION 1 +#define BPF_MINOR_VERSION 1 + +/* + * BPF ioctls + * + * The first set is for compatibility with Sun's pcc style + * header files. If your using gcc, we assume that you + * have run fixincludes so the latter set should work. + */ +#if (defined(sun) || defined(ibm032)) && !defined(__GNUC__) +#define BIOCGBLEN _IOR(B,102, u_int) +#define BIOCSBLEN _IOWR(B,102, u_int) +#define BIOCSETF _IOW(B,103, struct bpf_program) +#define BIOCFLUSH _IO(B,104) +#define BIOCPROMISC _IO(B,105) +#define BIOCGDLT _IOR(B,106, u_int) +#define BIOCGETIF _IOR(B,107, struct ifreq) +#define BIOCSETIF _IOW(B,108, struct ifreq) +#define BIOCSRTIMEOUT _IOW(B,109, struct timeval) +#define BIOCGRTIMEOUT _IOR(B,110, struct timeval) +#define BIOCGSTATS _IOR(B,111, struct bpf_stat) +#define BIOCIMMEDIATE _IOW(B,112, u_int) +#define BIOCVERSION _IOR(B,113, struct bpf_version) +#else +#define BIOCGBLEN _IOR('B',102, u_int) +#define BIOCSBLEN _IOWR('B',102, u_int) +#define BIOCSETF _IOW('B',103, struct bpf_program) +#define BIOCFLUSH _IO('B',104) +#define BIOCPROMISC _IO('B',105) +#define BIOCGDLT _IOR('B',106, u_int) +#define BIOCGETIF _IOR('B',107, struct ifreq) +#define BIOCSETIF _IOW('B',108, struct ifreq) +#define BIOCSRTIMEOUT _IOW('B',109, struct timeval) +#define BIOCGRTIMEOUT _IOR('B',110, struct timeval) +#define BIOCGSTATS _IOR('B',111, struct bpf_stat) +#define BIOCIMMEDIATE _IOW('B',112, u_int) +#define BIOCVERSION _IOR('B',113, struct bpf_version) +#endif + +/* + * Structure prepended to each packet. + */ +struct bpf_hdr { + struct timeval bh_tstamp; /* time stamp */ + u_long bh_caplen; /* length of captured portion */ + u_long bh_datalen; /* original length of packet */ + u_short bh_hdrlen; /* length of bpf header (this struct + plus alignment padding) */ +}; +/* + * Because the structure above is not a multiple of 4 bytes, some compilers + * will insist on inserting padding; hence, sizeof(struct bpf_hdr) won't work. + * Only the kernel needs to know about it; applications use bh_hdrlen. + */ +#ifdef KERNEL +#define SIZEOF_BPF_HDR 18 +#endif + +/* + * Data-link level type codes. + * Currently, only DLT_EN10MB and DLT_SLIP are supported. + */ +#define DLT_NULL 0 /* no link-layer encapsulation */ +#define DLT_EN10MB 1 /* Ethernet (10Mb) */ +#define DLT_EN3MB 2 /* Experimental Ethernet (3Mb) */ +#define DLT_AX25 3 /* Amateur Radio AX.25 */ +#define DLT_PRONET 4 /* Proteon ProNET Token Ring */ +#define DLT_CHAOS 5 /* Chaos */ +#define DLT_IEEE802 6 /* IEEE 802 Networks */ +#define DLT_ARCNET 7 /* ARCNET */ +#define DLT_SLIP 8 /* Serial Line IP */ +#define DLT_PPP 9 /* Point-to-point Protocol */ +#define DLT_FDDI 10 /* FDDI */ + +/* + * The instruction encondings. + */ +/* instruction classes */ +#define BPF_CLASS(code) ((code) & 0x07) +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 + +/* ld/ldx fields */ +#define BPF_SIZE(code) ((code) & 0x18) +#define BPF_W 0x00 +#define BPF_H 0x08 +#define BPF_B 0x10 +#define BPF_MODE(code) ((code) & 0xe0) +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 + +/* alu/jmp fields */ +#define BPF_OP(code) ((code) & 0xf0) +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 +#define BPF_SRC(code) ((code) & 0x08) +#define BPF_K 0x00 +#define BPF_X 0x08 + +/* ret - BPF_K and BPF_X also apply */ +#define BPF_RVAL(code) ((code) & 0x18) +#define BPF_A 0x10 + +/* misc */ +#define BPF_MISCOP(code) ((code) & 0xf8) +#define BPF_TAX 0x00 +#define BPF_TXA 0x80 + +/* + * The instruction data structure. + */ +struct bpf_insn { + u_short code; + u_char jt; + u_char jf; + long k; +}; + +/* + * Macros for insn array initializers. + */ +#define BPF_STMT(code, k) { (u_short)(code), 0, 0, k } +#define BPF_JUMP(code, k, jt, jf) { (u_short)(code), jt, jf, k } + +#ifdef KERNEL +int bpf_validate __P((struct bpf_insn *, int)); +int bpfopen __P((dev_t, int)); +int bpfclose __P((dev_t, int)); +int bpfread __P((dev_t, struct uio *)); +int bpfwrite __P((dev_t, struct uio *)); +int bpfioctl __P((dev_t, int, caddr_t, int)); +int bpf_select __P((dev_t, int, struct proc *)); +void bpf_tap __P((caddr_t, u_char *, u_int)); +void bpf_mtap __P((caddr_t, struct mbuf *)); +void bpfattach __P((caddr_t *, struct ifnet *, u_int, u_int)); +void bpfilterattach __P((int)); +u_int bpf_filter __P((struct bpf_insn *, u_char *, u_int, u_int)); +#endif + +/* + * Number of scratch memory words (for BPF_LD|BPF_MEM and BPF_ST). + */ +#define BPF_MEMWORDS 16 + diff --git a/sys/net/bpf_compat.h b/sys/net/bpf_compat.h new file mode 100644 index 00000000000..132a6df6452 --- /dev/null +++ b/sys/net/bpf_compat.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf_compat.h 8.1 (Berkeley) 6/10/93 + */ + +/* from: $Header: bpf_compat.h,v 1.1 92/05/22 15:33:20 mccanne Exp $ (LBL) */ + +/* + * Some hacks for compatibility across SunOS and 4.4BSD. We emulate malloc + * and free with mbuf clusters. We store a pointer to the mbuf in the first + * word of the mbuf and return 8 bytes passed the start of data (for double + * word alignment). We cannot just use offsets because clusters are not at + * a fixed offset from the associated mbuf. Sorry for this kludge. + */ +#define malloc(size, type, canwait) bpf_alloc(size, canwait) +#define free(cp, type) m_free(*(struct mbuf **)(cp - 8)) +#define M_WAITOK M_WAIT + +/* This mapping works for our purposes. */ +#define ERESTART EINTR diff --git a/sys/net/bpf_filter.c b/sys/net/bpf_filter.c new file mode 100644 index 00000000000..6a30a665754 --- /dev/null +++ b/sys/net/bpf_filter.c @@ -0,0 +1,548 @@ +/* + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf_filter.c 8.1 (Berkeley) 6/10/93 + * + * static char rcsid[] = + * "$Header: bpf_filter.c,v 1.16 91/10/27 21:22:35 mccanne Exp $"; + */ + +#include +#include +#include + +#ifdef sun +#include +#endif + +#if defined(sparc) || defined(mips) || defined(ibm032) +#define BPF_ALIGN +#endif + +#ifndef BPF_ALIGN +#define EXTRACT_SHORT(p) ((u_short)ntohs(*(u_short *)p)) +#define EXTRACT_LONG(p) (ntohl(*(u_long *)p)) +#else +#define EXTRACT_SHORT(p)\ + ((u_short)\ + ((u_short)*((u_char *)p+0)<<8|\ + (u_short)*((u_char *)p+1)<<0)) +#define EXTRACT_LONG(p)\ + ((u_long)*((u_char *)p+0)<<24|\ + (u_long)*((u_char *)p+1)<<16|\ + (u_long)*((u_char *)p+2)<<8|\ + (u_long)*((u_char *)p+3)<<0) +#endif + +#ifdef KERNEL +#include +#define MINDEX(m, k) \ +{ \ + register int len = m->m_len; \ + \ + while (k >= len) { \ + k -= len; \ + m = m->m_next; \ + if (m == 0) \ + return 0; \ + len = m->m_len; \ + } \ +} + +static int +m_xword(m, k, err) + register struct mbuf *m; + register int k, *err; +{ + register int len; + register u_char *cp, *np; + register struct mbuf *m0; + + len = m->m_len; + while (k >= len) { + k -= len; + m = m->m_next; + if (m == 0) + goto bad; + len = m->m_len; + } + cp = mtod(m, u_char *) + k; + if (len - k >= 4) { + *err = 0; + return EXTRACT_LONG(cp); + } + m0 = m->m_next; + if (m0 == 0 || m0->m_len + len - k < 4) + goto bad; + *err = 0; + np = mtod(m0, u_char *); + switch (len - k) { + + case 1: + return (cp[k] << 24) | (np[0] << 16) | (np[1] << 8) | np[2]; + + case 2: + return (cp[k] << 24) | (cp[k + 1] << 16) | (np[0] << 8) | + np[1]; + + default: + return (cp[k] << 24) | (cp[k + 1] << 16) | (cp[k + 2] << 8) | + np[0]; + } + bad: + *err = 1; + return 0; +} + +static int +m_xhalf(m, k, err) + register struct mbuf *m; + register int k, *err; +{ + register int len; + register u_char *cp; + register struct mbuf *m0; + + len = m->m_len; + while (k >= len) { + k -= len; + m = m->m_next; + if (m == 0) + goto bad; + len = m->m_len; + } + cp = mtod(m, u_char *) + k; + if (len - k >= 2) { + *err = 0; + return EXTRACT_SHORT(cp); + } + m0 = m->m_next; + if (m0 == 0) + goto bad; + *err = 0; + return (cp[k] << 8) | mtod(m0, u_char *)[0]; + bad: + *err = 1; + return 0; +} +#endif + +#include +/* + * Execute the filter program starting at pc on the packet p + * wirelen is the length of the original packet + * buflen is the amount of data present + */ +u_int +bpf_filter(pc, p, wirelen, buflen) + register struct bpf_insn *pc; + register u_char *p; + u_int wirelen; + register u_int buflen; +{ + register u_long A, X; + register int k; + long mem[BPF_MEMWORDS]; + + if (pc == 0) + /* + * No filter means accept all. + */ + return (u_int)-1; +#ifdef lint + A = 0; + X = 0; +#endif + --pc; + while (1) { + ++pc; + switch (pc->code) { + + default: +#ifdef KERNEL + return 0; +#else + abort(); +#endif + case BPF_RET|BPF_K: + return (u_int)pc->k; + + case BPF_RET|BPF_A: + return (u_int)A; + + case BPF_LD|BPF_W|BPF_ABS: + k = pc->k; + if (k + sizeof(long) > buflen) { +#ifdef KERNEL + int merr; + + if (buflen != 0) + return 0; + A = m_xword((struct mbuf *)p, k, &merr); + if (merr != 0) + return 0; + continue; +#else + return 0; +#endif + } +#ifdef BPF_ALIGN + if (((int)(p + k) & 3) != 0) + A = EXTRACT_LONG(&p[k]); + else +#endif + A = ntohl(*(long *)(p + k)); + continue; + + case BPF_LD|BPF_H|BPF_ABS: + k = pc->k; + if (k + sizeof(short) > buflen) { +#ifdef KERNEL + int merr; + + if (buflen != 0) + return 0; + A = m_xhalf((struct mbuf *)p, k, &merr); + continue; +#else + return 0; +#endif + } + A = EXTRACT_SHORT(&p[k]); + continue; + + case BPF_LD|BPF_B|BPF_ABS: + k = pc->k; + if (k >= buflen) { +#ifdef KERNEL + register struct mbuf *m; + + if (buflen != 0) + return 0; + m = (struct mbuf *)p; + MINDEX(m, k); + A = mtod(m, u_char *)[k]; + continue; +#else + return 0; +#endif + } + A = p[k]; + continue; + + case BPF_LD|BPF_W|BPF_LEN: + A = wirelen; + continue; + + case BPF_LDX|BPF_W|BPF_LEN: + X = wirelen; + continue; + + case BPF_LD|BPF_W|BPF_IND: + k = X + pc->k; + if (k + sizeof(long) > buflen) { +#ifdef KERNEL + int merr; + + if (buflen != 0) + return 0; + A = m_xword((struct mbuf *)p, k, &merr); + if (merr != 0) + return 0; + continue; +#else + return 0; +#endif + } +#ifdef BPF_ALIGN + if (((int)(p + k) & 3) != 0) + A = EXTRACT_LONG(&p[k]); + else +#endif + A = ntohl(*(long *)(p + k)); + continue; + + case BPF_LD|BPF_H|BPF_IND: + k = X + pc->k; + if (k + sizeof(short) > buflen) { +#ifdef KERNEL + int merr; + + if (buflen != 0) + return 0; + A = m_xhalf((struct mbuf *)p, k, &merr); + if (merr != 0) + return 0; + continue; +#else + return 0; +#endif + } + A = EXTRACT_SHORT(&p[k]); + continue; + + case BPF_LD|BPF_B|BPF_IND: + k = X + pc->k; + if (k >= buflen) { +#ifdef KERNEL + register struct mbuf *m; + + if (buflen != 0) + return 0; + m = (struct mbuf *)p; + MINDEX(m, k); + A = mtod(m, char *)[k]; + continue; +#else + return 0; +#endif + } + A = p[k]; + continue; + + case BPF_LDX|BPF_MSH|BPF_B: + k = pc->k; + if (k >= buflen) { +#ifdef KERNEL + register struct mbuf *m; + + if (buflen != 0) + return 0; + m = (struct mbuf *)p; + MINDEX(m, k); + X = (mtod(m, char *)[k] & 0xf) << 2; + continue; +#else + return 0; +#endif + } + X = (p[pc->k] & 0xf) << 2; + continue; + + case BPF_LD|BPF_IMM: + A = pc->k; + continue; + + case BPF_LDX|BPF_IMM: + X = pc->k; + continue; + + case BPF_LD|BPF_MEM: + A = mem[pc->k]; + continue; + + case BPF_LDX|BPF_MEM: + X = mem[pc->k]; + continue; + + case BPF_ST: + mem[pc->k] = A; + continue; + + case BPF_STX: + mem[pc->k] = X; + continue; + + case BPF_JMP|BPF_JA: + pc += pc->k; + continue; + + case BPF_JMP|BPF_JGT|BPF_K: + pc += (A > pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_K: + pc += (A >= pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_K: + pc += (A == pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_K: + pc += (A & pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JGT|BPF_X: + pc += (A > X) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_X: + pc += (A >= X) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_X: + pc += (A == X) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_X: + pc += (A & X) ? pc->jt : pc->jf; + continue; + + case BPF_ALU|BPF_ADD|BPF_X: + A += X; + continue; + + case BPF_ALU|BPF_SUB|BPF_X: + A -= X; + continue; + + case BPF_ALU|BPF_MUL|BPF_X: + A *= X; + continue; + + case BPF_ALU|BPF_DIV|BPF_X: + if (X == 0) + return 0; + A /= X; + continue; + + case BPF_ALU|BPF_AND|BPF_X: + A &= X; + continue; + + case BPF_ALU|BPF_OR|BPF_X: + A |= X; + continue; + + case BPF_ALU|BPF_LSH|BPF_X: + A <<= X; + continue; + + case BPF_ALU|BPF_RSH|BPF_X: + A >>= X; + continue; + + case BPF_ALU|BPF_ADD|BPF_K: + A += pc->k; + continue; + + case BPF_ALU|BPF_SUB|BPF_K: + A -= pc->k; + continue; + + case BPF_ALU|BPF_MUL|BPF_K: + A *= pc->k; + continue; + + case BPF_ALU|BPF_DIV|BPF_K: + A /= pc->k; + continue; + + case BPF_ALU|BPF_AND|BPF_K: + A &= pc->k; + continue; + + case BPF_ALU|BPF_OR|BPF_K: + A |= pc->k; + continue; + + case BPF_ALU|BPF_LSH|BPF_K: + A <<= pc->k; + continue; + + case BPF_ALU|BPF_RSH|BPF_K: + A >>= pc->k; + continue; + + case BPF_ALU|BPF_NEG: + A = -A; + continue; + + case BPF_MISC|BPF_TAX: + X = A; + continue; + + case BPF_MISC|BPF_TXA: + A = X; + continue; + } + } +} + +#ifdef KERNEL +/* + * Return true if the 'fcode' is a valid filter program. + * The constraints are that each jump be forward and to a valid + * code. The code must terminate with either an accept or reject. + * 'valid' is an array for use by the routine (it must be at least + * 'len' bytes long). + * + * The kernel needs to be able to verify an application's filter code. + * Otherwise, a bogus program could easily crash the system. + */ +int +bpf_validate(f, len) + struct bpf_insn *f; + int len; +{ + register int i; + register struct bpf_insn *p; + + for (i = 0; i < len; ++i) { + /* + * Check that that jumps are forward, and within + * the code block. + */ + p = &f[i]; + if (BPF_CLASS(p->code) == BPF_JMP) { + register int from = i + 1; + + if (BPF_OP(p->code) == BPF_JA) { + if (from + p->k >= len) + return 0; + } + else if (from + p->jt >= len || from + p->jf >= len) + return 0; + } + /* + * Check that memory operations use valid addresses. + */ + if ((BPF_CLASS(p->code) == BPF_ST || + (BPF_CLASS(p->code) == BPF_LD && + (p->code & 0xe0) == BPF_MEM)) && + (p->k >= BPF_MEMWORDS || p->k < 0)) + return 0; + /* + * Check for constant division by 0. + */ + if (p->code == (BPF_ALU|BPF_DIV|BPF_K) && p->k == 0) + return 0; + } + return BPF_CLASS(f[len - 1].code) == BPF_RET; +} +#endif diff --git a/sys/net/bpfdesc.h b/sys/net/bpfdesc.h new file mode 100644 index 00000000000..a13320e86a6 --- /dev/null +++ b/sys/net/bpfdesc.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpfdesc.h 8.1 (Berkeley) 6/10/93 + * + * @(#) $Header: bpfdesc.h,v 1.9 91/10/27 21:22:38 mccanne Exp $ (LBL) + */ + +/* + * Descriptor associated with each open bpf file. + */ +struct bpf_d { + struct bpf_d *bd_next; /* Linked list of descriptors */ + /* + * Buffer slots: two mbuf clusters buffer the incoming packets. + * The model has three slots. Sbuf is always occupied. + * sbuf (store) - Receive interrupt puts packets here. + * hbuf (hold) - When sbuf is full, put cluster here and + * wakeup read (replace sbuf with fbuf). + * fbuf (free) - When read is done, put cluster here. + * On receiving, if sbuf is full and fbuf is 0, packet is dropped. + */ + caddr_t bd_sbuf; /* store slot */ + caddr_t bd_hbuf; /* hold slot */ + caddr_t bd_fbuf; /* free slot */ + int bd_slen; /* current length of store buffer */ + int bd_hlen; /* current length of hold buffer */ + + int bd_bufsize; /* absolute length of buffers */ + + struct bpf_if * bd_bif; /* interface descriptor */ + u_long bd_rtout; /* Read timeout in 'ticks' */ + struct bpf_insn *bd_filter; /* filter code */ + u_long bd_rcount; /* number of packets received */ + u_long bd_dcount; /* number of packets dropped */ + + u_char bd_promisc; /* true if listening promiscuously */ + u_char bd_state; /* idle, waiting, or timed out */ + u_char bd_immediate; /* true to return on packet arrival */ +#if BSD < 199103 + u_char bd_selcoll; /* true if selects collide */ + int bd_timedout; + struct proc * bd_selproc; /* process that last selected us */ +#else + u_char bd_pad; /* explicit alignment */ + struct selinfo bd_sel; /* bsd select info */ +#endif +}; + +/* + * Descriptor associated with each attached hardware interface. + */ +struct bpf_if { + struct bpf_if *bif_next; /* list of all interfaces */ + struct bpf_d *bif_dlist; /* descriptor list */ + struct bpf_if **bif_driverp; /* pointer into softc */ + u_int bif_dlt; /* link layer type */ + u_int bif_hdrlen; /* length of header (with padding) */ + struct ifnet *bif_ifp; /* correspoding interface */ +}; + +#ifdef KERNEL +int bpf_setf __P((struct bpf_d *, struct bpf_program *)); +#endif diff --git a/sys/net/if.c b/sys/net/if.c new file mode 100644 index 00000000000..36963885cc7 --- /dev/null +++ b/sys/net/if.c @@ -0,0 +1,670 @@ +/* + * Copyright (c) 1980, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if.c 8.3 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +int ifqmaxlen = IFQ_MAXLEN; +void if_slowtimo __P((void *arg)); + +/* + * Network interface utility routines. + * + * Routines with ifa_ifwith* names take sockaddr *'s as + * parameters. + */ +void +ifinit() +{ + register struct ifnet *ifp; + + for (ifp = ifnet; ifp; ifp = ifp->if_next) + if (ifp->if_snd.ifq_maxlen == 0) + ifp->if_snd.ifq_maxlen = ifqmaxlen; + if_slowtimo(0); +} + +#ifdef vax +/* + * Call each interface on a Unibus reset. + */ +void +ifubareset(uban) + int uban; +{ + register struct ifnet *ifp; + + for (ifp = ifnet; ifp; ifp = ifp->if_next) + if (ifp->if_reset) + (*ifp->if_reset)(ifp->if_unit, uban); +} +#endif + +int if_index = 0; +struct ifaddr **ifnet_addrs; +static char *sprint_d __P((u_int, char *, int)); + +/* + * Attach an interface to the + * list of "active" interfaces. + */ +void +if_attach(ifp) + struct ifnet *ifp; +{ + unsigned socksize, ifasize; + int namelen, unitlen, masklen, ether_output(); + char workbuf[12], *unitname; + register struct ifnet **p = &ifnet; + register struct sockaddr_dl *sdl; + register struct ifaddr *ifa; + static int if_indexlim = 8; + extern void link_rtrequest(); + + while (*p) + p = &((*p)->if_next); + *p = ifp; + ifp->if_index = ++if_index; + if (ifnet_addrs == 0 || if_index >= if_indexlim) { + unsigned n = (if_indexlim <<= 1) * sizeof(ifa); + struct ifaddr **q = (struct ifaddr **) + malloc(n, M_IFADDR, M_WAITOK); + if (ifnet_addrs) { + bcopy((caddr_t)ifnet_addrs, (caddr_t)q, n/2); + free((caddr_t)ifnet_addrs, M_IFADDR); + } + ifnet_addrs = q; + } + /* + * create a Link Level name for this device + */ + unitname = sprint_d((u_int)ifp->if_unit, workbuf, sizeof(workbuf)); + namelen = strlen(ifp->if_name); + unitlen = strlen(unitname); +#define _offsetof(t, m) ((int)((caddr_t)&((t *)0)->m)) + masklen = _offsetof(struct sockaddr_dl, sdl_data[0]) + + unitlen + namelen; + socksize = masklen + ifp->if_addrlen; +#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof(long) - 1))) + socksize = ROUNDUP(socksize); + if (socksize < sizeof(*sdl)) + socksize = sizeof(*sdl); + ifasize = sizeof(*ifa) + 2 * socksize; + if (ifa = (struct ifaddr *)malloc(ifasize, M_IFADDR, M_WAITOK)) { + bzero((caddr_t)ifa, ifasize); + sdl = (struct sockaddr_dl *)(ifa + 1); + sdl->sdl_len = socksize; + sdl->sdl_family = AF_LINK; + bcopy(ifp->if_name, sdl->sdl_data, namelen); + bcopy(unitname, namelen + (caddr_t)sdl->sdl_data, unitlen); + sdl->sdl_nlen = (namelen += unitlen); + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = ifp->if_type; + ifnet_addrs[if_index - 1] = ifa; + ifa->ifa_ifp = ifp; + ifa->ifa_next = ifp->if_addrlist; + ifa->ifa_rtrequest = link_rtrequest; + ifp->if_addrlist = ifa; + ifa->ifa_addr = (struct sockaddr *)sdl; + sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); + ifa->ifa_netmask = (struct sockaddr *)sdl; + sdl->sdl_len = masklen; + while (namelen != 0) + sdl->sdl_data[--namelen] = 0xff; + } + /* XXX -- Temporary fix before changing 10 ethernet drivers */ + if (ifp->if_output == ether_output) + ether_ifattach(ifp); +} +/* + * Locate an interface based on a complete address. + */ +/*ARGSUSED*/ +struct ifaddr * +ifa_ifwithaddr(addr) + register struct sockaddr *addr; +{ + register struct ifnet *ifp; + register struct ifaddr *ifa; + +#define equal(a1, a2) \ + (bcmp((caddr_t)(a1), (caddr_t)(a2), ((struct sockaddr *)(a1))->sa_len) == 0) + for (ifp = ifnet; ifp; ifp = ifp->if_next) + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_addr->sa_family != addr->sa_family) + continue; + if (equal(addr, ifa->ifa_addr)) + return (ifa); + if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && + equal(ifa->ifa_broadaddr, addr)) + return (ifa); + } + return ((struct ifaddr *)0); +} +/* + * Locate the point to point interface with a given destination address. + */ +/*ARGSUSED*/ +struct ifaddr * +ifa_ifwithdstaddr(addr) + register struct sockaddr *addr; +{ + register struct ifnet *ifp; + register struct ifaddr *ifa; + + for (ifp = ifnet; ifp; ifp = ifp->if_next) + if (ifp->if_flags & IFF_POINTOPOINT) + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_addr->sa_family != addr->sa_family) + continue; + if (equal(addr, ifa->ifa_dstaddr)) + return (ifa); + } + return ((struct ifaddr *)0); +} + +/* + * Find an interface on a specific network. If many, choice + * is most specific found. + */ +struct ifaddr * +ifa_ifwithnet(addr) + struct sockaddr *addr; +{ + register struct ifnet *ifp; + register struct ifaddr *ifa; + struct ifaddr *ifa_maybe = (struct ifaddr *) 0; + u_int af = addr->sa_family; + char *addr_data = addr->sa_data, *cplim; + + if (af == AF_LINK) { + register struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr; + if (sdl->sdl_index && sdl->sdl_index <= if_index) + return (ifnet_addrs[sdl->sdl_index - 1]); + } + for (ifp = ifnet; ifp; ifp = ifp->if_next) + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) { + register char *cp, *cp2, *cp3; + + if (ifa->ifa_addr->sa_family != af || ifa->ifa_netmask == 0) + next: continue; + cp = addr_data; + cp2 = ifa->ifa_addr->sa_data; + cp3 = ifa->ifa_netmask->sa_data; + cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; + while (cp3 < cplim) + if ((*cp++ ^ *cp2++) & *cp3++) + goto next; + if (ifa_maybe == 0 || + rn_refines((caddr_t)ifa->ifa_netmask, + (caddr_t)ifa_maybe->ifa_netmask)) + ifa_maybe = ifa; + } + return (ifa_maybe); +} + +/* + * Find an interface using a specific address family + */ +struct ifaddr * +ifa_ifwithaf(af) + register int af; +{ + register struct ifnet *ifp; + register struct ifaddr *ifa; + + for (ifp = ifnet; ifp; ifp = ifp->if_next) + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) + if (ifa->ifa_addr->sa_family == af) + return (ifa); + return ((struct ifaddr *)0); +} + +/* + * Find an interface address specific to an interface best matching + * a given address. + */ +struct ifaddr * +ifaof_ifpforaddr(addr, ifp) + struct sockaddr *addr; + register struct ifnet *ifp; +{ + register struct ifaddr *ifa; + register char *cp, *cp2, *cp3; + register char *cplim; + struct ifaddr *ifa_maybe = 0; + u_int af = addr->sa_family; + + if (af >= AF_MAX) + return (0); + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_addr->sa_family != af) + continue; + ifa_maybe = ifa; + if (ifa->ifa_netmask == 0) { + if (equal(addr, ifa->ifa_addr) || + (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr))) + return (ifa); + continue; + } + cp = addr->sa_data; + cp2 = ifa->ifa_addr->sa_data; + cp3 = ifa->ifa_netmask->sa_data; + cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; + for (; cp3 < cplim; cp3++) + if ((*cp++ ^ *cp2++) & *cp3) + break; + if (cp3 == cplim) + return (ifa); + } + return (ifa_maybe); +} + +#include + +/* + * Default action when installing a route with a Link Level gateway. + * Lookup an appropriate real ifa to point to. + * This should be moved to /sys/net/link.c eventually. + */ +void +link_rtrequest(cmd, rt, sa) + int cmd; + register struct rtentry *rt; + struct sockaddr *sa; +{ + register struct ifaddr *ifa; + struct sockaddr *dst; + struct ifnet *ifp; + + if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) || + ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0)) + return; + if (ifa = ifaof_ifpforaddr(dst, ifp)) { + IFAFREE(rt->rt_ifa); + rt->rt_ifa = ifa; + ifa->ifa_refcnt++; + if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) + ifa->ifa_rtrequest(cmd, rt, sa); + } +} + +/* + * Mark an interface down and notify protocols of + * the transition. + * NOTE: must be called at splnet or eqivalent. + */ +void +if_down(ifp) + register struct ifnet *ifp; +{ + register struct ifaddr *ifa; + + ifp->if_flags &= ~IFF_UP; + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) + pfctlinput(PRC_IFDOWN, ifa->ifa_addr); + if_qflush(&ifp->if_snd); + rt_ifmsg(ifp); +} + +/* + * Mark an interface up and notify protocols of + * the transition. + * NOTE: must be called at splnet or eqivalent. + */ +void +if_up(ifp) + register struct ifnet *ifp; +{ + register struct ifaddr *ifa; + + ifp->if_flags |= IFF_UP; +#ifdef notyet + /* this has no effect on IP, and will kill all iso connections XXX */ + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) + pfctlinput(PRC_IFUP, ifa->ifa_addr); +#endif + rt_ifmsg(ifp); +} + +/* + * Flush an interface queue. + */ +void +if_qflush(ifq) + register struct ifqueue *ifq; +{ + register struct mbuf *m, *n; + + n = ifq->ifq_head; + while (m = n) { + n = m->m_act; + m_freem(m); + } + ifq->ifq_head = 0; + ifq->ifq_tail = 0; + ifq->ifq_len = 0; +} + +/* + * Handle interface watchdog timer routines. Called + * from softclock, we decrement timers (if set) and + * call the appropriate interface routine on expiration. + */ +void +if_slowtimo(arg) + void *arg; +{ + register struct ifnet *ifp; + int s = splimp(); + + for (ifp = ifnet; ifp; ifp = ifp->if_next) { + if (ifp->if_timer == 0 || --ifp->if_timer) + continue; + if (ifp->if_watchdog) + (*ifp->if_watchdog)(ifp->if_unit); + } + splx(s); + timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ); +} + +/* + * Map interface name to + * interface structure pointer. + */ +struct ifnet * +ifunit(name) + register char *name; +{ + register char *cp; + register struct ifnet *ifp; + int unit; + unsigned len; + char *ep, c; + + for (cp = name; cp < name + IFNAMSIZ && *cp; cp++) + if (*cp >= '0' && *cp <= '9') + break; + if (*cp == '\0' || cp == name + IFNAMSIZ) + return ((struct ifnet *)0); + /* + * Save first char of unit, and pointer to it, + * so we can put a null there to avoid matching + * initial substrings of interface names. + */ + len = cp - name + 1; + c = *cp; + ep = cp; + for (unit = 0; *cp >= '0' && *cp <= '9'; ) + unit = unit * 10 + *cp++ - '0'; + *ep = 0; + for (ifp = ifnet; ifp; ifp = ifp->if_next) { + if (bcmp(ifp->if_name, name, len)) + continue; + if (unit == ifp->if_unit) + break; + } + *ep = c; + return (ifp); +} + +/* + * Interface ioctls. + */ +int +ifioctl(so, cmd, data, p) + struct socket *so; + int cmd; + caddr_t data; + struct proc *p; +{ + register struct ifnet *ifp; + register struct ifreq *ifr; + int error; + + switch (cmd) { + + case SIOCGIFCONF: + case OSIOCGIFCONF: + return (ifconf(cmd, data)); + } + ifr = (struct ifreq *)data; + ifp = ifunit(ifr->ifr_name); + if (ifp == 0) + return (ENXIO); + switch (cmd) { + + case SIOCGIFFLAGS: + ifr->ifr_flags = ifp->if_flags; + break; + + case SIOCGIFMETRIC: + ifr->ifr_metric = ifp->if_metric; + break; + + case SIOCSIFFLAGS: + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + if (ifp->if_flags & IFF_UP && (ifr->ifr_flags & IFF_UP) == 0) { + int s = splimp(); + if_down(ifp); + splx(s); + } + if (ifr->ifr_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { + int s = splimp(); + if_up(ifp); + splx(s); + } + ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | + (ifr->ifr_flags &~ IFF_CANTCHANGE); + if (ifp->if_ioctl) + (void) (*ifp->if_ioctl)(ifp, cmd, data); + break; + + case SIOCSIFMETRIC: + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + ifp->if_metric = ifr->ifr_metric; + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + if (ifp->if_ioctl == NULL) + return (EOPNOTSUPP); + return ((*ifp->if_ioctl)(ifp, cmd, data)); + + default: + if (so->so_proto == 0) + return (EOPNOTSUPP); +#ifndef COMPAT_43 + return ((*so->so_proto->pr_usrreq)(so, PRU_CONTROL, + cmd, data, ifp)); +#else + { + int ocmd = cmd; + + switch (cmd) { + + case SIOCSIFDSTADDR: + case SIOCSIFADDR: + case SIOCSIFBRDADDR: + case SIOCSIFNETMASK: +#if BYTE_ORDER != BIG_ENDIAN + if (ifr->ifr_addr.sa_family == 0 && + ifr->ifr_addr.sa_len < 16) { + ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len; + ifr->ifr_addr.sa_len = 16; + } +#else + if (ifr->ifr_addr.sa_len == 0) + ifr->ifr_addr.sa_len = 16; +#endif + break; + + case OSIOCGIFADDR: + cmd = SIOCGIFADDR; + break; + + case OSIOCGIFDSTADDR: + cmd = SIOCGIFDSTADDR; + break; + + case OSIOCGIFBRDADDR: + cmd = SIOCGIFBRDADDR; + break; + + case OSIOCGIFNETMASK: + cmd = SIOCGIFNETMASK; + } + error = ((*so->so_proto->pr_usrreq)(so, PRU_CONTROL, + cmd, data, ifp)); + switch (ocmd) { + + case OSIOCGIFADDR: + case OSIOCGIFDSTADDR: + case OSIOCGIFBRDADDR: + case OSIOCGIFNETMASK: + *(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family; + } + return (error); + + } +#endif + } + return (0); +} + +/* + * Return interface configuration + * of system. List may be used + * in later ioctl's (above) to get + * other information. + */ +/*ARGSUSED*/ +int +ifconf(cmd, data) + int cmd; + caddr_t data; +{ + register struct ifconf *ifc = (struct ifconf *)data; + register struct ifnet *ifp = ifnet; + register struct ifaddr *ifa; + register char *cp, *ep; + struct ifreq ifr, *ifrp; + int space = ifc->ifc_len, error = 0; + + ifrp = ifc->ifc_req; + ep = ifr.ifr_name + sizeof (ifr.ifr_name) - 2; + for (; space > sizeof (ifr) && ifp; ifp = ifp->if_next) { + strncpy(ifr.ifr_name, ifp->if_name, sizeof (ifr.ifr_name) - 2); + for (cp = ifr.ifr_name; cp < ep && *cp; cp++) + continue; + *cp++ = '0' + ifp->if_unit; *cp = '\0'; + if ((ifa = ifp->if_addrlist) == 0) { + bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr)); + error = copyout((caddr_t)&ifr, (caddr_t)ifrp, + sizeof (ifr)); + if (error) + break; + space -= sizeof (ifr), ifrp++; + } else + for ( ; space > sizeof (ifr) && ifa; ifa = ifa->ifa_next) { + register struct sockaddr *sa = ifa->ifa_addr; +#ifdef COMPAT_43 + if (cmd == OSIOCGIFCONF) { + struct osockaddr *osa = + (struct osockaddr *)&ifr.ifr_addr; + ifr.ifr_addr = *sa; + osa->sa_family = sa->sa_family; + error = copyout((caddr_t)&ifr, (caddr_t)ifrp, + sizeof (ifr)); + ifrp++; + } else +#endif + if (sa->sa_len <= sizeof(*sa)) { + ifr.ifr_addr = *sa; + error = copyout((caddr_t)&ifr, (caddr_t)ifrp, + sizeof (ifr)); + ifrp++; + } else { + space -= sa->sa_len - sizeof(*sa); + if (space < sizeof (ifr)) + break; + error = copyout((caddr_t)&ifr, (caddr_t)ifrp, + sizeof (ifr.ifr_name)); + if (error == 0) + error = copyout((caddr_t)sa, + (caddr_t)&ifrp->ifr_addr, sa->sa_len); + ifrp = (struct ifreq *) + (sa->sa_len + (caddr_t)&ifrp->ifr_addr); + } + if (error) + break; + space -= sizeof (ifr); + } + } + ifc->ifc_len -= space; + return (error); +} + +static char * +sprint_d(n, buf, buflen) + u_int n; + char *buf; + int buflen; +{ + register char *cp = buf + buflen - 1; + + *cp = 0; + do { + cp--; + *cp = "0123456789"[n % 10]; + n /= 10; + } while (n != 0); + return (cp); +} diff --git a/sys/net/if.h b/sys/net/if.h new file mode 100644 index 00000000000..c27c4f9cf63 --- /dev/null +++ b/sys/net/if.h @@ -0,0 +1,363 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Structures defining a network interface, providing a packet + * transport mechanism (ala level 0 of the PUP protocols). + * + * Each interface accepts output datagrams of a specified maximum + * length, and provides higher level routines with input datagrams + * received from its medium. + * + * Output occurs when the routine if_output is called, with three parameters: + * (*ifp->if_output)(ifp, m, dst, rt) + * Here m is the mbuf chain to be sent and dst is the destination address. + * The output routine encapsulates the supplied datagram if necessary, + * and then transmits it on its medium. + * + * On input, each interface unwraps the data received by it, and either + * places it on the input queue of a internetwork datagram routine + * and posts the associated software interrupt, or passes the datagram to a raw + * packet input routine. + * + * Routines exist for locating interfaces by their addresses + * or for locating a interface on a certain network, as well as more general + * routing and gateway routines maintaining information used to locate + * interfaces. These routines live in the files if.c and route.c + */ +#ifndef _TIME_ /* XXX fast fix for SNMP, going away soon */ +#include +#endif + +#ifdef __STDC__ +/* + * Forward structure declarations for function prototypes [sic]. + */ +struct mbuf; +struct proc; +struct rtentry; +struct socket; +struct ether_header; +#endif +/* + * Structure describing information about an interface + * which may be of interest to management entities. + */ +/* + * Structure defining a queue for a network interface. + * + * (Would like to call this struct ``if'', but C isn't PL/1.) + */ + +struct ifnet { + char *if_name; /* name, e.g. ``en'' or ``lo'' */ + struct ifnet *if_next; /* all struct ifnets are chained */ + struct ifaddr *if_addrlist; /* linked list of addresses per if */ + int if_pcount; /* number of promiscuous listeners */ + caddr_t if_bpf; /* packet filter structure */ + u_short if_index; /* numeric abbreviation for this if */ + short if_unit; /* sub-unit for lower level driver */ + short if_timer; /* time 'til if_watchdog called */ + short if_flags; /* up/down, broadcast, etc. */ + struct if_data { +/* generic interface information */ + u_char ifi_type; /* ethernet, tokenring, etc */ + u_char ifi_addrlen; /* media address length */ + u_char ifi_hdrlen; /* media header length */ + u_long ifi_mtu; /* maximum transmission unit */ + u_long ifi_metric; /* routing metric (external only) */ + u_long ifi_baudrate; /* linespeed */ +/* volatile statistics */ + u_long ifi_ipackets; /* packets received on interface */ + u_long ifi_ierrors; /* input errors on interface */ + u_long ifi_opackets; /* packets sent on interface */ + u_long ifi_oerrors; /* output errors on interface */ + u_long ifi_collisions; /* collisions on csma interfaces */ + u_long ifi_ibytes; /* total number of octets received */ + u_long ifi_obytes; /* total number of octets sent */ + u_long ifi_imcasts; /* packets received via multicast */ + u_long ifi_omcasts; /* packets sent via multicast */ + u_long ifi_iqdrops; /* dropped on input, this interface */ + u_long ifi_noproto; /* destined for unsupported protocol */ + struct timeval ifi_lastchange;/* last updated */ + } if_data; +/* procedure handles */ + int (*if_init) /* init routine */ + __P((int)); + int (*if_output) /* output routine (enqueue) */ + __P((struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *)); + int (*if_start) /* initiate output routine */ + __P((struct ifnet *)); + int (*if_done) /* output complete routine */ + __P((struct ifnet *)); /* (XXX not used; fake prototype) */ + int (*if_ioctl) /* ioctl routine */ + __P((struct ifnet *, int, caddr_t)); + int (*if_reset) + __P((int)); /* new autoconfig will permit removal */ + int (*if_watchdog) /* timer routine */ + __P((int)); + struct ifqueue { + struct mbuf *ifq_head; + struct mbuf *ifq_tail; + int ifq_len; + int ifq_maxlen; + int ifq_drops; + } if_snd; /* output queue */ +}; +#define if_mtu if_data.ifi_mtu +#define if_type if_data.ifi_type +#define if_addrlen if_data.ifi_addrlen +#define if_hdrlen if_data.ifi_hdrlen +#define if_metric if_data.ifi_metric +#define if_baudrate if_data.ifi_baudrate +#define if_ipackets if_data.ifi_ipackets +#define if_ierrors if_data.ifi_ierrors +#define if_opackets if_data.ifi_opackets +#define if_oerrors if_data.ifi_oerrors +#define if_collisions if_data.ifi_collisions +#define if_ibytes if_data.ifi_ibytes +#define if_obytes if_data.ifi_obytes +#define if_imcasts if_data.ifi_imcasts +#define if_omcasts if_data.ifi_omcasts +#define if_iqdrops if_data.ifi_iqdrops +#define if_noproto if_data.ifi_noproto +#define if_lastchange if_data.ifi_lastchange + +#define IFF_UP 0x1 /* interface is up */ +#define IFF_BROADCAST 0x2 /* broadcast address valid */ +#define IFF_DEBUG 0x4 /* turn on debugging */ +#define IFF_LOOPBACK 0x8 /* is a loopback net */ +#define IFF_POINTOPOINT 0x10 /* interface is point-to-point link */ +#define IFF_NOTRAILERS 0x20 /* avoid use of trailers */ +#define IFF_RUNNING 0x40 /* resources allocated */ +#define IFF_NOARP 0x80 /* no address resolution protocol */ +#define IFF_PROMISC 0x100 /* receive all packets */ +#define IFF_ALLMULTI 0x200 /* receive all multicast packets */ +#define IFF_OACTIVE 0x400 /* transmission in progress */ +#define IFF_SIMPLEX 0x800 /* can't hear own transmissions */ +#define IFF_LINK0 0x1000 /* per link layer defined bit */ +#define IFF_LINK1 0x2000 /* per link layer defined bit */ +#define IFF_LINK2 0x4000 /* per link layer defined bit */ +#define IFF_MULTICAST 0x8000 /* supports multicast */ + +/* flags set internally only: */ +#define IFF_CANTCHANGE \ + (IFF_BROADCAST|IFF_POINTOPOINT|IFF_RUNNING|IFF_OACTIVE|\ + IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI) + +/* + * Output queues (ifp->if_snd) and internetwork datagram level (pup level 1) + * input routines have queues of messages stored on ifqueue structures + * (defined above). Entries are added to and deleted from these structures + * by these macros, which should be called with ipl raised to splimp(). + */ +#define IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen) +#define IF_DROP(ifq) ((ifq)->ifq_drops++) +#define IF_ENQUEUE(ifq, m) { \ + (m)->m_nextpkt = 0; \ + if ((ifq)->ifq_tail == 0) \ + (ifq)->ifq_head = m; \ + else \ + (ifq)->ifq_tail->m_nextpkt = m; \ + (ifq)->ifq_tail = m; \ + (ifq)->ifq_len++; \ +} +#define IF_PREPEND(ifq, m) { \ + (m)->m_nextpkt = (ifq)->ifq_head; \ + if ((ifq)->ifq_tail == 0) \ + (ifq)->ifq_tail = (m); \ + (ifq)->ifq_head = (m); \ + (ifq)->ifq_len++; \ +} +#define IF_DEQUEUE(ifq, m) { \ + (m) = (ifq)->ifq_head; \ + if (m) { \ + if (((ifq)->ifq_head = (m)->m_nextpkt) == 0) \ + (ifq)->ifq_tail = 0; \ + (m)->m_nextpkt = 0; \ + (ifq)->ifq_len--; \ + } \ +} + +#define IFQ_MAXLEN 50 +#define IFNET_SLOWHZ 1 /* granularity is 1 second */ + +/* + * The ifaddr structure contains information about one address + * of an interface. They are maintained by the different address families, + * are allocated and attached when an address is set, and are linked + * together so all addresses for an interface can be located. + */ +struct ifaddr { + struct sockaddr *ifa_addr; /* address of interface */ + struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ +#define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ + struct sockaddr *ifa_netmask; /* used to determine subnet */ + struct ifnet *ifa_ifp; /* back-pointer to interface */ + struct ifaddr *ifa_next; /* next address for interface */ + void (*ifa_rtrequest)(); /* check or clean routes (+ or -)'d */ + u_short ifa_flags; /* mostly rt_flags for cloning */ + short ifa_refcnt; /* extra to malloc for link info */ + int ifa_metric; /* cost of going out this interface */ +#ifdef notdef + struct rtentry *ifa_rt; /* XXXX for ROUTETOIF ????? */ +#endif +}; +#define IFA_ROUTE RTF_UP /* route installed */ + +/* + * Message format for use in obtaining information about interfaces + * from getkerninfo and the routing socket + */ +struct if_msghdr { + u_short ifm_msglen; /* to skip over non-understood messages */ + u_char ifm_version; /* future binary compatability */ + u_char ifm_type; /* message type */ + int ifm_addrs; /* like rtm_addrs */ + int ifm_flags; /* value of if_flags */ + u_short ifm_index; /* index for associated ifp */ + struct if_data ifm_data;/* statistics and other data about if */ +}; + +/* + * Message format for use in obtaining information about interface addresses + * from getkerninfo and the routing socket + */ +struct ifa_msghdr { + u_short ifam_msglen; /* to skip over non-understood messages */ + u_char ifam_version; /* future binary compatability */ + u_char ifam_type; /* message type */ + int ifam_addrs; /* like rtm_addrs */ + int ifam_flags; /* value of ifa_flags */ + u_short ifam_index; /* index for associated ifp */ + int ifam_metric; /* value of ifa_metric */ +}; + +/* + * Interface request structure used for socket + * ioctl's. All interface ioctl's must have parameter + * definitions which begin with ifr_name. The + * remainder may be interface specific. + */ +struct ifreq { +#define IFNAMSIZ 16 + char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + union { + struct sockaddr ifru_addr; + struct sockaddr ifru_dstaddr; + struct sockaddr ifru_broadaddr; + short ifru_flags; + int ifru_metric; + caddr_t ifru_data; + } ifr_ifru; +#define ifr_addr ifr_ifru.ifru_addr /* address */ +#define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ +#define ifr_broadaddr ifr_ifru.ifru_broadaddr /* broadcast address */ +#define ifr_flags ifr_ifru.ifru_flags /* flags */ +#define ifr_metric ifr_ifru.ifru_metric /* metric */ +#define ifr_data ifr_ifru.ifru_data /* for use by interface */ +}; + +struct ifaliasreq { + char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct sockaddr ifra_addr; + struct sockaddr ifra_broadaddr; + struct sockaddr ifra_mask; +}; + +/* + * Structure used in SIOCGIFCONF request. + * Used to retrieve interface configuration + * for machine (useful for programs which + * must know all networks accessible). + */ +struct ifconf { + int ifc_len; /* size of associated buffer */ + union { + caddr_t ifcu_buf; + struct ifreq *ifcu_req; + } ifc_ifcu; +#define ifc_buf ifc_ifcu.ifcu_buf /* buffer address */ +#define ifc_req ifc_ifcu.ifcu_req /* array of structures returned */ +}; + +#include + +#ifdef KERNEL +#define IFAFREE(ifa) \ + if ((ifa)->ifa_refcnt <= 0) \ + ifafree(ifa); \ + else \ + (ifa)->ifa_refcnt--; + +struct ifnet *ifnet; + +void ether_ifattach __P((struct ifnet *)); +void ether_input __P((struct ifnet *, struct ether_header *, struct mbuf *)); +int ether_output __P((struct ifnet *, + struct mbuf *, struct sockaddr *, struct rtentry *)); +char *ether_sprintf __P((u_char *)); + +void if_attach __P((struct ifnet *)); +void if_down __P((struct ifnet *)); +void if_qflush __P((struct ifqueue *)); +void if_slowtimo __P((void *)); +void if_up __P((struct ifnet *)); +#ifdef vax +void ifubareset __P((int)); +#endif +int ifconf __P((int, caddr_t)); +void ifinit __P((void)); +int ifioctl __P((struct socket *, int, caddr_t, struct proc *)); +int ifpromisc __P((struct ifnet *, int)); +struct ifnet *ifunit __P((char *)); + +struct ifaddr *ifa_ifwithaddr __P((struct sockaddr *)); +struct ifaddr *ifa_ifwithaf __P((int)); +struct ifaddr *ifa_ifwithdstaddr __P((struct sockaddr *)); +struct ifaddr *ifa_ifwithnet __P((struct sockaddr *)); +struct ifaddr *ifa_ifwithroute __P((int, struct sockaddr *, + struct sockaddr *)); +struct ifaddr *ifaof_ifpforaddr __P((struct sockaddr *, struct ifnet *)); +void ifafree __P((struct ifaddr *)); +void link_rtrequest __P((int, struct rtentry *, struct sockaddr *)); + +int loioctl __P((struct ifnet *, int, caddr_t)); +void loopattach __P((int)); +int looutput __P((struct ifnet *, + struct mbuf *, struct sockaddr *, struct rtentry *)); +void lortrequest __P((int, struct rtentry *, struct sockaddr *)); +#endif diff --git a/sys/net/if_arp.h b/sys/net/if_arp.h new file mode 100644 index 00000000000..84581cbb98d --- /dev/null +++ b/sys/net/if_arp.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_arp.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Address Resolution Protocol. + * + * See RFC 826 for protocol description. ARP packets are variable + * in size; the arphdr structure defines the fixed-length portion. + * Protocol type values are the same as those for 10 Mb/s Ethernet. + * It is followed by the variable-sized fields ar_sha, arp_spa, + * arp_tha and arp_tpa in that order, according to the lengths + * specified. Field names used correspond to RFC 826. + */ +struct arphdr { + u_short ar_hrd; /* format of hardware address */ +#define ARPHRD_ETHER 1 /* ethernet hardware format */ +#define ARPHRD_FRELAY 15 /* frame relay hardware format */ + u_short ar_pro; /* format of protocol address */ + u_char ar_hln; /* length of hardware address */ + u_char ar_pln; /* length of protocol address */ + u_short ar_op; /* one of: */ +#define ARPOP_REQUEST 1 /* request to resolve address */ +#define ARPOP_REPLY 2 /* response to previous request */ +#define ARPOP_REVREQUEST 3 /* request protocol address given hardware */ +#define ARPOP_REVREPLY 4 /* response giving protocol address */ +#define ARPOP_INVREQUEST 8 /* request to identify peer */ +#define ARPOP_INVREPLY 9 /* response identifying peer */ +/* + * The remaining fields are variable in size, + * according to the sizes above. + */ +#ifdef COMMENT_ONLY + u_char ar_sha[]; /* sender hardware address */ + u_char ar_spa[]; /* sender protocol address */ + u_char ar_tha[]; /* target hardware address */ + u_char ar_tpa[]; /* target protocol address */ +#endif +}; + +/* + * ARP ioctl request + */ +struct arpreq { + struct sockaddr arp_pa; /* protocol address */ + struct sockaddr arp_ha; /* hardware address */ + int arp_flags; /* flags */ +}; +/* arp_flags and at_flags field values */ +#define ATF_INUSE 0x01 /* entry in use */ +#define ATF_COM 0x02 /* completed entry (enaddr valid) */ +#define ATF_PERM 0x04 /* permanent entry */ +#define ATF_PUBL 0x08 /* publish entry (respond for other host) */ +#define ATF_USETRAILERS 0x10 /* has requested trailers */ diff --git a/sys/net/if_dl.h b/sys/net/if_dl.h new file mode 100644 index 00000000000..3e53449085a --- /dev/null +++ b/sys/net/if_dl.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_dl.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * A Link-Level Sockaddr may specify the interface in one of two + * ways: either by means of a system-provided index number (computed + * anew and possibly differently on every reboot), or by a human-readable + * string such as "il0" (for managerial convenience). + * + * Census taking actions, such as something akin to SIOCGCONF would return + * both the index and the human name. + * + * High volume transactions (such as giving a link-level ``from'' address + * in a recvfrom or recvmsg call) may be likely only to provide the indexed + * form, (which requires fewer copy operations and less space). + * + * The form and interpretation of the link-level address is purely a matter + * of convention between the device driver and its consumers; however, it is + * expected that all drivers for an interface of a given if_type will agree. + */ + +/* + * Structure of a Link-Level sockaddr: + */ +struct sockaddr_dl { + u_char sdl_len; /* Total length of sockaddr */ + u_char sdl_family; /* AF_DLI */ + u_short sdl_index; /* if != 0, system given index for interface */ + u_char sdl_type; /* interface type */ + u_char sdl_nlen; /* interface name length, no trailing 0 reqd. */ + u_char sdl_alen; /* link level address length */ + u_char sdl_slen; /* link layer selector length */ + char sdl_data[12]; /* minimum work area, can be larger; + contains both if name and ll address */ +}; + +#define LLADDR(s) ((caddr_t)((s)->sdl_data + (s)->sdl_nlen)) + +#ifndef KERNEL + +#include + +__BEGIN_DECLS +void link_addr __P((const char *, struct sockaddr_dl *)); +char *link_ntoa __P((const struct sockaddr_dl *)); +__END_DECLS + +#endif /* !KERNEL */ diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c new file mode 100644 index 00000000000..d4d6680fdb0 --- /dev/null +++ b/sys/net/if_ethersubr.c @@ -0,0 +1,675 @@ +/* + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#endif +#include + +#ifdef NS +#include +#include +#endif + +#ifdef ISO +#include +#include +#include +#include +#endif + +#ifdef LLC +#include +#include +#endif + +#if defined(LLC) && defined(CCITT) +extern struct ifqueue pkintrq; +#endif + +u_char etherbroadcastaddr[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; +extern struct ifnet loif; +#define senderr(e) { error = (e); goto bad;} + +/* + * Ethernet output routine. + * Encapsulate a packet of type family for the local net. + * Use trailer local net encapsulation if enough data in first + * packet leaves a multiple of 512 bytes of data in remainder. + * Assumes that ifp is actually pointer to arpcom structure. + */ +int +ether_output(ifp, m0, dst, rt0) + register struct ifnet *ifp; + struct mbuf *m0; + struct sockaddr *dst; + struct rtentry *rt0; +{ + short type; + int s, error = 0; + u_char edst[6]; + register struct mbuf *m = m0; + register struct rtentry *rt; + struct mbuf *mcopy = (struct mbuf *)0; + register struct ether_header *eh; + int off, len = m->m_pkthdr.len; + struct arpcom *ac = (struct arpcom *)ifp; + + if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) + senderr(ENETDOWN); + ifp->if_lastchange = time; + if (rt = rt0) { + if ((rt->rt_flags & RTF_UP) == 0) { + if (rt0 = rt = rtalloc1(dst, 1)) + rt->rt_refcnt--; + else + senderr(EHOSTUNREACH); + } + if (rt->rt_flags & RTF_GATEWAY) { + if (rt->rt_gwroute == 0) + goto lookup; + if (((rt = rt->rt_gwroute)->rt_flags & RTF_UP) == 0) { + rtfree(rt); rt = rt0; + lookup: rt->rt_gwroute = rtalloc1(rt->rt_gateway, 1); + if ((rt = rt->rt_gwroute) == 0) + senderr(EHOSTUNREACH); + } + } + if (rt->rt_flags & RTF_REJECT) + if (rt->rt_rmx.rmx_expire == 0 || + time.tv_sec < rt->rt_rmx.rmx_expire) + senderr(rt == rt0 ? EHOSTDOWN : EHOSTUNREACH); + } + switch (dst->sa_family) { + +#ifdef INET + case AF_INET: + if (!arpresolve(ac, rt, m, dst, edst)) + return (0); /* if not yet resolved */ + /* If broadcasting on a simplex interface, loopback a copy */ + if ((m->m_flags & M_BCAST) && (ifp->if_flags & IFF_SIMPLEX)) + mcopy = m_copy(m, 0, (int)M_COPYALL); + off = m->m_pkthdr.len - m->m_len; + type = ETHERTYPE_IP; + break; +#endif +#ifdef NS + case AF_NS: + type = ETHERTYPE_NS; + bcopy((caddr_t)&(((struct sockaddr_ns *)dst)->sns_addr.x_host), + (caddr_t)edst, sizeof (edst)); + if (!bcmp((caddr_t)edst, (caddr_t)&ns_thishost, sizeof(edst))) + return (looutput(ifp, m, dst, rt)); + /* If broadcasting on a simplex interface, loopback a copy */ + if ((m->m_flags & M_BCAST) && (ifp->if_flags & IFF_SIMPLEX)) + mcopy = m_copy(m, 0, (int)M_COPYALL); + break; +#endif +#ifdef ISO + case AF_ISO: { + int snpalen; + struct llc *l; + register struct sockaddr_dl *sdl; + + if (rt && (sdl = (struct sockaddr_dl *)rt->rt_gateway) && + sdl->sdl_family == AF_LINK && sdl->sdl_alen > 0) { + bcopy(LLADDR(sdl), (caddr_t)edst, sizeof(edst)); + } else if (error = + iso_snparesolve(ifp, (struct sockaddr_iso *)dst, + (char *)edst, &snpalen)) + goto bad; /* Not Resolved */ + /* If broadcasting on a simplex interface, loopback a copy */ + if (*edst & 1) + m->m_flags |= (M_BCAST|M_MCAST); + if ((m->m_flags & M_BCAST) && (ifp->if_flags & IFF_SIMPLEX) && + (mcopy = m_copy(m, 0, (int)M_COPYALL))) { + M_PREPEND(mcopy, sizeof (*eh), M_DONTWAIT); + if (mcopy) { + eh = mtod(mcopy, struct ether_header *); + bcopy((caddr_t)edst, + (caddr_t)eh->ether_dhost, sizeof (edst)); + bcopy((caddr_t)ac->ac_enaddr, + (caddr_t)eh->ether_shost, sizeof (edst)); + } + } + M_PREPEND(m, 3, M_DONTWAIT); + if (m == NULL) + return (0); + type = m->m_pkthdr.len; + l = mtod(m, struct llc *); + l->llc_dsap = l->llc_ssap = LLC_ISO_LSAP; + l->llc_control = LLC_UI; + len += 3; + IFDEBUG(D_ETHER) + int i; + printf("unoutput: sending pkt to: "); + for (i=0; i<6; i++) + printf("%x ", edst[i] & 0xff); + printf("\n"); + ENDDEBUG + } break; +#endif /* ISO */ +#ifdef LLC +/* case AF_NSAP: */ + case AF_CCITT: { + register struct sockaddr_dl *sdl = + (struct sockaddr_dl *) rt -> rt_gateway; + + if (sdl && sdl->sdl_family == AF_LINK + && sdl->sdl_alen > 0) { + bcopy(LLADDR(sdl), (char *)edst, + sizeof(edst)); + } else goto bad; /* Not a link interface ? Funny ... */ + if ((ifp->if_flags & IFF_SIMPLEX) && (*edst & 1) && + (mcopy = m_copy(m, 0, (int)M_COPYALL))) { + M_PREPEND(mcopy, sizeof (*eh), M_DONTWAIT); + if (mcopy) { + eh = mtod(mcopy, struct ether_header *); + bcopy((caddr_t)edst, + (caddr_t)eh->ether_dhost, sizeof (edst)); + bcopy((caddr_t)ac->ac_enaddr, + (caddr_t)eh->ether_shost, sizeof (edst)); + } + } + type = m->m_pkthdr.len; +#ifdef LLC_DEBUG + { + int i; + register struct llc *l = mtod(m, struct llc *); + + printf("ether_output: sending LLC2 pkt to: "); + for (i=0; i<6; i++) + printf("%x ", edst[i] & 0xff); + printf(" len 0x%x dsap 0x%x ssap 0x%x control 0x%x\n", + type & 0xff, l->llc_dsap & 0xff, l->llc_ssap &0xff, + l->llc_control & 0xff); + + } +#endif /* LLC_DEBUG */ + } break; +#endif /* LLC */ + + case AF_UNSPEC: + eh = (struct ether_header *)dst->sa_data; + bcopy((caddr_t)eh->ether_dhost, (caddr_t)edst, sizeof (edst)); + type = eh->ether_type; + break; + + default: + printf("%s%d: can't handle af%d\n", ifp->if_name, ifp->if_unit, + dst->sa_family); + senderr(EAFNOSUPPORT); + } + + + if (mcopy) + (void) looutput(ifp, mcopy, dst, rt); + /* + * Add local net header. If no space in first mbuf, + * allocate another. + */ + M_PREPEND(m, sizeof (struct ether_header), M_DONTWAIT); + if (m == 0) + senderr(ENOBUFS); + eh = mtod(m, struct ether_header *); + type = htons((u_short)type); + bcopy((caddr_t)&type,(caddr_t)&eh->ether_type, + sizeof(eh->ether_type)); + bcopy((caddr_t)edst, (caddr_t)eh->ether_dhost, sizeof (edst)); + bcopy((caddr_t)ac->ac_enaddr, (caddr_t)eh->ether_shost, + sizeof(eh->ether_shost)); + s = splimp(); + /* + * Queue message on interface, and start output if interface + * not yet active. + */ + if (IF_QFULL(&ifp->if_snd)) { + IF_DROP(&ifp->if_snd); + splx(s); + senderr(ENOBUFS); + } + IF_ENQUEUE(&ifp->if_snd, m); + if ((ifp->if_flags & IFF_OACTIVE) == 0) + (*ifp->if_start)(ifp); + splx(s); + ifp->if_obytes += len + sizeof (struct ether_header); + if (m->m_flags & M_MCAST) + ifp->if_omcasts++; + return (error); + +bad: + if (m) + m_freem(m); + return (error); +} + +/* + * Process a received Ethernet packet; + * the packet is in the mbuf chain m without + * the ether header, which is provided separately. + */ +void +ether_input(ifp, eh, m) + struct ifnet *ifp; + register struct ether_header *eh; + struct mbuf *m; +{ + register struct ifqueue *inq; + register struct llc *l; + struct arpcom *ac = (struct arpcom *)ifp; + int s; + + if ((ifp->if_flags & IFF_UP) == 0) { + m_freem(m); + return; + } + ifp->if_lastchange = time; + ifp->if_ibytes += m->m_pkthdr.len + sizeof (*eh); + if (bcmp((caddr_t)etherbroadcastaddr, (caddr_t)eh->ether_dhost, + sizeof(etherbroadcastaddr)) == 0) + m->m_flags |= M_BCAST; + else if (eh->ether_dhost[0] & 1) + m->m_flags |= M_MCAST; + if (m->m_flags & (M_BCAST|M_MCAST)) + ifp->if_imcasts++; + + switch (eh->ether_type) { +#ifdef INET + case ETHERTYPE_IP: + schednetisr(NETISR_IP); + inq = &ipintrq; + break; + + case ETHERTYPE_ARP: + schednetisr(NETISR_ARP); + inq = &arpintrq; + break; +#endif +#ifdef NS + case ETHERTYPE_NS: + schednetisr(NETISR_NS); + inq = &nsintrq; + break; + +#endif + default: +#if defined (ISO) || defined (LLC) + if (eh->ether_type > ETHERMTU) + goto dropanyway; + l = mtod(m, struct llc *); + switch (l->llc_dsap) { +#ifdef ISO + case LLC_ISO_LSAP: + switch (l->llc_control) { + case LLC_UI: + /* LLC_UI_P forbidden in class 1 service */ + if ((l->llc_dsap == LLC_ISO_LSAP) && + (l->llc_ssap == LLC_ISO_LSAP)) { + /* LSAP for ISO */ + if (m->m_pkthdr.len > eh->ether_type) + m_adj(m, eh->ether_type - m->m_pkthdr.len); + m->m_data += 3; /* XXX */ + m->m_len -= 3; /* XXX */ + m->m_pkthdr.len -= 3; /* XXX */ + M_PREPEND(m, sizeof *eh, M_DONTWAIT); + if (m == 0) + return; + *mtod(m, struct ether_header *) = *eh; + IFDEBUG(D_ETHER) + printf("clnp packet"); + ENDDEBUG + schednetisr(NETISR_ISO); + inq = &clnlintrq; + break; + } + goto dropanyway; + + case LLC_XID: + case LLC_XID_P: + if(m->m_len < 6) + goto dropanyway; + l->llc_window = 0; + l->llc_fid = 9; + l->llc_class = 1; + l->llc_dsap = l->llc_ssap = 0; + /* Fall through to */ + case LLC_TEST: + case LLC_TEST_P: + { + struct sockaddr sa; + register struct ether_header *eh2; + int i; + u_char c = l->llc_dsap; + + l->llc_dsap = l->llc_ssap; + l->llc_ssap = c; + if (m->m_flags & (M_BCAST | M_MCAST)) + bcopy((caddr_t)ac->ac_enaddr, + (caddr_t)eh->ether_dhost, 6); + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + eh2 = (struct ether_header *)sa.sa_data; + for (i = 0; i < 6; i++) { + eh2->ether_shost[i] = c = eh->ether_dhost[i]; + eh2->ether_dhost[i] = + eh->ether_dhost[i] = eh->ether_shost[i]; + eh->ether_shost[i] = c; + } + ifp->if_output(ifp, m, &sa, NULL); + return; + } + default: + m_freem(m); + return; + } + break; +#endif /* ISO */ +#ifdef LLC + case LLC_X25_LSAP: + { + if (m->m_pkthdr.len > eh->ether_type) + m_adj(m, eh->ether_type - m->m_pkthdr.len); + M_PREPEND(m, sizeof(struct sdl_hdr) , M_DONTWAIT); + if (m == 0) + return; + if ( !sdl_sethdrif(ifp, eh->ether_shost, LLC_X25_LSAP, + eh->ether_dhost, LLC_X25_LSAP, 6, + mtod(m, struct sdl_hdr *))) + panic("ETHER cons addr failure"); + mtod(m, struct sdl_hdr *)->sdlhdr_len = eh->ether_type; +#ifdef LLC_DEBUG + printf("llc packet\n"); +#endif /* LLC_DEBUG */ + schednetisr(NETISR_CCITT); + inq = &llcintrq; + break; + } +#endif /* LLC */ + dropanyway: + default: + m_freem(m); + return; + } +#else /* ISO || LLC */ + m_freem(m); + return; +#endif /* ISO || LLC */ + } + + s = splimp(); + if (IF_QFULL(inq)) { + IF_DROP(inq); + m_freem(m); + } else + IF_ENQUEUE(inq, m); + splx(s); +} + +/* + * Convert Ethernet address to printable (loggable) representation. + */ +static char digits[] = "0123456789abcdef"; +char * +ether_sprintf(ap) + register u_char *ap; +{ + register i; + static char etherbuf[18]; + register char *cp = etherbuf; + + for (i = 0; i < 6; i++) { + *cp++ = digits[*ap >> 4]; + *cp++ = digits[*ap++ & 0xf]; + *cp++ = ':'; + } + *--cp = 0; + return (etherbuf); +} + +/* + * Perform common duties while attaching to interface list + */ +void +ether_ifattach(ifp) + register struct ifnet *ifp; +{ + register struct ifaddr *ifa; + register struct sockaddr_dl *sdl; + + ifp->if_type = IFT_ETHER; + ifp->if_addrlen = 6; + ifp->if_hdrlen = 14; + ifp->if_mtu = ETHERMTU; + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) + if ((sdl = (struct sockaddr_dl *)ifa->ifa_addr) && + sdl->sdl_family == AF_LINK) { + sdl->sdl_type = IFT_ETHER; + sdl->sdl_alen = ifp->if_addrlen; + bcopy((caddr_t)((struct arpcom *)ifp)->ac_enaddr, + LLADDR(sdl), ifp->if_addrlen); + break; + } +} + +u_char ether_ipmulticast_min[6] = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; +u_char ether_ipmulticast_max[6] = { 0x01, 0x00, 0x5e, 0x7f, 0xff, 0xff }; +/* + * Add an Ethernet multicast address or range of addresses to the list for a + * given interface. + */ +int +ether_addmulti(ifr, ac) + struct ifreq *ifr; + register struct arpcom *ac; +{ + register struct ether_multi *enm; + struct sockaddr_in *sin; + u_char addrlo[6]; + u_char addrhi[6]; + int s = splimp(); + + switch (ifr->ifr_addr.sa_family) { + + case AF_UNSPEC: + bcopy(ifr->ifr_addr.sa_data, addrlo, 6); + bcopy(addrlo, addrhi, 6); + break; + +#ifdef INET + case AF_INET: + sin = (struct sockaddr_in *)&(ifr->ifr_addr); + if (sin->sin_addr.s_addr == INADDR_ANY) { + /* + * An IP address of INADDR_ANY means listen to all + * of the Ethernet multicast addresses used for IP. + * (This is for the sake of IP multicast routers.) + */ + bcopy(ether_ipmulticast_min, addrlo, 6); + bcopy(ether_ipmulticast_max, addrhi, 6); + } + else { + ETHER_MAP_IP_MULTICAST(&sin->sin_addr, addrlo); + bcopy(addrlo, addrhi, 6); + } + break; +#endif + + default: + splx(s); + return (EAFNOSUPPORT); + } + + /* + * Verify that we have valid Ethernet multicast addresses. + */ + if ((addrlo[0] & 0x01) != 1 || (addrhi[0] & 0x01) != 1) { + splx(s); + return (EINVAL); + } + /* + * See if the address range is already in the list. + */ + ETHER_LOOKUP_MULTI(addrlo, addrhi, ac, enm); + if (enm != NULL) { + /* + * Found it; just increment the reference count. + */ + ++enm->enm_refcount; + splx(s); + return (0); + } + /* + * New address or range; malloc a new multicast record + * and link it into the interface's multicast list. + */ + enm = (struct ether_multi *)malloc(sizeof(*enm), M_IFMADDR, M_NOWAIT); + if (enm == NULL) { + splx(s); + return (ENOBUFS); + } + bcopy(addrlo, enm->enm_addrlo, 6); + bcopy(addrhi, enm->enm_addrhi, 6); + enm->enm_ac = ac; + enm->enm_refcount = 1; + enm->enm_next = ac->ac_multiaddrs; + ac->ac_multiaddrs = enm; + ac->ac_multicnt++; + splx(s); + /* + * Return ENETRESET to inform the driver that the list has changed + * and its reception filter should be adjusted accordingly. + */ + return (ENETRESET); +} + +/* + * Delete a multicast address record. + */ +int +ether_delmulti(ifr, ac) + struct ifreq *ifr; + register struct arpcom *ac; +{ + register struct ether_multi *enm; + register struct ether_multi **p; + struct sockaddr_in *sin; + u_char addrlo[6]; + u_char addrhi[6]; + int s = splimp(); + + switch (ifr->ifr_addr.sa_family) { + + case AF_UNSPEC: + bcopy(ifr->ifr_addr.sa_data, addrlo, 6); + bcopy(addrlo, addrhi, 6); + break; + +#ifdef INET + case AF_INET: + sin = (struct sockaddr_in *)&(ifr->ifr_addr); + if (sin->sin_addr.s_addr == INADDR_ANY) { + /* + * An IP address of INADDR_ANY means stop listening + * to the range of Ethernet multicast addresses used + * for IP. + */ + bcopy(ether_ipmulticast_min, addrlo, 6); + bcopy(ether_ipmulticast_max, addrhi, 6); + } + else { + ETHER_MAP_IP_MULTICAST(&sin->sin_addr, addrlo); + bcopy(addrlo, addrhi, 6); + } + break; +#endif + + default: + splx(s); + return (EAFNOSUPPORT); + } + + /* + * Look up the address in our list. + */ + ETHER_LOOKUP_MULTI(addrlo, addrhi, ac, enm); + if (enm == NULL) { + splx(s); + return (ENXIO); + } + if (--enm->enm_refcount != 0) { + /* + * Still some claims to this record. + */ + splx(s); + return (0); + } + /* + * No remaining claims to this record; unlink and free it. + */ + for (p = &enm->enm_ac->ac_multiaddrs; + *p != enm; + p = &(*p)->enm_next) + continue; + *p = (*p)->enm_next; + free(enm, M_IFMADDR); + ac->ac_multicnt--; + splx(s); + /* + * Return ENETRESET to inform the driver that the list has changed + * and its reception filter should be adjusted accordingly. + */ + return (ENETRESET); +} diff --git a/sys/net/if_llc.h b/sys/net/if_llc.h new file mode 100644 index 00000000000..90dcb07991d --- /dev/null +++ b/sys/net/if_llc.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_llc.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * IEEE 802.2 Link Level Control headers, for use in conjunction with + * 802.{3,4,5} media access control methods. + * + * Headers here do not use bit fields due to shortcommings in many + * compilers. + */ + +struct llc { + u_char llc_dsap; + u_char llc_ssap; + union { + struct { + u_char control; + u_char format_id; + u_char class; + u_char window_x2; + } type_u; + struct { + u_char num_snd_x2; + u_char num_rcv_x2; + } type_i; + struct { + u_char control; + u_char num_rcv_x2; + } type_s; + struct { + u_char control; + struct frmrinfo { + u_char rej_pdu_0; + u_char rej_pdu_1; + u_char frmr_control; + u_char frmr_control_ext; + u_char frmr_cause; + } frmrinfo; + } type_frmr; + struct { + u_char control; + u_char org_code[3]; + u_short ether_type; + } type_snap; + struct { + u_char control; + u_char control_ext; + } type_raw; + } llc_un; +}; +#define llc_control llc_un.type_u.control +#define llc_control_ext llc_un.type_raw.control_ext +#define llc_fid llc_un.type_u.format_id +#define llc_class llc_un.type_u.class +#define llc_window llc_un.type_u.window_x2 +#define llc_frmrinfo llc_un.type_frmr.frmrinfo +#define llc_frmr_pdu0 llc_un.type_frmr.frmrinfo.rej_pdu0 +#define llc_frmr_pdu1 llc_un.type_frmr.frmrinfo.rej_pdu1 +#define llc_frmr_control llc_un.type_frmr.frmrinfo.frmr_control +#define llc_frmr_control_ext llc_un.type_frmr.frmrinfo.frmr_control_ext +#define llc_frmr_cause llc_un.type_frmr.frmrinfo.frmr_control_ext + +/* + * Don't use sizeof(struct llc_un) for LLC header sizes + */ +#define LLC_ISFRAMELEN 4 +#define LLC_UFRAMELEN 3 +#define LLC_FRMRLEN 7 + +/* + * Unnumbered LLC format commands + */ +#define LLC_UI 0x3 +#define LLC_UI_P 0x13 +#define LLC_DISC 0x43 +#define LLC_DISC_P 0x53 +#define LLC_UA 0x63 +#define LLC_UA_P 0x73 +#define LLC_TEST 0xe3 +#define LLC_TEST_P 0xf3 +#define LLC_FRMR 0x87 +#define LLC_FRMR_P 0x97 +#define LLC_DM 0x0f +#define LLC_DM_P 0x1f +#define LLC_XID 0xaf +#define LLC_XID_P 0xbf +#define LLC_SABME 0x6f +#define LLC_SABME_P 0x7f + +/* + * Supervisory LLC commands + */ +#define LLC_RR 0x01 +#define LLC_RNR 0x05 +#define LLC_REJ 0x09 + +/* + * Info format - dummy only + */ +#define LLC_INFO 0x00 + +/* + * ISO PDTR 10178 contains among others + */ +#define LLC_X25_LSAP 0x7e +#define LLC_SNAP_LSAP 0xaa +#define LLC_ISO_LSAP 0xfe + + + + + + diff --git a/sys/net/if_loop.c b/sys/net/if_loop.c new file mode 100644 index 00000000000..f09295e34be --- /dev/null +++ b/sys/net/if_loop.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_loop.c 8.1 (Berkeley) 6/10/93 + */ + +/* + * Loopback interface driver for protocol testing and timing. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#include +#include +#endif + +#ifdef NS +#include +#include +#endif + +#ifdef ISO +#include +#include +#endif + +#include "bpfilter.h" + +#define LOMTU (1024+512) + +struct ifnet loif; + +/* ARGSUSED */ +void +loopattach(n) + int n; +{ + register struct ifnet *ifp = &loif; + +#ifdef lint + n = n; /* Highlander: there can only be one... */ +#endif + ifp->if_name = "lo"; + ifp->if_mtu = LOMTU; + ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; + ifp->if_ioctl = loioctl; + ifp->if_output = looutput; + ifp->if_type = IFT_LOOP; + ifp->if_hdrlen = 0; + ifp->if_addrlen = 0; + if_attach(ifp); +#if NBPFILTER > 0 + bpfattach(&ifp->if_bpf, ifp, DLT_NULL, sizeof(u_int)); +#endif +} + +int +looutput(ifp, m, dst, rt) + struct ifnet *ifp; + register struct mbuf *m; + struct sockaddr *dst; + register struct rtentry *rt; +{ + int s, isr; + register struct ifqueue *ifq = 0; + + if ((m->m_flags & M_PKTHDR) == 0) + panic("looutput no HDR"); + ifp->if_lastchange = time; +#if NBPFILTER > 0 + if (loif.if_bpf) { + /* + * We need to prepend the address family as + * a four byte field. Cons up a dummy header + * to pacify bpf. This is safe because bpf + * will only read from the mbuf (i.e., it won't + * try to free it or keep a pointer a to it). + */ + struct mbuf m0; + u_int af = dst->sa_family; + + m0.m_next = m; + m0.m_len = 4; + m0.m_data = (char *)⁡ + + bpf_mtap(loif.if_bpf, &m0); + } +#endif + m->m_pkthdr.rcvif = ifp; + + if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + m_freem(m); + return (rt->rt_flags & RTF_BLACKHOLE ? 0 : + rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + } + ifp->if_opackets++; + ifp->if_obytes += m->m_pkthdr.len; + switch (dst->sa_family) { + +#ifdef INET + case AF_INET: + ifq = &ipintrq; + isr = NETISR_IP; + break; +#endif +#ifdef NS + case AF_NS: + ifq = &nsintrq; + isr = NETISR_NS; + break; +#endif +#ifdef ISO + case AF_ISO: + ifq = &clnlintrq; + isr = NETISR_ISO; + break; +#endif + default: + printf("lo%d: can't handle af%d\n", ifp->if_unit, + dst->sa_family); + m_freem(m); + return (EAFNOSUPPORT); + } + s = splimp(); + if (IF_QFULL(ifq)) { + IF_DROP(ifq); + m_freem(m); + splx(s); + return (ENOBUFS); + } + IF_ENQUEUE(ifq, m); + schednetisr(isr); + ifp->if_ipackets++; + ifp->if_ibytes += m->m_pkthdr.len; + splx(s); + return (0); +} + +/* ARGSUSED */ +void +lortrequest(cmd, rt, sa) + int cmd; + struct rtentry *rt; + struct sockaddr *sa; +{ + + if (rt) + rt->rt_rmx.rmx_mtu = LOMTU; +} + +/* + * Process an ioctl request. + */ +/* ARGSUSED */ +int +loioctl(ifp, cmd, data) + register struct ifnet *ifp; + int cmd; + caddr_t data; +{ + register struct ifaddr *ifa; + register struct ifreq *ifr; + register int error = 0; + + switch (cmd) { + + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + ifa = (struct ifaddr *)data; + if (ifa != 0 && ifa->ifa_addr->sa_family == AF_ISO) + ifa->ifa_rtrequest = lortrequest; + /* + * Everything else is done at a higher level. + */ + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + ifr = (struct ifreq *)data; + if (ifr == 0) { + error = EAFNOSUPPORT; /* XXX */ + break; + } + switch (ifr->ifr_addr.sa_family) { + +#ifdef INET + case AF_INET: + break; +#endif + + default: + error = EAFNOSUPPORT; + break; + } + break; + + default: + error = EINVAL; + } + return (error); +} diff --git a/sys/net/if_sl.c b/sys/net/if_sl.c new file mode 100644 index 00000000000..56ce96f4b9d --- /dev/null +++ b/sys/net/if_sl.c @@ -0,0 +1,839 @@ +/* + * Copyright (c) 1987, 1989, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_sl.c 8.6 (Berkeley) 2/1/94 + */ + +/* + * Serial Line interface + * + * Rick Adams + * Center for Seismic Studies + * 1300 N 17th Street, Suite 1450 + * Arlington, Virginia 22209 + * (703)276-7900 + * rick@seismo.ARPA + * seismo!rick + * + * Pounded on heavily by Chris Torek (chris@mimsy.umd.edu, umcp-cs!chris). + * N.B.: this belongs in netinet, not net, the way it stands now. + * Should have a link-layer type designation, but wouldn't be + * backwards-compatible. + * + * Converted to 4.3BSD Beta by Chris Torek. + * Other changes made at Berkeley, based in part on code by Kirk Smith. + * W. Jolitz added slip abort. + * + * Hacked almost beyond recognition by Van Jacobson (van@helios.ee.lbl.gov). + * Added priority queuing for "interactive" traffic; hooks for TCP + * header compression; ICMP filtering (at 2400 baud, some cretin + * pinging you can use up all your bandwidth). Made low clist behavior + * more robust and slightly less likely to hang serial line. + * Sped up a bunch of things. + * + * Note that splimp() is used throughout to block both (tty) input + * interrupts and network activity; thus, splimp must be >= spltty. + */ + +#include "sl.h" +#if NSL > 0 + +#include "bpfilter.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#if INET +#include +#include +#include +#include +#else +Huh? Slip without inet? +#endif + +#include +#include +#include + +#if NBPFILTER > 0 +#include +#include +#endif + +/* + * SLMAX is a hard limit on input packet size. To simplify the code + * and improve performance, we require that packets fit in an mbuf + * cluster, and if we get a compressed packet, there's enough extra + * room to expand the header into a max length tcp/ip header (128 + * bytes). So, SLMAX can be at most + * MCLBYTES - 128 + * + * SLMTU is a hard limit on output packet size. To insure good + * interactive response, SLMTU wants to be the smallest size that + * amortizes the header cost. (Remember that even with + * type-of-service queuing, we have to wait for any in-progress + * packet to finish. I.e., we wait, on the average, 1/2 * mtu / + * cps, where cps is the line speed in characters per second. + * E.g., 533ms wait for a 1024 byte MTU on a 9600 baud line. The + * average compressed header size is 6-8 bytes so any MTU > 90 + * bytes will give us 90% of the line bandwidth. A 100ms wait is + * tolerable (500ms is not), so want an MTU around 296. (Since TCP + * will send 256 byte segments (to allow for 40 byte headers), the + * typical packet size on the wire will be around 260 bytes). In + * 4.3tahoe+ systems, we can set an MTU in a route so we do that & + * leave the interface MTU relatively high (so we don't IP fragment + * when acting as a gateway to someone using a stupid MTU). + * + * Similar considerations apply to SLIP_HIWAT: It's the amount of + * data that will be queued 'downstream' of us (i.e., in clists + * waiting to be picked up by the tty output interrupt). If we + * queue a lot of data downstream, it's immune to our t.o.s. queuing. + * E.g., if SLIP_HIWAT is 1024, the interactive traffic in mixed + * telnet/ftp will see a 1 sec wait, independent of the mtu (the + * wait is dependent on the ftp window size but that's typically + * 1k - 4k). So, we want SLIP_HIWAT just big enough to amortize + * the cost (in idle time on the wire) of the tty driver running + * off the end of its clists & having to call back slstart for a + * new packet. For a tty interface with any buffering at all, this + * cost will be zero. Even with a totally brain dead interface (like + * the one on a typical workstation), the cost will be <= 1 character + * time. So, setting SLIP_HIWAT to ~100 guarantees that we'll lose + * at most 1% while maintaining good interactive response. + */ +#if NBPFILTER > 0 +#define BUFOFFSET (128+sizeof(struct ifnet **)+SLIP_HDRLEN) +#else +#define BUFOFFSET (128+sizeof(struct ifnet **)) +#endif +#define SLMAX (MCLBYTES - BUFOFFSET) +#define SLBUFSIZE (SLMAX + BUFOFFSET) +#define SLMTU 296 +#define SLIP_HIWAT roundup(50,CBSIZE) +#define CLISTRESERVE 1024 /* Can't let clists get too low */ + +/* + * SLIP ABORT ESCAPE MECHANISM: + * (inspired by HAYES modem escape arrangement) + * 1sec escape 1sec escape 1sec escape { 1sec escape 1sec escape } + * within window time signals a "soft" exit from slip mode by remote end + * if the IFF_DEBUG flag is on. + */ +#define ABT_ESC '\033' /* can't be t_intr - distant host must know it*/ +#define ABT_IDLE 1 /* in seconds - idle before an escape */ +#define ABT_COUNT 3 /* count of escapes for abort */ +#define ABT_WINDOW (ABT_COUNT*2+2) /* in seconds - time to count */ + +struct sl_softc sl_softc[NSL]; + +#define FRAME_END 0xc0 /* Frame End */ +#define FRAME_ESCAPE 0xdb /* Frame Esc */ +#define TRANS_FRAME_END 0xdc /* transposed frame end */ +#define TRANS_FRAME_ESCAPE 0xdd /* transposed frame esc */ + +extern struct timeval time; + +static int slinit __P((struct sl_softc *)); +static struct mbuf *sl_btom __P((struct sl_softc *, int)); + +/* + * Called from boot code to establish sl interfaces. + */ +void +slattach() +{ + register struct sl_softc *sc; + register int i = 0; + + for (sc = sl_softc; i < NSL; sc++) { + sc->sc_if.if_name = "sl"; + sc->sc_if.if_next = NULL; + sc->sc_if.if_unit = i++; + sc->sc_if.if_mtu = SLMTU; + sc->sc_if.if_flags = + IFF_POINTOPOINT | SC_AUTOCOMP | IFF_MULTICAST; + sc->sc_if.if_type = IFT_SLIP; + sc->sc_if.if_ioctl = slioctl; + sc->sc_if.if_output = sloutput; + sc->sc_if.if_snd.ifq_maxlen = 50; + sc->sc_fastq.ifq_maxlen = 32; + if_attach(&sc->sc_if); +#if NBPFILTER > 0 + bpfattach(&sc->sc_bpf, &sc->sc_if, DLT_SLIP, SLIP_HDRLEN); +#endif + } +} + +static int +slinit(sc) + register struct sl_softc *sc; +{ + register caddr_t p; + + if (sc->sc_ep == (u_char *) 0) { + MCLALLOC(p, M_WAIT); + if (p) + sc->sc_ep = (u_char *)p + SLBUFSIZE; + else { + printf("sl%d: can't allocate buffer\n", sc - sl_softc); + sc->sc_if.if_flags &= ~IFF_UP; + return (0); + } + } + sc->sc_buf = sc->sc_ep - SLMAX; + sc->sc_mp = sc->sc_buf; + sl_compress_init(&sc->sc_comp); + return (1); +} + +/* + * Line specific open routine. + * Attach the given tty to the first available sl unit. + */ +/* ARGSUSED */ +int +slopen(dev, tp) + dev_t dev; + register struct tty *tp; +{ + struct proc *p = curproc; /* XXX */ + register struct sl_softc *sc; + register int nsl; + int error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + + if (tp->t_line == SLIPDISC) + return (0); + + for (nsl = NSL, sc = sl_softc; --nsl >= 0; sc++) + if (sc->sc_ttyp == NULL) { + if (slinit(sc) == 0) + return (ENOBUFS); + tp->t_sc = (caddr_t)sc; + sc->sc_ttyp = tp; + sc->sc_if.if_baudrate = tp->t_ospeed; + ttyflush(tp, FREAD | FWRITE); + return (0); + } + return (ENXIO); +} + +/* + * Line specific close routine. + * Detach the tty from the sl unit. + */ +void +slclose(tp) + struct tty *tp; +{ + register struct sl_softc *sc; + int s; + + ttywflush(tp); + s = splimp(); /* actually, max(spltty, splnet) */ + tp->t_line = 0; + sc = (struct sl_softc *)tp->t_sc; + if (sc != NULL) { + if_down(&sc->sc_if); + sc->sc_ttyp = NULL; + tp->t_sc = NULL; + MCLFREE((caddr_t)(sc->sc_ep - SLBUFSIZE)); + sc->sc_ep = 0; + sc->sc_mp = 0; + sc->sc_buf = 0; + } + splx(s); +} + +/* + * Line specific (tty) ioctl routine. + * Provide a way to get the sl unit number. + */ +/* ARGSUSED */ +int +sltioctl(tp, cmd, data, flag) + struct tty *tp; + int cmd; + caddr_t data; + int flag; +{ + struct sl_softc *sc = (struct sl_softc *)tp->t_sc; + + switch (cmd) { + case SLIOCGUNIT: + *(int *)data = sc->sc_if.if_unit; + break; + + default: + return (-1); + } + return (0); +} + +/* + * Queue a packet. Start transmission if not active. + * Compression happens in slstart; if we do it here, IP TOS + * will cause us to not compress "background" packets, because + * ordering gets trashed. It can be done for all packets in slstart. + */ +int +sloutput(ifp, m, dst, rtp) + struct ifnet *ifp; + register struct mbuf *m; + struct sockaddr *dst; + struct rtentry *rtp; +{ + register struct sl_softc *sc = &sl_softc[ifp->if_unit]; + register struct ip *ip; + register struct ifqueue *ifq; + int s; + + /* + * `Cannot happen' (see slioctl). Someday we will extend + * the line protocol to support other address families. + */ + if (dst->sa_family != AF_INET) { + printf("sl%d: af%d not supported\n", sc->sc_if.if_unit, + dst->sa_family); + m_freem(m); + sc->sc_if.if_noproto++; + return (EAFNOSUPPORT); + } + + if (sc->sc_ttyp == NULL) { + m_freem(m); + return (ENETDOWN); /* sort of */ + } + if ((sc->sc_ttyp->t_state & TS_CARR_ON) == 0 && + (sc->sc_ttyp->t_cflag & CLOCAL) == 0) { + m_freem(m); + return (EHOSTUNREACH); + } + ifq = &sc->sc_if.if_snd; + ip = mtod(m, struct ip *); + if (sc->sc_if.if_flags & SC_NOICMP && ip->ip_p == IPPROTO_ICMP) { + m_freem(m); + return (ENETRESET); /* XXX ? */ + } + if (ip->ip_tos & IPTOS_LOWDELAY) + ifq = &sc->sc_fastq; + s = splimp(); + if (IF_QFULL(ifq)) { + IF_DROP(ifq); + m_freem(m); + splx(s); + sc->sc_if.if_oerrors++; + return (ENOBUFS); + } + IF_ENQUEUE(ifq, m); + sc->sc_if.if_lastchange = time; + if (sc->sc_ttyp->t_outq.c_cc == 0) + slstart(sc->sc_ttyp); + splx(s); + return (0); +} + +/* + * Start output on interface. Get another datagram + * to send from the interface queue and map it to + * the interface before starting output. + */ +void +slstart(tp) + register struct tty *tp; +{ + register struct sl_softc *sc = (struct sl_softc *)tp->t_sc; + register struct mbuf *m; + register u_char *cp; + register struct ip *ip; + int s; + struct mbuf *m2; +#if NBPFILTER > 0 + u_char bpfbuf[SLMTU + SLIP_HDRLEN]; + register int len; +#endif + extern int cfreecount; + + for (;;) { + /* + * If there is more in the output queue, just send it now. + * We are being called in lieu of ttstart and must do what + * it would. + */ + if (tp->t_outq.c_cc != 0) { + (*tp->t_oproc)(tp); + if (tp->t_outq.c_cc > SLIP_HIWAT) + return; + } + /* + * This happens briefly when the line shuts down. + */ + if (sc == NULL) + return; + + /* + * Get a packet and send it to the interface. + */ + s = splimp(); + IF_DEQUEUE(&sc->sc_fastq, m); + if (m) + sc->sc_if.if_omcasts++; /* XXX */ + else + IF_DEQUEUE(&sc->sc_if.if_snd, m); + splx(s); + if (m == NULL) + return; + + /* + * We do the header compression here rather than in sloutput + * because the packets will be out of order if we are using TOS + * queueing, and the connection id compression will get + * munged when this happens. + */ +#if NBPFILTER > 0 + if (sc->sc_bpf) { + /* + * We need to save the TCP/IP header before it's + * compressed. To avoid complicated code, we just + * copy the entire packet into a stack buffer (since + * this is a serial line, packets should be short + * and/or the copy should be negligible cost compared + * to the packet transmission time). + */ + register struct mbuf *m1 = m; + register u_char *cp = bpfbuf + SLIP_HDRLEN; + + len = 0; + do { + register int mlen = m1->m_len; + + bcopy(mtod(m1, caddr_t), cp, mlen); + cp += mlen; + len += mlen; + } while (m1 = m1->m_next); + } +#endif + if ((ip = mtod(m, struct ip *))->ip_p == IPPROTO_TCP) { + if (sc->sc_if.if_flags & SC_COMPRESS) + *mtod(m, u_char *) |= sl_compress_tcp(m, ip, + &sc->sc_comp, 1); + } +#if NBPFILTER > 0 + if (sc->sc_bpf) { + /* + * Put the SLIP pseudo-"link header" in place. The + * compressed header is now at the beginning of the + * mbuf. + */ + bpfbuf[SLX_DIR] = SLIPDIR_OUT; + bcopy(mtod(m, caddr_t), &bpfbuf[SLX_CHDR], CHDR_LEN); + bpf_tap(sc->sc_bpf, bpfbuf, len + SLIP_HDRLEN); + } +#endif + sc->sc_if.if_lastchange = time; + + /* + * If system is getting low on clists, just flush our + * output queue (if the stuff was important, it'll get + * retransmitted). + */ + if (cfreecount < CLISTRESERVE + SLMTU) { + m_freem(m); + sc->sc_if.if_collisions++; + continue; + } + /* + * The extra FRAME_END will start up a new packet, and thus + * will flush any accumulated garbage. We do this whenever + * the line may have been idle for some time. + */ + if (tp->t_outq.c_cc == 0) { + ++sc->sc_if.if_obytes; + (void) putc(FRAME_END, &tp->t_outq); + } + + while (m) { + register u_char *ep; + + cp = mtod(m, u_char *); ep = cp + m->m_len; + while (cp < ep) { + /* + * Find out how many bytes in the string we can + * handle without doing something special. + */ + register u_char *bp = cp; + + while (cp < ep) { + switch (*cp++) { + case FRAME_ESCAPE: + case FRAME_END: + --cp; + goto out; + } + } + out: + if (cp > bp) { + /* + * Put n characters at once + * into the tty output queue. + */ + if (b_to_q((char *)bp, cp - bp, + &tp->t_outq)) + break; + sc->sc_if.if_obytes += cp - bp; + } + /* + * If there are characters left in the mbuf, + * the first one must be special.. + * Put it out in a different form. + */ + if (cp < ep) { + if (putc(FRAME_ESCAPE, &tp->t_outq)) + break; + if (putc(*cp++ == FRAME_ESCAPE ? + TRANS_FRAME_ESCAPE : TRANS_FRAME_END, + &tp->t_outq)) { + (void) unputc(&tp->t_outq); + break; + } + sc->sc_if.if_obytes += 2; + } + } + MFREE(m, m2); + m = m2; + } + + if (putc(FRAME_END, &tp->t_outq)) { + /* + * Not enough room. Remove a char to make room + * and end the packet normally. + * If you get many collisions (more than one or two + * a day) you probably do not have enough clists + * and you should increase "nclist" in param.c. + */ + (void) unputc(&tp->t_outq); + (void) putc(FRAME_END, &tp->t_outq); + sc->sc_if.if_collisions++; + } else { + ++sc->sc_if.if_obytes; + sc->sc_if.if_opackets++; + } + } +} + +/* + * Copy data buffer to mbuf chain; add ifnet pointer. + */ +static struct mbuf * +sl_btom(sc, len) + register struct sl_softc *sc; + register int len; +{ + register struct mbuf *m; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return (NULL); + + /* + * If we have more than MHLEN bytes, it's cheaper to + * queue the cluster we just filled & allocate a new one + * for the input buffer. Otherwise, fill the mbuf we + * allocated above. Note that code in the input routine + * guarantees that packet will fit in a cluster. + */ + if (len >= MHLEN) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + /* + * we couldn't get a cluster - if memory's this + * low, it's time to start dropping packets. + */ + (void) m_free(m); + return (NULL); + } + sc->sc_ep = mtod(m, u_char *) + SLBUFSIZE; + m->m_data = (caddr_t)sc->sc_buf; + m->m_ext.ext_buf = (caddr_t)((int)sc->sc_buf &~ MCLOFSET); + } else + bcopy((caddr_t)sc->sc_buf, mtod(m, caddr_t), len); + + m->m_len = len; + m->m_pkthdr.len = len; + m->m_pkthdr.rcvif = &sc->sc_if; + return (m); +} + +/* + * tty interface receiver interrupt. + */ +void +slinput(c, tp) + register int c; + register struct tty *tp; +{ + register struct sl_softc *sc; + register struct mbuf *m; + register int len; + int s; +#if NBPFILTER > 0 + u_char chdr[CHDR_LEN]; +#endif + + tk_nin++; + sc = (struct sl_softc *)tp->t_sc; + if (sc == NULL) + return; + if (c & TTY_ERRORMASK || ((tp->t_state & TS_CARR_ON) == 0 && + (tp->t_cflag & CLOCAL) == 0)) { + sc->sc_flags |= SC_ERROR; + return; + } + c &= TTY_CHARMASK; + + ++sc->sc_if.if_ibytes; + + if (sc->sc_if.if_flags & IFF_DEBUG) { + if (c == ABT_ESC) { + /* + * If we have a previous abort, see whether + * this one is within the time limit. + */ + if (sc->sc_abortcount && + time.tv_sec >= sc->sc_starttime + ABT_WINDOW) + sc->sc_abortcount = 0; + /* + * If we see an abort after "idle" time, count it; + * record when the first abort escape arrived. + */ + if (time.tv_sec >= sc->sc_lasttime + ABT_IDLE) { + if (++sc->sc_abortcount == 1) + sc->sc_starttime = time.tv_sec; + if (sc->sc_abortcount >= ABT_COUNT) { + slclose(tp); + return; + } + } + } else + sc->sc_abortcount = 0; + sc->sc_lasttime = time.tv_sec; + } + + switch (c) { + + case TRANS_FRAME_ESCAPE: + if (sc->sc_escape) + c = FRAME_ESCAPE; + break; + + case TRANS_FRAME_END: + if (sc->sc_escape) + c = FRAME_END; + break; + + case FRAME_ESCAPE: + sc->sc_escape = 1; + return; + + case FRAME_END: + if(sc->sc_flags & SC_ERROR) { + sc->sc_flags &= ~SC_ERROR; + goto newpack; + } + len = sc->sc_mp - sc->sc_buf; + if (len < 3) + /* less than min length packet - ignore */ + goto newpack; + +#if NBPFILTER > 0 + if (sc->sc_bpf) { + /* + * Save the compressed header, so we + * can tack it on later. Note that we + * will end up copying garbage in some + * cases but this is okay. We remember + * where the buffer started so we can + * compute the new header length. + */ + bcopy(sc->sc_buf, chdr, CHDR_LEN); + } +#endif + + if ((c = (*sc->sc_buf & 0xf0)) != (IPVERSION << 4)) { + if (c & 0x80) + c = TYPE_COMPRESSED_TCP; + else if (c == TYPE_UNCOMPRESSED_TCP) + *sc->sc_buf &= 0x4f; /* XXX */ + /* + * We've got something that's not an IP packet. + * If compression is enabled, try to decompress it. + * Otherwise, if `auto-enable' compression is on and + * it's a reasonable packet, decompress it and then + * enable compression. Otherwise, drop it. + */ + if (sc->sc_if.if_flags & SC_COMPRESS) { + len = sl_uncompress_tcp(&sc->sc_buf, len, + (u_int)c, &sc->sc_comp); + if (len <= 0) + goto error; + } else if ((sc->sc_if.if_flags & SC_AUTOCOMP) && + c == TYPE_UNCOMPRESSED_TCP && len >= 40) { + len = sl_uncompress_tcp(&sc->sc_buf, len, + (u_int)c, &sc->sc_comp); + if (len <= 0) + goto error; + sc->sc_if.if_flags |= SC_COMPRESS; + } else + goto error; + } +#if NBPFILTER > 0 + if (sc->sc_bpf) { + /* + * Put the SLIP pseudo-"link header" in place. + * We couldn't do this any earlier since + * decompression probably moved the buffer + * pointer. Then, invoke BPF. + */ + register u_char *hp = sc->sc_buf - SLIP_HDRLEN; + + hp[SLX_DIR] = SLIPDIR_IN; + bcopy(chdr, &hp[SLX_CHDR], CHDR_LEN); + bpf_tap(sc->sc_bpf, hp, len + SLIP_HDRLEN); + } +#endif + m = sl_btom(sc, len); + if (m == NULL) + goto error; + + sc->sc_if.if_ipackets++; + sc->sc_if.if_lastchange = time; + s = splimp(); + if (IF_QFULL(&ipintrq)) { + IF_DROP(&ipintrq); + sc->sc_if.if_ierrors++; + sc->sc_if.if_iqdrops++; + m_freem(m); + } else { + IF_ENQUEUE(&ipintrq, m); + schednetisr(NETISR_IP); + } + splx(s); + goto newpack; + } + if (sc->sc_mp < sc->sc_ep) { + *sc->sc_mp++ = c; + sc->sc_escape = 0; + return; + } + + /* can't put lower; would miss an extra frame */ + sc->sc_flags |= SC_ERROR; + +error: + sc->sc_if.if_ierrors++; +newpack: + sc->sc_mp = sc->sc_buf = sc->sc_ep - SLMAX; + sc->sc_escape = 0; +} + +/* + * Process an ioctl request. + */ +int +slioctl(ifp, cmd, data) + register struct ifnet *ifp; + int cmd; + caddr_t data; +{ + register struct ifaddr *ifa = (struct ifaddr *)data; + register struct ifreq *ifr; + register int s = splimp(), error = 0; + + switch (cmd) { + + case SIOCSIFADDR: + if (ifa->ifa_addr->sa_family == AF_INET) + ifp->if_flags |= IFF_UP; + else + error = EAFNOSUPPORT; + break; + + case SIOCSIFDSTADDR: + if (ifa->ifa_addr->sa_family != AF_INET) + error = EAFNOSUPPORT; + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + ifr = (struct ifreq *)data; + if (ifr == 0) { + error = EAFNOSUPPORT; /* XXX */ + break; + } + switch (ifr->ifr_addr.sa_family) { + +#ifdef INET + case AF_INET: + break; +#endif + + default: + error = EAFNOSUPPORT; + break; + } + break; + + default: + error = EINVAL; + } + splx(s); + return (error); +} +#endif diff --git a/sys/net/if_slvar.h b/sys/net/if_slvar.h new file mode 100644 index 00000000000..e7b27647284 --- /dev/null +++ b/sys/net/if_slvar.h @@ -0,0 +1,80 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_slvar.h 8.3 (Berkeley) 2/1/94 + * + * $Header: if_slvar.h,v 1.3 89/05/31 02:25:18 van Exp $ + */ + +/* + * Definitions for SLIP interface data structures + * + * (This exists so programs like slstats can get at the definition + * of sl_softc.) + */ +struct sl_softc { + struct ifnet sc_if; /* network-visible interface */ + struct ifqueue sc_fastq; /* interactive output queue */ + struct tty *sc_ttyp; /* pointer to tty structure */ + u_char *sc_mp; /* pointer to next available buf char */ + u_char *sc_ep; /* pointer to last available buf char */ + u_char *sc_buf; /* input buffer */ + u_int sc_flags; /* see below */ + u_int sc_escape; /* =1 if last char input was FRAME_ESCAPE */ + long sc_lasttime; /* last time a char arrived */ + long sc_abortcount; /* number of abort esacpe chars */ + long sc_starttime; /* time of first abort in window */ +#ifdef INET /* XXX */ + struct slcompress sc_comp; /* tcp compression data */ +#endif + caddr_t sc_bpf; /* BPF data */ +}; + +/* internal flags */ +#define SC_ERROR 0x0001 /* had an input error */ + +/* visible flags */ +#define SC_COMPRESS IFF_LINK0 /* compress TCP traffic */ +#define SC_NOICMP IFF_LINK1 /* supress ICMP traffic */ +#define SC_AUTOCOMP IFF_LINK2 /* auto-enable TCP compression */ + +#ifdef KERNEL +void slattach __P((void)); +void slclose __P((struct tty *)); +void slinput __P((int, struct tty *)); +int slioctl __P((struct ifnet *, int, caddr_t)); +int slopen __P((dev_t, struct tty *)); +int sloutput __P((struct ifnet *, + struct mbuf *, struct sockaddr *, struct rtentry *)); +void slstart __P((struct tty *)); +int sltioctl __P((struct tty *, int, caddr_t, int)); +#endif /* KERNEL */ diff --git a/sys/net/if_types.h b/sys/net/if_types.h new file mode 100644 index 00000000000..030f234fbac --- /dev/null +++ b/sys/net/if_types.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 1989, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_types.h 8.2 (Berkeley) 4/20/94 + */ + +/* + * Interface types for benefit of parsing media address headers. + * This list is derived from the SNMP list of ifTypes, currently + * documented in RFC1573. + */ + +#define IFT_OTHER 0x1 /* none of the following */ +#define IFT_1822 0x2 /* old-style arpanet imp */ +#define IFT_HDH1822 0x3 /* HDH arpanet imp */ +#define IFT_X25DDN 0x4 /* x25 to imp */ +#define IFT_X25 0x5 /* PDN X25 interface (RFC877) */ +#define IFT_ETHER 0x6 /* Ethernet CSMACD */ +#define IFT_ISO88023 0x7 /* CMSA CD */ +#define IFT_ISO88024 0x8 /* Token Bus */ +#define IFT_ISO88025 0x9 /* Token Ring */ +#define IFT_ISO88026 0xa /* MAN */ +#define IFT_STARLAN 0xb +#define IFT_P10 0xc /* Proteon 10MBit ring */ +#define IFT_P80 0xd /* Proteon 10MBit ring */ +#define IFT_HY 0xe /* Hyperchannel */ +#define IFT_FDDI 0xf +#define IFT_LAPB 0x10 +#define IFT_SDLC 0x11 +#define IFT_T1 0x12 +#define IFT_CEPT 0x13 /* E1 - european T1 */ +#define IFT_ISDNBASIC 0x14 +#define IFT_ISDNPRIMARY 0x15 +#define IFT_PTPSERIAL 0x16 /* Proprietary PTP serial */ +#define IFT_PPP 0x17 /* RFC 1331 */ +#define IFT_LOOP 0x18 /* loopback */ +#define IFT_EON 0x19 /* ISO over IP */ +#define IFT_XETHER 0x1a /* obsolete 3MB experimental ethernet */ +#define IFT_NSIP 0x1b /* XNS over IP */ +#define IFT_SLIP 0x1c /* IP over generic TTY */ +#define IFT_ULTRA 0x1d /* Ultra Technologies */ +#define IFT_DS3 0x1e /* Generic T3 */ +#define IFT_SIP 0x1f /* SMDS */ +#define IFT_FRELAY 0x20 /* Frame Relay DTE only */ +#define IFT_RS232 0x21 +#define IFT_PARA 0x22 /* parallel-port */ +#define IFT_ARCNET 0x23 +#define IFT_ARCNETPLUS 0x24 +#define IFT_ATM 0x25 /* ATM cells */ +#define IFT_MIOX25 0x26 +#define IFT_SONET 0x27 /* SONET or SDH */ +#define IFT_X25PLE 0x28 +#define IFT_ISO88022LLC 0x29 +#define IFT_LOCALTALK 0x2a +#define IFT_SMDSDXI 0x2b +#define IFT_FRELAYDCE 0x2c /* Frame Relay DCE */ +#define IFT_V35 0x2d +#define IFT_HSSI 0x2e +#define IFT_HIPPI 0x2f +#define IFT_MODEM 0x30 /* Generic Modem */ +#define IFT_AAL5 0x31 /* AAL5 over ATM */ +#define IFT_SONETPATH 0x32 +#define IFT_SONETVT 0x33 +#define IFT_SMDSICIP 0x34 /* SMDS InterCarrier Interface */ +#define IFT_PROPVIRTUAL 0x35 /* Proprietary Virtual/internal */ +#define IFT_PROPMUX 0x36 /* Proprietary Multiplexing */ diff --git a/sys/net/netisr.h b/sys/net/netisr.h new file mode 100644 index 00000000000..e2e465379d3 --- /dev/null +++ b/sys/net/netisr.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 1980, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)netisr.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * The networking code runs off software interrupts. + * + * You can switch into the network by doing splnet() and return by splx(). + * The software interrupt level for the network is higher than the software + * level for the clock (so you can enter the network in routines called + * at timeout time). + */ +#if defined(vax) || defined(tahoe) +#define setsoftnet() mtpr(SIRR, 12) +#endif + +/* + * Each ``pup-level-1'' input queue has a bit in a ``netisr'' status + * word which is used to de-multiplex a single software + * interrupt used for scheduling the network code to calls + * on the lowest level routine of each protocol. + */ +#define NETISR_RAW 0 /* same as AF_UNSPEC */ +#define NETISR_IP 2 /* same as AF_INET */ +#define NETISR_IMP 3 /* same as AF_IMPLINK */ +#define NETISR_NS 6 /* same as AF_NS */ +#define NETISR_ISO 7 /* same as AF_ISO */ +#define NETISR_CCITT 10 /* same as AF_CCITT */ +#define NETISR_ARP 18 /* same as AF_LINK */ + +#define schednetisr(anisr) { netisr |= 1<<(anisr); setsoftnet(); } + +#ifdef i386 +/* XXX Temporary -- soon to vanish - wfj */ +#define NETISR_SCLK 11 /* softclock */ +#define NETISR_AST 12 /* ast -- resched */ + +#undef schednetisr +#define schednetisr(anisr) {\ + if(netisr == 0) { \ + softem++; \ + } \ + netisr |= 1<<(anisr); \ +} +#ifndef LOCORE +#ifdef KERNEL +int softem; +#endif +#endif +#endif /* i386 */ + +#ifndef LOCORE +#ifdef KERNEL +int netisr; /* scheduling bits for network */ +#endif +#endif diff --git a/sys/net/radix.c b/sys/net/radix.c new file mode 100644 index 00000000000..f182eb77abf --- /dev/null +++ b/sys/net/radix.c @@ -0,0 +1,757 @@ +/* + * Copyright (c) 1988, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)radix.c 8.2 (Berkeley) 1/4/94 + */ + +/* + * Routines to build and maintain radix trees for routing lookups. + */ +#ifndef RNF_NORMAL +#include +#include +#include +#define M_DONTWAIT M_NOWAIT +#ifdef KERNEL +#include +#endif +#endif + +#include + +int max_keylen; +struct radix_mask *rn_mkfreelist; +struct radix_node_head *mask_rnhead; +static int gotOddMasks; +static char *maskedKey; +static char *rn_zeros, *rn_ones; + +#define rn_masktop (mask_rnhead->rnh_treetop) +#undef Bcmp +#define Bcmp(a, b, l) (l == 0 ? 0 : bcmp((caddr_t)(a), (caddr_t)(b), (u_long)l)) +/* + * The data structure for the keys is a radix tree with one way + * branching removed. The index rn_b at an internal node n represents a bit + * position to be tested. The tree is arranged so that all descendants + * of a node n have keys whose bits all agree up to position rn_b - 1. + * (We say the index of n is rn_b.) + * + * There is at least one descendant which has a one bit at position rn_b, + * and at least one with a zero there. + * + * A route is determined by a pair of key and mask. We require that the + * bit-wise logical and of the key and mask to be the key. + * We define the index of a route to associated with the mask to be + * the first bit number in the mask where 0 occurs (with bit number 0 + * representing the highest order bit). + * + * We say a mask is normal if every bit is 0, past the index of the mask. + * If a node n has a descendant (k, m) with index(m) == index(n) == rn_b, + * and m is a normal mask, then the route applies to every descendant of n. + * If the index(m) < rn_b, this implies the trailing last few bits of k + * before bit b are all 0, (and hence consequently true of every descendant + * of n), so the route applies to all descendants of the node as well. + * + * The present version of the code makes no use of normal routes, + * but similar logic shows that a non-normal mask m such that + * index(m) <= index(n) could potentially apply to many children of n. + * Thus, for each non-host route, we attach its mask to a list at an internal + * node as high in the tree as we can go. + */ + +struct radix_node * +rn_search(v_arg, head) + void *v_arg; + struct radix_node *head; +{ + register struct radix_node *x; + register caddr_t v; + + for (x = head, v = v_arg; x->rn_b >= 0;) { + if (x->rn_bmask & v[x->rn_off]) + x = x->rn_r; + else + x = x->rn_l; + } + return (x); +}; + +struct radix_node * +rn_search_m(v_arg, head, m_arg) + struct radix_node *head; + void *v_arg, *m_arg; +{ + register struct radix_node *x; + register caddr_t v = v_arg, m = m_arg; + + for (x = head; x->rn_b >= 0;) { + if ((x->rn_bmask & m[x->rn_off]) && + (x->rn_bmask & v[x->rn_off])) + x = x->rn_r; + else + x = x->rn_l; + } + return x; +}; + +int +rn_refines(m_arg, n_arg) + void *m_arg, *n_arg; +{ + register caddr_t m = m_arg, n = n_arg; + register caddr_t lim, lim2 = lim = n + *(u_char *)n; + int longer = (*(u_char *)n++) - (int)(*(u_char *)m++); + int masks_are_equal = 1; + + if (longer > 0) + lim -= longer; + while (n < lim) { + if (*n & ~(*m)) + return 0; + if (*n++ != *m++) + masks_are_equal = 0; + + } + while (n < lim2) + if (*n++) + return 0; + if (masks_are_equal && (longer < 0)) + for (lim2 = m - longer; m < lim2; ) + if (*m++) + return 1; + return (!masks_are_equal); +} + + +struct radix_node * +rn_match(v_arg, head) + void *v_arg; + struct radix_node_head *head; +{ + caddr_t v = v_arg; + register struct radix_node *t = head->rnh_treetop, *x; + register caddr_t cp = v, cp2, cp3; + caddr_t cplim, mstart; + struct radix_node *saved_t, *top = t; + int off = t->rn_off, vlen = *(u_char *)cp, matched_off; + + /* + * Open code rn_search(v, top) to avoid overhead of extra + * subroutine call. + */ + for (; t->rn_b >= 0; ) { + if (t->rn_bmask & cp[t->rn_off]) + t = t->rn_r; + else + t = t->rn_l; + } + /* + * See if we match exactly as a host destination + */ + cp += off; cp2 = t->rn_key + off; cplim = v + vlen; + for (; cp < cplim; cp++, cp2++) + if (*cp != *cp2) + goto on1; + /* + * This extra grot is in case we are explicitly asked + * to look up the default. Ugh! + */ + if ((t->rn_flags & RNF_ROOT) && t->rn_dupedkey) + t = t->rn_dupedkey; + return t; +on1: + matched_off = cp - v; + saved_t = t; + do { + if (t->rn_mask) { + /* + * Even if we don't match exactly as a hosts; + * we may match if the leaf we wound up at is + * a route to a net. + */ + cp3 = matched_off + t->rn_mask; + cp2 = matched_off + t->rn_key; + for (; cp < cplim; cp++) + if ((*cp2++ ^ *cp) & *cp3++) + break; + if (cp == cplim) + return t; + cp = matched_off + v; + } + } while (t = t->rn_dupedkey); + t = saved_t; + /* start searching up the tree */ + do { + register struct radix_mask *m; + t = t->rn_p; + if (m = t->rn_mklist) { + /* + * After doing measurements here, it may + * turn out to be faster to open code + * rn_search_m here instead of always + * copying and masking. + */ + off = min(t->rn_off, matched_off); + mstart = maskedKey + off; + do { + cp2 = mstart; + cp3 = m->rm_mask + off; + for (cp = v + off; cp < cplim;) + *cp2++ = *cp++ & *cp3++; + x = rn_search(maskedKey, t); + while (x && x->rn_mask != m->rm_mask) + x = x->rn_dupedkey; + if (x && + (Bcmp(mstart, x->rn_key + off, + vlen - off) == 0)) + return x; + } while (m = m->rm_mklist); + } + } while (t != top); + return 0; +}; + +#ifdef RN_DEBUG +int rn_nodenum; +struct radix_node *rn_clist; +int rn_saveinfo; +int rn_debug = 1; +#endif + +struct radix_node * +rn_newpair(v, b, nodes) + void *v; + int b; + struct radix_node nodes[2]; +{ + register struct radix_node *tt = nodes, *t = tt + 1; + t->rn_b = b; t->rn_bmask = 0x80 >> (b & 7); + t->rn_l = tt; t->rn_off = b >> 3; + tt->rn_b = -1; tt->rn_key = (caddr_t)v; tt->rn_p = t; + tt->rn_flags = t->rn_flags = RNF_ACTIVE; +#ifdef RN_DEBUG + tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; + tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; +#endif + return t; +} + +struct radix_node * +rn_insert(v_arg, head, dupentry, nodes) + void *v_arg; + struct radix_node_head *head; + int *dupentry; + struct radix_node nodes[2]; +{ + caddr_t v = v_arg; + struct radix_node *top = head->rnh_treetop; + int head_off = top->rn_off, vlen = (int)*((u_char *)v); + register struct radix_node *t = rn_search(v_arg, top); + register caddr_t cp = v + head_off; + register int b; + struct radix_node *tt; + /* + *find first bit at which v and t->rn_key differ + */ + { + register caddr_t cp2 = t->rn_key + head_off; + register int cmp_res; + caddr_t cplim = v + vlen; + + while (cp < cplim) + if (*cp2++ != *cp++) + goto on1; + *dupentry = 1; + return t; +on1: + *dupentry = 0; + cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; + for (b = (cp - v) << 3; cmp_res; b--) + cmp_res >>= 1; + } + { + register struct radix_node *p, *x = top; + cp = v; + do { + p = x; + if (cp[x->rn_off] & x->rn_bmask) + x = x->rn_r; + else x = x->rn_l; + } while (b > (unsigned) x->rn_b); /* x->rn_b < b && x->rn_b >= 0 */ +#ifdef RN_DEBUG + if (rn_debug) + printf("Going In:\n"), traverse(p); +#endif + t = rn_newpair(v_arg, b, nodes); tt = t->rn_l; + if ((cp[p->rn_off] & p->rn_bmask) == 0) + p->rn_l = t; + else + p->rn_r = t; + x->rn_p = t; t->rn_p = p; /* frees x, p as temp vars below */ + if ((cp[t->rn_off] & t->rn_bmask) == 0) { + t->rn_r = x; + } else { + t->rn_r = tt; t->rn_l = x; + } +#ifdef RN_DEBUG + if (rn_debug) + printf("Coming out:\n"), traverse(p); +#endif + } + return (tt); +} + +struct radix_node * +rn_addmask(n_arg, search, skip) + int search, skip; + void *n_arg; +{ + caddr_t netmask = (caddr_t)n_arg; + register struct radix_node *x; + register caddr_t cp, cplim; + register int b, mlen, j; + int maskduplicated; + + mlen = *(u_char *)netmask; + if (search) { + x = rn_search(netmask, rn_masktop); + mlen = *(u_char *)netmask; + if (Bcmp(netmask, x->rn_key, mlen) == 0) + return (x); + } + R_Malloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x)); + if (x == 0) + return (0); + Bzero(x, max_keylen + 2 * sizeof (*x)); + cp = (caddr_t)(x + 2); + Bcopy(netmask, cp, mlen); + netmask = cp; + x = rn_insert(netmask, mask_rnhead, &maskduplicated, x); + /* + * Calculate index of mask. + */ + cplim = netmask + mlen; + for (cp = netmask + skip; cp < cplim; cp++) + if (*(u_char *)cp != 0xff) + break; + b = (cp - netmask) << 3; + if (cp != cplim) { + if (*cp != 0) { + gotOddMasks = 1; + for (j = 0x80; j; b++, j >>= 1) + if ((j & *cp) == 0) + break; + } + } + x->rn_b = -1 - b; + return (x); +} + +struct radix_node * +rn_addroute(v_arg, n_arg, head, treenodes) + void *v_arg, *n_arg; + struct radix_node_head *head; + struct radix_node treenodes[2]; +{ + caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg; + register struct radix_node *t, *x, *tt; + struct radix_node *saved_tt, *top = head->rnh_treetop; + short b = 0, b_leaf; + int mlen, keyduplicated; + caddr_t cplim; + struct radix_mask *m, **mp; + + /* + * In dealing with non-contiguous masks, there may be + * many different routes which have the same mask. + * We will find it useful to have a unique pointer to + * the mask to speed avoiding duplicate references at + * nodes and possibly save time in calculating indices. + */ + if (netmask) { + x = rn_search(netmask, rn_masktop); + mlen = *(u_char *)netmask; + if (Bcmp(netmask, x->rn_key, mlen) != 0) { + x = rn_addmask(netmask, 0, top->rn_off); + if (x == 0) + return (0); + } + netmask = x->rn_key; + b = -1 - x->rn_b; + } + /* + * Deal with duplicated keys: attach node to previous instance + */ + saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes); + if (keyduplicated) { + do { + if (tt->rn_mask == netmask) + return (0); + t = tt; + if (netmask == 0 || + (tt->rn_mask && rn_refines(netmask, tt->rn_mask))) + break; + } while (tt = tt->rn_dupedkey); + /* + * If the mask is not duplicated, we wouldn't + * find it among possible duplicate key entries + * anyway, so the above test doesn't hurt. + * + * We sort the masks for a duplicated key the same way as + * in a masklist -- most specific to least specific. + * This may require the unfortunate nuisance of relocating + * the head of the list. + */ + if (tt && t == saved_tt) { + struct radix_node *xx = x; + /* link in at head of list */ + (tt = treenodes)->rn_dupedkey = t; + tt->rn_flags = t->rn_flags; + tt->rn_p = x = t->rn_p; + if (x->rn_l == t) x->rn_l = tt; else x->rn_r = tt; + saved_tt = tt; x = xx; + } else { + (tt = treenodes)->rn_dupedkey = t->rn_dupedkey; + t->rn_dupedkey = tt; + } +#ifdef RN_DEBUG + t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; + tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; +#endif + t = saved_tt; + tt->rn_key = (caddr_t) v; + tt->rn_b = -1; + tt->rn_flags = t->rn_flags & ~RNF_ROOT; + } + /* + * Put mask in tree. + */ + if (netmask) { + tt->rn_mask = netmask; + tt->rn_b = x->rn_b; + } + t = saved_tt->rn_p; + b_leaf = -1 - t->rn_b; + if (t->rn_r == saved_tt) x = t->rn_l; else x = t->rn_r; + /* Promote general routes from below */ + if (x->rn_b < 0) { + if (x->rn_mask && (x->rn_b >= b_leaf) && x->rn_mklist == 0) { + MKGet(m); + if (m) { + Bzero(m, sizeof *m); + m->rm_b = x->rn_b; + m->rm_mask = x->rn_mask; + x->rn_mklist = t->rn_mklist = m; + } + } + } else if (x->rn_mklist) { + /* + * Skip over masks whose index is > that of new node + */ + for (mp = &x->rn_mklist; m = *mp; mp = &m->rm_mklist) + if (m->rm_b >= b_leaf) + break; + t->rn_mklist = m; *mp = 0; + } + /* Add new route to highest possible ancestor's list */ + if ((netmask == 0) || (b > t->rn_b )) + return tt; /* can't lift at all */ + b_leaf = tt->rn_b; + do { + x = t; + t = t->rn_p; + } while (b <= t->rn_b && x != top); + /* + * Search through routes associated with node to + * insert new route according to index. + * For nodes of equal index, place more specific + * masks first. + */ + cplim = netmask + mlen; + for (mp = &x->rn_mklist; m = *mp; mp = &m->rm_mklist) { + if (m->rm_b < b_leaf) + continue; + if (m->rm_b > b_leaf) + break; + if (m->rm_mask == netmask) { + m->rm_refs++; + tt->rn_mklist = m; + return tt; + } + if (rn_refines(netmask, m->rm_mask)) + break; + } + MKGet(m); + if (m == 0) { + printf("Mask for route not entered\n"); + return (tt); + } + Bzero(m, sizeof *m); + m->rm_b = b_leaf; + m->rm_mask = netmask; + m->rm_mklist = *mp; + *mp = m; + tt->rn_mklist = m; + return tt; +} + +struct radix_node * +rn_delete(v_arg, netmask_arg, head) + void *v_arg, *netmask_arg; + struct radix_node_head *head; +{ + register struct radix_node *t, *p, *x, *tt; + struct radix_mask *m, *saved_m, **mp; + struct radix_node *dupedkey, *saved_tt, *top; + caddr_t v, netmask; + int b, head_off, vlen; + + v = v_arg; + netmask = netmask_arg; + x = head->rnh_treetop; + tt = rn_search(v, x); + head_off = x->rn_off; + vlen = *(u_char *)v; + saved_tt = tt; + top = x; + if (tt == 0 || + Bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off)) + return (0); + /* + * Delete our route from mask lists. + */ + if (dupedkey = tt->rn_dupedkey) { + if (netmask) + netmask = rn_search(netmask, rn_masktop)->rn_key; + while (tt->rn_mask != netmask) + if ((tt = tt->rn_dupedkey) == 0) + return (0); + } + if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0) + goto on1; + if (m->rm_mask != tt->rn_mask) { + printf("rn_delete: inconsistent annotation\n"); + goto on1; + } + if (--m->rm_refs >= 0) + goto on1; + b = -1 - tt->rn_b; + t = saved_tt->rn_p; + if (b > t->rn_b) + goto on1; /* Wasn't lifted at all */ + do { + x = t; + t = t->rn_p; + } while (b <= t->rn_b && x != top); + for (mp = &x->rn_mklist; m = *mp; mp = &m->rm_mklist) + if (m == saved_m) { + *mp = m->rm_mklist; + MKFree(m); + break; + } + if (m == 0) + printf("rn_delete: couldn't find our annotation\n"); +on1: + /* + * Eliminate us from tree + */ + if (tt->rn_flags & RNF_ROOT) + return (0); +#ifdef RN_DEBUG + /* Get us out of the creation list */ + for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {} + if (t) t->rn_ybro = tt->rn_ybro; +#endif + t = tt->rn_p; + if (dupedkey) { + if (tt == saved_tt) { + x = dupedkey; x->rn_p = t; + if (t->rn_l == tt) t->rn_l = x; else t->rn_r = x; + } else { + for (x = p = saved_tt; p && p->rn_dupedkey != tt;) + p = p->rn_dupedkey; + if (p) p->rn_dupedkey = tt->rn_dupedkey; + else printf("rn_delete: couldn't find us\n"); + } + t = tt + 1; + if (t->rn_flags & RNF_ACTIVE) { +#ifndef RN_DEBUG + *++x = *t; p = t->rn_p; +#else + b = t->rn_info; *++x = *t; t->rn_info = b; p = t->rn_p; +#endif + if (p->rn_l == t) p->rn_l = x; else p->rn_r = x; + x->rn_l->rn_p = x; x->rn_r->rn_p = x; + } + goto out; + } + if (t->rn_l == tt) x = t->rn_r; else x = t->rn_l; + p = t->rn_p; + if (p->rn_r == t) p->rn_r = x; else p->rn_l = x; + x->rn_p = p; + /* + * Demote routes attached to us. + */ + if (t->rn_mklist) { + if (x->rn_b >= 0) { + for (mp = &x->rn_mklist; m = *mp;) + mp = &m->rm_mklist; + *mp = t->rn_mklist; + } else { + for (m = t->rn_mklist; m;) { + struct radix_mask *mm = m->rm_mklist; + if (m == x->rn_mklist && (--(m->rm_refs) < 0)) { + x->rn_mklist = 0; + MKFree(m); + } else + printf("%s %x at %x\n", + "rn_delete: Orphaned Mask", m, x); + m = mm; + } + } + } + /* + * We may be holding an active internal node in the tree. + */ + x = tt + 1; + if (t != x) { +#ifndef RN_DEBUG + *t = *x; +#else + b = t->rn_info; *t = *x; t->rn_info = b; +#endif + t->rn_l->rn_p = t; t->rn_r->rn_p = t; + p = x->rn_p; + if (p->rn_l == x) p->rn_l = t; else p->rn_r = t; + } +out: + tt->rn_flags &= ~RNF_ACTIVE; + tt[1].rn_flags &= ~RNF_ACTIVE; + return (tt); +} + +int +rn_walktree(h, f, w) + struct radix_node_head *h; + register int (*f)(); + void *w; +{ + int error; + struct radix_node *base, *next; + register struct radix_node *rn = h->rnh_treetop; + /* + * This gets complicated because we may delete the node + * while applying the function f to it, so we need to calculate + * the successor node in advance. + */ + /* First time through node, go left */ + while (rn->rn_b >= 0) + rn = rn->rn_l; + for (;;) { + base = rn; + /* If at right child go back up, otherwise, go right */ + while (rn->rn_p->rn_r == rn && (rn->rn_flags & RNF_ROOT) == 0) + rn = rn->rn_p; + /* Find the next *leaf* since next node might vanish, too */ + for (rn = rn->rn_p->rn_r; rn->rn_b >= 0;) + rn = rn->rn_l; + next = rn; + /* Process leaves */ + while (rn = base) { + base = rn->rn_dupedkey; + if (!(rn->rn_flags & RNF_ROOT) && (error = (*f)(rn, w))) + return (error); + } + rn = next; + if (rn->rn_flags & RNF_ROOT) + return (0); + } + /* NOTREACHED */ +} + +int +rn_inithead(head, off) + void **head; + int off; +{ + register struct radix_node_head *rnh; + register struct radix_node *t, *tt, *ttt; + if (*head) + return (1); + R_Malloc(rnh, struct radix_node_head *, sizeof (*rnh)); + if (rnh == 0) + return (0); + Bzero(rnh, sizeof (*rnh)); + *head = rnh; + t = rn_newpair(rn_zeros, off, rnh->rnh_nodes); + ttt = rnh->rnh_nodes + 2; + t->rn_r = ttt; + t->rn_p = t; + tt = t->rn_l; + tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; + tt->rn_b = -1 - off; + *ttt = *tt; + ttt->rn_key = rn_ones; + rnh->rnh_addaddr = rn_addroute; + rnh->rnh_deladdr = rn_delete; + rnh->rnh_matchaddr = rn_match; + rnh->rnh_walktree = rn_walktree; + rnh->rnh_treetop = t; + return (1); +} + +void +rn_init() +{ + char *cp, *cplim; +#ifdef KERNEL + struct domain *dom; + + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_maxrtkey > max_keylen) + max_keylen = dom->dom_maxrtkey; +#endif + if (max_keylen == 0) { + printf("rn_init: radix functions require max_keylen be set\n"); + return; + } + R_Malloc(rn_zeros, char *, 3 * max_keylen); + if (rn_zeros == NULL) + panic("rn_init"); + Bzero(rn_zeros, 3 * max_keylen); + rn_ones = cp = rn_zeros + max_keylen; + maskedKey = cplim = rn_ones + max_keylen; + while (cp < cplim) + *cp++ = -1; + if (rn_inithead((void **)&mask_rnhead, 0) == 0) + panic("rn_init 2"); +} diff --git a/sys/net/radix.h b/sys/net/radix.h new file mode 100644 index 00000000000..a11057f0439 --- /dev/null +++ b/sys/net/radix.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 1988, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)radix.h 8.1 (Berkeley) 6/10/93 + */ + +#ifndef _RADIX_H_ +#define _RADIX_H_ + +/* + * Radix search tree node layout. + */ + +struct radix_node { + struct radix_mask *rn_mklist; /* list of masks contained in subtree */ + struct radix_node *rn_p; /* parent */ + short rn_b; /* bit offset; -1-index(netmask) */ + char rn_bmask; /* node: mask for bit test*/ + u_char rn_flags; /* enumerated next */ +#define RNF_NORMAL 1 /* leaf contains normal route */ +#define RNF_ROOT 2 /* leaf is root leaf for tree */ +#define RNF_ACTIVE 4 /* This node is alive (for rtfree) */ + union { + struct { /* leaf only data: */ + caddr_t rn_Key; /* object of search */ + caddr_t rn_Mask; /* netmask, if present */ + struct radix_node *rn_Dupedkey; + } rn_leaf; + struct { /* node only data: */ + int rn_Off; /* where to start compare */ + struct radix_node *rn_L;/* progeny */ + struct radix_node *rn_R;/* progeny */ + }rn_node; + } rn_u; +#ifdef RN_DEBUG + int rn_info; + struct radix_node *rn_twin; + struct radix_node *rn_ybro; +#endif +}; + +#define rn_dupedkey rn_u.rn_leaf.rn_Dupedkey +#define rn_key rn_u.rn_leaf.rn_Key +#define rn_mask rn_u.rn_leaf.rn_Mask +#define rn_off rn_u.rn_node.rn_Off +#define rn_l rn_u.rn_node.rn_L +#define rn_r rn_u.rn_node.rn_R + +/* + * Annotations to tree concerning potential routes applying to subtrees. + */ + +extern struct radix_mask { + short rm_b; /* bit offset; -1-index(netmask) */ + char rm_unused; /* cf. rn_bmask */ + u_char rm_flags; /* cf. rn_flags */ + struct radix_mask *rm_mklist; /* more masks to try */ + caddr_t rm_mask; /* the mask */ + int rm_refs; /* # of references to this struct */ +} *rn_mkfreelist; + +#define MKGet(m) {\ + if (rn_mkfreelist) {\ + m = rn_mkfreelist; \ + rn_mkfreelist = (m)->rm_mklist; \ + } else \ + R_Malloc(m, struct radix_mask *, sizeof (*(m))); }\ + +#define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);} + +struct radix_node_head { + struct radix_node *rnh_treetop; + int rnh_addrsize; /* permit, but not require fixed keys */ + int rnh_pktsize; /* permit, but not require fixed keys */ + struct radix_node *(*rnh_addaddr) /* add based on sockaddr */ + __P((void *v, void *mask, + struct radix_node_head *head, struct radix_node nodes[])); + struct radix_node *(*rnh_addpkt) /* add based on packet hdr */ + __P((void *v, void *mask, + struct radix_node_head *head, struct radix_node nodes[])); + struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */ + __P((void *v, void *mask, struct radix_node_head *head)); + struct radix_node *(*rnh_delpkt) /* remove based on packet hdr */ + __P((void *v, void *mask, struct radix_node_head *head)); + struct radix_node *(*rnh_matchaddr) /* locate based on sockaddr */ + __P((void *v, struct radix_node_head *head)); + struct radix_node *(*rnh_matchpkt) /* locate based on packet hdr */ + __P((void *v, struct radix_node_head *head)); + int (*rnh_walktree) /* traverse tree */ + __P((struct radix_node_head *head, int (*f)(), void *w)); + struct radix_node rnh_nodes[3]; /* empty tree for common case */ +}; + + +#ifndef KERNEL +#define Bcmp(a, b, n) bcmp(((char *)(a)), ((char *)(b)), (n)) +#define Bzero(p, n) bzero((char *)(p), (int)(n)); +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n))) +#define Free(p) free((char *)p); +#else +#define Bcmp(a, b, n) bcmp(((caddr_t)(a)), ((caddr_t)(b)), (unsigned)(n)) +#define Bcopy(a, b, n) bcopy(((caddr_t)(a)), ((caddr_t)(b)), (unsigned)(n)) +#define Bzero(p, n) bzero((caddr_t)(p), (unsigned)(n)); +#define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_DONTWAIT)) +#define Free(p) free((caddr_t)p, M_RTABLE); + +void rn_init __P((void)); +int rn_inithead __P((void **, int)); +int rn_refines __P((void *, void *)); +int rn_walktree __P((struct radix_node_head *, int (*)(), void *)); +struct radix_node + *rn_addmask __P((void *, int, int)), + *rn_addroute __P((void *, void *, struct radix_node_head *, + struct radix_node [2])), + *rn_delete __P((void *, void *, struct radix_node_head *)), + *rn_insert __P((void *, struct radix_node_head *, int *, + struct radix_node [2])), + *rn_match __P((void *, struct radix_node_head *)), + *rn_newpair __P((void *, int, struct radix_node[2])), + *rn_search __P((void *, struct radix_node *)), + *rn_search_m __P((void *, struct radix_node *, void *)); + +#endif /*KERNEL*/ +#endif /* _RADIX_H_ */ diff --git a/sys/net/raw_cb.c b/sys/net/raw_cb.c new file mode 100644 index 00000000000..e44192d2e83 --- /dev/null +++ b/sys/net/raw_cb.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 1980, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_cb.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Routines to manage the raw protocol control blocks. + * + * TODO: + * hash lookups by protocol family/protocol + address family + * take care of unique address problems per AF? + * redo address binding to allow wildcards + */ + +u_long raw_sendspace = RAWSNDQ; +u_long raw_recvspace = RAWRCVQ; + +/* + * Allocate a control block and a nominal amount + * of buffer space for the socket. + */ +int +raw_attach(so, proto) + register struct socket *so; + int proto; +{ + register struct rawcb *rp = sotorawcb(so); + int error; + + /* + * It is assumed that raw_attach is called + * after space has been allocated for the + * rawcb. + */ + if (rp == 0) + return (ENOBUFS); + if (error = soreserve(so, raw_sendspace, raw_recvspace)) + return (error); + rp->rcb_socket = so; + rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family; + rp->rcb_proto.sp_protocol = proto; + insque(rp, &rawcb); + return (0); +} + +/* + * Detach the raw connection block and discard + * socket resources. + */ +void +raw_detach(rp) + register struct rawcb *rp; +{ + struct socket *so = rp->rcb_socket; + + so->so_pcb = 0; + sofree(so); + remque(rp); +#ifdef notdef + if (rp->rcb_laddr) + m_freem(dtom(rp->rcb_laddr)); + rp->rcb_laddr = 0; +#endif + free((caddr_t)(rp), M_PCB); +} + +/* + * Disconnect and possibly release resources. + */ +void +raw_disconnect(rp) + struct rawcb *rp; +{ + +#ifdef notdef + if (rp->rcb_faddr) + m_freem(dtom(rp->rcb_faddr)); + rp->rcb_faddr = 0; +#endif + if (rp->rcb_socket->so_state & SS_NOFDREF) + raw_detach(rp); +} + +#ifdef notdef +int +raw_bind(so, nam) + register struct socket *so; + struct mbuf *nam; +{ + struct sockaddr *addr = mtod(nam, struct sockaddr *); + register struct rawcb *rp; + + if (ifnet == 0) + return (EADDRNOTAVAIL); + rp = sotorawcb(so); + nam = m_copym(nam, 0, M_COPYALL, M_WAITOK); + rp->rcb_laddr = mtod(nam, struct sockaddr *); + return (0); +} +#endif diff --git a/sys/net/raw_cb.h b/sys/net/raw_cb.h new file mode 100644 index 00000000000..6003e181edb --- /dev/null +++ b/sys/net/raw_cb.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 1980, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_cb.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Raw protocol interface control block. Used + * to tie a socket to the generic raw interface. + */ +struct rawcb { + struct rawcb *rcb_next; /* doubly linked list */ + struct rawcb *rcb_prev; + struct socket *rcb_socket; /* back pointer to socket */ + struct sockaddr *rcb_faddr; /* destination address */ + struct sockaddr *rcb_laddr; /* socket's address */ + struct sockproto rcb_proto; /* protocol family, protocol */ +}; + +#define sotorawcb(so) ((struct rawcb *)(so)->so_pcb) + +/* + * Nominal space allocated to a raw socket. + */ +#define RAWSNDQ 8192 +#define RAWRCVQ 8192 + +#ifdef KERNEL +struct rawcb rawcb; /* head of list */ + +int raw_attach __P((struct socket *, int)); +void raw_ctlinput __P((int, struct sockaddr *)); +void raw_detach __P((struct rawcb *)); +void raw_disconnect __P((struct rawcb *)); +void raw_init __P((void)); +void raw_input __P((struct mbuf *, + struct sockproto *, struct sockaddr *, struct sockaddr *)); +int raw_usrreq __P((struct socket *, + int, struct mbuf *, struct mbuf *, struct mbuf *)); +#endif diff --git a/sys/net/raw_usrreq.c b/sys/net/raw_usrreq.c new file mode 100644 index 00000000000..560106ef95c --- /dev/null +++ b/sys/net/raw_usrreq.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 1980, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_usrreq.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Initialize raw connection block q. + */ +void +raw_init() +{ + + rawcb.rcb_next = rawcb.rcb_prev = &rawcb; +} + + +/* + * Raw protocol input routine. Find the socket + * associated with the packet(s) and move them over. If + * nothing exists for this packet, drop it. + */ +/* + * Raw protocol interface. + */ +void +raw_input(m0, proto, src, dst) + struct mbuf *m0; + register struct sockproto *proto; + struct sockaddr *src, *dst; +{ + register struct rawcb *rp; + register struct mbuf *m = m0; + register int sockets = 0; + struct socket *last; + + last = 0; + for (rp = rawcb.rcb_next; rp != &rawcb; rp = rp->rcb_next) { + if (rp->rcb_proto.sp_family != proto->sp_family) + continue; + if (rp->rcb_proto.sp_protocol && + rp->rcb_proto.sp_protocol != proto->sp_protocol) + continue; + /* + * We assume the lower level routines have + * placed the address in a canonical format + * suitable for a structure comparison. + * + * Note that if the lengths are not the same + * the comparison will fail at the first byte. + */ +#define equal(a1, a2) \ + (bcmp((caddr_t)(a1), (caddr_t)(a2), a1->sa_len) == 0) + if (rp->rcb_laddr && !equal(rp->rcb_laddr, dst)) + continue; + if (rp->rcb_faddr && !equal(rp->rcb_faddr, src)) + continue; + if (last) { + struct mbuf *n; + if (n = m_copy(m, 0, (int)M_COPYALL)) { + if (sbappendaddr(&last->so_rcv, src, + n, (struct mbuf *)0) == 0) + /* should notify about lost packet */ + m_freem(n); + else { + sorwakeup(last); + sockets++; + } + } + } + last = rp->rcb_socket; + } + if (last) { + if (sbappendaddr(&last->so_rcv, src, + m, (struct mbuf *)0) == 0) + m_freem(m); + else { + sorwakeup(last); + sockets++; + } + } else + m_freem(m); +} + +/*ARGSUSED*/ +void +raw_ctlinput(cmd, arg) + int cmd; + struct sockaddr *arg; +{ + + if (cmd < 0 || cmd > PRC_NCMDS) + return; + /* INCOMPLETE */ +} + +/*ARGSUSED*/ +int +raw_usrreq(so, req, m, nam, control) + struct socket *so; + int req; + struct mbuf *m, *nam, *control; +{ + register struct rawcb *rp = sotorawcb(so); + register int error = 0; + int len; + + if (req == PRU_CONTROL) + return (EOPNOTSUPP); + if (control && control->m_len) { + error = EOPNOTSUPP; + goto release; + } + if (rp == 0) { + error = EINVAL; + goto release; + } + switch (req) { + + /* + * Allocate a raw control block and fill in the + * necessary info to allow packets to be routed to + * the appropriate raw interface routine. + */ + case PRU_ATTACH: + if ((so->so_state & SS_PRIV) == 0) { + error = EACCES; + break; + } + error = raw_attach(so, (int)nam); + break; + + /* + * Destroy state just before socket deallocation. + * Flush data or not depending on the options. + */ + case PRU_DETACH: + if (rp == 0) { + error = ENOTCONN; + break; + } + raw_detach(rp); + break; + +#ifdef notdef + /* + * If a socket isn't bound to a single address, + * the raw input routine will hand it anything + * within that protocol family (assuming there's + * nothing else around it should go to). + */ + case PRU_CONNECT: + if (rp->rcb_faddr) { + error = EISCONN; + break; + } + nam = m_copym(nam, 0, M_COPYALL, M_WAIT); + rp->rcb_faddr = mtod(nam, struct sockaddr *); + soisconnected(so); + break; + + case PRU_BIND: + if (rp->rcb_laddr) { + error = EINVAL; /* XXX */ + break; + } + error = raw_bind(so, nam); + break; +#endif + + case PRU_CONNECT2: + error = EOPNOTSUPP; + goto release; + + case PRU_DISCONNECT: + if (rp->rcb_faddr == 0) { + error = ENOTCONN; + break; + } + raw_disconnect(rp); + soisdisconnected(so); + break; + + /* + * Mark the connection as being incapable of further input. + */ + case PRU_SHUTDOWN: + socantsendmore(so); + break; + + /* + * Ship a packet out. The appropriate raw output + * routine handles any massaging necessary. + */ + case PRU_SEND: + if (nam) { + if (rp->rcb_faddr) { + error = EISCONN; + break; + } + rp->rcb_faddr = mtod(nam, struct sockaddr *); + } else if (rp->rcb_faddr == 0) { + error = ENOTCONN; + break; + } + error = (*so->so_proto->pr_output)(m, so); + m = NULL; + if (nam) + rp->rcb_faddr = 0; + break; + + case PRU_ABORT: + raw_disconnect(rp); + sofree(so); + soisdisconnected(so); + break; + + case PRU_SENSE: + /* + * stat: don't bother with a blocksize. + */ + return (0); + + /* + * Not supported. + */ + case PRU_RCVOOB: + case PRU_RCVD: + return(EOPNOTSUPP); + + case PRU_LISTEN: + case PRU_ACCEPT: + case PRU_SENDOOB: + error = EOPNOTSUPP; + break; + + case PRU_SOCKADDR: + if (rp->rcb_laddr == 0) { + error = EINVAL; + break; + } + len = rp->rcb_laddr->sa_len; + bcopy((caddr_t)rp->rcb_laddr, mtod(nam, caddr_t), (unsigned)len); + nam->m_len = len; + break; + + case PRU_PEERADDR: + if (rp->rcb_faddr == 0) { + error = ENOTCONN; + break; + } + len = rp->rcb_faddr->sa_len; + bcopy((caddr_t)rp->rcb_faddr, mtod(nam, caddr_t), (unsigned)len); + nam->m_len = len; + break; + + default: + panic("raw_usrreq"); + } +release: + if (m != NULL) + m_freem(m); + return (error); +} diff --git a/sys/net/route.c b/sys/net/route.c new file mode 100644 index 00000000000..96902dace19 --- /dev/null +++ b/sys/net/route.c @@ -0,0 +1,538 @@ +/* + * Copyright (c) 1980, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)route.c 8.2 (Berkeley) 11/15/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#ifdef NS +#include +#endif + +#define SA(p) ((struct sockaddr *)(p)) + +int rttrash; /* routes not in table but not freed */ +struct sockaddr wildcard; /* zero valued cookie for wildcard searches */ + +void +rtable_init(table) + void **table; +{ + struct domain *dom; + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_rtattach) + dom->dom_rtattach(&table[dom->dom_family], + dom->dom_rtoffset); +} + +void +route_init() +{ + rn_init(); /* initialize all zeroes, all ones, mask table */ + rtable_init((void **)rt_tables); +} + +/* + * Packet routing routines. + */ +void +rtalloc(ro) + register struct route *ro; +{ + if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP)) + return; /* XXX */ + ro->ro_rt = rtalloc1(&ro->ro_dst, 1); +} + +struct rtentry * +rtalloc1(dst, report) + register struct sockaddr *dst; + int report; +{ + register struct radix_node_head *rnh = rt_tables[dst->sa_family]; + register struct rtentry *rt; + register struct radix_node *rn; + struct rtentry *newrt = 0; + struct rt_addrinfo info; + int s = splnet(), err = 0, msgtype = RTM_MISS; + + if (rnh && (rn = rnh->rnh_matchaddr((caddr_t)dst, rnh)) && + ((rn->rn_flags & RNF_ROOT) == 0)) { + newrt = rt = (struct rtentry *)rn; + if (report && (rt->rt_flags & RTF_CLONING)) { + err = rtrequest(RTM_RESOLVE, dst, SA(0), + SA(0), 0, &newrt); + if (err) { + newrt = rt; + rt->rt_refcnt++; + goto miss; + } + if ((rt = newrt) && (rt->rt_flags & RTF_XRESOLVE)) { + msgtype = RTM_RESOLVE; + goto miss; + } + } else + rt->rt_refcnt++; + } else { + rtstat.rts_unreach++; + miss: if (report) { + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_DST] = dst; + rt_missmsg(msgtype, &info, 0, err); + } + } + splx(s); + return (newrt); +} + +void +rtfree(rt) + register struct rtentry *rt; +{ + register struct ifaddr *ifa; + + if (rt == 0) + panic("rtfree"); + rt->rt_refcnt--; + if (rt->rt_refcnt <= 0 && (rt->rt_flags & RTF_UP) == 0) { + if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) + panic ("rtfree 2"); + rttrash--; + if (rt->rt_refcnt < 0) { + printf("rtfree: %x not freed (neg refs)\n", rt); + return; + } + ifa = rt->rt_ifa; + IFAFREE(ifa); + Free(rt_key(rt)); + Free(rt); + } +} + +void +ifafree(ifa) + register struct ifaddr *ifa; +{ + if (ifa == NULL) + panic("ifafree"); + if (ifa->ifa_refcnt == 0) + free(ifa, M_IFADDR); + else + ifa->ifa_refcnt--; +} + +/* + * Force a routing table entry to the specified + * destination to go through the given gateway. + * Normally called as a result of a routing redirect + * message from the network layer. + * + * N.B.: must be called at splnet + * + */ +int +rtredirect(dst, gateway, netmask, flags, src, rtp) + struct sockaddr *dst, *gateway, *netmask, *src; + int flags; + struct rtentry **rtp; +{ + register struct rtentry *rt; + int error = 0; + short *stat = 0; + struct rt_addrinfo info; + struct ifaddr *ifa; + + /* verify the gateway is directly reachable */ + if ((ifa = ifa_ifwithnet(gateway)) == 0) { + error = ENETUNREACH; + goto out; + } + rt = rtalloc1(dst, 0); + /* + * If the redirect isn't from our current router for this dst, + * it's either old or wrong. If it redirects us to ourselves, + * we have a routing loop, perhaps as a result of an interface + * going down recently. + */ +#define equal(a1, a2) (bcmp((caddr_t)(a1), (caddr_t)(a2), (a1)->sa_len) == 0) + if (!(flags & RTF_DONE) && rt && + (!equal(src, rt->rt_gateway) || rt->rt_ifa != ifa)) + error = EINVAL; + else if (ifa_ifwithaddr(gateway)) + error = EHOSTUNREACH; + if (error) + goto done; + /* + * Create a new entry if we just got back a wildcard entry + * or the the lookup failed. This is necessary for hosts + * which use routing redirects generated by smart gateways + * to dynamically build the routing tables. + */ + if ((rt == 0) || (rt_mask(rt) && rt_mask(rt)->sa_len < 2)) + goto create; + /* + * Don't listen to the redirect if it's + * for a route to an interface. + */ + if (rt->rt_flags & RTF_GATEWAY) { + if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { + /* + * Changing from route to net => route to host. + * Create new route, rather than smashing route to net. + */ + create: + flags |= RTF_GATEWAY | RTF_DYNAMIC; + error = rtrequest((int)RTM_ADD, dst, gateway, + netmask, flags, + (struct rtentry **)0); + stat = &rtstat.rts_dynamic; + } else { + /* + * Smash the current notion of the gateway to + * this destination. Should check about netmask!!! + */ + rt->rt_flags |= RTF_MODIFIED; + flags |= RTF_MODIFIED; + stat = &rtstat.rts_newgateway; + rt_setgate(rt, rt_key(rt), gateway); + } + } else + error = EHOSTUNREACH; +done: + if (rt) { + if (rtp && !error) + *rtp = rt; + else + rtfree(rt); + } +out: + if (error) + rtstat.rts_badredirect++; + else if (stat != NULL) + (*stat)++; + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_DST] = dst; + info.rti_info[RTAX_GATEWAY] = gateway; + info.rti_info[RTAX_NETMASK] = netmask; + info.rti_info[RTAX_AUTHOR] = src; + rt_missmsg(RTM_REDIRECT, &info, flags, error); +} + +/* +* Routing table ioctl interface. +*/ +int +rtioctl(req, data, p) + int req; + caddr_t data; + struct proc *p; +{ + return (EOPNOTSUPP); +} + +struct ifaddr * +ifa_ifwithroute(flags, dst, gateway) + int flags; + struct sockaddr *dst, *gateway; +{ + register struct ifaddr *ifa; + if ((flags & RTF_GATEWAY) == 0) { + /* + * If we are adding a route to an interface, + * and the interface is a pt to pt link + * we should search for the destination + * as our clue to the interface. Otherwise + * we can use the local address. + */ + ifa = 0; + if (flags & RTF_HOST) + ifa = ifa_ifwithdstaddr(dst); + if (ifa == 0) + ifa = ifa_ifwithaddr(gateway); + } else { + /* + * If we are adding a route to a remote net + * or host, the gateway may still be on the + * other end of a pt to pt link. + */ + ifa = ifa_ifwithdstaddr(gateway); + } + if (ifa == 0) + ifa = ifa_ifwithnet(gateway); + if (ifa == 0) { + struct rtentry *rt = rtalloc1(dst, 0); + if (rt == 0) + return (0); + rt->rt_refcnt--; + if ((ifa = rt->rt_ifa) == 0) + return (0); + } + if (ifa->ifa_addr->sa_family != dst->sa_family) { + struct ifaddr *oifa = ifa; + ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); + if (ifa == 0) + ifa = oifa; + } + return (ifa); +} + +#define ROUNDUP(a) (a>0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long)) + +int +rtrequest(req, dst, gateway, netmask, flags, ret_nrt) + int req, flags; + struct sockaddr *dst, *gateway, *netmask; + struct rtentry **ret_nrt; +{ + int s = splnet(); int error = 0; + register struct rtentry *rt; + register struct radix_node *rn; + register struct radix_node_head *rnh; + struct ifaddr *ifa; + struct sockaddr *ndst; +#define senderr(x) { error = x ; goto bad; } + + if ((rnh = rt_tables[dst->sa_family]) == 0) + senderr(ESRCH); + if (flags & RTF_HOST) + netmask = 0; + switch (req) { + case RTM_DELETE: + if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == 0) + senderr(ESRCH); + if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) + panic ("rtrequest delete"); + rt = (struct rtentry *)rn; + rt->rt_flags &= ~RTF_UP; + if (rt->rt_gwroute) { + rt = rt->rt_gwroute; RTFREE(rt); + (rt = (struct rtentry *)rn)->rt_gwroute = 0; + } + if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) + ifa->ifa_rtrequest(RTM_DELETE, rt, SA(0)); + rttrash++; + if (ret_nrt) + *ret_nrt = rt; + else if (rt->rt_refcnt <= 0) { + rt->rt_refcnt++; + rtfree(rt); + } + break; + + case RTM_RESOLVE: + if (ret_nrt == 0 || (rt = *ret_nrt) == 0) + senderr(EINVAL); + ifa = rt->rt_ifa; + flags = rt->rt_flags & ~RTF_CLONING; + gateway = rt->rt_gateway; + if ((netmask = rt->rt_genmask) == 0) + flags |= RTF_HOST; + goto makeroute; + + case RTM_ADD: + if ((ifa = ifa_ifwithroute(flags, dst, gateway)) == 0) + senderr(ENETUNREACH); + makeroute: + R_Malloc(rt, struct rtentry *, sizeof(*rt)); + if (rt == 0) + senderr(ENOBUFS); + Bzero(rt, sizeof(*rt)); + rt->rt_flags = RTF_UP | flags; + if (rt_setgate(rt, dst, gateway)) { + Free(rt); + senderr(ENOBUFS); + } + ndst = rt_key(rt); + if (netmask) { + rt_maskedcopy(dst, ndst, netmask); + } else + Bcopy(dst, ndst, dst->sa_len); + rn = rnh->rnh_addaddr((caddr_t)ndst, (caddr_t)netmask, + rnh, rt->rt_nodes); + if (rn == 0) { + if (rt->rt_gwroute) + rtfree(rt->rt_gwroute); + Free(rt_key(rt)); + Free(rt); + senderr(EEXIST); + } + ifa->ifa_refcnt++; + rt->rt_ifa = ifa; + rt->rt_ifp = ifa->ifa_ifp; + if (req == RTM_RESOLVE) + rt->rt_rmx = (*ret_nrt)->rt_rmx; /* copy metrics */ + if (ifa->ifa_rtrequest) + ifa->ifa_rtrequest(req, rt, SA(ret_nrt ? *ret_nrt : 0)); + if (ret_nrt) { + *ret_nrt = rt; + rt->rt_refcnt++; + } + break; + } +bad: + splx(s); + return (error); +} + +int +rt_setgate(rt0, dst, gate) + struct rtentry *rt0; + struct sockaddr *dst, *gate; +{ + caddr_t new, old; + int dlen = ROUNDUP(dst->sa_len), glen = ROUNDUP(gate->sa_len); + register struct rtentry *rt = rt0; + + if (rt->rt_gateway == 0 || glen > ROUNDUP(rt->rt_gateway->sa_len)) { + old = (caddr_t)rt_key(rt); + R_Malloc(new, caddr_t, dlen + glen); + if (new == 0) + return 1; + rt->rt_nodes->rn_key = new; + } else { + new = rt->rt_nodes->rn_key; + old = 0; + } + Bcopy(gate, (rt->rt_gateway = (struct sockaddr *)(new + dlen)), glen); + if (old) { + Bcopy(dst, new, dlen); + Free(old); + } + if (rt->rt_gwroute) { + rt = rt->rt_gwroute; RTFREE(rt); + rt = rt0; rt->rt_gwroute = 0; + } + if (rt->rt_flags & RTF_GATEWAY) { + rt->rt_gwroute = rtalloc1(gate, 1); + } + return 0; +} + +void +rt_maskedcopy(src, dst, netmask) + struct sockaddr *src, *dst, *netmask; +{ + register u_char *cp1 = (u_char *)src; + register u_char *cp2 = (u_char *)dst; + register u_char *cp3 = (u_char *)netmask; + u_char *cplim = cp2 + *cp3; + u_char *cplim2 = cp2 + *cp1; + + *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ + cp3 += 2; + if (cplim > cplim2) + cplim = cplim2; + while (cp2 < cplim) + *cp2++ = *cp1++ & *cp3++; + if (cp2 < cplim2) + bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); +} + +/* + * Set up a routing table entry, normally + * for an interface. + */ +int +rtinit(ifa, cmd, flags) + register struct ifaddr *ifa; + int cmd, flags; +{ + register struct rtentry *rt; + register struct sockaddr *dst; + register struct sockaddr *deldst; + struct mbuf *m = 0; + struct rtentry *nrt = 0; + int error; + + dst = flags & RTF_HOST ? ifa->ifa_dstaddr : ifa->ifa_addr; + if (cmd == RTM_DELETE) { + if ((flags & RTF_HOST) == 0 && ifa->ifa_netmask) { + m = m_get(M_WAIT, MT_SONAME); + deldst = mtod(m, struct sockaddr *); + rt_maskedcopy(dst, deldst, ifa->ifa_netmask); + dst = deldst; + } + if (rt = rtalloc1(dst, 0)) { + rt->rt_refcnt--; + if (rt->rt_ifa != ifa) { + if (m) + (void) m_free(m); + return (flags & RTF_HOST ? EHOSTUNREACH + : ENETUNREACH); + } + } + } + error = rtrequest(cmd, dst, ifa->ifa_addr, ifa->ifa_netmask, + flags | ifa->ifa_flags, &nrt); + if (m) + (void) m_free(m); + if (cmd == RTM_DELETE && error == 0 && (rt = nrt)) { + rt_newaddrmsg(cmd, ifa, error, nrt); + if (rt->rt_refcnt <= 0) { + rt->rt_refcnt++; + rtfree(rt); + } + } + if (cmd == RTM_ADD && error == 0 && (rt = nrt)) { + rt->rt_refcnt--; + if (rt->rt_ifa != ifa) { + printf("rtinit: wrong ifa (%x) was (%x)\n", ifa, + rt->rt_ifa); + if (rt->rt_ifa->ifa_rtrequest) + rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, SA(0)); + IFAFREE(rt->rt_ifa); + rt->rt_ifa = ifa; + rt->rt_ifp = ifa->ifa_ifp; + ifa->ifa_refcnt++; + if (ifa->ifa_rtrequest) + ifa->ifa_rtrequest(RTM_ADD, rt, SA(0)); + } + rt_newaddrmsg(cmd, ifa, error, nrt); + } + return (error); +} diff --git a/sys/net/route.h b/sys/net/route.h new file mode 100644 index 00000000000..2fbed9ea0a1 --- /dev/null +++ b/sys/net/route.h @@ -0,0 +1,261 @@ +/* + * Copyright (c) 1980, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)route.h 8.3 (Berkeley) 4/19/94 + */ + +/* + * Kernel resident routing tables. + * + * The routing tables are initialized when interface addresses + * are set by making entries for all directly connected interfaces. + */ + +/* + * A route consists of a destination address and a reference + * to a routing entry. These are often held by protocols + * in their control blocks, e.g. inpcb. + */ +struct route { + struct rtentry *ro_rt; + struct sockaddr ro_dst; +}; + +/* + * These numbers are used by reliable protocols for determining + * retransmission behavior and are included in the routing structure. + */ +struct rt_metrics { + u_long rmx_locks; /* Kernel must leave these values alone */ + u_long rmx_mtu; /* MTU for this path */ + u_long rmx_hopcount; /* max hops expected */ + u_long rmx_expire; /* lifetime for route, e.g. redirect */ + u_long rmx_recvpipe; /* inbound delay-bandwith product */ + u_long rmx_sendpipe; /* outbound delay-bandwith product */ + u_long rmx_ssthresh; /* outbound gateway buffer limit */ + u_long rmx_rtt; /* estimated round trip time */ + u_long rmx_rttvar; /* estimated rtt variance */ + u_long rmx_pksent; /* packets sent using this route */ +}; + +/* + * rmx_rtt and rmx_rttvar are stored as microseconds; + * RTTTOPRHZ(rtt) converts to a value suitable for use + * by a protocol slowtimo counter. + */ +#define RTM_RTTUNIT 1000000 /* units for rtt, rttvar, as units per sec */ +#define RTTTOPRHZ(r) ((r) / (RTM_RTTUNIT / PR_SLOWHZ)) + +/* + * We distinguish between routes to hosts and routes to networks, + * preferring the former if available. For each route we infer + * the interface to use from the gateway address supplied when + * the route was entered. Routes that forward packets through + * gateways are marked so that the output routines know to address the + * gateway rather than the ultimate destination. + */ +#ifndef RNF_NORMAL +#include +#endif +struct rtentry { + struct radix_node rt_nodes[2]; /* tree glue, and other values */ +#define rt_key(r) ((struct sockaddr *)((r)->rt_nodes->rn_key)) +#define rt_mask(r) ((struct sockaddr *)((r)->rt_nodes->rn_mask)) + struct sockaddr *rt_gateway; /* value */ + short rt_flags; /* up/down?, host/net */ + short rt_refcnt; /* # held references */ + u_long rt_use; /* raw # packets forwarded */ + struct ifnet *rt_ifp; /* the answer: interface to use */ + struct ifaddr *rt_ifa; /* the answer: interface to use */ + struct sockaddr *rt_genmask; /* for generation of cloned routes */ + caddr_t rt_llinfo; /* pointer to link level info cache */ + struct rt_metrics rt_rmx; /* metrics used by rx'ing protocols */ + struct rtentry *rt_gwroute; /* implied entry for gatewayed routes */ +}; + +/* + * Following structure necessary for 4.3 compatibility; + * We should eventually move it to a compat file. + */ +struct ortentry { + u_long rt_hash; /* to speed lookups */ + struct sockaddr rt_dst; /* key */ + struct sockaddr rt_gateway; /* value */ + short rt_flags; /* up/down?, host/net */ + short rt_refcnt; /* # held references */ + u_long rt_use; /* raw # packets forwarded */ + struct ifnet *rt_ifp; /* the answer: interface to use */ +}; + +#define RTF_UP 0x1 /* route usable */ +#define RTF_GATEWAY 0x2 /* destination is a gateway */ +#define RTF_HOST 0x4 /* host entry (net otherwise) */ +#define RTF_REJECT 0x8 /* host or net unreachable */ +#define RTF_DYNAMIC 0x10 /* created dynamically (by redirect) */ +#define RTF_MODIFIED 0x20 /* modified dynamically (by redirect) */ +#define RTF_DONE 0x40 /* message confirmed */ +#define RTF_MASK 0x80 /* subnet mask present */ +#define RTF_CLONING 0x100 /* generate new routes on use */ +#define RTF_XRESOLVE 0x200 /* external daemon resolves name */ +#define RTF_LLINFO 0x400 /* generated by ARP or ESIS */ +#define RTF_STATIC 0x800 /* manually added */ +#define RTF_BLACKHOLE 0x1000 /* just discard pkts (during updates) */ +#define RTF_PROTO2 0x4000 /* protocol specific routing flag */ +#define RTF_PROTO1 0x8000 /* protocol specific routing flag */ + + +/* + * Routing statistics. + */ +struct rtstat { + short rts_badredirect; /* bogus redirect calls */ + short rts_dynamic; /* routes created by redirects */ + short rts_newgateway; /* routes modified by redirects */ + short rts_unreach; /* lookups which failed */ + short rts_wildcard; /* lookups satisfied by a wildcard */ +}; +/* + * Structures for routing messages. + */ +struct rt_msghdr { + u_short rtm_msglen; /* to skip over non-understood messages */ + u_char rtm_version; /* future binary compatibility */ + u_char rtm_type; /* message type */ + u_short rtm_index; /* index for associated ifp */ + int rtm_flags; /* flags, incl. kern & message, e.g. DONE */ + int rtm_addrs; /* bitmask identifying sockaddrs in msg */ + pid_t rtm_pid; /* identify sender */ + int rtm_seq; /* for sender to identify action */ + int rtm_errno; /* why failed */ + int rtm_use; /* from rtentry */ + u_long rtm_inits; /* which metrics we are initializing */ + struct rt_metrics rtm_rmx; /* metrics themselves */ +}; + +#define RTM_VERSION 3 /* Up the ante and ignore older versions */ + +#define RTM_ADD 0x1 /* Add Route */ +#define RTM_DELETE 0x2 /* Delete Route */ +#define RTM_CHANGE 0x3 /* Change Metrics or flags */ +#define RTM_GET 0x4 /* Report Metrics */ +#define RTM_LOSING 0x5 /* Kernel Suspects Partitioning */ +#define RTM_REDIRECT 0x6 /* Told to use different route */ +#define RTM_MISS 0x7 /* Lookup failed on this address */ +#define RTM_LOCK 0x8 /* fix specified metrics */ +#define RTM_OLDADD 0x9 /* caused by SIOCADDRT */ +#define RTM_OLDDEL 0xa /* caused by SIOCDELRT */ +#define RTM_RESOLVE 0xb /* req to resolve dst to LL addr */ +#define RTM_NEWADDR 0xc /* address being added to iface */ +#define RTM_DELADDR 0xd /* address being removed from iface */ +#define RTM_IFINFO 0xe /* iface going up/down etc. */ + +#define RTV_MTU 0x1 /* init or lock _mtu */ +#define RTV_HOPCOUNT 0x2 /* init or lock _hopcount */ +#define RTV_EXPIRE 0x4 /* init or lock _hopcount */ +#define RTV_RPIPE 0x8 /* init or lock _recvpipe */ +#define RTV_SPIPE 0x10 /* init or lock _sendpipe */ +#define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */ +#define RTV_RTT 0x40 /* init or lock _rtt */ +#define RTV_RTTVAR 0x80 /* init or lock _rttvar */ + +/* + * Bitmask values for rtm_addr. + */ +#define RTA_DST 0x1 /* destination sockaddr present */ +#define RTA_GATEWAY 0x2 /* gateway sockaddr present */ +#define RTA_NETMASK 0x4 /* netmask sockaddr present */ +#define RTA_GENMASK 0x8 /* cloning mask sockaddr present */ +#define RTA_IFP 0x10 /* interface name sockaddr present */ +#define RTA_IFA 0x20 /* interface addr sockaddr present */ +#define RTA_AUTHOR 0x40 /* sockaddr for author of redirect */ +#define RTA_BRD 0x80 /* for NEWADDR, broadcast or p-p dest addr */ + +/* + * Index offsets for sockaddr array for alternate internal encoding. + */ +#define RTAX_DST 0 /* destination sockaddr present */ +#define RTAX_GATEWAY 1 /* gateway sockaddr present */ +#define RTAX_NETMASK 2 /* netmask sockaddr present */ +#define RTAX_GENMASK 3 /* cloning mask sockaddr present */ +#define RTAX_IFP 4 /* interface name sockaddr present */ +#define RTAX_IFA 5 /* interface addr sockaddr present */ +#define RTAX_AUTHOR 6 /* sockaddr for author of redirect */ +#define RTAX_BRD 7 /* for NEWADDR, broadcast or p-p dest addr */ +#define RTAX_MAX 8 /* size of array to allocate */ + +struct rt_addrinfo { + int rti_addrs; + struct sockaddr *rti_info[RTAX_MAX]; +}; + +struct route_cb { + int ip_count; + int ns_count; + int iso_count; + int any_count; +}; + +#ifdef KERNEL +#define RTFREE(rt) \ + if ((rt)->rt_refcnt <= 1) \ + rtfree(rt); \ + else \ + (rt)->rt_refcnt--; + +struct route_cb route_cb; +struct rtstat rtstat; +struct radix_node_head *rt_tables[AF_MAX+1]; + +void route_init __P((void)); +int route_output __P((struct mbuf *, struct socket *)); +int route_usrreq __P((struct socket *, + int, struct mbuf *, struct mbuf *, struct mbuf *)); +void rt_ifmsg __P((struct ifnet *)); +void rt_maskedcopy __P((struct sockaddr *, + struct sockaddr *, struct sockaddr *)); +void rt_missmsg __P((int, struct rt_addrinfo *, int, int)); +void rt_newaddrmsg __P((int, struct ifaddr *, int, struct rtentry *)); +int rt_setgate __P((struct rtentry *, + struct sockaddr *, struct sockaddr *)); +void rt_setmetrics __P((u_long, struct rt_metrics *, struct rt_metrics *)); +void rtable_init __P((void **)); +void rtalloc __P((struct route *)); +struct rtentry * + rtalloc1 __P((struct sockaddr *, int)); +void rtfree __P((struct rtentry *)); +int rtinit __P((struct ifaddr *, int, int)); +int rtioctl __P((int, caddr_t, struct proc *)); +int rtredirect __P((struct sockaddr *, struct sockaddr *, + struct sockaddr *, int, struct sockaddr *, struct rtentry **)); +int rtrequest __P((int, struct sockaddr *, + struct sockaddr *, struct sockaddr *, int, struct rtentry **)); +#endif diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c new file mode 100644 index 00000000000..d128121708d --- /dev/null +++ b/sys/net/rtsock.c @@ -0,0 +1,833 @@ +/* + * Copyright (c) 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)rtsock.c 8.3 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +struct sockaddr route_dst = { 2, PF_ROUTE, }; +struct sockaddr route_src = { 2, PF_ROUTE, }; +struct sockproto route_proto = { PF_ROUTE, }; + +struct walkarg { + int w_op, w_arg, w_given, w_needed, w_tmemsize; + caddr_t w_where, w_tmem; +}; + +static struct mbuf * + rt_msg1 __P((int, struct rt_addrinfo *)); +static int rt_msg2 __P((int, + struct rt_addrinfo *, caddr_t, struct walkarg *)); +static void rt_xaddrs __P((caddr_t, caddr_t, struct rt_addrinfo *)); + +/* Sleazy use of local variables throughout file, warning!!!! */ +#define dst info.rti_info[RTAX_DST] +#define gate info.rti_info[RTAX_GATEWAY] +#define netmask info.rti_info[RTAX_NETMASK] +#define genmask info.rti_info[RTAX_GENMASK] +#define ifpaddr info.rti_info[RTAX_IFP] +#define ifaaddr info.rti_info[RTAX_IFA] +#define brdaddr info.rti_info[RTAX_BRD] + +/*ARGSUSED*/ +int +route_usrreq(so, req, m, nam, control) + register struct socket *so; + int req; + struct mbuf *m, *nam, *control; +{ + register int error = 0; + register struct rawcb *rp = sotorawcb(so); + int s; + + if (req == PRU_ATTACH) { + MALLOC(rp, struct rawcb *, sizeof(*rp), M_PCB, M_WAITOK); + if (so->so_pcb = (caddr_t)rp) + bzero(so->so_pcb, sizeof(*rp)); + + } + if (req == PRU_DETACH && rp) { + int af = rp->rcb_proto.sp_protocol; + if (af == AF_INET) + route_cb.ip_count--; + else if (af == AF_NS) + route_cb.ns_count--; + else if (af == AF_ISO) + route_cb.iso_count--; + route_cb.any_count--; + } + s = splnet(); + error = raw_usrreq(so, req, m, nam, control); + rp = sotorawcb(so); + if (req == PRU_ATTACH && rp) { + int af = rp->rcb_proto.sp_protocol; + if (error) { + free((caddr_t)rp, M_PCB); + splx(s); + return (error); + } + if (af == AF_INET) + route_cb.ip_count++; + else if (af == AF_NS) + route_cb.ns_count++; + else if (af == AF_ISO) + route_cb.iso_count++; + rp->rcb_faddr = &route_src; + route_cb.any_count++; + soisconnected(so); + so->so_options |= SO_USELOOPBACK; + } + splx(s); + return (error); +} + +/*ARGSUSED*/ +int +route_output(m, so) + register struct mbuf *m; + struct socket *so; +{ + register struct rt_msghdr *rtm = 0; + register struct rtentry *rt = 0; + struct rtentry *saved_nrt = 0; + struct rt_addrinfo info; + int len, error = 0; + struct ifnet *ifp = 0; + struct ifaddr *ifa = 0; + +#define senderr(e) { error = e; goto flush;} + if (m == 0 || ((m->m_len < sizeof(long)) && + (m = m_pullup(m, sizeof(long))) == 0)) + return (ENOBUFS); + if ((m->m_flags & M_PKTHDR) == 0) + panic("route_output"); + len = m->m_pkthdr.len; + if (len < sizeof(*rtm) || + len != mtod(m, struct rt_msghdr *)->rtm_msglen) { + dst = 0; + senderr(EINVAL); + } + R_Malloc(rtm, struct rt_msghdr *, len); + if (rtm == 0) { + dst = 0; + senderr(ENOBUFS); + } + m_copydata(m, 0, len, (caddr_t)rtm); + if (rtm->rtm_version != RTM_VERSION) { + dst = 0; + senderr(EPROTONOSUPPORT); + } + rtm->rtm_pid = curproc->p_pid; + info.rti_addrs = rtm->rtm_addrs; + rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info); + if (dst == 0) + senderr(EINVAL); + if (genmask) { + struct radix_node *t; + t = rn_addmask((caddr_t)genmask, 1, 2); + if (t && Bcmp(genmask, t->rn_key, *(u_char *)genmask) == 0) + genmask = (struct sockaddr *)(t->rn_key); + else + senderr(ENOBUFS); + } + switch (rtm->rtm_type) { + + case RTM_ADD: + if (gate == 0) + senderr(EINVAL); + error = rtrequest(RTM_ADD, dst, gate, netmask, + rtm->rtm_flags, &saved_nrt); + if (error == 0 && saved_nrt) { + rt_setmetrics(rtm->rtm_inits, + &rtm->rtm_rmx, &saved_nrt->rt_rmx); + saved_nrt->rt_refcnt--; + saved_nrt->rt_genmask = genmask; + } + break; + + case RTM_DELETE: + error = rtrequest(RTM_DELETE, dst, gate, netmask, + rtm->rtm_flags, (struct rtentry **)0); + break; + + case RTM_GET: + case RTM_CHANGE: + case RTM_LOCK: + rt = rtalloc1(dst, 0); + if (rt == 0) + senderr(ESRCH); + if (rtm->rtm_type != RTM_GET) {/* XXX: too grotty */ + struct radix_node *rn; + extern struct radix_node_head *mask_rnhead; + + if (Bcmp(dst, rt_key(rt), dst->sa_len) != 0) + senderr(ESRCH); + if (netmask && (rn = rn_search(netmask, + mask_rnhead->rnh_treetop))) + netmask = (struct sockaddr *)rn->rn_key; + for (rn = rt->rt_nodes; rn; rn = rn->rn_dupedkey) + if (netmask == (struct sockaddr *)rn->rn_mask) + break; + if (rn == 0) + senderr(ETOOMANYREFS); + rt = (struct rtentry *)rn; + } + switch(rtm->rtm_type) { + + case RTM_GET: + dst = rt_key(rt); + gate = rt->rt_gateway; + netmask = rt_mask(rt); + genmask = rt->rt_genmask; + if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { + if (ifp = rt->rt_ifp) { + ifpaddr = ifp->if_addrlist->ifa_addr; + ifaaddr = rt->rt_ifa->ifa_addr; + rtm->rtm_index = ifp->if_index; + } else { + ifpaddr = 0; + ifaaddr = 0; + } + } + len = rt_msg2(RTM_GET, &info, (caddr_t)0, + (struct walkarg *)0); + if (len > rtm->rtm_msglen) { + struct rt_msghdr *new_rtm; + R_Malloc(new_rtm, struct rt_msghdr *, len); + if (new_rtm == 0) + senderr(ENOBUFS); + Bcopy(rtm, new_rtm, rtm->rtm_msglen); + Free(rtm); rtm = new_rtm; + } + (void)rt_msg2(RTM_GET, &info, (caddr_t)rtm, + (struct walkarg *)0); + rtm->rtm_flags = rt->rt_flags; + rtm->rtm_rmx = rt->rt_rmx; + rtm->rtm_addrs = info.rti_addrs; + break; + + case RTM_CHANGE: + if (gate && rt_setgate(rt, rt_key(rt), gate)) + senderr(EDQUOT); + /* new gateway could require new ifaddr, ifp; + flags may also be different; ifp may be specified + by ll sockaddr when protocol address is ambiguous */ + if (ifpaddr && (ifa = ifa_ifwithnet(ifpaddr)) && + (ifp = ifa->ifa_ifp)) + ifa = ifaof_ifpforaddr(ifaaddr ? ifaaddr : gate, + ifp); + else if ((ifaaddr && (ifa = ifa_ifwithaddr(ifaaddr))) || + (ifa = ifa_ifwithroute(rt->rt_flags, + rt_key(rt), gate))) + ifp = ifa->ifa_ifp; + if (ifa) { + register struct ifaddr *oifa = rt->rt_ifa; + if (oifa != ifa) { + if (oifa && oifa->ifa_rtrequest) + oifa->ifa_rtrequest(RTM_DELETE, + rt, gate); + IFAFREE(rt->rt_ifa); + rt->rt_ifa = ifa; + ifa->ifa_refcnt++; + rt->rt_ifp = ifp; + } + } + rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, + &rt->rt_rmx); + if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) + rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, gate); + if (genmask) + rt->rt_genmask = genmask; + /* + * Fall into + */ + case RTM_LOCK: + rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); + rt->rt_rmx.rmx_locks |= + (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); + break; + } + break; + + default: + senderr(EOPNOTSUPP); + } + +flush: + if (rtm) { + if (error) + rtm->rtm_errno = error; + else + rtm->rtm_flags |= RTF_DONE; + } + if (rt) + rtfree(rt); + { + register struct rawcb *rp = 0; + /* + * Check to see if we don't want our own messages. + */ + if ((so->so_options & SO_USELOOPBACK) == 0) { + if (route_cb.any_count <= 1) { + if (rtm) + Free(rtm); + m_freem(m); + return (error); + } + /* There is another listener, so construct message */ + rp = sotorawcb(so); + } + if (rtm) { + m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); + Free(rtm); + } + if (rp) + rp->rcb_proto.sp_family = 0; /* Avoid us */ + if (dst) + route_proto.sp_protocol = dst->sa_family; + raw_input(m, &route_proto, &route_src, &route_dst); + if (rp) + rp->rcb_proto.sp_family = PF_ROUTE; + } + return (error); +} + +void +rt_setmetrics(which, in, out) + u_long which; + register struct rt_metrics *in, *out; +{ +#define metric(f, e) if (which & (f)) out->e = in->e; + metric(RTV_RPIPE, rmx_recvpipe); + metric(RTV_SPIPE, rmx_sendpipe); + metric(RTV_SSTHRESH, rmx_ssthresh); + metric(RTV_RTT, rmx_rtt); + metric(RTV_RTTVAR, rmx_rttvar); + metric(RTV_HOPCOUNT, rmx_hopcount); + metric(RTV_MTU, rmx_mtu); + metric(RTV_EXPIRE, rmx_expire); +#undef metric +} + +#define ROUNDUP(a) \ + ((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long)) +#define ADVANCE(x, n) (x += ROUNDUP((n)->sa_len)) + +static void +rt_xaddrs(cp, cplim, rtinfo) + register caddr_t cp, cplim; + register struct rt_addrinfo *rtinfo; +{ + register struct sockaddr *sa; + register int i; + + bzero(rtinfo->rti_info, sizeof(rtinfo->rti_info)); + for (i = 0; (i < RTAX_MAX) && (cp < cplim); i++) { + if ((rtinfo->rti_addrs & (1 << i)) == 0) + continue; + rtinfo->rti_info[i] = sa = (struct sockaddr *)cp; + ADVANCE(cp, sa); + } +} + +/* + * Copy data from a buffer back into the indicated mbuf chain, + * starting "off" bytes from the beginning, extending the mbuf + * chain if necessary. + */ +void +m_copyback(m0, off, len, cp) + struct mbuf *m0; + register int off; + register int len; + caddr_t cp; +{ + register int mlen; + register struct mbuf *m = m0, *n; + int totlen = 0; + + if (m0 == 0) + return; + while (off > (mlen = m->m_len)) { + off -= mlen; + totlen += mlen; + if (m->m_next == 0) { + n = m_getclr(M_DONTWAIT, m->m_type); + if (n == 0) + goto out; + n->m_len = min(MLEN, len + off); + m->m_next = n; + } + m = m->m_next; + } + while (len > 0) { + mlen = min (m->m_len - off, len); + bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen); + cp += mlen; + len -= mlen; + mlen += off; + off = 0; + totlen += mlen; + if (len == 0) + break; + if (m->m_next == 0) { + n = m_get(M_DONTWAIT, m->m_type); + if (n == 0) + break; + n->m_len = min(MLEN, len); + m->m_next = n; + } + m = m->m_next; + } +out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) + m->m_pkthdr.len = totlen; +} + +static struct mbuf * +rt_msg1(type, rtinfo) + int type; + register struct rt_addrinfo *rtinfo; +{ + register struct rt_msghdr *rtm; + register struct mbuf *m; + register int i; + register struct sockaddr *sa; + int len, dlen; + + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == 0) + return (m); + switch (type) { + + case RTM_DELADDR: + case RTM_NEWADDR: + len = sizeof(struct ifa_msghdr); + break; + + case RTM_IFINFO: + len = sizeof(struct if_msghdr); + break; + + default: + len = sizeof(struct rt_msghdr); + } + if (len > MHLEN) + panic("rt_msg1"); + m->m_pkthdr.len = m->m_len = len; + m->m_pkthdr.rcvif = 0; + rtm = mtod(m, struct rt_msghdr *); + bzero((caddr_t)rtm, len); + for (i = 0; i < RTAX_MAX; i++) { + if ((sa = rtinfo->rti_info[i]) == NULL) + continue; + rtinfo->rti_addrs |= (1 << i); + dlen = ROUNDUP(sa->sa_len); + m_copyback(m, len, dlen, (caddr_t)sa); + len += dlen; + } + if (m->m_pkthdr.len != len) { + m_freem(m); + return (NULL); + } + rtm->rtm_msglen = len; + rtm->rtm_version = RTM_VERSION; + rtm->rtm_type = type; + return (m); +} + +static int +rt_msg2(type, rtinfo, cp, w) + int type; + register struct rt_addrinfo *rtinfo; + caddr_t cp; + struct walkarg *w; +{ + register int i; + int len, dlen, second_time = 0; + caddr_t cp0; + + rtinfo->rti_addrs = 0; +again: + switch (type) { + + case RTM_DELADDR: + case RTM_NEWADDR: + len = sizeof(struct ifa_msghdr); + break; + + case RTM_IFINFO: + len = sizeof(struct if_msghdr); + break; + + default: + len = sizeof(struct rt_msghdr); + } + if (cp0 = cp) + cp += len; + for (i = 0; i < RTAX_MAX; i++) { + register struct sockaddr *sa; + + if ((sa = rtinfo->rti_info[i]) == 0) + continue; + rtinfo->rti_addrs |= (1 << i); + dlen = ROUNDUP(sa->sa_len); + if (cp) { + bcopy((caddr_t)sa, cp, (unsigned)dlen); + cp += dlen; + } + len += dlen; + } + if (cp == 0 && w != NULL && !second_time) { + register struct walkarg *rw = w; + + rw->w_needed += len; + if (rw->w_needed <= 0 && rw->w_where) { + if (rw->w_tmemsize < len) { + if (rw->w_tmem) + free(rw->w_tmem, M_RTABLE); + if (rw->w_tmem = (caddr_t) + malloc(len, M_RTABLE, M_NOWAIT)) + rw->w_tmemsize = len; + } + if (rw->w_tmem) { + cp = rw->w_tmem; + second_time = 1; + goto again; + } else + rw->w_where = 0; + } + } + if (cp) { + register struct rt_msghdr *rtm = (struct rt_msghdr *)cp0; + + rtm->rtm_version = RTM_VERSION; + rtm->rtm_type = type; + rtm->rtm_msglen = len; + } + return (len); +} + +/* + * This routine is called to generate a message from the routing + * socket indicating that a redirect has occured, a routing lookup + * has failed, or that a protocol has detected timeouts to a particular + * destination. + */ +void +rt_missmsg(type, rtinfo, flags, error) + int type, flags, error; + register struct rt_addrinfo *rtinfo; +{ + register struct rt_msghdr *rtm; + register struct mbuf *m; + struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; + + if (route_cb.any_count == 0) + return; + m = rt_msg1(type, rtinfo); + if (m == 0) + return; + rtm = mtod(m, struct rt_msghdr *); + rtm->rtm_flags = RTF_DONE | flags; + rtm->rtm_errno = error; + rtm->rtm_addrs = rtinfo->rti_addrs; + route_proto.sp_protocol = sa ? sa->sa_family : 0; + raw_input(m, &route_proto, &route_src, &route_dst); +} + +/* + * This routine is called to generate a message from the routing + * socket indicating that the status of a network interface has changed. + */ +void +rt_ifmsg(ifp) + register struct ifnet *ifp; +{ + register struct if_msghdr *ifm; + struct mbuf *m; + struct rt_addrinfo info; + + if (route_cb.any_count == 0) + return; + bzero((caddr_t)&info, sizeof(info)); + m = rt_msg1(RTM_IFINFO, &info); + if (m == 0) + return; + ifm = mtod(m, struct if_msghdr *); + ifm->ifm_index = ifp->if_index; + ifm->ifm_flags = ifp->if_flags; + ifm->ifm_data = ifp->if_data; + ifm->ifm_addrs = 0; + route_proto.sp_protocol = 0; + raw_input(m, &route_proto, &route_src, &route_dst); +} + +/* + * This is called to generate messages from the routing socket + * indicating a network interface has had addresses associated with it. + * if we ever reverse the logic and replace messages TO the routing + * socket indicate a request to configure interfaces, then it will + * be unnecessary as the routing socket will automatically generate + * copies of it. + */ +void +rt_newaddrmsg(cmd, ifa, error, rt) + int cmd, error; + register struct ifaddr *ifa; + register struct rtentry *rt; +{ + struct rt_addrinfo info; + struct sockaddr *sa; + int pass; + struct mbuf *m; + struct ifnet *ifp = ifa->ifa_ifp; + + if (route_cb.any_count == 0) + return; + for (pass = 1; pass < 3; pass++) { + bzero((caddr_t)&info, sizeof(info)); + if ((cmd == RTM_ADD && pass == 1) || + (cmd == RTM_DELETE && pass == 2)) { + register struct ifa_msghdr *ifam; + int ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; + + ifaaddr = sa = ifa->ifa_addr; + ifpaddr = ifp->if_addrlist->ifa_addr; + netmask = ifa->ifa_netmask; + brdaddr = ifa->ifa_dstaddr; + if ((m = rt_msg1(ncmd, &info)) == NULL) + continue; + ifam = mtod(m, struct ifa_msghdr *); + ifam->ifam_index = ifp->if_index; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_addrs = info.rti_addrs; + } + if ((cmd == RTM_ADD && pass == 2) || + (cmd == RTM_DELETE && pass == 1)) { + register struct rt_msghdr *rtm; + + if (rt == 0) + continue; + netmask = rt_mask(rt); + dst = sa = rt_key(rt); + gate = rt->rt_gateway; + if ((m = rt_msg1(cmd, &info)) == NULL) + continue; + rtm = mtod(m, struct rt_msghdr *); + rtm->rtm_index = ifp->if_index; + rtm->rtm_flags |= rt->rt_flags; + rtm->rtm_errno = error; + rtm->rtm_addrs = info.rti_addrs; + } + route_proto.sp_protocol = sa ? sa->sa_family : 0; + raw_input(m, &route_proto, &route_src, &route_dst); + } +} + +/* + * This is used in dumping the kernel table via sysctl(). + */ +int +sysctl_dumpentry(rn, w) + struct radix_node *rn; + register struct walkarg *w; +{ + register struct rtentry *rt = (struct rtentry *)rn; + int error = 0, size; + struct rt_addrinfo info; + + if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg)) + return 0; + bzero((caddr_t)&info, sizeof(info)); + dst = rt_key(rt); + gate = rt->rt_gateway; + netmask = rt_mask(rt); + genmask = rt->rt_genmask; + size = rt_msg2(RTM_GET, &info, 0, w); + if (w->w_where && w->w_tmem) { + register struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; + + rtm->rtm_flags = rt->rt_flags; + rtm->rtm_use = rt->rt_use; + rtm->rtm_rmx = rt->rt_rmx; + rtm->rtm_index = rt->rt_ifp->if_index; + rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; + rtm->rtm_addrs = info.rti_addrs; + if (error = copyout((caddr_t)rtm, w->w_where, size)) + w->w_where = NULL; + else + w->w_where += size; + } + return (error); +} + +int +sysctl_iflist(af, w) + int af; + register struct walkarg *w; +{ + register struct ifnet *ifp; + register struct ifaddr *ifa; + struct rt_addrinfo info; + int len, error = 0; + + bzero((caddr_t)&info, sizeof(info)); + for (ifp = ifnet; ifp; ifp = ifp->if_next) { + if (w->w_arg && w->w_arg != ifp->if_index) + continue; + ifa = ifp->if_addrlist; + ifpaddr = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO, &info, (caddr_t)0, w); + ifpaddr = 0; + if (w->w_where && w->w_tmem) { + register struct if_msghdr *ifm; + + ifm = (struct if_msghdr *)w->w_tmem; + ifm->ifm_index = ifp->if_index; + ifm->ifm_flags = ifp->if_flags; + ifm->ifm_data = ifp->if_data; + ifm->ifm_addrs = info.rti_addrs; + if (error = copyout((caddr_t)ifm, w->w_where, len)) + return (error); + w->w_where += len; + } + while (ifa = ifa->ifa_next) { + if (af && af != ifa->ifa_addr->sa_family) + continue; + ifaaddr = ifa->ifa_addr; + netmask = ifa->ifa_netmask; + brdaddr = ifa->ifa_dstaddr; + len = rt_msg2(RTM_NEWADDR, &info, 0, w); + if (w->w_where && w->w_tmem) { + register struct ifa_msghdr *ifam; + + ifam = (struct ifa_msghdr *)w->w_tmem; + ifam->ifam_index = ifa->ifa_ifp->if_index; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_addrs = info.rti_addrs; + if (error = copyout(w->w_tmem, w->w_where, len)) + return (error); + w->w_where += len; + } + } + ifaaddr = netmask = brdaddr = 0; + } + return (0); +} + +int +sysctl_rtable(name, namelen, where, given, new, newlen) + int *name; + int namelen; + caddr_t where; + size_t *given; + caddr_t *new; + size_t newlen; +{ + register struct radix_node_head *rnh; + int i, s, error = EINVAL; + u_char af; + struct walkarg w; + + if (new) + return (EPERM); + if (namelen != 3) + return (EINVAL); + af = name[0]; + Bzero(&w, sizeof(w)); + w.w_where = where; + w.w_given = *given; + w.w_needed = 0 - w.w_given; + w.w_op = name[1]; + w.w_arg = name[2]; + + s = splnet(); + switch (w.w_op) { + + case NET_RT_DUMP: + case NET_RT_FLAGS: + for (i = 1; i <= AF_MAX; i++) + if ((rnh = rt_tables[i]) && (af == 0 || af == i) && + (error = rnh->rnh_walktree(rnh, + sysctl_dumpentry, &w))) + break; + break; + + case NET_RT_IFLIST: + error = sysctl_iflist(af, &w); + } + splx(s); + if (w.w_tmem) + free(w.w_tmem, M_RTABLE); + w.w_needed += w.w_given; + if (where) { + *given = w.w_where - where; + if (*given < w.w_needed) + return (ENOMEM); + } else { + *given = (11 * w.w_needed) / 10; + } + return (error); +} + +/* + * Definitions of protocols supported in the ROUTE domain. + */ + +extern struct domain routedomain; /* or at least forward */ + +struct protosw routesw[] = { +{ SOCK_RAW, &routedomain, 0, PR_ATOMIC|PR_ADDR, + raw_input, route_output, raw_ctlinput, 0, + route_usrreq, + raw_init, 0, 0, 0, + sysctl_rtable, +} +}; + +struct domain routedomain = + { PF_ROUTE, "route", route_init, 0, 0, + routesw, &routesw[sizeof(routesw)/sizeof(routesw[0])] }; diff --git a/sys/net/slcompress.c b/sys/net/slcompress.c new file mode 100644 index 00000000000..70af9358e37 --- /dev/null +++ b/sys/net/slcompress.c @@ -0,0 +1,535 @@ +/*- + * Copyright (c) 1989, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)slcompress.c 8.2 (Berkeley) 4/16/94 + */ + +/* + * Routines to compress and uncompess tcp packets (for transmission + * over low speed serial lines. + * + * Van Jacobson (van@helios.ee.lbl.gov), Dec 31, 1989: + * - Initial distribution. + * + * static char rcsid[] = + * "$Header: slcompress.c,v 1.19 89/12/31 08:52:59 van Exp $"; + */ + +#include +#include + +#include +#include +#include +#include + +#include + +#ifndef SL_NO_STATS +#define INCR(counter) ++comp->counter; +#else +#define INCR(counter) +#endif + +#define BCMP(p1, p2, n) bcmp((char *)(p1), (char *)(p2), (int)(n)) +#define BCOPY(p1, p2, n) bcopy((char *)(p1), (char *)(p2), (int)(n)) +#ifndef KERNEL +#define ovbcopy bcopy +#endif + +void +sl_compress_init(comp) + struct slcompress *comp; +{ + register u_int i; + register struct cstate *tstate = comp->tstate; + + bzero((char *)comp, sizeof(*comp)); + for (i = MAX_STATES - 1; i > 0; --i) { + tstate[i].cs_id = i; + tstate[i].cs_next = &tstate[i - 1]; + } + tstate[0].cs_next = &tstate[MAX_STATES - 1]; + tstate[0].cs_id = 0; + comp->last_cs = &tstate[0]; + comp->last_recv = 255; + comp->last_xmit = 255; + comp->flags = SLF_TOSS; +} + + +/* ENCODE encodes a number that is known to be non-zero. ENCODEZ + * checks for zero (since zero has to be encoded in the long, 3 byte + * form). + */ +#define ENCODE(n) { \ + if ((u_short)(n) >= 256) { \ + *cp++ = 0; \ + cp[1] = (n); \ + cp[0] = (n) >> 8; \ + cp += 2; \ + } else { \ + *cp++ = (n); \ + } \ +} +#define ENCODEZ(n) { \ + if ((u_short)(n) >= 256 || (u_short)(n) == 0) { \ + *cp++ = 0; \ + cp[1] = (n); \ + cp[0] = (n) >> 8; \ + cp += 2; \ + } else { \ + *cp++ = (n); \ + } \ +} + +#define DECODEL(f) { \ + if (*cp == 0) {\ + (f) = htonl(ntohl(f) + ((cp[1] << 8) | cp[2])); \ + cp += 3; \ + } else { \ + (f) = htonl(ntohl(f) + (u_long)*cp++); \ + } \ +} + +#define DECODES(f) { \ + if (*cp == 0) {\ + (f) = htons(ntohs(f) + ((cp[1] << 8) | cp[2])); \ + cp += 3; \ + } else { \ + (f) = htons(ntohs(f) + (u_long)*cp++); \ + } \ +} + +#define DECODEU(f) { \ + if (*cp == 0) {\ + (f) = htons((cp[1] << 8) | cp[2]); \ + cp += 3; \ + } else { \ + (f) = htons((u_long)*cp++); \ + } \ +} + +u_int +sl_compress_tcp(m, ip, comp, compress_cid) + struct mbuf *m; + register struct ip *ip; + struct slcompress *comp; + int compress_cid; +{ + register struct cstate *cs = comp->last_cs->cs_next; + register u_int hlen = ip->ip_hl; + register struct tcphdr *oth; + register struct tcphdr *th; + register u_int deltaS, deltaA; + register u_int changes = 0; + u_char new_seq[16]; + register u_char *cp = new_seq; + + /* + * Bail if this is an IP fragment or if the TCP packet isn't + * `compressible' (i.e., ACK isn't set or some other control bit is + * set). (We assume that the caller has already made sure the + * packet is IP proto TCP). + */ + if ((ip->ip_off & htons(0x3fff)) || m->m_len < 40) + return (TYPE_IP); + + th = (struct tcphdr *)&((int *)ip)[hlen]; + if ((th->th_flags & (TH_SYN|TH_FIN|TH_RST|TH_ACK)) != TH_ACK) + return (TYPE_IP); + /* + * Packet is compressible -- we're going to send either a + * COMPRESSED_TCP or UNCOMPRESSED_TCP packet. Either way we need + * to locate (or create) the connection state. Special case the + * most recently used connection since it's most likely to be used + * again & we don't have to do any reordering if it's used. + */ + INCR(sls_packets) + if (ip->ip_src.s_addr != cs->cs_ip.ip_src.s_addr || + ip->ip_dst.s_addr != cs->cs_ip.ip_dst.s_addr || + *(int *)th != ((int *)&cs->cs_ip)[cs->cs_ip.ip_hl]) { + /* + * Wasn't the first -- search for it. + * + * States are kept in a circularly linked list with + * last_cs pointing to the end of the list. The + * list is kept in lru order by moving a state to the + * head of the list whenever it is referenced. Since + * the list is short and, empirically, the connection + * we want is almost always near the front, we locate + * states via linear search. If we don't find a state + * for the datagram, the oldest state is (re-)used. + */ + register struct cstate *lcs; + register struct cstate *lastcs = comp->last_cs; + + do { + lcs = cs; cs = cs->cs_next; + INCR(sls_searches) + if (ip->ip_src.s_addr == cs->cs_ip.ip_src.s_addr + && ip->ip_dst.s_addr == cs->cs_ip.ip_dst.s_addr + && *(int *)th == ((int *)&cs->cs_ip)[cs->cs_ip.ip_hl]) + goto found; + } while (cs != lastcs); + + /* + * Didn't find it -- re-use oldest cstate. Send an + * uncompressed packet that tells the other side what + * connection number we're using for this conversation. + * Note that since the state list is circular, the oldest + * state points to the newest and we only need to set + * last_cs to update the lru linkage. + */ + INCR(sls_misses) + comp->last_cs = lcs; + hlen += th->th_off; + hlen <<= 2; + goto uncompressed; + + found: + /* + * Found it -- move to the front on the connection list. + */ + if (cs == lastcs) + comp->last_cs = lcs; + else { + lcs->cs_next = cs->cs_next; + cs->cs_next = lastcs->cs_next; + lastcs->cs_next = cs; + } + } + + /* + * Make sure that only what we expect to change changed. The first + * line of the `if' checks the IP protocol version, header length & + * type of service. The 2nd line checks the "Don't fragment" bit. + * The 3rd line checks the time-to-live and protocol (the protocol + * check is unnecessary but costless). The 4th line checks the TCP + * header length. The 5th line checks IP options, if any. The 6th + * line checks TCP options, if any. If any of these things are + * different between the previous & current datagram, we send the + * current datagram `uncompressed'. + */ + oth = (struct tcphdr *)&((int *)&cs->cs_ip)[hlen]; + deltaS = hlen; + hlen += th->th_off; + hlen <<= 2; + + if (((u_short *)ip)[0] != ((u_short *)&cs->cs_ip)[0] || + ((u_short *)ip)[3] != ((u_short *)&cs->cs_ip)[3] || + ((u_short *)ip)[4] != ((u_short *)&cs->cs_ip)[4] || + th->th_off != oth->th_off || + (deltaS > 5 && + BCMP(ip + 1, &cs->cs_ip + 1, (deltaS - 5) << 2)) || + (th->th_off > 5 && + BCMP(th + 1, oth + 1, (th->th_off - 5) << 2))) + goto uncompressed; + + /* + * Figure out which of the changing fields changed. The + * receiver expects changes in the order: urgent, window, + * ack, seq (the order minimizes the number of temporaries + * needed in this section of code). + */ + if (th->th_flags & TH_URG) { + deltaS = ntohs(th->th_urp); + ENCODEZ(deltaS); + changes |= NEW_U; + } else if (th->th_urp != oth->th_urp) + /* argh! URG not set but urp changed -- a sensible + * implementation should never do this but RFC793 + * doesn't prohibit the change so we have to deal + * with it. */ + goto uncompressed; + + if (deltaS = (u_short)(ntohs(th->th_win) - ntohs(oth->th_win))) { + ENCODE(deltaS); + changes |= NEW_W; + } + + if (deltaA = ntohl(th->th_ack) - ntohl(oth->th_ack)) { + if (deltaA > 0xffff) + goto uncompressed; + ENCODE(deltaA); + changes |= NEW_A; + } + + if (deltaS = ntohl(th->th_seq) - ntohl(oth->th_seq)) { + if (deltaS > 0xffff) + goto uncompressed; + ENCODE(deltaS); + changes |= NEW_S; + } + + switch(changes) { + + case 0: + /* + * Nothing changed. If this packet contains data and the + * last one didn't, this is probably a data packet following + * an ack (normal on an interactive connection) and we send + * it compressed. Otherwise it's probably a retransmit, + * retransmitted ack or window probe. Send it uncompressed + * in case the other side missed the compressed version. + */ + if (ip->ip_len != cs->cs_ip.ip_len && + ntohs(cs->cs_ip.ip_len) == hlen) + break; + + /* (fall through) */ + + case SPECIAL_I: + case SPECIAL_D: + /* + * actual changes match one of our special case encodings -- + * send packet uncompressed. + */ + goto uncompressed; + + case NEW_S|NEW_A: + if (deltaS == deltaA && + deltaS == ntohs(cs->cs_ip.ip_len) - hlen) { + /* special case for echoed terminal traffic */ + changes = SPECIAL_I; + cp = new_seq; + } + break; + + case NEW_S: + if (deltaS == ntohs(cs->cs_ip.ip_len) - hlen) { + /* special case for data xfer */ + changes = SPECIAL_D; + cp = new_seq; + } + break; + } + + deltaS = ntohs(ip->ip_id) - ntohs(cs->cs_ip.ip_id); + if (deltaS != 1) { + ENCODEZ(deltaS); + changes |= NEW_I; + } + if (th->th_flags & TH_PUSH) + changes |= TCP_PUSH_BIT; + /* + * Grab the cksum before we overwrite it below. Then update our + * state with this packet's header. + */ + deltaA = ntohs(th->th_sum); + BCOPY(ip, &cs->cs_ip, hlen); + + /* + * We want to use the original packet as our compressed packet. + * (cp - new_seq) is the number of bytes we need for compressed + * sequence numbers. In addition we need one byte for the change + * mask, one for the connection id and two for the tcp checksum. + * So, (cp - new_seq) + 4 bytes of header are needed. hlen is how + * many bytes of the original packet to toss so subtract the two to + * get the new packet size. + */ + deltaS = cp - new_seq; + cp = (u_char *)ip; + if (compress_cid == 0 || comp->last_xmit != cs->cs_id) { + comp->last_xmit = cs->cs_id; + hlen -= deltaS + 4; + cp += hlen; + *cp++ = changes | NEW_C; + *cp++ = cs->cs_id; + } else { + hlen -= deltaS + 3; + cp += hlen; + *cp++ = changes; + } + m->m_len -= hlen; + m->m_data += hlen; + *cp++ = deltaA >> 8; + *cp++ = deltaA; + BCOPY(new_seq, cp, deltaS); + INCR(sls_compressed) + return (TYPE_COMPRESSED_TCP); + + /* + * Update connection state cs & send uncompressed packet ('uncompressed' + * means a regular ip/tcp packet but with the 'conversation id' we hope + * to use on future compressed packets in the protocol field). + */ +uncompressed: + BCOPY(ip, &cs->cs_ip, hlen); + ip->ip_p = cs->cs_id; + comp->last_xmit = cs->cs_id; + return (TYPE_UNCOMPRESSED_TCP); +} + + +int +sl_uncompress_tcp(bufp, len, type, comp) + u_char **bufp; + int len; + u_int type; + struct slcompress *comp; +{ + register u_char *cp; + register u_int hlen, changes; + register struct tcphdr *th; + register struct cstate *cs; + register struct ip *ip; + + switch (type) { + + case TYPE_UNCOMPRESSED_TCP: + ip = (struct ip *) *bufp; + if (ip->ip_p >= MAX_STATES) + goto bad; + cs = &comp->rstate[comp->last_recv = ip->ip_p]; + comp->flags &=~ SLF_TOSS; + ip->ip_p = IPPROTO_TCP; + hlen = ip->ip_hl; + hlen += ((struct tcphdr *)&((int *)ip)[hlen])->th_off; + hlen <<= 2; + BCOPY(ip, &cs->cs_ip, hlen); + cs->cs_ip.ip_sum = 0; + cs->cs_hlen = hlen; + INCR(sls_uncompressedin) + return (len); + + default: + goto bad; + + case TYPE_COMPRESSED_TCP: + break; + } + /* We've got a compressed packet. */ + INCR(sls_compressedin) + cp = *bufp; + changes = *cp++; + if (changes & NEW_C) { + /* Make sure the state index is in range, then grab the state. + * If we have a good state index, clear the 'discard' flag. */ + if (*cp >= MAX_STATES) + goto bad; + + comp->flags &=~ SLF_TOSS; + comp->last_recv = *cp++; + } else { + /* this packet has an implicit state index. If we've + * had a line error since the last time we got an + * explicit state index, we have to toss the packet. */ + if (comp->flags & SLF_TOSS) { + INCR(sls_tossed) + return (0); + } + } + cs = &comp->rstate[comp->last_recv]; + hlen = cs->cs_ip.ip_hl << 2; + th = (struct tcphdr *)&((u_char *)&cs->cs_ip)[hlen]; + th->th_sum = htons((*cp << 8) | cp[1]); + cp += 2; + if (changes & TCP_PUSH_BIT) + th->th_flags |= TH_PUSH; + else + th->th_flags &=~ TH_PUSH; + + switch (changes & SPECIALS_MASK) { + case SPECIAL_I: + { + register u_int i = ntohs(cs->cs_ip.ip_len) - cs->cs_hlen; + th->th_ack = htonl(ntohl(th->th_ack) + i); + th->th_seq = htonl(ntohl(th->th_seq) + i); + } + break; + + case SPECIAL_D: + th->th_seq = htonl(ntohl(th->th_seq) + ntohs(cs->cs_ip.ip_len) + - cs->cs_hlen); + break; + + default: + if (changes & NEW_U) { + th->th_flags |= TH_URG; + DECODEU(th->th_urp) + } else + th->th_flags &=~ TH_URG; + if (changes & NEW_W) + DECODES(th->th_win) + if (changes & NEW_A) + DECODEL(th->th_ack) + if (changes & NEW_S) + DECODEL(th->th_seq) + break; + } + if (changes & NEW_I) { + DECODES(cs->cs_ip.ip_id) + } else + cs->cs_ip.ip_id = htons(ntohs(cs->cs_ip.ip_id) + 1); + + /* + * At this point, cp points to the first byte of data in the + * packet. If we're not aligned on a 4-byte boundary, copy the + * data down so the ip & tcp headers will be aligned. Then back up + * cp by the tcp/ip header length to make room for the reconstructed + * header (we assume the packet we were handed has enough space to + * prepend 128 bytes of header). Adjust the length to account for + * the new header & fill in the IP total length. + */ + len -= (cp - *bufp); + if (len < 0) + /* we must have dropped some characters (crc should detect + * this but the old slip framing won't) */ + goto bad; + + if ((int)cp & 3) { + if (len > 0) + (void) ovbcopy(cp, (caddr_t)((int)cp &~ 3), len); + cp = (u_char *)((int)cp &~ 3); + } + cp -= cs->cs_hlen; + len += cs->cs_hlen; + cs->cs_ip.ip_len = htons(len); + BCOPY(&cs->cs_ip, cp, cs->cs_hlen); + *bufp = cp; + + /* recompute the ip header checksum */ + { + register u_short *bp = (u_short *)cp; + for (changes = 0; hlen > 0; hlen -= 2) + changes += *bp++; + changes = (changes & 0xffff) + (changes >> 16); + changes = (changes & 0xffff) + (changes >> 16); + ((struct ip *)cp)->ip_sum = ~ changes; + } + return (len); +bad: + comp->flags |= SLF_TOSS; + INCR(sls_errorin) + return (0); +} diff --git a/sys/net/slcompress.h b/sys/net/slcompress.h new file mode 100644 index 00000000000..cefe940f198 --- /dev/null +++ b/sys/net/slcompress.h @@ -0,0 +1,157 @@ +/* slcompress.h 8.1 93/06/10 */ +/* + * Definitions for tcp compression routines. + * + * $Header: slcompress.h,v 1.10 89/12/31 08:53:02 van Exp $ + * + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Van Jacobson (van@helios.ee.lbl.gov), Dec 31, 1989: + * - Initial distribution. + */ + +#define MAX_STATES 16 /* must be > 2 and < 256 */ +#define MAX_HDR MLEN /* XXX 4bsd-ism: should really be 128 */ + +/* + * Compressed packet format: + * + * The first octet contains the packet type (top 3 bits), TCP + * 'push' bit, and flags that indicate which of the 4 TCP sequence + * numbers have changed (bottom 5 bits). The next octet is a + * conversation number that associates a saved IP/TCP header with + * the compressed packet. The next two octets are the TCP checksum + * from the original datagram. The next 0 to 15 octets are + * sequence number changes, one change per bit set in the header + * (there may be no changes and there are two special cases where + * the receiver implicitly knows what changed -- see below). + * + * There are 5 numbers which can change (they are always inserted + * in the following order): TCP urgent pointer, window, + * acknowlegement, sequence number and IP ID. (The urgent pointer + * is different from the others in that its value is sent, not the + * change in value.) Since typical use of SLIP links is biased + * toward small packets (see comments on MTU/MSS below), changes + * use a variable length coding with one octet for numbers in the + * range 1 - 255 and 3 octets (0, MSB, LSB) for numbers in the + * range 256 - 65535 or 0. (If the change in sequence number or + * ack is more than 65535, an uncompressed packet is sent.) + */ + +/* + * Packet types (must not conflict with IP protocol version) + * + * The top nibble of the first octet is the packet type. There are + * three possible types: IP (not proto TCP or tcp with one of the + * control flags set); uncompressed TCP (a normal IP/TCP packet but + * with the 8-bit protocol field replaced by an 8-bit connection id -- + * this type of packet syncs the sender & receiver); and compressed + * TCP (described above). + * + * LSB of 4-bit field is TCP "PUSH" bit (a worthless anachronism) and + * is logically part of the 4-bit "changes" field that follows. Top + * three bits are actual packet type. For backward compatibility + * and in the interest of conserving bits, numbers are chosen so the + * IP protocol version number (4) which normally appears in this nibble + * means "IP packet". + */ + +/* packet types */ +#define TYPE_IP 0x40 +#define TYPE_UNCOMPRESSED_TCP 0x70 +#define TYPE_COMPRESSED_TCP 0x80 +#define TYPE_ERROR 0x00 + +/* Bits in first octet of compressed packet */ +#define NEW_C 0x40 /* flag bits for what changed in a packet */ +#define NEW_I 0x20 +#define NEW_S 0x08 +#define NEW_A 0x04 +#define NEW_W 0x02 +#define NEW_U 0x01 + +/* reserved, special-case values of above */ +#define SPECIAL_I (NEW_S|NEW_W|NEW_U) /* echoed interactive traffic */ +#define SPECIAL_D (NEW_S|NEW_A|NEW_W|NEW_U) /* unidirectional data */ +#define SPECIALS_MASK (NEW_S|NEW_A|NEW_W|NEW_U) + +#define TCP_PUSH_BIT 0x10 + + +/* + * "state" data for each active tcp conversation on the wire. This is + * basically a copy of the entire IP/TCP header from the last packet + * we saw from the conversation together with a small identifier + * the transmit & receive ends of the line use to locate saved header. + */ +struct cstate { + struct cstate *cs_next; /* next most recently used cstate (xmit only) */ + u_short cs_hlen; /* size of hdr (receive only) */ + u_char cs_id; /* connection # associated with this state */ + u_char cs_filler; + union { + char csu_hdr[MAX_HDR]; + struct ip csu_ip; /* ip/tcp hdr from most recent packet */ + } slcs_u; +}; +#define cs_ip slcs_u.csu_ip +#define cs_hdr slcs_u.csu_hdr + +/* + * all the state data for one serial line (we need one of these + * per line). + */ +struct slcompress { + struct cstate *last_cs; /* most recently used tstate */ + u_char last_recv; /* last rcvd conn. id */ + u_char last_xmit; /* last sent conn. id */ + u_short flags; +#ifndef SL_NO_STATS + int sls_packets; /* outbound packets */ + int sls_compressed; /* outbound compressed packets */ + int sls_searches; /* searches for connection state */ + int sls_misses; /* times couldn't find conn. state */ + int sls_uncompressedin; /* inbound uncompressed packets */ + int sls_compressedin; /* inbound compressed packets */ + int sls_errorin; /* inbound unknown type packets */ + int sls_tossed; /* inbound packets tossed because of error */ +#endif + struct cstate tstate[MAX_STATES]; /* xmit connection states */ + struct cstate rstate[MAX_STATES]; /* receive connection states */ +}; +/* flag values */ +#define SLF_TOSS 1 /* tossing rcvd frames because of input err */ + +void sl_compress_init __P((struct slcompress *)); +u_int sl_compress_tcp __P((struct mbuf *, + struct ip *, struct slcompress *, int)); +int sl_uncompress_tcp __P((u_char **, int, u_int, struct slcompress *)); diff --git a/sys/net/slip.h b/sys/net/slip.h new file mode 100644 index 00000000000..4caeb464df3 --- /dev/null +++ b/sys/net/slip.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)slip.h 8.1 (Berkeley) 2/12/94 + */ + +/* Ioctls operating on SLIP ttys. */ +#define SLIOCGUNIT _IOR('t', 88, int) /* get slip unit number */ + +/* + * Definitions of the pseudo-link-level header attached to slip + * packets grabbed by the packet filter (bpf) traffic monitor. + */ +#define SLIP_HDRLEN 16 /* BPF SLIP header length */ + +/* Offsets into BPF SLIP header. */ +#define SLX_DIR 0 /* direction; see below */ +#define SLX_CHDR 1 /* compressed header data */ +#define CHDR_LEN 15 /* length of compressed header data */ + +#define SLIPDIR_IN 0 /* incoming */ +#define SLIPDIR_OUT 1 /* outgoing */ diff --git a/sys/netccitt/README.hdlc b/sys/netccitt/README.hdlc new file mode 100644 index 00000000000..24b5fef96df --- /dev/null +++ b/sys/netccitt/README.hdlc @@ -0,0 +1,50 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * + * @(#)README.hdlc 8.1 (Berkeley) 6/10/93 + * + * X.25 HDLC DATA LINK LEVEL: + * + * + * This module implements the Link Level of the Open Systems Interconnect + * Model. The implementation is based on the ISO High-Level Data Link + * Control (HDLC). These procedures subscribe to the principles of the + * ISO-Class of Procedures for point-to-point. These procedures implement + * two-way asynchronous balanced mode (LAPB) as recommented by the CCITT. + * + * The HDLC protocol layer interface consists of the following procedures: + * Hd_init (pr_init) + * Hd_ouput (pr_output) + * Hd_input (pr_input) + * Hd_timer (pr_slowtimo) + * + * Note: Supervisory commands RR, RNR and REJ are not transmitted by this + * station. + * + * This station never enters a busy (RNR) condition. + * + * The "Generate_rr" variable can be set to FALSE. This means that + * we NEVER send an RR. This works just fine if the network level + * is X.25 packet protocol -- which it is. + * + * Currently, this is only a DTE implementation. + * + * Think about: + * If the remote is busy, no iframes are sent. The remote sends a RR + * to clear this condition. However, this RR may be damaged, causing + * a possible deadlock. A solution is to poll with iframe (P(S)==P(R) + * of RNR) indefinitly. + * + * + * Date: February 1984 + * + * Author: Gerald W. Neufeld + * + * Installation: Department of Computer Science + * University of British Columbia + * Vancouver, BC, CANADA. + * + * History: + * + * + */ diff --git a/sys/netccitt/README.packet b/sys/netccitt/README.packet new file mode 100644 index 00000000000..858d75cccdc --- /dev/null +++ b/sys/netccitt/README.packet @@ -0,0 +1,36 @@ +/* + * @(#)README.packet 8.1 (Berkeley) 6/10/93 + * + * X.25 NETWORK PACKET LEVEL: + * + * This implementation is based on Recommentation X.25 as agreed at the + * March 1976 and the February 1980 meetings of CCITT Study Group VII. + * However, not all aspects are implemented. The following is a list of + * features which are not yet or may never be implemented: + * + * 1. D bit + * 2. PVC + * 3. fast select + * + * + * Note: This implementation is for DTEs only. + * + * Currently, only the 1976 verison is implemented. + * + * + * Date: February, 1984 + * + * Author: Gerald W. Neufeld + * + * Installation: Department of Computer Science + * University of British Columbia + * Vancouver, BC, CANADA + * + * To Do: Find some reasonable heuristic for piggybacking packet + * level acks. + * + * Bugs: Clear might be sent before data is all out. + * + * History: + * + */ diff --git a/sys/netccitt/ccitt_proto.c b/sys/netccitt/ccitt_proto.c new file mode 100644 index 00000000000..d832fd38f92 --- /dev/null +++ b/sys/netccitt/ccitt_proto.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ccitt_proto.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include + +#include + +#include + +/* + * Definitions of protocols supported in the CCITT domain. + */ + +extern struct domain ccittdomain; +#define DOMAIN &ccittdomain + +#ifdef LLC +int llc_output(); +void llc_ctlinput(), llc_init(), llc_timer(); +#endif +#ifdef HDLC +int hd_output(); +void hd_ctlinput(), hd_init(), hd_timer(); +#endif +int pk_usrreq(), pk_ctloutput(); +void pk_timer(), pk_init(), pk_input(), pk_ctlinput(); + +struct protosw ccittsw[] = { +#ifdef LLC + { 0, DOMAIN, IEEEPROTO_802LLC,0, + 0, llc_output, llc_ctlinput, 0, + 0, + llc_init, 0, llc_timer, 0, + }, +#endif +#ifdef HDLC + { 0, DOMAIN, CCITTPROTO_HDLC,0, + 0, hd_output, hd_ctlinput, 0, + 0, + hd_init, 0, hd_timer, 0, + }, +#endif + { SOCK_STREAM, DOMAIN, CCITTPROTO_X25, PR_CONNREQUIRED|PR_ATOMIC|PR_WANTRCVD, + pk_input, 0, pk_ctlinput, pk_ctloutput, + pk_usrreq, + pk_init, 0, pk_timer, 0, + } +}; + +struct domain ccittdomain = + { AF_CCITT, "ccitt", 0, 0, 0, ccittsw, + &ccittsw[sizeof(ccittsw)/sizeof(ccittsw[0])], 0, + rn_inithead, 32, sizeof (struct sockaddr_x25) }; diff --git a/sys/netccitt/dll.h b/sys/netccitt/dll.h new file mode 100644 index 00000000000..46ded88eda8 --- /dev/null +++ b/sys/netccitt/dll.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) Dirk Husemann, Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1990, 1991, 1992 + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dll.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * We define the additional PRC_* codes in here + */ +#ifdef KERNEL +#ifndef PRC_IFUP +#define PRC_IFUP 3 +#endif +#define PRC_CONNECT_INDICATION 8 +#define PRC_CONNECT_REQUEST 9 +#define PRC_DISCONNECT_REQUEST 10 +#define PRC_DISCONNECT_INDICATION 11 +#define PRC_RESET_REQUEST 12 +#endif + +/* + * Data link layer configuration --- basically a copy of the relevant parts + * of x25config, implemented to become a little bit more network + * layer independent. (Probably only used for casting et al.) + */ +struct dllconfig { + u_short dllcfg_unused0:4, + dllcfg_unused1:4, + dllcfg_trace:1, /* link level tracing flag */ + dllcfg_window:7; /* link level window size */ + u_short dllcfg_xchxid:1, /* exchange XID (not yet) */ + dllcfg_unused2:7; /* here be dragons */ +}; + +struct dll_ctlinfo { + union { + struct { + struct dllconfig *dctli_up_cfg; + u_char dctli_up_lsap; + } CTLI_UP; + struct { + caddr_t dctli_down_pcb; + struct rtentry *dctli_down_rt; + struct dllconfig *dctli_down_llconf; + } CTLI_DOWN; + } CTLIun; +}; +#define dlcti_cfg CTLIun.CTLI_UP.dctli_up_cfg +#define dlcti_lsap CTLIun.CTLI_UP.dctli_up_lsap +#define dlcti_pcb CTLIun.CTLI_DOWN.dctli_down_pcb +#define dlcti_rt CTLIun.CTLI_DOWN.dctli_down_rt +#define dlcti_conf CTLIun.CTLI_DOWN.dctli_down_llconf diff --git a/sys/netccitt/hd_debug.c b/sys/netccitt/hd_debug.c new file mode 100644 index 00000000000..b8a45a3f59a --- /dev/null +++ b/sys/netccitt/hd_debug.c @@ -0,0 +1,212 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hd_debug.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#ifdef HDLCDEBUG +#define NTRACE 32 + +struct hdlctrace { + struct hdcb *ht_hdp; + short ht_dir; + struct mbuf *ht_frame; + struct timeval ht_time; +} hdtrace[NTRACE]; + +int lasttracelogged, freezetrace; +#endif + +hd_trace (hdp, direction, frame) +struct hdcb *hdp; +register struct Hdlc_frame *frame; +{ + register char *s; + register int nr, pf, ns, i; + struct Hdlc_iframe *iframe = (struct Hdlc_iframe *) frame; + +#ifdef HDLCDEBUG + hd_savetrace (hdp, direction, frame); +#endif + if (hdp -> hd_xcp -> xc_ltrace) { + if (direction == RX) + printf ("F-In: "); + else if (direction == 2) + printf ("F-Xmt: "); + else + printf ("F-Out: "); + + nr = iframe -> nr; + pf = iframe -> pf; + ns = iframe -> ns; + + switch (hd_decode (hdp, frame)) { + case SABM: + printf ("SABM : PF=%d\n", pf); + break; + + case DISC: + printf ("DISC : PF=%d\n", pf); + break; + + case DM: + printf ("DM : PF=%d\n", pf); + break; + + case FRMR: + { + register struct Frmr_frame *f = (struct Frmr_frame *)frame; + + printf ("FRMR : PF=%d, TEXT=", pf); + for (s = (char *) frame, i = 0; i < 5; ++i, ++s) + printf ("%x ", (int) * s & 0xff); + printf ("\n"); + printf ("control=%x v(s)=%d v(r)=%d w%d x%d y%d z%d\n", + f->frmr_control, f->frmr_ns, f->frmr_nr, + f->frmr_w, f->frmr_x, f->frmr_y, f->frmr_z); + break; + } + + case UA: + printf ("UA : PF=%d\n", pf); + break; + + case RR: + printf ("RR : N(R)=%d, PF=%d\n", nr, pf); + break; + + case RNR: + printf ("RNR : N(R)=%d, PF=%d\n", nr, pf); + break; + + case REJ: + printf ("REJ : N(R)=%d, PF=%d\n", nr, pf); + break; + + case IFRAME: + { + register struct mbuf *m; + register int len = 0; + + for(m = dtom (frame); m; m = m -> m_next) + len += m -> m_len; + len -= HDHEADERLN; + printf ("IFRAME : N(R)=%d, PF=%d, N(S)=%d, DATA(%d)=", + nr, pf, ns, len); + for (s = (char *)iframe->i_field, i = 0; i < 3; ++i, ++s) + printf ("%x ", (int) *s & 0xff); + printf ("\n"); + break; + } + + default: + printf ("ILLEGAL: "); + for (s = (char *) frame, i = 0; i < 5; ++i, ++s) + printf ("%x ", (int) *s & 0xff); + printf ("\n"); + } + + } +} + +#ifdef HDLCDEBUG +static +hd_savetrace (hdp, dir, frame) +struct hdcb *hdp; +struct Hdlc_frame *frame; +{ + register struct hdlctrace *htp; + register struct mbuf *m; + + if (freezetrace) + return; + htp = &hdtrace[lasttracelogged]; + lasttracelogged = (lasttracelogged + 1) % NTRACE; + if (m = htp->ht_frame) + m_freem (m); + m = dtom (frame); + htp->ht_frame = m_copy (m, 0, m->m_len); + htp->ht_hdp = hdp; + htp->ht_dir = dir; + htp->ht_time = time; +} + +hd_dumptrace (hdp) +struct hdcb *hdp; +{ + register int i, ltrace; + register struct hdlctrace *htp; + + freezetrace = 1; + hd_status (hdp); + printf ("retransmit queue:"); + for (i = 0; i < 8; i++) + printf (" %x", hdp -> hd_retxq[i]); + printf ("\n"); + ltrace = hdp -> hd_xcp -> xc_ltrace; + hdp -> hd_xcp -> xc_ltrace = 1; + for (i = 0; i < NTRACE; i++) { + htp = &hdtrace[(lasttracelogged + i) % NTRACE]; + if (htp->ht_hdp != hdp || htp->ht_frame == 0) + continue; + printf ("%d/%d ", htp->ht_time.tv_sec & 0xff, + htp->ht_time.tv_usec / 10000); + hd_trace (htp->ht_hdp, htp->ht_dir, + mtod (htp->ht_frame, struct Hdlc_frame *)); + m_freem (htp->ht_frame); + htp->ht_frame = 0; + } + hdp -> hd_xcp -> xc_ltrace = ltrace; + freezetrace = 0; +} +#endif diff --git a/sys/netccitt/hd_input.c b/sys/netccitt/hd_input.c new file mode 100644 index 00000000000..eb939d03199 --- /dev/null +++ b/sys/netccitt/hd_input.c @@ -0,0 +1,669 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hd_input.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +static frame_reject(); +static rej_routine(); +static free_iframes(); +/* + * HDLC INPUT INTERFACE + * + * This routine is called when the HDLC physical device has + * completed reading a frame. + */ + +hdintr () +{ + register struct mbuf *m; + register struct hdcb *hdp; + register struct ifnet *ifp; + register int s; + static struct ifnet *lastifp; + static struct hdcb *lasthdp; + + for (;;) { + s = splimp (); + IF_DEQUEUE (&hdintrq, m); + splx (s); + if (m == 0) + break; + if (m->m_len < HDHEADERLN) { + printf ("hdintr: packet too short (len=%d)\n", + m->m_len); + m_freem (m); + continue; + } + if ((m->m_flags & M_PKTHDR) == 0) + panic("hdintr"); + ifp = m->m_pkthdr.rcvif; + + /* + * look up the appropriate hdlc control block + */ + + if (ifp == lastifp) + hdp = lasthdp; + else { + for (hdp = hdcbhead; hdp; hdp = hdp->hd_next) + if (hdp->hd_ifp == ifp) + break; + if (hdp == 0) { + printf ("hdintr: unknown interface %x\n", ifp); + m_freem (m); + continue; + } + lastifp = ifp; + lasthdp = hdp; + } + + /* Process_rxframe returns FALSE if the frame was NOT queued + for the next higher layers. */ + if (process_rxframe (hdp, m) == FALSE) + m_freem (m); + } +} + +process_rxframe (hdp, fbuf) +register struct hdcb *hdp; +register struct mbuf *fbuf; +{ + register int queued = FALSE, frametype, pf; + register struct Hdlc_frame *frame; + + frame = mtod (fbuf, struct Hdlc_frame *); + pf = ((struct Hdlc_iframe *) frame) -> pf; + + hd_trace (hdp, RX, frame); + if (frame -> address != ADDRESS_A && frame -> address != ADDRESS_B) + return (queued); + + switch ((frametype = hd_decode (hdp, frame)) + hdp->hd_state) { + case DM + DISC_SENT: + case UA + DISC_SENT: + /* + * Link now closed. Leave timer running + * so hd_timer() can periodically check the + * status of interface driver flag bit IFF_UP. + */ + hdp->hd_state = DISCONNECTED; + break; + + case DM + INIT: + case UA + INIT: + /* + * This is a non-standard state change needed for DCEs + * that do dynamic link selection. We can't go into the + * usual "SEND DM" state because a DM is a SARM in LAP. + */ + hd_writeinternal (hdp, SABM, POLLOFF); + hdp->hd_state = SABM_SENT; + SET_TIMER (hdp); + break; + + case SABM + DM_SENT: + case SABM + WAIT_SABM: + hd_writeinternal (hdp, UA, pf); + case UA + SABM_SENT: + case UA + WAIT_UA: + KILL_TIMER (hdp); + hd_initvars (hdp); + hdp->hd_state = ABM; + hd_message (hdp, "Link level operational"); + /* Notify the packet level - to send RESTART. */ + (void) pk_ctlinput (PRC_LINKUP, hdp->hd_pkp); + break; + + case SABM + SABM_SENT: + /* Got a SABM collision. Acknowledge the remote's SABM + via UA but still wait for UA. */ + hd_writeinternal (hdp, UA, pf); + break; + + case SABM + ABM: + /* Request to reset the link from the remote. */ + KILL_TIMER (hdp); + hd_message (hdp, "Link reset"); +#ifdef HDLCDEBUG + hd_dumptrace (hdp); +#endif + hd_flush (hdp->hd_ifp); + hd_writeinternal (hdp, UA, pf); + hd_initvars (hdp); + (void) pk_ctlinput (PRC_LINKRESET, hdp->hd_pkp); + hdp->hd_resets++; + break; + + case SABM + WAIT_UA: + hd_writeinternal (hdp, UA, pf); + break; + + case DM + ABM: + hd_message (hdp, "DM received: link down"); +#ifdef HDLCDEBUG + hd_dumptrace (hdp); +#endif + (void) pk_ctlinput (PRC_LINKDOWN, hdp->hd_pkp); + hd_flush (hdp->hd_ifp); + case DM + DM_SENT: + case DM + WAIT_SABM: + case DM + WAIT_UA: + hd_writeinternal (hdp, SABM, pf); + hdp->hd_state = SABM_SENT; + SET_TIMER (hdp); + break; + + case DISC + INIT: + case DISC + DM_SENT: + case DISC + SABM_SENT: + /* Note: This is a non-standard state change. */ + hd_writeinternal (hdp, UA, pf); + hd_writeinternal (hdp, SABM, POLLOFF); + hdp->hd_state = SABM_SENT; + SET_TIMER (hdp); + break; + + case DISC + WAIT_UA: + hd_writeinternal (hdp, DM, pf); + SET_TIMER (hdp); + hdp->hd_state = DM_SENT; + break; + + case DISC + ABM: + hd_message (hdp, "DISC received: link down"); + (void) pk_ctlinput (PRC_LINKDOWN, hdp->hd_pkp); + case DISC + WAIT_SABM: + hd_writeinternal (hdp, UA, pf); + hdp->hd_state = DM_SENT; + SET_TIMER (hdp); + break; + + case UA + ABM: + hd_message (hdp, "UA received: link down"); + (void) pk_ctlinput (PRC_LINKDOWN, hdp->hd_pkp); + case UA + WAIT_SABM: + hd_writeinternal (hdp, DM, pf); + hdp->hd_state = DM_SENT; + SET_TIMER (hdp); + break; + + case FRMR + DM_SENT: + hd_writeinternal (hdp, SABM, pf); + hdp->hd_state = SABM_SENT; + SET_TIMER (hdp); + break; + + case FRMR + WAIT_SABM: + hd_writeinternal (hdp, DM, pf); + hdp->hd_state = DM_SENT; + SET_TIMER (hdp); + break; + + case FRMR + ABM: + hd_message (hdp, "FRMR received: link down"); + (void) pk_ctlinput (PRC_LINKDOWN, hdp->hd_pkp); +#ifdef HDLCDEBUG + hd_dumptrace (hdp); +#endif + hd_flush (hdp->hd_ifp); + hd_writeinternal (hdp, SABM, pf); + hdp->hd_state = WAIT_UA; + SET_TIMER (hdp); + break; + + case RR + ABM: + case RNR + ABM: + case REJ + ABM: + process_sframe (hdp, (struct Hdlc_sframe *)frame, frametype); + break; + + case IFRAME + ABM: + queued = process_iframe (hdp, fbuf, (struct Hdlc_iframe *)frame); + break; + + case IFRAME + SABM_SENT: + case RR + SABM_SENT: + case RNR + SABM_SENT: + case REJ + SABM_SENT: + hd_writeinternal (hdp, DM, POLLON); + hdp->hd_state = DM_SENT; + SET_TIMER (hdp); + break; + + case IFRAME + WAIT_SABM: + case RR + WAIT_SABM: + case RNR + WAIT_SABM: + case REJ + WAIT_SABM: + hd_writeinternal (hdp, FRMR, POLLOFF); + SET_TIMER (hdp); + break; + + case ILLEGAL + SABM_SENT: + hdp->hd_unknown++; + hd_writeinternal (hdp, DM, POLLOFF); + hdp->hd_state = DM_SENT; + SET_TIMER (hdp); + break; + + case ILLEGAL + ABM: + hd_message (hdp, "Unknown frame received: link down"); + (void) pk_ctlinput (PRC_LINKDOWN, hdp->hd_pkp); + case ILLEGAL + WAIT_SABM: + hdp->hd_unknown++; +#ifdef HDLCDEBUG + hd_dumptrace (hdp); +#endif + hd_writeinternal (hdp, FRMR, POLLOFF); + hdp->hd_state = WAIT_SABM; + SET_TIMER (hdp); + break; + } + + return (queued); +} + +process_iframe (hdp, fbuf, frame) +register struct hdcb *hdp; +struct mbuf *fbuf; +register struct Hdlc_iframe *frame; +{ + register int nr = frame -> nr, + ns = frame -> ns, + pf = frame -> pf; + register int queued = FALSE; + + /* + * Validate the iframe's N(R) value. It's N(R) value must be in + * sync with our V(S) value and our "last received nr". + */ + + if (valid_nr (hdp, nr, FALSE) == FALSE) { + frame_reject (hdp, Z, frame); + return (queued); + } + + + /* + * This section tests the IFRAME for proper sequence. That is, it's + * sequence number N(S) MUST be equal to V(S). + */ + + if (ns != hdp->hd_vr) { + hdp->hd_invalid_ns++; + if (pf || (hdp->hd_condition & REJ_CONDITION) == 0) { + hdp->hd_condition |= REJ_CONDITION; + /* + * Flush the transmit queue. This is ugly but we + * have no choice. A reject response must be + * immediately sent to the DCE. Failure to do so + * may result in another out of sequence iframe + * arriving (and thus sending another reject) + * before the first reject is transmitted. This + * will cause the DCE to receive two or more + * rejects back to back, which must never happen. + */ + hd_flush (hdp->hd_ifp); + hd_writeinternal (hdp, REJ, pf); + } + return (queued); + } + hdp->hd_condition &= ~REJ_CONDITION; + + /* + * This section finally tests the IFRAME's sequence number against + * the window size (K) and the sequence number of the last frame + * we have acknowledged. If the IFRAME is completely correct then + * it is queued for the packet level. + */ + + if (ns != (hdp -> hd_lasttxnr + hdp -> hd_xcp -> xc_lwsize) % MODULUS) { + hdp -> hd_vr = (hdp -> hd_vr + 1) % MODULUS; + if (pf == 1) { + /* Must generate a RR or RNR with final bit on. */ + hd_writeinternal (hdp, RR, POLLON); + } else + /* + * Hopefully we can piggyback the RR, if not we will generate + * a RR when T3 timer expires. + */ + if (hdp -> hd_rrtimer == 0) + hdp->hd_rrtimer = hd_t3; + + /* Forward iframe to packet level of X.25. */ + fbuf -> m_data += HDHEADERLN; + fbuf -> m_len -= HDHEADERLN; + fbuf -> m_pkthdr.len -= HDHEADERLN; + fbuf -> m_pkthdr.rcvif = (struct ifnet *)hdp -> hd_pkp; +#ifdef BSD4_3 + fbuf->m_act = 0; /* probably not necessary */ +#else + { + register struct mbuf *m; + + for (m = fbuf; m -> m_next; m = m -> m_next) + m -> m_act = (struct mbuf *) 0; + m -> m_act = (struct mbuf *) 1; + } +#endif + pk_input (fbuf); + queued = TRUE; + hd_start (hdp); + } else { + /* + * Here if the remote station has transmitted more iframes then + * the number which have been acknowledged plus K. + */ + hdp->hd_invalid_ns++; + frame_reject (hdp, W, frame); + } + return (queued); +} + +/* + * This routine is used to determine if a value (the middle parameter) + * is between two other values. The low value is the first parameter + * the high value is the last parameter. The routine checks the middle + * value to see if it is within the range of the first and last values. + * The reason we need this routine is the values are modulo some base + * hence a simple test for greater or less than is not sufficient. + */ + +bool +range_check (rear, value, front) +int rear, + value, + front; +{ + register bool result = FALSE; + + if (front > rear) + result = (rear <= value) && (value <= front); + else + result = (rear <= value) || (value <= front); + + return (result); +} + +/* + * This routine handles all the frame reject conditions which can + * arise as a result of secondary processing. The frame reject + * condition Y (frame length error) are handled elsewhere. + */ + +static +frame_reject (hdp, rejectcode, frame) +struct hdcb *hdp; +struct Hdlc_iframe *frame; +{ + register struct Frmr_frame *frmr = &hd_frmr; + + frmr -> frmr_control = ((struct Hdlc_frame *) frame) -> control; + + frmr -> frmr_ns = frame -> ns; + frmr -> frmr_f1_0 = 0; + frmr -> frmr_nr = frame -> nr; + frmr -> frmr_f2_0 = 0; + + frmr -> frmr_0000 = 0; + frmr -> frmr_w = frmr -> frmr_x = frmr -> frmr_y = + frmr -> frmr_z = 0; + switch (rejectcode) { + case Z: + frmr -> frmr_z = 1;/* invalid N(R). */ + break; + + case Y: + frmr -> frmr_y = 1;/* iframe length error. */ + break; + + case X: + frmr -> frmr_x = 1;/* invalid information field. */ + frmr -> frmr_w = 1; + break; + + case W: + frmr -> frmr_w = 1;/* invalid N(S). */ + } + + hd_writeinternal (hdp, FRMR, POLLOFF); + + hdp->hd_state = WAIT_SABM; + SET_TIMER (hdp); +} + +/* + * This procedure is invoked when ever we receive a supervisor + * frame such as RR, RNR and REJ. All processing for these + * frames is done here. + */ + +process_sframe (hdp, frame, frametype) +register struct hdcb *hdp; +register struct Hdlc_sframe *frame; +int frametype; +{ + register int nr = frame -> nr, pf = frame -> pf, pollbit = 0; + + if (valid_nr (hdp, nr, pf) == TRUE) { + switch (frametype) { + case RR: + hdp->hd_condition &= ~REMOTE_RNR_CONDITION; + break; + + case RNR: + hdp->hd_condition |= REMOTE_RNR_CONDITION; + hdp->hd_retxcnt = 0; + break; + + case REJ: + hdp->hd_condition &= ~REMOTE_RNR_CONDITION; + rej_routine (hdp, nr); + } + + if (pf == 1) { + hdp->hd_retxcnt = 0; + hdp->hd_condition &= ~TIMER_RECOVERY_CONDITION; + + if (frametype == RR && hdp->hd_lastrxnr == hdp->hd_vs + && hdp->hd_timer == 0 && hdp->hd_txq.head == 0) + hd_writeinternal(hdp, RR, pf); + else + /* If any iframes have been queued because of the + timer condition, transmit then now. */ + if (hdp->hd_condition & REMOTE_RNR_CONDITION) { + /* Remote is busy or timer condition, so only + send one. */ + if (hdp->hd_vs != hdp->hd_retxqi) + hd_send_iframe (hdp, hdp->hd_retxq[hdp->hd_vs], pollbit); + } + else /* Flush the retransmit list first. */ + while (hdp->hd_vs != hdp->hd_retxqi) + hd_send_iframe (hdp, hdp->hd_retxq[hdp->hd_vs], POLLOFF); + } + + hd_start (hdp); + } else + frame_reject (hdp, Z, (struct Hdlc_iframe *)frame); /* Invalid N(R). */ +} + +/* + * This routine tests the validity of the N(R) which we have received. + * If it is ok, then all the iframes which it acknowledges (if any) + * will be freed. + */ + +bool +valid_nr (hdp, nr, finalbit) +register struct hdcb *hdp; +register int finalbit; +{ + /* Make sure it really does acknowledge something. */ + if (hdp->hd_lastrxnr == nr) + return (TRUE); + + /* + * This section validates the frame's N(R) value. It's N(R) value + * must be in syncronization with our V(S) value and our "last + * received nr" variable. If it is correct then we are able to send + * more IFRAME's, else frame reject condition is entered. + */ + + if (range_check (hdp->hd_lastrxnr, nr, hdp->hd_vs) == FALSE) { + if ((hdp->hd_condition & TIMER_RECOVERY_CONDITION) && + range_check (hdp->hd_vs, nr, hdp->hd_xx) == TRUE) + hdp->hd_vs = nr; + + else { + hdp->hd_invalid_nr++; + return (FALSE); + } + } + + /* + * If we get to here, we do have a valid frame but it might be out + * of sequence. However, we should still accept the receive state + * number N(R) since it has already passed our previous test and it + * does acknowledge frames which we are sending. + */ + + KILL_TIMER (hdp); + free_iframes (hdp, &nr, finalbit);/* Free all acknowledged iframes */ + if (nr != hdp->hd_vs) + SET_TIMER (hdp); + + return (TRUE); +} + +/* + * This routine determines how many iframes need to be retransmitted. + * It then resets the Send State Variable V(S) to accomplish this. + */ + +static +rej_routine (hdp, rejnr) +register struct hdcb *hdp; +register int rejnr; +{ + register int anchor; + + /* + * Flush the output queue. Any iframes queued for + * transmission will be out of sequence. + */ + + hd_flush (hdp->hd_ifp); + + /* + * Determine how many frames should be re-transmitted. In the case + * of a normal REJ this should be 1 to K. In the case of a timer + * recovery REJ (ie. a REJ with the Final Bit on) this could be 0. + */ + + anchor = hdp->hd_vs; + if (hdp->hd_condition & TIMER_RECOVERY_CONDITION) + anchor = hdp->hd_xx; + + anchor = (anchor - rejnr + 8) % MODULUS; + + if (anchor > 0) { + + /* There is at least one iframe to retransmit. */ + KILL_TIMER (hdp); + hdp->hd_vs = rejnr; + + while (hdp->hd_vs != hdp->hd_retxqi) + hd_send_iframe (hdp, hdp->hd_retxq[hdp->hd_vs], POLLOFF); + + } + hd_start (hdp); +} + +/* + * This routine frees iframes from the retransmit queue. It is called + * when a previously written iframe is acknowledged. + */ + +static +free_iframes (hdp, nr, finalbit) +register struct hdcb *hdp; +int *nr; +register int finalbit; + +{ + register int i, k; + + /* + * We need to do the following because of a funny quirk in the + * protocol. This case occures when in Timer recovery condition + * we get a N(R) which acknowledges all the outstanding iframes + * but with the Final Bit off. In this case we need to save the last + * iframe for possible retransmission even though it has already been + * acknowledged! + */ + + if ((hdp->hd_condition & TIMER_RECOVERY_CONDITION) && *nr == hdp->hd_xx && finalbit == 0) { + *nr = (*nr - 1 + 8) % MODULUS; +/* printf ("QUIRK\n"); */ + } + + k = (*nr - hdp->hd_lastrxnr + 8) % MODULUS; + + /* Loop here freeing all acknowledged iframes. */ + for (i = 0; i < k; ++i) { + m_freem (hdp->hd_retxq[hdp->hd_lastrxnr]); + hdp->hd_retxq[hdp->hd_lastrxnr] = 0; + hdp->hd_lastrxnr = (hdp->hd_lastrxnr + 1) % MODULUS; + } + +} diff --git a/sys/netccitt/hd_output.c b/sys/netccitt/hd_output.c new file mode 100644 index 00000000000..05992e1deb1 --- /dev/null +++ b/sys/netccitt/hd_output.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hd_output.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +/* + * HDLC OUTPUT INTERFACE + * + * This routine is called when the X.25 packet layer output routine + * has a information frame (iframe) to write. It is also called + * by the input and control routines of the HDLC layer. + */ + +hd_output (hdp, m0) +register struct hdcb *hdp; +struct mbuf *m0; +{ + struct x25config *xcp; + register struct mbuf *m = m0; + int len; + + if (m == NULL) + panic ("hd_output"); + if ((m->m_flags & M_PKTHDR) == 0) + panic ("hd_output 2"); + + if (hdp->hd_state != ABM) { + m_freem (m); + return; + } + + /* + * Make room for the hdlc header either by prepending + * another mbuf, or by adjusting the offset and length + * of the first mbuf in the mbuf chain. + */ + + M_PREPEND(m, HDHEADERLN, M_DONTWAIT); + if (m == NULL) + return; + for (len = 0; m; m = m->m_next) + len += m->m_len; + m = m0; + m->m_pkthdr.len = len; + + hd_append (&hdp->hd_txq, m); + hd_start (hdp); +} + +hd_start (hdp) +register struct hdcb *hdp; +{ + register struct mbuf *m; + + /* + * The iframe is only transmitted if all these conditions are FALSE. + * The iframe remains queued (hdp->hd_txq) however and will be + * transmitted as soon as these conditions are cleared. + */ + + while (!(hdp->hd_condition & (TIMER_RECOVERY_CONDITION | REMOTE_RNR_CONDITION | REJ_CONDITION))) { + if (hdp->hd_vs == (hdp->hd_lastrxnr + hdp->hd_xcp->xc_lwsize) % MODULUS) { + + /* We have now exceeded the maximum number of + outstanding iframes. Therefore, we must wait + until at least one is acknowledged if this + condition is not turned off before we are + requested to write another iframe. */ + hdp->hd_window_condition++; + break; + } + + /* hd_remove top iframe from transmit queue. */ + if ((m = hd_remove (&hdp->hd_txq)) == NULL) + break; + + hd_send_iframe (hdp, m, POLLOFF); + } +} + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. It + * also starts the acknowledgement timer and keeps the iframe in the + * Retransmit queue (Retxq) just in case we have to do this again. + * + * Note: This routine is also called from hd_input.c when retransmission + * of old frames is required. + */ + +hd_send_iframe (hdp, buf, poll_bit) +register struct hdcb *hdp; +register struct mbuf *buf; +int poll_bit; +{ + register struct Hdlc_iframe *iframe; + struct mbuf *m; + + KILL_TIMER (hdp); + + if (buf == 0) { + printf ("hd_send_iframe: zero arg\n"); +#ifdef HDLCDEBUG + hd_status (hdp); + hd_dumptrace (hdp); +#endif + hdp->hd_vs = (hdp->hd_vs + 7) % MODULUS; + return; + } + iframe = mtod (buf, struct Hdlc_iframe *); + + iframe -> hdlc_0 = 0; + iframe -> nr = hdp->hd_vr; + iframe -> pf = poll_bit; + iframe -> ns = hdp->hd_vs; + iframe -> address = ADDRESS_B; + hdp->hd_lasttxnr = hdp->hd_vr; + hdp->hd_rrtimer = 0; + + if (hdp->hd_vs == hdp->hd_retxqi) { + /* Check for retransmissions. */ + /* Put iframe only once in the Retransmission queue. */ + hdp->hd_retxq[hdp->hd_retxqi] = buf; + hdp->hd_retxqi = (hdp->hd_retxqi + 1) % MODULUS; + hdp->hd_iframes_out++; + } + + hdp->hd_vs = (hdp->hd_vs + 1) % MODULUS; + + hd_trace (hdp, TX, (struct Hdlc_frame *)iframe); + + /* Write buffer on device. */ + m = hdp->hd_dontcopy ? buf : m_copy(buf, 0, (int)M_COPYALL); + if (m == 0) { + printf("hdlc: out of mbufs\n"); + return; + } + (*hdp->hd_output)(hdp, m); + SET_TIMER (hdp); +} + +hd_ifoutput(hdp, m) +register struct mbuf *m; +register struct hdcb *hdp; +{ + /* + * Queue message on interface, and start output if interface + * not yet active. + */ + register struct ifnet *ifp = hdp->hd_ifp; + int s = splimp(); + + if (IF_QFULL(&ifp->if_snd)) { + IF_DROP(&ifp->if_snd); + /* printf("%s%d: HDLC says OK to send but queue full, may hang\n", + ifp->if_name, ifp->if_unit);*/ + m_freem(m); + } else { + IF_ENQUEUE(&ifp->if_snd, m); + if ((ifp->if_flags & IFF_OACTIVE) == 0) + (*ifp->if_start)(ifp); + } + splx(s); +} + + +/* + * This routine gets control when the timer expires because we have not + * received an acknowledgement for a iframe. + */ + +hd_resend_iframe (hdp) +register struct hdcb *hdp; +{ + + if (hdp->hd_retxcnt++ < hd_n2) { + if (!(hdp->hd_condition & TIMER_RECOVERY_CONDITION)) { + hdp->hd_xx = hdp->hd_vs; + hdp->hd_condition |= TIMER_RECOVERY_CONDITION; + } + + hdp->hd_vs = hdp->hd_lastrxnr; + hd_send_iframe (hdp, hdp->hd_retxq[hdp->hd_vs], POLLON); + } else { + /* At this point we have not received a RR even after N2 + retries - attempt to reset link. */ + + hd_initvars (hdp); + hd_writeinternal (hdp, SABM, POLLOFF); + hdp->hd_state = WAIT_UA; + SET_TIMER (hdp); + hd_message (hdp, "Timer recovery failed: link down"); + (void) pk_ctlinput (PRC_LINKDOWN, hdp->hd_pkp); + } +} diff --git a/sys/netccitt/hd_subr.c b/sys/netccitt/hd_subr.c new file mode 100644 index 00000000000..c75ab07568e --- /dev/null +++ b/sys/netccitt/hd_subr.c @@ -0,0 +1,391 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hd_subr.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +hd_init () +{ + + hdintrq.ifq_maxlen = IFQ_MAXLEN; +} + +hd_ctlinput (prc, addr) +struct sockaddr *addr; +{ + register struct x25config *xcp = (struct x25config *)addr; + register struct hdcb *hdp; + register struct ifaddr *ifa; + struct ifnet *ifp; + caddr_t pk_newlink(); + + if (addr->sa_family != AF_CCITT) + return (EAFNOSUPPORT); + if (xcp->xc_lptype != HDLCPROTO_LAPB) + return (EPROTONOSUPPORT); + ifa = ifa_ifwithaddr(addr); + if (ifa == 0 || ifa->ifa_addr->sa_family != AF_CCITT || + (ifp = ifa->ifa_ifp) == 0) + panic ("hd_ctlinput"); + for (hdp = hdcbhead; hdp; hdp = hdp->hd_next) + if (hdp->hd_ifp == ifp) + break; + + if (hdp == 0) { /* new interface */ + int error, hd_ifoutput(), hd_output(); + + /* an hdcb is now too big to fit in an mbuf */ + MALLOC(hdp, struct hdcb *, sizeof (*hdp), M_PCB, M_DONTWAIT); + if (hdp == 0) + return (ENOBUFS); + bzero((caddr_t)hdp, sizeof(*hdp)); + hdp->hd_pkp = + (caddr_t) pk_newlink ((struct x25_ifaddr *) ifa, + (caddr_t) hdp); + ((struct x25_ifaddr *)ifa)->ia_pkcb = + (struct pkcb *) hdp->hd_pkp; + if (hdp -> hd_pkp == 0) { + free(hdp, M_PCB); + return (ENOBUFS); + } + hdp->hd_ifp = ifp; + hdp->hd_ifa = ifa; + hdp->hd_xcp = xcp; + hdp->hd_state = INIT; + hdp->hd_output = hd_ifoutput; + hdp->hd_next = hdcbhead; + hdcbhead = hdp; + } else if (hdp->hd_pkp == 0) { /* interface got reconfigured */ + hdp->hd_pkp = + (caddr_t) pk_newlink ((struct x25_ifaddr *) ifa, + (caddr_t) hdp); + ((struct x25_ifaddr *)ifa)->ia_pkcb = + (struct pkcb *) hdp->hd_pkp; + if (hdp -> hd_pkp == 0) { + free(hdp, M_PCB); + return (ENOBUFS); + } + } + + switch (prc) { + case PRC_IFUP: + if (xcp->xc_lwsize == 0 || + xcp->xc_lwsize > MAX_WINDOW_SIZE) + xcp->xc_lwsize = MAX_WINDOW_SIZE; + if (hdp->hd_state == INIT) + SET_TIMER (hdp); + break; + + case PRC_IFDOWN: + if (hdp->hd_state == ABM) + hd_message (hdp, "Operator shutdown: link closed"); + (void) pk_ctlinput (PRC_LINKDOWN, hdp->hd_pkp); + + /* fall thru to ... */ + + case PRC_DISCONNECT_REQUEST: + /* drop reference to pkcb --- it's dead meat */ + hdp->hd_pkp = (caddr_t) 0; + ((struct x25_ifaddr *)ifa)->ia_pkcb = (struct pkcb *) 0; + + hd_writeinternal (hdp, DISC, POLLON); + hdp->hd_state = DISC_SENT; + SET_TIMER (hdp); + } + return (0); +} + +hd_initvars (hdp) +register struct hdcb *hdp; +{ + register struct mbuf *m; + register int i; + + /* Clear Transmit queue. */ + while ((m = hd_remove (&hdp->hd_txq)) != NULL) + m_freem (m); + + /* Clear Retransmit queue. */ + i = hdp->hd_lastrxnr; + while (i != hdp->hd_retxqi) { + m_freem (hdp->hd_retxq[i]); + i = (i + 1) % MODULUS; + } + hdp->hd_retxqi = 0; + + hdp->hd_vs = hdp->hd_vr = 0; + hdp->hd_lasttxnr = hdp->hd_lastrxnr = 0; + hdp->hd_rrtimer = 0; + KILL_TIMER(hdp); + hdp->hd_retxcnt = 0; + hdp->hd_condition = 0; +} + +hd_decode (hdp, frame) +register struct hdcb *hdp; +struct Hdlc_frame *frame; +{ + register int frametype = ILLEGAL; + register struct Hdlc_iframe *iframe = (struct Hdlc_iframe *) frame; + register struct Hdlc_sframe *sframe = (struct Hdlc_sframe *) frame; + register struct Hdlc_uframe *uframe = (struct Hdlc_uframe *) frame; + + if (iframe -> hdlc_0 == 0) { + frametype = IFRAME; + hdp->hd_iframes_in++; + } + + else if (sframe -> hdlc_01 == 1) { + /* Supervisory format. */ + switch (sframe -> s2) { + case 0: + frametype = RR; + hdp->hd_rrs_in++; + break; + + case 1: + frametype = RNR; + hdp->hd_rnrs_in++; + break; + + case 2: + frametype = REJ; + hdp->hd_rejs_in++; + } + } + else if (uframe -> hdlc_11 == 3) { + /* Unnumbered format. */ + switch (uframe -> m3) { + case 0: + frametype = DM; + break; + + case 1: + frametype = SABM; + break; + + case 2: + frametype = DISC; + break; + + case 3: + frametype = UA; + break; + + case 4: + frametype = FRMR; + hdp->hd_frmrs_in++; + } + } + return (frametype); +} + +/* + * This routine is called when the HDLC layer internally generates a + * command or response for the remote machine ( eg. RR, UA etc. ). + * Only supervisory or unnumbered frames are processed. + */ + +hd_writeinternal (hdp, frametype, pf) +register struct hdcb *hdp; +register int frametype, pf; +{ + register struct mbuf *buf; + struct Hdlc_frame *frame; + register struct Hdlc_sframe *sframe; + register struct Hdlc_uframe *uframe; + + MGETHDR (buf, M_DONTWAIT, MT_HEADER); + if (buf == 0) + return; + frame = mtod (buf, struct Hdlc_frame *); + sframe = mtod (buf, struct Hdlc_sframe *); + uframe = mtod (buf, struct Hdlc_uframe *); + + /* Assume a response - address structure for DTE */ + frame -> address = ADDRESS_A; + buf -> m_len = 2; + buf -> m_act = buf -> m_next = NULL; + + switch (frametype) { + case RR: + frame -> control = RR_CONTROL; + hdp->hd_rrs_out++; + break; + + case RNR: + frame -> control = RNR_CONTROL; + hdp->hd_rnrs_out++; + break; + + case REJ: + frame -> control = REJ_CONTROL; + hdp->hd_rejs_out++; + break; + + case SABM: + frame -> control = SABM_CONTROL; + frame -> address = ADDRESS_B; + break; + + case DISC: + if ((hdp->hd_ifp->if_flags & IFF_UP) == 0) { + hdp->hd_state = DISCONNECTED; + (void) m_freem (buf); + hd_flush (hdp->hd_ifp); + return; + } + frame -> control = DISC_CONTROL; + frame -> address = ADDRESS_B; + break; + + case DM: + frame -> control = DM_CONTROL; + break; + + case UA: + frame -> control = UA_CONTROL; + break; + + case FRMR: + frame -> control = FRMR_CONTROL; + bcopy ((caddr_t)&hd_frmr, (caddr_t)frame -> info, 3); + buf -> m_len = 5; + hdp->hd_frmrs_out++; + + } + + if (sframe -> hdlc_01 == 1) { + /* Supervisory format - RR, REJ, or RNR. */ + sframe -> nr = hdp->hd_vr; + sframe -> pf = pf; + hdp->hd_lasttxnr = hdp->hd_vr; + hdp->hd_rrtimer = 0; + } + else + uframe -> pf = pf; + + hd_trace (hdp, TX, frame); + buf -> m_pkthdr.len = buf -> m_len; + (*hdp->hd_output) (hdp, buf); +} + +struct mbuf * +hd_remove (q) +struct hdtxq *q; +{ + register struct mbuf *m; + + m = q -> head; + if (m) { + if ((q -> head = m -> m_act) == NULL) + q -> tail = NULL; + m -> m_act = 0; + } + return (m); +} + +hd_append (q, m) +register struct hdtxq *q; +register struct mbuf *m; +{ + + m -> m_act = NULL; + if (q -> tail == NULL) + q -> head = m; + else + q -> tail -> m_act = m; + q -> tail = m; +} + +hd_flush (ifp) +struct ifnet *ifp; +{ + register struct mbuf *m; + register int s; + + while (1) { + s = splimp (); + IF_DEQUEUE (&ifp->if_snd, m); + splx (s); + if (m == 0) + break; + m_freem (m); + } +} + +hd_message (hdp, msg) +struct hdcb *hdp; +char *msg; +{ + char *format_ntn (); + + if (hdcbhead -> hd_next) + printf ("HDLC(%s): %s\n", format_ntn (hdp->hd_xcp), msg); + else + printf ("HDLC: %s\n", msg); +} + +#ifdef HDLCDEBUG +hd_status (hdp) +struct hdcb *hdp; +{ + printf ("HDLC STATUS:\n V(S)=%d, V(R)=%d, retxqi=%d,\n", + hdp->hd_vs, hdp->hd_vr, hdp->hd_retxqi); + + printf ("Last_rx_nr=%d, Last_tx_nr=%d,\n Condition=%d, Xx=%d\n", + hdp->hd_lastrxnr, hdp->hd_lasttxnr, hdp->hd_condition, hdp->hd_xx); +} +#endif diff --git a/sys/netccitt/hd_timer.c b/sys/netccitt/hd_timer.c new file mode 100644 index 00000000000..a3bf12addf0 --- /dev/null +++ b/sys/netccitt/hd_timer.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hd_timer.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +/* + * these can be patched with adb if the + * default values are inappropriate + */ + +int hd_t1 = T1; +int hd_t3 = T3; +int hd_n2 = N2; + +/* + * HDLC TIMER + * + * This routine is called every 500ms by the kernel. Decrement timer by this + * amount - if expired then process the event. + */ + +hd_timer () +{ + register struct hdcb *hdp; + register int s = splimp (); + + for (hdp = hdcbhead; hdp; hdp = hdp->hd_next) { + if (hdp->hd_rrtimer && (--hdp->hd_rrtimer == 0)) { + if (hdp->hd_lasttxnr != hdp->hd_vr) + hd_writeinternal (hdp, RR, POLLOFF); + } + + if (!(hdp->hd_timer && --hdp->hd_timer == 0)) + continue; + + switch (hdp->hd_state) { + case INIT: + case DISC_SENT: + hd_writeinternal (hdp, DISC, POLLON); + break; + + case ABM: + if (hdp->hd_lastrxnr != hdp->hd_vs) { /* XXX */ + hdp->hd_timeouts++; + hd_resend_iframe (hdp); + } + break; + + case WAIT_SABM: + hd_writeinternal (hdp, FRMR, POLLOFF); + if (++hdp->hd_retxcnt == hd_n2) { + hdp->hd_retxcnt = 0; + hd_writeinternal (hdp, SABM, POLLOFF); + hdp->hd_state = WAIT_UA; + } + break; + + case DM_SENT: + if (++hdp->hd_retxcnt == hd_n2) { + /* Notify the packet level. */ + (void) pk_ctlinput (PRC_LINKDOWN, hdp->hd_pkp); + hdp->hd_retxcnt = 0; + hdp->hd_state = SABM_SENT; + hd_writeinternal (hdp, SABM, POLLOFF); + } else + hd_writeinternal (hdp, DM, POLLOFF); + break; + + case WAIT_UA: + if (++hdp->hd_retxcnt == hd_n2) { + hdp->hd_retxcnt = 0; + hd_writeinternal (hdp, DM, POLLOFF); + hdp->hd_state = DM_SENT; + } else + hd_writeinternal (hdp, SABM, POLLOFF); + break; + + case SABM_SENT: + /* Do this indefinitely. */ + hd_writeinternal (hdp, SABM, POLLON); + break; + + case DISCONNECTED: + /* + * Poll the interface driver flags waiting + * for the IFF_UP bit to come on. + */ + if (hdp->hd_ifp->if_flags & IFF_UP) + hdp->hd_state = INIT; + + } + SET_TIMER (hdp); + } + + splx (s); +} diff --git a/sys/netccitt/hd_var.h b/sys/netccitt/hd_var.h new file mode 100644 index 00000000000..5fefe0869a4 --- /dev/null +++ b/sys/netccitt/hd_var.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hd_var.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * + * hdlc control block + * + */ + +struct hdtxq { + struct mbuf *head; + struct mbuf *tail; +}; + +struct hdcb { + struct hdcb *hd_next; /* pointer to next hdlc control block */ + char hd_state; /* link state */ + char hd_vs; /* send state variable */ + char hd_vr; /* receive state variable */ + char hd_lastrxnr; /* last received N(R) */ + char hd_lasttxnr; /* last transmitted N(R) */ + char hd_condition; +#define TIMER_RECOVERY_CONDITION 0x01 +#define REJ_CONDITION 0x02 +#define REMOTE_RNR_CONDITION 0X04 + char hd_retxcnt; + char hd_xx; + struct hdtxq hd_txq; + struct mbuf *hd_retxq[MODULUS]; + char hd_retxqi; + char hd_rrtimer; + char hd_timer; +#define SET_TIMER(hdp) hdp->hd_timer = hd_t1 +#define KILL_TIMER(hdp) hdp->hd_timer = 0 + char hd_dontcopy; /* if-driver doesn't free I-frames */ + struct ifnet *hd_ifp; /* device's network visible interface */ + struct ifaddr *hd_ifa; /* device's X.25 network address */ + struct x25config *hd_xcp; + caddr_t hd_pkp; /* Level III junk */ + int (*hd_output)(); /* separate entry for HDLC direct output */ + + /* link statistics */ + + long hd_iframes_in; + long hd_iframes_out; + long hd_rrs_in; + long hd_rrs_out; + short hd_rejs_in; + short hd_rejs_out; + long hd_window_condition; + short hd_invalid_ns; + short hd_invalid_nr; + short hd_timeouts; + short hd_resets; + short hd_unknown; + short hd_frmrs_in; + short hd_frmrs_out; + short hd_rnrs_in; + short hd_rnrs_out; +}; + +#ifdef KERNEL +struct hdcb *hdcbhead; /* head of linked list of hdcb's */ +struct Frmr_frame hd_frmr; /* rejected frame diagnostic info */ +struct ifqueue hdintrq; /* hdlc packet input queue */ + +int hd_t1; /* timer T1 value */ +int hd_t3; /* RR send timer */ +int hd_n2; /* frame retransmission limit */ +#endif diff --git a/sys/netccitt/hdlc.h b/sys/netccitt/hdlc.h new file mode 100644 index 00000000000..60cf7adf072 --- /dev/null +++ b/sys/netccitt/hdlc.h @@ -0,0 +1,156 @@ +/*- + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by the + * Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)hdlc.h 8.1 (Berkeley) 6/10/93 + */ + +#ifndef ORDER4 +#define FALSE 0 +#define TRUE 1 +typedef u_char octet; +typedef char bool; + +/* + * HDLC Packet format definitions + * This will eventually have to be rewritten without reference + * to bit fields, to be compliant with ANSI C and alignment safe. + */ + +#if BYTE_ORDER == BIG_ENDIAN +#define ORDER4(a, b, c, d) a , b , c , d +#define ORDER5(a, b, c, d, e) a , b , c , d , e +#endif + +#if BYTE_ORDER == LITTLE_ENDIAN +#define ORDER4(a, b, c, d) d , c , b , a +#define ORDER5(a, b, c, d, e) e , d , c , b , a +#endif +#endif + +#define MAX_INFO_LEN 4096+3+4 +#define ADDRESS_A 3 /* B'00000011' */ +#define ADDRESS_B 1 /* B'00000001' */ + +struct Hdlc_iframe { + octet address; + octet ORDER4(nr:3, pf:1, ns:3, hdlc_0:1); + octet i_field[MAX_INFO_LEN]; +}; + +struct Hdlc_sframe { + octet address; + octet ORDER4(nr:3, pf:1, s2:2, hdlc_01:2); +}; + +struct Hdlc_uframe { + octet address; + octet ORDER4(m3:3, pf:1, m2:2, hdlc_11:2); +}; + +struct Frmr_frame { + octet address; + octet control; + octet frmr_control; + octet ORDER4(frmr_nr:3, frmr_f1_0:1, frmr_ns:3, frmr_f2_0:1); + octet ORDER5(frmr_0000:4, frmr_z:1, frmr_y:1, frmr_x:1, frmr_w:1); +}; + +#define HDHEADERLN 2 +#define MINFRLN 2 /* Minimum frame length. */ + +struct Hdlc_frame { + octet address; + octet control; + octet info[3]; /* min for FRMR */ +}; + +#define SABM_CONTROL 057 /* B'00101111' */ +#define UA_CONTROL 0143 /* B'01100011' */ +#define DISC_CONTROL 0103 /* B'01000011' */ +#define DM_CONTROL 017 /* B'00001111' */ +#define FRMR_CONTROL 0207 /* B'10000111' */ +#define RR_CONTROL 01 /* B'00000001' */ +#define RNR_CONTROL 05 /* B'00000101' */ +#define REJ_CONTROL 011 /* B'00001001' */ + +#define POLLOFF 0 +#define POLLON 1 + +/* Define Link State constants. */ + +#define INIT 0 +#define DM_SENT 1 +#define SABM_SENT 2 +#define ABM 3 +#define WAIT_SABM 4 +#define WAIT_UA 5 +#define DISC_SENT 6 +#define DISCONNECTED 7 +#define MAXSTATE 8 + +/* The following constants are used in a switch statement to process + frames read from the communications line. */ + +#define SABM 0 * MAXSTATE +#define DM 1 * MAXSTATE +#define DISC 2 * MAXSTATE +#define UA 3 * MAXSTATE +#define FRMR 4 * MAXSTATE +#define RR 5 * MAXSTATE +#define RNR 6 * MAXSTATE +#define REJ 7 * MAXSTATE +#define IFRAME 8 * MAXSTATE +#define ILLEGAL 9 * MAXSTATE + +#define T1 (3 * PR_SLOWHZ) /* IFRAME TIMEOUT - 3 seconds */ +#define T3 (T1 / 2) /* RR generate timeout - 1.5 seconds */ +#define N2 10 +#define MODULUS 8 +#define MAX_WINDOW_SIZE 7 + +#define Z 0 +#define Y 1 +#define X 2 +#define W 3 +#define A 4 + +#define TX 0 +#define RX 1 + +bool range_check (); +bool valid_nr (); +struct mbuf *hd_remove (); diff --git a/sys/netccitt/if_x25subr.c b/sys/netccitt/if_x25subr.c new file mode 100644 index 00000000000..6f00496a18b --- /dev/null +++ b/sys/netccitt/if_x25subr.c @@ -0,0 +1,801 @@ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_x25subr.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef INET +#include +#include +#endif + +#ifdef NS +#include +#include +#endif + +#ifdef ISO +int tp_incoming(); +#include +#include +#include +#endif + +extern struct ifnet loif; +struct llinfo_x25 llinfo_x25 = {&llinfo_x25, &llinfo_x25}; +#ifndef _offsetof +#define _offsetof(t, m) ((int)((caddr_t)&((t *)0)->m)) +#endif +struct sockaddr *x25_dgram_sockmask; +struct sockaddr_x25 x25_dgmask = { + _offsetof(struct sockaddr_x25, x25_udata[1]), /* _len */ + 0, /* _family */ + 0, /* _net */ + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* _addr */ + {0}, /* opts */ + -1, /* _udlen */ + {-1} /* _udata */ +}; + +struct if_x25stats { + int ifx_wrongplen; + int ifx_nophdr; +} if_x25stats; +int x25_autoconnect = 0; + +#define senderr(x) {error = x; goto bad;} +/* + * Ancillary routines + */ +static struct llinfo_x25 * +x25_lxalloc(rt) +register struct rtentry *rt; +{ + register struct llinfo_x25 *lx; + register struct sockaddr *dst = rt_key(rt); + register struct ifaddr *ifa; + + MALLOC(lx, struct llinfo_x25 *, sizeof (*lx), M_PCB, M_NOWAIT); + if (lx == 0) + return lx; + Bzero(lx, sizeof(*lx)); + lx->lx_rt = rt; + lx->lx_family = dst->sa_family; + rt->rt_refcnt++; + if (rt->rt_llinfo) + insque(lx, (struct llinfo_x25 *)rt->rt_llinfo); + else { + rt->rt_llinfo = (caddr_t)lx; + insque(lx, &llinfo_x25); + } + for (ifa = rt->rt_ifp->if_addrlist; ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_addr->sa_family == AF_CCITT) + lx->lx_ia = (struct x25_ifaddr *)ifa; + } + return lx; +} +x25_lxfree(lx) +register struct llinfo_x25 *lx; +{ + register struct rtentry *rt = lx->lx_rt; + register struct pklcd *lcp = lx->lx_lcd; + + if (lcp) { + lcp->lcd_upper = 0; + pk_disconnect(lcp); + } + if ((rt->rt_llinfo == (caddr_t)lx) && (lx->lx_next->lx_rt == rt)) + rt->rt_llinfo = (caddr_t)lx->lx_next; + else + rt->rt_llinfo = 0; + RTFREE(rt); + remque(lx); + FREE(lx, M_PCB); +} +/* + * Process a x25 packet as datagram; + */ +x25_ifinput(lcp, m) +struct pklcd *lcp; +register struct mbuf *m; +{ + struct llinfo_x25 *lx = (struct llinfo_x25 *)lcp->lcd_upnext; + register struct ifnet *ifp; + struct ifqueue *inq; + extern struct timeval time; + int s, len, isr; + + if (m == 0 || lcp->lcd_state != DATA_TRANSFER) { + x25_connect_callback(lcp, 0); + return; + } + pk_flowcontrol(lcp, 0, 1); /* Generate RR */ + ifp = m->m_pkthdr.rcvif; + ifp->if_lastchange = time; + switch (m->m_type) { + default: + if (m) + m_freem(m); + return; + + case MT_DATA: + /* FALLTHROUGH */; + } + switch (lx->lx_family) { +#ifdef INET + case AF_INET: + isr = NETISR_IP; + inq = &ipintrq; + break; + +#endif +#ifdef NS + case AF_NS: + isr = NETISR_NS; + inq = &nsintrq; + break; + +#endif +#ifdef ISO + case AF_ISO: + isr = NETISR_ISO; + inq = &clnlintrq; + break; +#endif + default: + m_freem(m); + ifp->if_noproto++; + return; + } + s = splimp(); + schednetisr(isr); + if (IF_QFULL(inq)) { + IF_DROP(inq); + m_freem(m); + } else { + IF_ENQUEUE(inq, m); + ifp->if_ibytes += m->m_pkthdr.len; + } + splx(s); +} +x25_connect_callback(lcp, m) +register struct pklcd *lcp; +register struct mbuf *m; +{ + register struct llinfo_x25 *lx = (struct llinfo_x25 *)lcp->lcd_upnext; + int do_clear = 1; + if (m == 0) + goto refused; + if (m->m_type != MT_CONTROL) { + printf("x25_connect_callback: should panic\n"); + goto refused; + } + switch (pk_decode(mtod(m, struct x25_packet *))) { + case CALL_ACCEPTED: + lcp->lcd_upper = x25_ifinput; + if (lcp->lcd_sb.sb_mb) + lcp->lcd_send(lcp); /* XXX start queued packets */ + return; + default: + do_clear = 0; + refused: + lcp->lcd_upper = 0; + lx->lx_lcd = 0; + if (do_clear) + pk_disconnect(lcp); + return; + } +} +#define SA(p) ((struct sockaddr *)(p)) +#define RT(p) ((struct rtentry *)(p)) + +x25_dgram_incoming(lcp, m0) +register struct pklcd *lcp; +struct mbuf *m0; +{ + register struct rtentry *rt, *nrt; + register struct mbuf *m = m0->m_next; /* m0 has calling sockaddr_x25 */ + void x25_rtrequest(); + + rt = rtalloc1(SA(&lcp->lcd_faddr), 0); + if (rt == 0) { +refuse: lcp->lcd_upper = 0; + pk_close(lcp); + return; + } + rt->rt_refcnt--; + if ((nrt = RT(rt->rt_llinfo)) == 0 || rt_mask(rt) != x25_dgram_sockmask) + goto refuse; + if ((nrt->rt_flags & RTF_UP) == 0) { + rt->rt_llinfo = (caddr_t)rtalloc1(rt->rt_gateway, 0); + rtfree(nrt); + if ((nrt = RT(rt->rt_llinfo)) == 0) + goto refuse; + nrt->rt_refcnt--; + } + if (nrt->rt_ifa == 0 || nrt->rt_ifa->ifa_rtrequest != x25_rtrequest) + goto refuse; + lcp->lcd_send(lcp); /* confirm call */ + x25_rtattach(lcp, nrt); + m_freem(m); +} + +/* + * X.25 output routine. + */ +x25_ifoutput(ifp, m0, dst, rt) +struct ifnet *ifp; +struct mbuf *m0; +struct sockaddr *dst; +register struct rtentry *rt; +{ + register struct mbuf *m = m0; + register struct llinfo_x25 *lx; + struct pklcd *lcp; + int s, error = 0; + +int plen; +for (plen = 0; m; m = m->m_next) + plen += m->m_len; +m = m0; + + if ((ifp->if_flags & IFF_UP) == 0) + senderr(ENETDOWN); + while (rt == 0 || (rt->rt_flags & RTF_GATEWAY)) { + if (rt) { + if (rt->rt_llinfo) { + rt = (struct rtentry *)rt->rt_llinfo; + continue; + } + dst = rt->rt_gateway; + } + if ((rt = rtalloc1(dst, 1)) == 0) + senderr(EHOSTUNREACH); + rt->rt_refcnt--; + } + /* + * Sanity checks. + */ + if ((rt->rt_ifp != ifp) || + (rt->rt_flags & (RTF_CLONING | RTF_GATEWAY)) || + ((lx = (struct llinfo_x25 *)rt->rt_llinfo) == 0)) { + senderr(ENETUNREACH); + } +if ((m->m_flags & M_PKTHDR) == 0) { + if_x25stats.ifx_nophdr++; + m = m_gethdr(M_NOWAIT, MT_HEADER); + if (m == 0) + senderr(ENOBUFS); + m->m_pkthdr.len = plen; + m->m_next = m0; +} +if (plen != m->m_pkthdr.len) { + if_x25stats.ifx_wrongplen++; + m->m_pkthdr.len = plen; +} +next_circuit: + lcp = lx->lx_lcd; + if (lcp == 0) { + lx->lx_lcd = lcp = pk_attach((struct socket *)0); + if (lcp == 0) + senderr(ENOBUFS); + lcp->lcd_upper = x25_connect_callback; + lcp->lcd_upnext = (caddr_t)lx; + lcp->lcd_packetsize = lx->lx_ia->ia_xc.xc_psize; + lcp->lcd_flags = X25_MBS_HOLD; + } + switch (lcp->lcd_state) { + case READY: + if (dst->sa_family == AF_INET && + ifp->if_type == IFT_X25DDN && + rt->rt_gateway->sa_family != AF_CCITT) + x25_ddnip_to_ccitt(dst, rt); + if (rt->rt_gateway->sa_family != AF_CCITT) { + if ((rt->rt_flags & RTF_XRESOLVE) == 0) + senderr(EHOSTUNREACH); + } else if (x25_autoconnect) + error = pk_connect(lcp, + (struct sockaddr_x25 *)rt->rt_gateway); + if (error) + senderr(error); + /* FALLTHROUGH */ + case SENT_CALL: + case DATA_TRANSFER: + if (sbspace(&lcp->lcd_sb) < 0) { + lx = lx->lx_next; + if (lx->lx_rt != rt) + senderr(ENOSPC); + goto next_circuit; + } + if (lx->lx_ia) + lcp->lcd_dg_timer = + lx->lx_ia->ia_xc.xc_dg_idletimo; + pk_send(lcp, m); + break; + default: + /* + * We count on the timer routine to close idle + * connections, if there are not enough circuits to go + * around. + * + * So throw away data for now. + * After we get it all working, we'll rewrite to handle + * actively closing connections (other than by timers), + * when circuits get tight. + * + * In the DDN case, the imp itself closes connections + * under heavy load. + */ + error = ENOBUFS; + bad: + if (m) + m_freem(m); + } + return (error); +} + +/* + * Simpleminded timer routine. + */ +x25_iftimeout(ifp) +struct ifnet *ifp; +{ + register struct pkcb *pkcb = 0; + register struct pklcd **lcpp, *lcp; + int s = splimp(); + + FOR_ALL_PKCBS(pkcb) + if (pkcb->pk_ia->ia_ifp == ifp) + for (lcpp = pkcb->pk_chan + pkcb->pk_maxlcn; + --lcpp > pkcb->pk_chan;) + if ((lcp = *lcpp) && + lcp->lcd_state == DATA_TRANSFER && + (lcp->lcd_flags & X25_DG_CIRCUIT) && + (lcp->lcd_dg_timer && --lcp->lcd_dg_timer == 0)) { + lcp->lcd_upper(lcp, 0); + } + splx(s); +} +/* + * This routine gets called when validating additions of new routes + * or deletions of old ones. + */ +x25_rtrequest(cmd, rt, dst) +register struct rtentry *rt; +struct sockaddr *dst; +{ + register struct llinfo_x25 *lx = (struct llinfo_x25 *)rt->rt_llinfo; + register struct sockaddr_x25 *sa =(struct sockaddr_x25 *)rt->rt_gateway; + register struct pklcd *lcp; + + /* would put this pk_init, except routing table doesn't + exist yet. */ + if (x25_dgram_sockmask == 0) { + struct radix_node *rn_addmask(); + x25_dgram_sockmask = + SA(rn_addmask((caddr_t)&x25_dgmask, 0, 4)->rn_key); + } + if (rt->rt_flags & RTF_GATEWAY) { + if (rt->rt_llinfo) + RTFREE((struct rtentry *)rt->rt_llinfo); + rt->rt_llinfo = (cmd == RTM_ADD) ? + (caddr_t)rtalloc1(rt->rt_gateway, 1) : 0; + return; + } + if ((rt->rt_flags & RTF_HOST) == 0) + return; + if (cmd == RTM_DELETE) { + while (rt->rt_llinfo) + x25_lxfree((struct llinfo *)rt->rt_llinfo); + x25_rtinvert(RTM_DELETE, rt->rt_gateway, rt); + return; + } + if (lx == 0 && (lx = x25_lxalloc(rt)) == 0) + return; + if ((lcp = lx->lx_lcd) && lcp->lcd_state != READY) { + /* + * This can only happen on a RTM_CHANGE operation + * though cmd will be RTM_ADD. + */ + if (lcp->lcd_ceaddr && + Bcmp(rt->rt_gateway, lcp->lcd_ceaddr, + lcp->lcd_ceaddr->x25_len) != 0) { + x25_rtinvert(RTM_DELETE, lcp->lcd_ceaddr, rt); + lcp->lcd_upper = 0; + pk_disconnect(lcp); + } + lcp = 0; + } + x25_rtinvert(RTM_ADD, rt->rt_gateway, rt); +} + +int x25_dont_rtinvert = 0; + +x25_rtinvert(cmd, sa, rt) +register struct sockaddr *sa; +register struct rtentry *rt; +{ + struct rtentry *rt2 = 0; + /* + * rt_gateway contains PID indicating which proto + * family on the other end, so will be different + * from general host route via X.25. + */ + if (rt->rt_ifp->if_type == IFT_X25DDN || x25_dont_rtinvert) + return; + if (sa->sa_family != AF_CCITT) + return; + if (cmd != RTM_DELETE) { + rtrequest(RTM_ADD, sa, rt_key(rt), x25_dgram_sockmask, + RTF_PROTO2, &rt2); + if (rt2) { + rt2->rt_llinfo = (caddr_t) rt; + rt->rt_refcnt++; + } + return; + } + rt2 = rt; + if ((rt = rtalloc1(sa, 0)) == 0 || + (rt->rt_flags & RTF_PROTO2) == 0 || + rt->rt_llinfo != (caddr_t)rt2) { + printf("x25_rtchange: inverse route screwup\n"); + return; + } else + rt2->rt_refcnt--; + rtrequest(RTM_DELETE, sa, rt_key(rt2), x25_dgram_sockmask, + 0, (struct rtentry **) 0); +} + +static struct sockaddr_x25 blank_x25 = {sizeof blank_x25, AF_CCITT}; +/* + * IP to X25 address routine copyright ACC, used by permission. + */ +union imp_addr { + struct in_addr ip; + struct imp { + u_char s_net; + u_char s_host; + u_char s_lh; + u_char s_impno; + } imp; +}; + +/* + * The following is totally bogus and here only to preserve + * the IP to X.25 translation. + */ +x25_ddnip_to_ccitt(src, rt) +struct sockaddr_in *src; +register struct rtentry *rt; +{ + register struct sockaddr_x25 *dst = (struct sockaddr_x25 *)rt->rt_gateway; + union imp_addr imp_addr; + int imp_no, imp_port, temp; + char *x25addr = dst->x25_addr; + + + imp_addr.ip = src->sin_addr; + *dst = blank_x25; + if ((imp_addr.imp.s_net & 0x80) == 0x00) { /* class A */ + imp_no = imp_addr.imp.s_impno; + imp_port = imp_addr.imp.s_host; + } else if ((imp_addr.imp.s_net & 0xc0) == 0x80) { /* class B */ + imp_no = imp_addr.imp.s_impno; + imp_port = imp_addr.imp.s_lh; + } else { /* class C */ + imp_no = imp_addr.imp.s_impno / 32; + imp_port = imp_addr.imp.s_impno % 32; + } + + x25addr[0] = 12; /* length */ + /* DNIC is cleared by struct copy above */ + + if (imp_port < 64) { /* Physical: 0000 0 IIIHH00 [SS] *//* s_impno + * -> III, s_host -> HH */ + x25addr[5] = 0; /* set flag bit */ + x25addr[6] = imp_no / 100; + x25addr[7] = (imp_no % 100) / 10; + x25addr[8] = imp_no % 10; + x25addr[9] = imp_port / 10; + x25addr[10] = imp_port % 10; + } else { /* Logical: 0000 1 RRRRR00 [SS] *//* s + * _host * 256 + s_impno -> RRRRR */ + temp = (imp_port << 8) + imp_no; + x25addr[5] = 1; + x25addr[6] = temp / 10000; + x25addr[7] = (temp % 10000) / 1000; + x25addr[8] = (temp % 1000) / 100; + x25addr[9] = (temp % 100) / 10; + x25addr[10] = temp % 10; + } +} + +/* + * This routine is a sketch and is not to be believed!!!!! + * + * This is a utility routine to be called by x25 devices when a + * call request is honored with the intent of starting datagram forwarding. + */ +x25_dg_rtinit(dst, ia, af) +struct sockaddr_x25 *dst; +register struct x25_ifaddr *ia; +{ + struct sockaddr *sa = 0; + struct rtentry *rt; + struct in_addr my_addr; + static struct sockaddr_in sin = {sizeof(sin), AF_INET}; + + if (ia->ia_ifp->if_type == IFT_X25DDN && af == AF_INET) { + /* + * Inverse X25 to IP mapping copyright and courtesy ACC. + */ + int imp_no, imp_port, temp; + union imp_addr imp_addr; + { + /* + * First determine our IP addr for network + */ + register struct in_ifaddr *ina; + extern struct in_ifaddr *in_ifaddr; + + for (ina = in_ifaddr; ina; ina = ina->ia_next) + if (ina->ia_ifp == ia->ia_ifp) { + my_addr = ina->ia_addr.sin_addr; + break; + } + } + { + + register char *x25addr = dst->x25_addr; + + switch (x25addr[5] & 0x0f) { + case 0: /* Physical: 0000 0 IIIHH00 [SS] */ + imp_no = + ((int) (x25addr[6] & 0x0f) * 100) + + ((int) (x25addr[7] & 0x0f) * 10) + + ((int) (x25addr[8] & 0x0f)); + + + imp_port = + ((int) (x25addr[9] & 0x0f) * 10) + + ((int) (x25addr[10] & 0x0f)); + break; + case 1: /* Logical: 0000 1 RRRRR00 [SS] */ + temp = ((int) (x25addr[6] & 0x0f) * 10000) + + ((int) (x25addr[7] & 0x0f) * 1000) + + ((int) (x25addr[8] & 0x0f) * 100) + + ((int) (x25addr[9] & 0x0f) * 10) + + ((int) (x25addr[10] & 0x0f)); + + imp_port = temp >> 8; + imp_no = temp & 0xff; + break; + default: + return (0L); + } + imp_addr.ip = my_addr; + if ((imp_addr.imp.s_net & 0x80) == 0x00) { + /* class A */ + imp_addr.imp.s_host = imp_port; + imp_addr.imp.s_impno = imp_no; + imp_addr.imp.s_lh = 0; + } else if ((imp_addr.imp.s_net & 0xc0) == 0x80) { + /* class B */ + imp_addr.imp.s_lh = imp_port; + imp_addr.imp.s_impno = imp_no; + } else { + /* class C */ + imp_addr.imp.s_impno = (imp_no << 5) + imp_port; + } + } + sin.sin_addr = imp_addr.ip; + sa = (struct sockaddr *)&sin; + } else { + /* + * This uses the X25 routing table to do inverse + * lookup of x25 address to sockaddr. + */ + if (rt = rtalloc1(SA(dst), 0)) { + sa = rt->rt_gateway; + rt->rt_refcnt--; + } + } + /* + * Call to rtalloc1 will create rtentry for reverse path + * to callee by virtue of cloning magic and will allocate + * space for local control block. + */ + if (sa && (rt = rtalloc1(sa, 1))) + rt->rt_refcnt--; +} +int x25_startproto = 1; + +pk_init() +{ + /* + * warning, sizeof (struct sockaddr_x25) > 32, + * but contains no data of interest beyond 32 + */ + if (x25_startproto) { + pk_protolisten(0xcc, 1, x25_dgram_incoming); + pk_protolisten(0x81, 1, x25_dgram_incoming); + } +} + +struct x25_dgproto { + u_char spi; + u_char spilen; + int (*f)(); +} x25_dgprototab[] = { +#if defined(ISO) && defined(TPCONS) +{ 0x0, 0, tp_incoming}, +#endif +{ 0xcc, 1, x25_dgram_incoming}, +{ 0xcd, 1, x25_dgram_incoming}, +{ 0x81, 1, x25_dgram_incoming}, +}; + +pk_user_protolisten(info) +register u_char *info; +{ + register struct x25_dgproto *dp = x25_dgprototab + + ((sizeof x25_dgprototab) / (sizeof *dp)); + register struct pklcd *lcp; + + while (dp > x25_dgprototab) + if ((--dp)->spi == info[0]) + goto gotspi; + return ESRCH; + +gotspi: if (info[1]) + return pk_protolisten(dp->spi, dp->spilen, dp->f); + for (lcp = pk_listenhead; lcp; lcp = lcp->lcd_listen) + if (lcp->lcd_laddr.x25_udlen == dp->spilen && + Bcmp(&dp->spi, lcp->lcd_laddr.x25_udata, dp->spilen) == 0) { + pk_disconnect(lcp); + return 0; + } + return ESRCH; +} + +/* + * This routine transfers an X.25 circuit to or from a routing entry. + * If the supplied circuit is * in DATA_TRANSFER state, it is added to the + * routing entry. If freshly allocated, it glues back the vc from + * the rtentry to the socket. + */ +pk_rtattach(so, m0) +register struct socket *so; +struct mbuf *m0; +{ + register struct pklcd *lcp = (struct pklcd *)so->so_pcb; + register struct mbuf *m = m0; + struct sockaddr *dst = mtod(m, struct sockaddr *); + register struct rtentry *rt = rtalloc1(dst, 0); + register struct llinfo_x25 *lx; + caddr_t cp; +#define ROUNDUP(a) \ + ((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long)) +#define transfer_sockbuf(s, f, l) \ + while (m = (s)->sb_mb)\ + {(s)->sb_mb = m->m_act; m->m_act = 0; sbfree((s), m); f(l, m);} + + if (rt) + rt->rt_refcnt--; + cp = (dst->sa_len < m->m_len) ? ROUNDUP(dst->sa_len) + (caddr_t)dst : 0; + while (rt && + ((cp == 0 && rt_mask(rt) != 0) || + (cp != 0 && (rt_mask(rt) == 0 || + Bcmp(cp, rt_mask(rt), rt_mask(rt)->sa_len)) != 0))) + rt = (struct rtentry *)rt->rt_nodes->rn_dupedkey; + if (rt == 0 || (rt->rt_flags & RTF_GATEWAY) || + (lx = (struct llinfo_x25 *)rt->rt_llinfo) == 0) + return ESRCH; + if (lcp == 0) + return ENOTCONN; + switch (lcp->lcd_state) { + default: + return ENOTCONN; + + case READY: + /* Detach VC from rtentry */ + if (lx->lx_lcd == 0) + return ENOTCONN; + lcp->lcd_so = 0; + pk_close(lcp); + lcp = lx->lx_lcd; + if (lx->lx_next->lx_rt == rt) + x25_lxfree(lx); + lcp->lcd_so = so; + lcp->lcd_upper = 0; + lcp->lcd_upnext = 0; + transfer_sockbuf(&lcp->lcd_sb, sbappendrecord, &so->so_snd); + soisconnected(so); + return 0; + + case DATA_TRANSFER: + /* Add VC to rtentry */ + lcp->lcd_so = 0; + lcp->lcd_sb = so->so_snd; /* structure copy */ + bzero((caddr_t)&so->so_snd, sizeof(so->so_snd)); /* XXXXXX */ + so->so_pcb = 0; + x25_rtattach(lcp, rt); + transfer_sockbuf(&so->so_rcv, x25_ifinput, lcp); + soisdisconnected(so); + } + return 0; +} +x25_rtattach(lcp0, rt) +register struct pklcd *lcp0; +struct rtentry *rt; +{ + register struct llinfo_x25 *lx = (struct llinfo_x25 *)rt->rt_llinfo; + register struct pklcd *lcp; + register struct mbuf *m; + if (lcp = lx->lx_lcd) { /* adding an additional VC */ + if (lcp->lcd_state == READY) { + transfer_sockbuf(&lcp->lcd_sb, pk_output, lcp0); + lcp->lcd_upper = 0; + pk_close(lcp); + } else { + lx = x25_lxalloc(rt); + if (lx == 0) + return ENOBUFS; + } + } + lx->lx_lcd = lcp = lcp0; + lcp->lcd_upper = x25_ifinput; + lcp->lcd_upnext = (caddr_t)lx; +} diff --git a/sys/netccitt/llc_input.c b/sys/netccitt/llc_input.c new file mode 100644 index 00000000000..7a01973d979 --- /dev/null +++ b/sys/netccitt/llc_input.c @@ -0,0 +1,468 @@ +/* + * Copyright (C) Dirk Husemann, Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1990, 1991, 1992 + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Dirk Husemann and the Computer Science Department (IV) of + * the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)llc_input.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +/* + * This module implements LLC as specified by ISO 8802-2. + */ + + +/* + * llcintr() handles all LLC frames (except ISO CLNS ones for the time being) + * and tries to pass them on to the appropriate network layer entity. + */ +void +llcintr() +{ + register struct mbuf *m; + register int i; + register int frame_kind; + register u_char cmdrsp; + struct llc_linkcb *linkp; + struct rtentry *sirt; + struct npaidbentry *sapinfo; + struct sdl_hdr *sdlhdr; + struct llc *frame; + char *c; + long expected_len; + + struct ifnet *ifp; + struct rtentry *llrt; + struct rtentry *nlrt; + + for (;;) { + i = splimp(); + IF_DEQUEUE(&llcintrq, m); + splx(i); + if (m == 0) + break; +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("llcintr no HDR"); +#endif + /* + * Get ifp this packet was received on + */ + ifp = m->m_pkthdr.rcvif; + + sdlhdr = mtod(m, struct sdl_hdr *); + + /* + * [Copied from net/ip_input.c] + * + * Check that the amount of data in the buffers is + * at least as much as the LLC header tells us. + * Trim mbufs if longer than expected. + * Drop packets if shorter than we think they are. + * + * Layout of mbuf chain at this point: + * + * +-------------------------------+----+ -\ + * | sockaddr_dl src - sdlhdr_src | 20 | \ + * +-------------------------------+----+ | + * | sockaddr_dl dst - sdlhdr_dst | 20 | > sizeof(struct sdl_hdr) == 44 + * +-------------------------------+----+ | + * | LLC frame len - sdlhdr_len | 04 | / + * +-------------------------------+----+ -/ + * / + * | m_next + * \ + * +----------------------------+----+ -\ + * | llc DSAP | 01 | \ + * +----------------------------+----+ | + * | llc SSAP | 01 | | + * +----------------------------+----+ > sdlhdr_len + * | llc control | 01 | | + * +----------------------------+----+ | + * | ... | | / + * -/ + * + * Thus the we expect to have exactly + * (sdlhdr->sdlhdr_len+sizeof(struct sdl_hdr)) in the mbuf chain + */ + expected_len = sdlhdr->sdlhdr_len + sizeof(struct sdl_hdr); + + if (m->m_pkthdr.len < expected_len) { + m_freem(m); + continue; + } + if (m->m_pkthdr.len > expected_len) { + if (m->m_len == m->m_pkthdr.len) { + m->m_len = expected_len; + m->m_pkthdr.len = expected_len; + } else + m_adj(m, expected_len - m->m_pkthdr.len); + } + + /* + * Get llc header + */ + if (m->m_len > sizeof(struct sdl_hdr)) + frame = mtod((struct mbuf *)((struct sdl_hdr*)(m+1)), + struct llc *); + else frame = mtod(m->m_next, struct llc *); + if (frame == (struct llc *) NULL) + panic("llcintr no llc header"); + + /* + * Now check for bogus I/S frame, i.e. those with a control + * field telling us they're an I/S frame yet their length + * is less than the established I/S frame length (DSAP + SSAP + + * control + N(R)&P/F = 4) --- we drop those suckers + */ + if (((frame->llc_control & 0x03) != 0x03) + && ((expected_len - sizeof(struct sdl_hdr)) < LLC_ISFRAMELEN)) { + m_freem(m); + printf("llc: hurz error\n"); + continue; + } + + /* + * Get link control block for the addressed link connection. + * If there is none we take care of it later on. + */ + cmdrsp = (frame->llc_ssap & 0x01); + frame->llc_ssap &= ~0x01; + if (llrt = rtalloc1((struct sockaddr *)&sdlhdr->sdlhdr_src, 0)) + llrt->rt_refcnt--; +#ifdef notyet + else llrt = npaidb_enter(&sdlhdr->sdlhdr_src, 0, 0, 0); +#endif /* notyet */ + else { + /* + * We cannot do anything currently here as we + * don't `know' this link --- drop it + */ + m_freem(m); + continue; + } + linkp = ((struct npaidbentry *)(llrt->rt_llinfo))->np_link; + nlrt = ((struct npaidbentry *)(llrt->rt_llinfo))->np_rt; + + /* + * If the link is not existing right now, we can try and look up + * the SAP info block. + */ + if ((linkp == 0) && frame->llc_ssap) + sapinfo = llc_getsapinfo(frame->llc_dsap, ifp); + + /* + * Handle XID and TEST frames + * XID: if DLSAP == 0, return type-of-services + * window-0 + * DLSAP-0 + * format-identifier-? + * if DLSAP != 0, locate sapcb and return + * type-of-services + * SAP-window + * format-identifier-? + * TEST: swap (snpah_dst, snpah_src) and return frame + * + * Also toggle the CMD/RESP bit + * + * Is this behaviour correct? Check ISO 8802-2 (90)! + */ + frame_kind = llc_decode(frame, (struct llc_linkcb *)0); + switch(frame_kind) { + case LLCFT_XID: + if (linkp || sapinfo) { + if (linkp) + frame->llc_window = linkp->llcl_window; + else frame->llc_window = sapinfo->si_window; + frame->llc_fid = 9; /* XXX */ + frame->llc_class = sapinfo->si_class; + frame->llc_ssap = frame->llc_dsap; + } else { + frame->llc_window = 0; + frame->llc_fid = 9; + frame->llc_class = 1; + frame->llc_dsap = frame->llc_ssap = 0; + } + + /* fall thru to */ + case LLCFT_TEST: + sdl_swapaddr(&(mtod(m, struct sdl_hdr *)->sdlhdr_dst), + &(mtod(m, struct sdl_hdr *)->sdlhdr_src)); + + /* Now set the CMD/RESP bit */ + frame->llc_ssap |= (cmdrsp == 0x0 ? 0x1 : 0x0); + + /* Ship it out again */ + (*ifp->if_output)(ifp, m, + (struct sockaddr *) &(mtod(m, struct sdl_hdr *)->sdlhdr_dst), + (struct rtentry *) 0); + continue; + } + + /* + * Create link control block in case it is not existing + */ + if (linkp == 0 && sapinfo) { + if ((linkp = llc_newlink(&sdlhdr->sdlhdr_src, ifp, nlrt, + (nlrt == 0) ? 0 : nlrt->rt_llinfo, + llrt)) == 0) { + printf("llcintr: couldn't create new link\n"); + m_freem(m); + continue; + } + ((struct npaidbentry *)llrt->rt_llinfo)->np_link = linkp; + } else if (linkp == 0) { + /* The link is not known to us, drop the frame and continue */ + m_freem(m); + continue; + } + + /* + * Drop SNPA header and get rid of empty mbuf at the + * front of the mbuf chain (I don't like 'em) + */ + m_adj(m, sizeof(struct sdl_hdr)); + /* + * LLC_UFRAMELEN is sufficient, m_pullup() will pull up + * the min(m->m_len, maxprotohdr_len [=40]) thus doing + * the trick ... + */ + if ((m = m_pullup(m, LLC_UFRAMELEN))) + /* + * Pass it on thru the elements of procedure + */ + llc_input(linkp, m, cmdrsp); + } + return; +} + +/* + * llc_input() --- We deal with the various incoming frames here. + * Basically we (indirectly) call the appropriate + * state handler function that's pointed to by + * llcl_statehandler. + * + * The statehandler returns an action code --- + * further actions like + * o notify network layer + * o block further sending + * o deblock link + * o ... + * are then enacted accordingly. + */ +llc_input(struct llc_linkcb *linkp, struct mbuf *m, u_char cmdrsp) +{ + int frame_kind; + int pollfinal; + int action = 0; + struct llc *frame; + struct ifnet *ifp = linkp->llcl_if; + + if ((frame = mtod(m, struct llc *)) == (struct llc *) 0) { + m_freem(m); + return 0; + } + pollfinal = ((frame->llc_control & 0x03) == 0x03) ? + LLCGBITS(frame->llc_control, u_pf) : + LLCGBITS(frame->llc_control_ext, s_pf); + + /* + * first decode the frame + */ + frame_kind = llc_decode(frame, linkp); + + switch (action = llc_statehandler(linkp, frame, frame_kind, cmdrsp, + pollfinal)) { + case LLC_DATA_INDICATION: + m_adj(m, LLC_ISFRAMELEN); + if (m = m_pullup(m, NLHDRSIZEGUESS)) { + m->m_pkthdr.rcvif = (struct ifnet *)linkp->llcl_nlnext; + (*linkp->llcl_sapinfo->si_input)(m); + } + break; + } + + /* release mbuf if not an info frame */ + if (action != LLC_DATA_INDICATION && m) + m_freem(m); + + /* try to get frames out ... */ + llc_start(linkp); + + return 0; +} + +/* + * This routine is called by configuration setup. It sets up a station control + * block and notifies all registered upper level protocols. + */ +caddr_t +llc_ctlinput(int prc, struct sockaddr *addr, caddr_t info) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + struct dll_ctlinfo *ctlinfo = (struct dll_ctlinfo *)info; + u_char sap; + struct dllconfig *config; + caddr_t pcb; + struct rtentry *nlrt; + struct rtentry *llrt; + struct llc_linkcb *linkp; + register int i; + + /* info must point to something valid at all times */ + if (info == 0) + return 0; + + if (prc == PRC_IFUP || prc == PRC_IFDOWN) { + /* we use either this set ... */ + ifa = ifa_ifwithaddr(addr); + ifp = ifa ? ifa->ifa_ifp : 0; + if (ifp == 0) + return 0; + + sap = ctlinfo->dlcti_lsap; + config = ctlinfo->dlcti_cfg; + pcb = (caddr_t) 0; + nlrt = (struct rtentry *) 0; + } else { + /* or this one */ + sap = 0; + config = (struct dllconfig *) 0; + pcb = ctlinfo->dlcti_pcb; + nlrt = ctlinfo->dlcti_rt; + + if ((llrt = rtalloc1(nlrt->rt_gateway, 0))) + llrt->rt_refcnt--; + else return 0; + + linkp = ((struct npaidbentry *)llrt->rt_llinfo)->np_link; + } + + switch (prc) { + case PRC_IFUP: + (void) llc_setsapinfo(ifp, addr->sa_family, sap, config); + return 0; + + case PRC_IFDOWN: { + register struct llc_linkcb *linkp; + register struct llc_linkcb *nlinkp; + register int i; + + /* + * All links are accessible over the doubly linked list llccb_q + */ + if (!LQEMPTY) { + /* + * A for-loop is not that great an idea as the linkp + * will get deleted by llc_timer() + */ + linkp = LQFIRST; + while (LQVALID(linkp)) { + nlinkp = LQNEXT(linkp); + if (linkp->llcl_if = ifp) { + i = splimp(); + (void)llc_statehandler(linkp, (struct llc *)0, + NL_DISCONNECT_REQUEST, + 0, 1); + splx(i); + } + linkp = nlinkp; + } + } + } + + case PRC_CONNECT_REQUEST: + if (linkp == 0) { + if ((linkp = llc_newlink((struct sockaddr_dl *) nlrt->rt_gateway, + nlrt->rt_ifp, nlrt, + pcb, llrt)) == 0) + return (0); + ((struct npaidbentry *)llrt->rt_llinfo)->np_link = linkp; + i = splimp(); + (void)llc_statehandler(linkp, (struct llc *) 0, + NL_CONNECT_REQUEST, 0, 1); + splx(i); + } + return ((caddr_t)linkp); + + case PRC_DISCONNECT_REQUEST: + if (linkp == 0) + panic("no link control block!"); + + i = splimp(); + (void)llc_statehandler(linkp, (struct llc *) 0, + NL_DISCONNECT_REQUEST, 0, 1); + splx(i); + + /* + * The actual removal of the link control block is done by the + * cleaning neutrum (i.e. llc_timer()). + */ + break; + + case PRC_RESET_REQUEST: + if (linkp == 0) + panic("no link control block!"); + + i = splimp(); + (void)llc_statehandler(linkp, (struct llc *) 0, + NL_RESET_REQUEST, 0, 1); + splx(i); + + break; + + } + + return 0; +} diff --git a/sys/netccitt/llc_output.c b/sys/netccitt/llc_output.c new file mode 100644 index 00000000000..98d0328a5f5 --- /dev/null +++ b/sys/netccitt/llc_output.c @@ -0,0 +1,304 @@ +/* + * Copyright (C) Dirk Husemann, Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1990, 1991, 1992 + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Dirk Husemann and the Computer Science Department (IV) of + * the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)llc_output.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +/* + * llc_output() --- called by an upper layer (network layer) entity whenever + * there is an INFO frame to be transmitted. We enqueue the + * info frame and call llc_start() to do the actual sending. + */ + +llc_output(struct llc_linkcb *linkp, struct mbuf *m) +{ + register int i; + + i = splimp(); + LLC_ENQUEUE(linkp, m); + llc_start(linkp); + splx(i); + +} + + +/* + * llc_start() --- We try to subsequently dequeue all the frames available and + * send them out. + */ +void +llc_start(struct llc_linkcb *linkp) +{ + register int i; + register struct mbuf *m; + int action; + + while ((LLC_STATEEQ(linkp, NORMAL) || LLC_STATEEQ(linkp, BUSY) || + LLC_STATEEQ(linkp, REJECT)) && + (linkp->llcl_slotsfree > 0) && + (LLC_GETFLAG(linkp, REMOTE_BUSY) == 0)) { + LLC_DEQUEUE(linkp, m); + if (m == NULL) + break; + LLC_SETFRAME(linkp, m); + (void)llc_statehandler(linkp, (struct llc *) 0, NL_DATA_REQUEST, + 0, 0); + } +} + + +/* + * llc_send() --- Handles single frames. If dealing with INFO frames we need to + * prepend the LLC header, otherwise we just allocate an mbuf. + * In both cases the actual send is done by llc_rawsend(). + */ +llc_send(struct llc_linkcb *linkp, int frame_kind, int cmdrsp, int pollfinal) +{ + register struct mbuf *m = (struct mbuf *)0; + register struct llc *frame; + + if (frame_kind == LLCFT_INFO) + m = linkp->llcl_output_buffers[llc_seq2slot(linkp, + linkp->llcl_vs)]; + LLC_GETHDR(frame, m); + + /* pass it on to llc_rawsend() */ + llc_rawsend(linkp, m, frame, frame_kind, linkp->llcl_vs, cmdrsp, pollfinal); + + if (frame_kind == LLCFT_INFO) + LLC_INC(linkp->llcl_vs); + + return 0; +} + +/* + * llc_resend() --- llc_resend() retransmits all unacknowledged INFO frames. + */ +llc_resend(struct llc_linkcb *linkp, int cmdrsp, int pollfinal) +{ + register struct llc *frame; + register struct mbuf *m; + register int seq, slot; + + if (linkp->llcl_slotsfree < linkp->llcl_window) + /* assert lock between nr_received & V(S) */ + if (linkp->llcl_nr_received != linkp->llcl_vs) + panic("llc: V(S) != N(R) received\n"); + + for (slot = llc_seq2slot(linkp, linkp->llcl_vs); + slot != linkp->llcl_freeslot; + LLC_INC(linkp->llcl_vs), + slot = llc_seq2slot(linkp, linkp->llcl_vs)) { + m = linkp->llcl_output_buffers[slot]; + LLC_GETHDR(frame, m); + llc_rawsend(linkp, m, frame, LLCFT_INFO, linkp->llcl_vs, + cmdrsp, pollfinal); + pollfinal = 0; + } + + return 0; +} + +/* + * llc_rawsend() --- constructs an LLC frame and sends it out via the + * associated interface of the link control block. + * + * We need to make sure that outgoing frames have the correct length, + * in particular the 4 byte ones (RR, RNR, REJ) as LLC_GETHDR() will + * set the mbuf len to 3 as default len for non INFO frames ... + * + * Frame kind Length (w/o MAC header, {D,S}SAP incl.) + * -------------------------------------------------------------- + * DISC, SABME, UA, DM 3 bytes ({D,S}SAP + CONTROL) + * RR, RNR, REJ 4 bytes ({D,S}SAP + CONTROL0 + CONTROL1) + * XID 6 bytes ({D,S}SAP + CONTROL0 + FI,CLASS,WINDOW) + * FRMR 7 bytes ({D,S}SAP + CONTROL0 + REJ CONTROL,V(S),V(R),CAUSE) + * INFO 4 -- MTU + * UI, TEST 3 -- MTU + * + */ +#define LLC_SETLEN(m, l) (m)->m_pkthdr.len = (m)->m_len = (l) + +llc_rawsend(struct llc_linkcb *linkp, struct mbuf *m, struct llc *frame, + int frame_kind, int vs, int cmdrsp, int pollfinal) +{ + register short adjust = LLC_UFRAMELEN; + struct ifnet *ifp; + + switch (frame_kind) { + /* supervisory and information frames */ + case LLCFT_INFO: + frame->llc_control = LLC_INFO; + LLCSBITS(frame->llc_control, i_ns, vs); + LLCSBITS(frame->llc_control_ext, i_nr, linkp->llcl_vr); + adjust = LLC_ISFRAMELEN; + break; + case LLCFT_RR: + frame->llc_control = LLC_RR; + LLC_SETLEN(m, LLC_ISFRAMELEN); + LLCSBITS(frame->llc_control_ext, s_nr, linkp->llcl_vr); + adjust = LLC_ISFRAMELEN; + break; + case LLCFT_RNR: + frame->llc_control = LLC_RNR; + LLC_SETLEN(m, LLC_ISFRAMELEN); + LLCSBITS(frame->llc_control_ext, s_nr, linkp->llcl_vr); + adjust = LLC_ISFRAMELEN; + break; + case LLCFT_REJ: + frame->llc_control = LLC_REJ; + LLC_SETLEN(m, LLC_ISFRAMELEN); + LLCSBITS(frame->llc_control_ext, s_nr, linkp->llcl_vr); + adjust = LLC_ISFRAMELEN; + break; + /* unnumbered frames */ + case LLCFT_DM: + frame->llc_control = LLC_DM; + break; + case LLCFT_SABME: + frame->llc_control = LLC_SABME; + break; + case LLCFT_DISC: + frame->llc_control = LLC_DISC; + break; + case LLCFT_UA: + frame->llc_control = LLC_UA; + break; + case LLCFT_UI: + frame->llc_control = LLC_UI; + break; + case LLCFT_FRMR: + frame->llc_control = LLC_FRMR; + /* get more space --- FRMR frame are longer then usual */ + LLC_SETLEN(m, LLC_FRMRLEN); + bcopy((caddr_t) &linkp->llcl_frmrinfo, + (caddr_t) &frame->llc_frmrinfo, + sizeof(struct frmrinfo)); + break; + default: + /* + * We don't send {XID, TEST} frames + */ + if (m) + m_freem(m); + return; + } + + /* + * Fill in DSAP/SSAP + */ + frame->llc_dsap = frame->llc_ssap = LLSAPADDR(&linkp->llcl_addr); + frame->llc_ssap |= cmdrsp; + + /* + * Check for delayed action pending. ISO 8802-2, 7.9.2 (5) + * and ISO 8802-2, 7.9.2.3 (32), (34), (36) pertain to this + * piece of code --- hopefully we got it right here (i.e. + * in the spirit of (32), (34), and (36) ... + */ + switch (frame_kind) { + case LLCFT_RR: + case LLCFT_RNR: + case LLCFT_REJ: + case LLCFT_INFO: + switch (LLC_GETFLAG(linkp, DACTION)) { + case LLC_DACKCMD: + case LLC_DACKRSP: + LLC_STOPTIMER(linkp, DACTION); + break; + case LLC_DACKCMDPOLL: + if (cmdrsp == LLC_CMD) { + pollfinal = 1; + LLC_STOPTIMER(linkp, DACTION); + } + break; + case LLC_DACKRSPFINAL: + if (cmdrsp == LLC_RSP) { + pollfinal = 1; + LLC_STOPTIMER(linkp, DACTION); + } + break; + } + break; + } + + if (adjust == LLC_UFRAMELEN) + LLCSBITS(frame->llc_control, u_pf, pollfinal); + else LLCSBITS(frame->llc_control_ext, s_pf, pollfinal); + + /* + * Get interface to send frame onto + */ + ifp = linkp->llcl_if; + if (frame_kind == LLCFT_INFO) { + /* + * send out a copy of the frame, retain the + * original + */ + (*ifp->if_output)(ifp, m_copy(m, 0, (int)M_COPYALL), + rt_key(linkp->llcl_nlrt), + linkp->llcl_nlrt); + /* + * Account for the LLC header and let it ``disappear'' + * as the raw info frame payload is what we hold in + * the output_buffers of the link. + */ + m_adj(m, LLC_ISFRAMELEN); + } else (*ifp->if_output)(ifp, m, + rt_key(linkp->llcl_nlrt), + linkp->llcl_nlrt); +} + diff --git a/sys/netccitt/llc_subr.c b/sys/netccitt/llc_subr.c new file mode 100644 index 00000000000..46848fdf5bc --- /dev/null +++ b/sys/netccitt/llc_subr.c @@ -0,0 +1,2356 @@ +/* + * Copyright (C) Dirk Husemann, Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1990, 1991, 1992 + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Dirk Husemann and the Computer Science Department (IV) of + * the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)llc_subr.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +/* + * Frame names for diagnostic messages + */ +char *frame_names[] = { "INFO", "RR", "RNR", "REJ", "DM", "SABME", "DISC", + "UA", "FRMR", "UI", "XID", "TEST", "ILLEGAL", "TIMER", "N2xT1"}; + + +/* + * Trace level + */ +int llc_tracelevel = LLCTR_URGENT; + +/* + * Values for accessing various bitfields + */ +struct bitslice llc_bitslice[] = { +/* mask, shift value */ + { 0x1, 0x0 }, + { 0xfe, 0x1 }, + { 0x3, 0x0 }, + { 0xc, 0x2 }, + { 0x10, 0x4 }, + { 0xe0, 0x5 }, + { 0x1f, 0x0 } +}; + +/* + * We keep the link control blocks on a doubly linked list - + * primarily for checking in llc_time() + */ + +struct llccb_q llccb_q = { &llccb_q, &llccb_q }; + +/* + * Flag for signalling wether route tree for AF_LINK has been + * initialized yet. + */ + +int af_link_rts_init_done = 0; + + +/* + * Functions dealing with struct sockaddr_dl */ + +/* Compare sdl_a w/ sdl_b */ + +sdl_cmp(struct sockaddr_dl *sdl_a, struct sockaddr_dl *sdl_b) +{ + if (LLADDRLEN(sdl_a) != LLADDRLEN(sdl_b)) + return(1); + return(bcmp((caddr_t) sdl_a->sdl_data, (caddr_t) sdl_b->sdl_data, + LLADDRLEN(sdl_a))); +} + +/* Copy sdl_f to sdl_t */ + +sdl_copy(struct sockaddr_dl *sdl_f, struct sockaddr_dl *sdl_t) +{ + bcopy((caddr_t) sdl_f, (caddr_t) sdl_t, sdl_f->sdl_len); +} + +/* Swap sdl_a w/ sdl_b */ + +sdl_swapaddr(struct sockaddr_dl *sdl_a, struct sockaddr_dl *sdl_b) +{ + struct sockaddr_dl sdl_tmp; + + sdl_copy(sdl_a, &sdl_tmp); + sdl_copy(sdl_b, sdl_a); + sdl_copy(&sdl_tmp, sdl_b); +} + +/* Fetch the sdl of the associated if */ + +struct sockaddr_dl * +sdl_getaddrif(struct ifnet *ifp) +{ + register struct ifaddr *ifa; + + for(ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) + if (ifa->ifa_addr->sa_family == AF_LINK ) + return((struct sockaddr_dl *)(ifa->ifa_addr)); + + return((struct sockaddr_dl *)0); +} + +/* Check addr of interface with the one given */ + +sdl_checkaddrif(struct ifnet *ifp, struct sockaddr_dl *sdl_c) +{ + register struct ifaddr *ifa; + + for(ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) + if ((ifa->ifa_addr->sa_family == AF_LINK ) && + !sdl_cmp((struct sockaddr_dl *)(ifa->ifa_addr), sdl_c)) + return(1); + + return(0); +} + +/* Build an sdl from MAC addr, DLSAP addr, and interface */ + +sdl_setaddrif(struct ifnet *ifp, u_char *mac_addr, u_char dlsap_addr, + u_char mac_len, struct sockaddr_dl *sdl_to) +{ + register struct sockaddr_dl *sdl_tmp; + + if ((sdl_tmp = sdl_getaddrif(ifp)) ) { + sdl_copy(sdl_tmp, sdl_to); + bcopy((caddr_t) mac_addr, (caddr_t) LLADDR(sdl_to), mac_len); + *(LLADDR(sdl_to)+mac_len) = dlsap_addr; + sdl_to->sdl_alen = mac_len+1; + return(1); + } else return(0); +} + +/* Fill out the sdl header aggregate */ + +sdl_sethdrif(struct ifnet *ifp, u_char *mac_src, u_char dlsap_src, u_char *mac_dst, + u_char dlsap_dst, u_char mac_len, struct sdl_hdr *sdlhdr_to) +{ + if ( !sdl_setaddrif(ifp, mac_src, dlsap_src, mac_len, + &sdlhdr_to->sdlhdr_src) || + !sdl_setaddrif(ifp, mac_dst, dlsap_dst, mac_len, + &sdlhdr_to->sdlhdr_dst) ) + return(0); + else return(1); +} + +static struct sockaddr_dl sap_saddr; +static struct sockaddr_dl sap_sgate = { + sizeof(struct sockaddr_dl), /* _len */ + AF_LINK /* _af */ +}; + +/* + * Set sapinfo for SAP address, llcconfig, af, and interface + */ +struct npaidbentry * +llc_setsapinfo(struct ifnet *ifp, u_char af, u_char sap, struct dllconfig *llconf) +{ + struct protosw *pp; + struct sockaddr_dl *ifdl_addr; + struct rtentry *sirt = (struct rtentry *)0; + struct npaidbentry *sapinfo; + u_char saploc; + int size = sizeof(struct npaidbentry); + + USES_AF_LINK_RTS; + + /* + * We rely/assume that only STREAM protocols will make use of + * connection oriented LLC2. If this will one day not be the + * case this will obviously fail. + */ + pp = pffindtype (af, SOCK_STREAM); + if (pp == 0 || pp->pr_input == 0 || pp->pr_ctlinput == 0) { + printf("network level protosw error"); + return 0; + } + + /* + * We need a way to jot down the LLC2 configuration for + * a certain LSAP address. To do this we enter + * a "route" for the SAP. + */ + ifdl_addr = sdl_getaddrif(ifp); + sdl_copy(ifdl_addr, &sap_saddr); + sdl_copy(ifdl_addr, &sap_sgate); + saploc = LLSAPLOC(&sap_saddr, ifp); + sap_saddr.sdl_data[saploc] = sap; + sap_saddr.sdl_alen++; + + /* now enter it */ + rtrequest(RTM_ADD, (struct sockaddr *)&sap_saddr, + (struct sockaddr *)&sap_sgate, 0, 0, &sirt); + if (sirt == 0) + return 0; + + /* Plug in config information in rt->rt_llinfo */ + + sirt->rt_llinfo = malloc(size , M_PCB, M_WAITOK); + sapinfo = (struct npaidbentry *) sirt->rt_llinfo; + if (sapinfo) { + bzero ((caddr_t)sapinfo, size); + /* + * For the time being we support LLC CLASS II here + * only + */ + sapinfo->si_class = LLC_CLASS_II; + sapinfo->si_window = llconf->dllcfg_window; + sapinfo->si_trace = llconf->dllcfg_trace; + if (sapinfo->si_trace) + llc_tracelevel--; + else llc_tracelevel++; + sapinfo->si_input = pp->pr_input; + sapinfo->si_ctlinput = (caddr_t (*)())pp->pr_ctlinput; + + return (sapinfo); + } + + return 0; +} + +/* + * Get sapinfo for SAP address and interface + */ +struct npaidbentry * +llc_getsapinfo(u_char sap, struct ifnet *ifp) +{ + struct sockaddr_dl *ifdl_addr; + struct sockaddr_dl si_addr; + struct rtentry *sirt; + u_char saploc; + + USES_AF_LINK_RTS; + + ifdl_addr = sdl_getaddrif(ifp); + sdl_copy(ifdl_addr, &si_addr); + saploc = LLSAPLOC(&si_addr, ifp); + si_addr.sdl_data[saploc] = sap; + si_addr.sdl_alen++; + + if ((sirt = rtalloc1((struct sockaddr *)&si_addr, 0))) + sirt->rt_refcnt--; + else return(0); + + return((struct npaidbentry *)sirt->rt_llinfo); +} + +/* + * llc_seq2slot() --- We only allocate enough memory to hold the window. This + * introduces the necessity to keep track of two ``pointers'' + * + * o llcl_freeslot the next free slot to be used + * this one advances modulo llcl_window + * o llcl_projvs the V(S) associated with the next frame + * to be set via llcl_freeslot + * this one advances modulo LLC_MAX_SEQUENCE + * + * A new frame is inserted at llcl_output_buffers[llcl_freeslot], after + * which both llcl_freeslot and llcl_projvs are incremented. + * + * The slot sl(sn) for any given sequence number sn is given by + * + * sl(sn) = (llcl_freeslot + llcl_window - 1 - (llcl_projvs + + * LLC_MAX_SEQUENCE- sn) % LLC_MAX_SEQUENCE) % + * llcl_window + * + * i.e. we first calculate the number of frames we need to ``go back'' + * from the current one (really the next one, but that doesn't matter as + * llcl_projvs is likewise of by plus one) and subtract that from the + * pointer to the most recently taken frame (llcl_freeslot - 1). + */ + +short +llc_seq2slot(struct llc_linkcb *linkp, short seqn) +{ + register sn = 0; + + sn = (linkp->llcl_freeslot + linkp->llcl_window - + (linkp->llcl_projvs + LLC_MAX_SEQUENCE - seqn) % + LLC_MAX_SEQUENCE) % linkp->llcl_window; + + return sn; +} + +/* + * LLC2 link state handler + * + * There is in most cases one function per LLC2 state. The LLC2 standard + * ISO 8802-2 allows in some cases for ambiguities, i.e. we have the choice + * to do one thing or the other. Right now I have just chosen one but have also + * indicated the spot by "multiple possibilities". One could make the behavior + * in those cases configurable, allowing the superuser to enter a profile word + * (32/64 bits, whatever is needed) that would suit her needs [I quite like + * that idea, perhaps I'll get around to it]. + * + * [Preceeding each state handler function is the description as taken from + * ISO 8802-2, section 7.9.2.1] + */ + +/* + * ADM --- The connection component is in the asynchronous disconnected mode. + * It can accept an SABME PDU from a remote LLC SSAP or, at the request + * of the service access point user, can initiate an SABME PDU + * transmission to a remote LLC DSAP, to establish a data link + * connection. It also responds to a DISC command PDU and to any + * command PDU with the P bit set to ``1''. + */ +int +llc_state_ADM(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = 0; + + switch(frame_kind + cmdrsp) { + case NL_CONNECT_REQUEST: + llc_send(linkp, LLCFT_SABME, LLC_CMD, pollfinal); + LLC_SETFLAG(linkp, P, pollfinal); + LLC_SETFLAG(linkp, S, 0); + linkp->llcl_retry = 0; + LLC_NEWSTATE(linkp, SETUP); + break; + case LLCFT_SABME + LLC_CMD: + /* + * ISO 8802-2, table 7-1, ADM state says to set + * the P flag, yet this will cause an SABME [P] to be + * answered with an UA only, not an UA [F], all + * other `disconnected' states set the F flag, so ... + */ + LLC_SETFLAG(linkp, F, pollfinal); + LLC_NEWSTATE(linkp, CONN); + action = LLC_CONNECT_INDICATION; + break; + case LLCFT_DISC + LLC_CMD: + llc_send(linkp, LLCFT_DM, LLC_RSP, pollfinal); + break; + default: + if (cmdrsp == LLC_CMD && pollfinal == 1) + llc_send(linkp, LLCFT_DM, LLC_RSP, 1); + /* remain in ADM state */ + } + + return action; +} + +/* + * CONN --- The local connection component has received an SABME PDU from a + * remote LLC SSAP, and it is waiting for the local user to accept or + * refuse the connection. + */ +int +llc_state_CONN(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = 0; + + switch(frame_kind + cmdrsp) { + case NL_CONNECT_RESPONSE: + llc_send(linkp, LLCFT_UA, LLC_RSP, LLC_GETFLAG(linkp, F)); + LLC_RESETCOUNTER(linkp); + LLC_SETFLAG(linkp, P, 0); + LLC_SETFLAG(linkp, REMOTE_BUSY, 0); + LLC_NEWSTATE(linkp, NORMAL); + break; + case NL_DISCONNECT_REQUEST: + llc_send(linkp, LLCFT_DM, LLC_RSP, LLC_GETFLAG(linkp, F)); + LLC_NEWSTATE(linkp, ADM); + break; + case LLCFT_SABME + LLC_CMD: + LLC_SETFLAG(linkp, F, pollfinal); + break; + case LLCFT_DM + LLC_RSP: + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + /* all other frames effect nothing here */ + } + + return action; +} + +/* + * RESET_WAIT --- The local connection component is waiting for the local user + * to indicate a RESET_REQUEST or a DISCONNECT_REQUEST. + */ +int +llc_state_RESET_WAIT(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = 0; + + switch(frame_kind + cmdrsp) { + case NL_RESET_REQUEST: + if (LLC_GETFLAG(linkp, S) == 0) { + llc_send(linkp, LLCFT_SABME, LLC_CMD, pollfinal); + LLC_SETFLAG(linkp, P, pollfinal); + LLC_START_ACK_TIMER(linkp); + linkp->llcl_retry = 0; + LLC_NEWSTATE(linkp, RESET); + } else { + llc_send(linkp, LLCFT_UA, LLC_RSP, + LLC_GETFLAG(linkp, F)); + LLC_RESETCOUNTER(linkp); + LLC_SETFLAG(linkp, P, 0); + LLC_SETFLAG(linkp, REMOTE_BUSY, 0); + LLC_NEWSTATE(linkp, NORMAL); + action = LLC_RESET_CONFIRM; + } + break; + case NL_DISCONNECT_REQUEST: + if (LLC_GETFLAG(linkp, S) == 0) { + llc_send(linkp, LLCFT_DISC, LLC_CMD, pollfinal); + LLC_SETFLAG(linkp, P, pollfinal); + LLC_START_ACK_TIMER(linkp); + linkp->llcl_retry = 0; + LLC_NEWSTATE(linkp, D_CONN); + } else { + llc_send(linkp, LLCFT_DM, LLC_RSP, + LLC_GETFLAG(linkp, F)); + LLC_NEWSTATE(linkp, ADM); + } + break; + case LLCFT_DM + LLC_RSP: + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + case LLCFT_SABME + LLC_CMD: + LLC_SETFLAG(linkp, S, 1); + LLC_SETFLAG(linkp, F, pollfinal); + break; + case LLCFT_DISC + LLC_CMD: + llc_send(linkp, LLCFT_DM, LLC_RSP, pollfinal); + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + } + + return action; +} + +/* + * RESET_CHECK --- The local connection component is waiting for the local user + * to accept or refuse a remote reset request. + */ +int +llc_state_RESET_CHECK(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = 0; + + switch(frame_kind + cmdrsp) { + case NL_RESET_RESPONSE: + llc_send(linkp, LLCFT_UA, LLC_RSP, LLC_GETFLAG(linkp, F)); + LLC_RESETCOUNTER(linkp); + LLC_SETFLAG(linkp, P, 0); + LLC_SETFLAG(linkp, REMOTE_BUSY, 0); + LLC_NEWSTATE(linkp, NORMAL); + break; + case NL_DISCONNECT_REQUEST: + llc_send(linkp, LLCFT_DM, LLC_RSP, LLC_GETFLAG(linkp, F)); + LLC_NEWSTATE(linkp, ADM); + break; + case LLCFT_DM + LLC_RSP: + action = LLC_DISCONNECT_INDICATION; + break; + case LLCFT_SABME + LLC_CMD: + LLC_SETFLAG(linkp, F, pollfinal); + break; + case LLCFT_DISC + LLC_CMD: + llc_send(linkp, LLCFT_DM, LLC_RSP, pollfinal); + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + } + + return action; +} + +/* + * SETUP --- The connection component has transmitted an SABME command PDU to a + * remote LLC DSAP and is waiting for a reply. + */ +int +llc_state_SETUP(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = 0; + + switch(frame_kind + cmdrsp) { + case LLCFT_SABME + LLC_CMD: + LLC_RESETCOUNTER(linkp); + llc_send(linkp, LLCFT_UA, LLC_RSP, pollfinal); + LLC_SETFLAG(linkp, S, 1); + break; + case LLCFT_UA + LLC_RSP: + if (LLC_GETFLAG(linkp, P) == pollfinal) { + LLC_STOP_ACK_TIMER(linkp); + LLC_RESETCOUNTER(linkp); + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + LLC_SETFLAG(linkp, REMOTE_BUSY, 0); + LLC_NEWSTATE(linkp, NORMAL); + action = LLC_CONNECT_CONFIRM; + } + break; + case LLC_ACK_TIMER_EXPIRED: + if (LLC_GETFLAG(linkp, S) == 1) { + LLC_SETFLAG(linkp, P, 0); + LLC_SETFLAG(linkp, REMOTE_BUSY, 0), + LLC_NEWSTATE(linkp, NORMAL); + action = LLC_CONNECT_CONFIRM; + } else if (linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_SABME, LLC_CMD, pollfinal); + LLC_SETFLAG(linkp, P, pollfinal); + LLC_START_ACK_TIMER(linkp); + linkp->llcl_retry++; + } else { + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + } + break; + case LLCFT_DISC + LLC_CMD: + llc_send(linkp, LLCFT_DM, LLC_RSP, pollfinal); + LLC_STOP_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + case LLCFT_DM + LLC_RSP: + LLC_STOP_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + } + + return action; +} + +/* + * RESET --- As a result of a service access point user request or the receipt + * of a FRMR response PDU, the local connection component has sent an + * SABME command PDU to the remote LLC DSAP to reset the data link + * connection and is waiting for a reply. + */ +int +llc_state_RESET(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = 0; + + switch(frame_kind + cmdrsp) { + case LLCFT_SABME + LLC_CMD: + LLC_RESETCOUNTER(linkp); + LLC_SETFLAG(linkp, S, 1); + llc_send(linkp, LLCFT_UA, LLC_RSP, pollfinal); + break; + case LLCFT_UA + LLC_RSP: + if (LLC_GETFLAG(linkp, P) == pollfinal) { + LLC_STOP_ACK_TIMER(linkp); + LLC_RESETCOUNTER(linkp); + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + LLC_SETFLAG(linkp, REMOTE_BUSY, 0); + LLC_NEWSTATE(linkp, NORMAL); + action = LLC_RESET_CONFIRM; + } + break; + case LLC_ACK_TIMER_EXPIRED: + if (LLC_GETFLAG(linkp, S) == 1) { + LLC_SETFLAG(linkp, P, 0); + LLC_SETFLAG(linkp, REMOTE_BUSY, 0); + LLC_NEWSTATE(linkp, NORMAL); + action = LLC_RESET_CONFIRM; + } else if (linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_SABME, LLC_CMD, pollfinal); + LLC_SETFLAG(linkp, P, pollfinal); + LLC_START_ACK_TIMER(linkp); + linkp->llcl_retry++; + } else { + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + } + break; + case LLCFT_DISC + LLC_CMD: + llc_send(linkp, LLCFT_DM, LLC_RSP, pollfinal); + LLC_STOP_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + case LLCFT_DM + LLC_RSP: + LLC_STOP_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + } + + return action; +} + +/* + * D_CONN --- At the request of the service access point user, the local LLC + * has sent a DISC command PDU to the remote LLC DSAP and is waiting + * for a reply. + */ +int +llc_state_D_CONN(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = 0; + + switch(frame_kind + cmdrsp) { + case LLCFT_SABME + LLC_CMD: + llc_send(linkp, LLCFT_DM, LLC_RSP, pollfinal); + LLC_STOP_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, ADM); + break; + case LLCFT_UA + LLC_RSP: + if (LLC_GETFLAG(linkp, P) == pollfinal) { + LLC_STOP_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, ADM); + } + break; + case LLCFT_DISC + LLC_CMD: + llc_send(linkp, LLCFT_UA, LLC_RSP, pollfinal); + break; + case LLCFT_DM + LLC_RSP: + LLC_STOP_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, ADM); + break; + case LLC_ACK_TIMER_EXPIRED: + if (linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_DISC, LLC_CMD, pollfinal); + LLC_SETFLAG(linkp, P, pollfinal); + LLC_START_ACK_TIMER(linkp); + linkp->llcl_retry++; + } else LLC_NEWSTATE(linkp, ADM); + break; + } + + return action; +} + +/* + * ERROR --- The local connection component has detected an error in a received + * PDU and has sent a FRMR response PDU. It is waiting for a reply from + * the remote connection component. + */ +int +llc_state_ERROR(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = 0; + + switch(frame_kind + cmdrsp) { + case LLCFT_SABME + LLC_CMD: + LLC_STOP_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, RESET_CHECK); + action = LLC_RESET_INDICATION_REMOTE; + break; + case LLCFT_DISC + LLC_CMD: + llc_send(linkp, LLCFT_UA, LLC_RSP, pollfinal); + LLC_STOP_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + case LLCFT_DM + LLC_RSP: + LLC_STOP_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + case LLCFT_FRMR + LLC_RSP: + LLC_STOP_ACK_TIMER(linkp); + LLC_SETFLAG(linkp, S, 0); + LLC_NEWSTATE(linkp, RESET_WAIT); + action = LLC_FRMR_RECEIVED; + break; + case LLC_ACK_TIMER_EXPIRED: + if (linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_FRMR, LLC_RSP, 0); + LLC_START_ACK_TIMER(linkp); + linkp->llcl_retry++; + } else { + LLC_SETFLAG(linkp, S, 0); + LLC_NEWSTATE(linkp, RESET_WAIT); + action = LLC_RESET_INDICATION_LOCAL; + } + break; + default: + if (cmdrsp == LLC_CMD){ + llc_send(linkp, LLCFT_FRMR, LLC_RSP, pollfinal); + LLC_START_ACK_TIMER(linkp); + } + break; + + } + + return action; +} + +/* + * NORMAL, BUSY, REJECT, AWAIT, AWAIT_BUSY, and AWAIT_REJECT all share + * a common core state handler. + */ +int +llc_state_NBRAcore(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = 0; + + switch(frame_kind + cmdrsp) { + case NL_DISCONNECT_REQUEST: + llc_send(linkp, LLCFT_DISC, LLC_CMD, pollfinal); + LLC_SETFLAG(linkp, P, pollfinal); + LLC_STOP_ALL_TIMERS(linkp); + LLC_START_ACK_TIMER(linkp); + linkp->llcl_retry = 0; + LLC_NEWSTATE(linkp, D_CONN); + break; + case NL_RESET_REQUEST: + llc_send(linkp, LLCFT_SABME, LLC_CMD, pollfinal); + LLC_SETFLAG(linkp, P, pollfinal); + LLC_STOP_ALL_TIMERS(linkp); + LLC_START_ACK_TIMER(linkp); + linkp->llcl_retry = 0; + LLC_SETFLAG(linkp, S, 0); + LLC_NEWSTATE(linkp, RESET); + break; + case LLCFT_SABME + LLC_CMD: + LLC_SETFLAG(linkp, F, pollfinal); + LLC_STOP_ALL_TIMERS(linkp); + LLC_NEWSTATE(linkp, RESET_CHECK); + action = LLC_RESET_INDICATION_REMOTE; + break; + case LLCFT_DISC + LLC_CMD: + llc_send(linkp, LLCFT_UA, LLC_RSP, pollfinal); + LLC_STOP_ALL_TIMERS(linkp); + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + case LLCFT_FRMR + LLC_RSP: + LLC_STOP_ALL_TIMERS(linkp); + LLC_SETFLAG(linkp, S, 0); + LLC_NEWSTATE(linkp, RESET_WAIT); + action = LLC_FRMR_RECEIVED; + break; + case LLCFT_DM + LLC_RSP: + LLC_STOP_ALL_TIMERS(linkp); + LLC_NEWSTATE(linkp, ADM); + action = LLC_DISCONNECT_INDICATION; + break; + case LLC_INVALID_NR + LLC_CMD: + case LLC_INVALID_NS + LLC_CMD: + LLC_SETFRMR(linkp, frame, cmdrsp, + (frame_kind == LLC_INVALID_NR ? LLC_FRMR_Z : + (LLC_FRMR_V | LLC_FRMR_W))); + llc_send(linkp, LLCFT_FRMR, LLC_RSP, pollfinal); + LLC_STOP_ALL_TIMERS(linkp); + LLC_START_ACK_TIMER(linkp); + linkp->llcl_retry = 0; + LLC_NEWSTATE(linkp, ERROR); + action = LLC_FRMR_SENT; + break; + case LLC_INVALID_NR + LLC_RSP: + case LLC_INVALID_NS + LLC_RSP: + case LLCFT_UA + LLC_RSP: + case LLC_BAD_PDU: { + char frmrcause = 0; + + switch (frame_kind) { + case LLC_INVALID_NR: frmrcause = LLC_FRMR_Z; break; + case LLC_INVALID_NS: frmrcause = LLC_FRMR_V | LLC_FRMR_W; break; + default: frmrcause = LLC_FRMR_W; + } + LLC_SETFRMR(linkp, frame, cmdrsp, frmrcause); + llc_send(linkp, LLCFT_FRMR, LLC_RSP, 0); + LLC_STOP_ALL_TIMERS(linkp); + LLC_START_ACK_TIMER(linkp); + linkp->llcl_retry = 0; + LLC_NEWSTATE(linkp, ERROR); + action = LLC_FRMR_SENT; + break; + } + default: + if (cmdrsp == LLC_RSP && pollfinal == 1 && + LLC_GETFLAG(linkp, P) == 0) { + LLC_SETFRMR(linkp, frame, cmdrsp, LLC_FRMR_W); + LLC_STOP_ALL_TIMERS(linkp); + LLC_START_ACK_TIMER(linkp); + linkp->llcl_retry = 0; + LLC_NEWSTATE(linkp, ERROR); + action = LLC_FRMR_SENT; + } + break; + case LLC_P_TIMER_EXPIRED: + case LLC_ACK_TIMER_EXPIRED: + case LLC_REJ_TIMER_EXPIRED: + case LLC_BUSY_TIMER_EXPIRED: + if (linkp->llcl_retry >= llc_n2) { + LLC_STOP_ALL_TIMERS(linkp); + LLC_SETFLAG(linkp, S, 0); + LLC_NEWSTATE(linkp, RESET_WAIT); + action = LLC_RESET_INDICATION_LOCAL; + } + break; + } + + return action; +} + +/* + * NORMAL --- A data link connection exists between the local LLC service access + * point and the remote LLC service access point. Sending and + * reception of information and supervisory PDUs can be performed. + */ +int +llc_state_NORMAL(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = LLC_PASSITON; + + switch(frame_kind + cmdrsp) { + case NL_DATA_REQUEST: + if (LLC_GETFLAG(linkp, REMOTE_BUSY) == 0) { +#ifdef not_now + if (LLC_GETFLAG(linkp, P) == 0) { + /* multiple possibilities */ + llc_send(linkp, LLCFT_INFO, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + if (LLC_TIMERXPIRED(linkp, ACK) != LLC_TIMER_RUNNING) + LLC_START_ACK_TIMER(linkp); + } else { +#endif + /* multiple possibilities */ + llc_send(linkp, LLCFT_INFO, LLC_CMD, 0); + if (LLC_TIMERXPIRED(linkp, ACK) != LLC_TIMER_RUNNING) + LLC_START_ACK_TIMER(linkp); +#ifdef not_now + } +#endif + action = 0; + } + break; + case LLC_LOCAL_BUSY_DETECTED: + if (LLC_GETFLAG(linkp, P) == 0) { + /* multiple possibilities --- action-wise */ + /* multiple possibilities --- CMD/RSP-wise */ + llc_send(linkp, LLCFT_RNR, LLC_CMD, 0); + LLC_START_P_TIMER(linkp); + LLC_SETFLAG(linkp, DATA, 0); + LLC_NEWSTATE(linkp, BUSY); + action = 0; + } else { + /* multiple possibilities --- CMD/RSP-wise */ + llc_send(linkp, LLCFT_RNR, LLC_CMD, 0); + LLC_SETFLAG(linkp, DATA, 0); + LLC_NEWSTATE(linkp, BUSY); + action = 0; + } + break; + case LLC_INVALID_NS + LLC_CMD: + case LLC_INVALID_NS + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_REJ, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_START_REJ_TIMER(linkp); + LLC_NEWSTATE(linkp, REJECT); + action = 0; + } else if (pollfinal == 0 && p == 1) { + llc_send(linkp, LLCFT_REJ, LLC_CMD, 0); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_START_REJ_TIMER(linkp); + LLC_NEWSTATE(linkp, REJECT); + action = 0; + } else if ((pollfinal == 0 && p == 0) || + (pollfinal == 1 && p == 1 && cmdrsp == LLC_RSP)) { + llc_send(linkp, LLCFT_REJ, LLC_CMD, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_START_P_TIMER(linkp); + LLC_START_REJ_TIMER(linkp); + if (cmdrsp == LLC_RSP && pollfinal == 1) { + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else action = 0; + LLC_NEWSTATE(linkp, REJECT); + } + break; + } + case LLCFT_INFO + LLC_CMD: + case LLCFT_INFO + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + LLC_INC(linkp->llcl_vr); + LLC_SENDACKNOWLEDGE(linkp, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + action = LLC_DATA_INDICATION; + } else if (pollfinal == 0 && p == 1) { + LLC_INC(linkp->llcl_vr); + LLC_SENDACKNOWLEDGE(linkp, LLC_CMD, 0); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + action = LLC_DATA_INDICATION; + } else if ((pollfinal == 0 && p == 0 && cmdrsp == LLC_CMD) || + (pollfinal == p && cmdrsp == LLC_RSP)) { + LLC_INC(linkp->llcl_vr); + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + LLC_SENDACKNOWLEDGE(linkp, LLC_CMD, 0); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + if (cmdrsp == LLC_RSP && pollfinal == 1) + LLC_CLEAR_REMOTE_BUSY(linkp, action); + action = LLC_DATA_INDICATION; + } + break; + } + case LLCFT_RR + LLC_CMD: + case LLCFT_RR + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + LLC_SENDACKNOWLEDGE(linkp, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else if ((pollfinal == 0) || + (cmdrsp == LLC_RSP && pollfinal == 1 && p == 1)) { + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } + break; + } + case LLCFT_RNR + LLC_CMD: + case LLCFT_RNR + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SET_REMOTE_BUSY(linkp, action); + } else if ((pollfinal == 0) || + (cmdrsp == LLC_RSP && pollfinal == 1 && p == 1)) { + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SET_REMOTE_BUSY(linkp, action); + } + break; + } + case LLCFT_REJ + LLC_CMD: + case LLCFT_REJ + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + linkp->llcl_vs = nr; + LLC_UPDATE_NR_RECEIVED(linkp, nr); + llc_resend(linkp, LLC_RSP, 1); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else if (pollfinal == 0 && p == 1) { + linkp->llcl_vs = nr; + LLC_UPDATE_NR_RECEIVED(linkp, nr); + llc_resend(linkp, LLC_CMD, 0); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else if ((pollfinal == 0 && p == 0 && cmdrsp == LLC_CMD) || + (pollfinal == p && cmdrsp == LLC_RSP)) { + linkp->llcl_vs = nr; + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_START_P_TIMER(linkp); + llc_resend(linkp, LLC_CMD, 1); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } + break; + } + case NL_INITIATE_PF_CYCLE: + if (LLC_GETFLAG(linkp, P) == 0) { + llc_send(linkp, LLCFT_RR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + action = 0; + } + break; + case LLC_P_TIMER_EXPIRED: + if (linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_RR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + linkp->llcl_retry++; + LLC_NEWSTATE(linkp, AWAIT); + action = 0; + } + break; + case LLC_ACK_TIMER_EXPIRED: + case LLC_BUSY_TIMER_EXPIRED: + if ((LLC_GETFLAG(linkp, P) == 0) + && (linkp->llcl_retry < llc_n2)) { + llc_send(linkp, LLCFT_RR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + linkp->llcl_retry++; + LLC_NEWSTATE(linkp, AWAIT); + action = 0; + } + break; + } + if (action == LLC_PASSITON) + action = llc_state_NBRAcore(linkp, frame, frame_kind, + cmdrsp, pollfinal); + + return action; +} + +/* + * BUSY --- A data link connection exists between the local LLC service access + * point and the remote LLC service access point. I PDUs may be sent. + * Local conditions make it likely that the information feld of + * received I PDUs will be ignored. Supervisory PDUs may be both sent + * and received. + */ +int +llc_state_BUSY(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = LLC_PASSITON; + + switch(frame_kind + cmdrsp) { + case NL_DATA_REQUEST: + if (LLC_GETFLAG(linkp, REMOTE_BUSY) == 0) + if (LLC_GETFLAG(linkp, P) == 0) { + llc_send(linkp, LLCFT_INFO, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + if (LLC_TIMERXPIRED(linkp, ACK) != LLC_TIMER_RUNNING) + LLC_START_ACK_TIMER(linkp); + action = 0; + } else { + llc_send(linkp, LLCFT_INFO, LLC_CMD, 0); + if (LLC_TIMERXPIRED(linkp, ACK) != LLC_TIMER_RUNNING) + LLC_START_ACK_TIMER(linkp); + action = 0; + } + break; + case LLC_LOCAL_BUSY_CLEARED: { + register int p = LLC_GETFLAG(linkp, P); + register int df = LLC_GETFLAG(linkp, DATA); + + switch (df) { + case 1: + if (p == 0) { + /* multiple possibilities */ + llc_send(linkp, LLCFT_REJ, LLC_CMD, 1); + LLC_START_REJ_TIMER(linkp); + LLC_START_P_TIMER(linkp); + LLC_NEWSTATE(linkp, REJECT); + action = 0; + } else { + llc_send(linkp, LLCFT_REJ, LLC_CMD, 0); + LLC_START_REJ_TIMER(linkp); + LLC_NEWSTATE(linkp, REJECT); + action = 0; + } + break; + case 0: + if (p == 0) { + /* multiple possibilities */ + llc_send(linkp, LLCFT_RR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + LLC_NEWSTATE(linkp, NORMAL); + action = 0; + } else { + llc_send(linkp, LLCFT_RR, LLC_CMD, 0); + LLC_NEWSTATE(linkp, NORMAL); + action = 0; + } + break; + case 2: + if (p == 0) { + /* multiple possibilities */ + llc_send(linkp, LLCFT_RR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + LLC_NEWSTATE(linkp, REJECT); + action = 0; + } else { + llc_send(linkp, LLCFT_RR, LLC_CMD, 0); + LLC_NEWSTATE(linkp, REJECT); + action =0; + } + break; + } + break; + } + case LLC_INVALID_NS + LLC_CMD: + case LLC_INVALID_NS + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RNR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + if (LLC_GETFLAG(linkp, DATA) == 0) + LLC_SETFLAG(linkp, DATA, 1); + action = 0; + } else if ((cmdrsp == LLC_CMD && pollfinal == 0 && p == 0) || + (cmdrsp == LLC_RSP && pollfinal == p)) { + llc_send(linkp, LLCFT_RNR, LLC_CMD, 0); + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + if (LLC_GETFLAG(linkp, DATA) == 0) + LLC_SETFLAG(linkp, DATA, 1); + if (cmdrsp == LLC_RSP && pollfinal == 1) { + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else action = 0; + } else if (pollfinal == 0 && p == 1) { + llc_send(linkp, LLCFT_RNR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + if (LLC_GETFLAG(linkp, DATA) == 0) + LLC_SETFLAG(linkp, DATA, 1); + action = 0; + } + break; + } + case LLCFT_INFO + LLC_CMD: + case LLCFT_INFO + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + LLC_INC(linkp->llcl_vr); + llc_send(linkp, LLCFT_RNR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + if (LLC_GETFLAG(linkp, DATA) == 2) + LLC_STOP_REJ_TIMER(linkp); + LLC_SETFLAG(linkp, DATA, 0); + action = LLC_DATA_INDICATION; + } else if ((cmdrsp == LLC_CMD && pollfinal == 0 && p == 0) || + (cmdrsp == LLC_RSP && pollfinal == p)) { + LLC_INC(linkp->llcl_vr); + llc_send(linkp, LLCFT_RNR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + if (LLC_GETFLAG(linkp, DATA) == 2) + LLC_STOP_REJ_TIMER(linkp); + if (cmdrsp == LLC_RSP && pollfinal == 1) + LLC_CLEAR_REMOTE_BUSY(linkp, action); + action = LLC_DATA_INDICATION; + } else if (pollfinal == 0 && p == 1) { + LLC_INC(linkp->llcl_vr); + llc_send(linkp, LLCFT_RNR, LLC_CMD, 0); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + if (LLC_GETFLAG(linkp, DATA) == 2) + LLC_STOP_REJ_TIMER(linkp); + LLC_SETFLAG(linkp, DATA, 0); + action = LLC_DATA_INDICATION; + } + break; + } + case LLCFT_RR + LLC_CMD: + case LLCFT_RR + LLC_RSP: + case LLCFT_RNR + LLC_CMD: + case LLCFT_RNR + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RNR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + if (frame_kind == LLCFT_RR) { + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else { + LLC_SET_REMOTE_BUSY(linkp, action); + } + } else if (pollfinal = 0 || + (cmdrsp == LLC_RSP && pollfinal == 1)) { + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + if (frame_kind == LLCFT_RR) { + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else { + LLC_SET_REMOTE_BUSY(linkp, action); + } + } + break; + } + case LLCFT_REJ + LLC_CMD: + case LLCFT_REJ + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + linkp->llcl_vs = nr; + LLC_UPDATE_NR_RECEIVED(linkp, nr); + llc_send(linkp, LLCFT_RNR, LLC_RSP, 1); + llc_resend(linkp, LLC_CMD, 0); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else if ((cmdrsp == LLC_CMD && pollfinal == 0 && p == 0) || + (cmdrsp == LLC_RSP && pollfinal == p)) { + linkp->llcl_vs = nr; + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + llc_resend(linkp, LLC_CMD, 0); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else if (pollfinal == 0 && p == 1) { + linkp->llcl_vs = nr; + LLC_UPDATE_NR_RECEIVED(linkp, nr); + llc_resend(linkp, LLC_CMD, 0); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } + break; + } + case NL_INITIATE_PF_CYCLE: + if (LLC_GETFLAG(linkp, P) == 0) { + llc_send(linkp, LLCFT_RNR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + action = 0; + } + break; + case LLC_P_TIMER_EXPIRED: + /* multiple possibilities */ + if (linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_RNR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + linkp->llcl_retry++; + LLC_NEWSTATE(linkp, AWAIT_BUSY); + action = 0; + } + break; + case LLC_ACK_TIMER_EXPIRED: + case LLC_BUSY_TIMER_EXPIRED: + if (LLC_GETFLAG(linkp, P) == 0 && linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_RNR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + linkp->llcl_retry++; + LLC_NEWSTATE(linkp, AWAIT_BUSY); + action = 0; + } + break; + case LLC_REJ_TIMER_EXPIRED: + if (linkp->llcl_retry < llc_n2) + if (LLC_GETFLAG(linkp, P) == 0) { + /* multiple possibilities */ + llc_send(linkp, LLCFT_RNR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + linkp->llcl_retry++; + LLC_SETFLAG(linkp, DATA, 1); + LLC_NEWSTATE(linkp, AWAIT_BUSY); + action = 0; + } else{ + LLC_SETFLAG(linkp, DATA, 1); + LLC_NEWSTATE(linkp, BUSY); + action = 0; + } + + break; + } + if (action == LLC_PASSITON) + action = llc_state_NBRAcore(linkp, frame, frame_kind, + cmdrsp, pollfinal); + + return action; +} + +/* + * REJECT --- A data link connection exists between the local LLC service + * access point and the remote LLC service access point. The local + * connection component has requested that the remote connection + * component resend a specific I PDU that the local connection + * componnent has detected as being out of sequence. Both I PDUs and + * supervisory PDUs may be sent and received. + */ +int +llc_state_REJECT(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = LLC_PASSITON; + + switch(frame_kind + cmdrsp) { + case NL_DATA_REQUEST: + if (LLC_GETFLAG(linkp, P) == 0) { + llc_send(linkp, LLCFT_INFO, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + if (LLC_TIMERXPIRED(linkp, ACK) != LLC_TIMER_RUNNING) + LLC_START_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, REJECT); + action = 0; + } else { + llc_send(linkp, LLCFT_INFO, LLC_CMD, 0); + if (LLC_TIMERXPIRED(linkp, ACK) != LLC_TIMER_RUNNING) + LLC_START_ACK_TIMER(linkp); + LLC_NEWSTATE(linkp, REJECT); + action = 0; + } + break; + case NL_LOCAL_BUSY_DETECTED: + if (LLC_GETFLAG(linkp, P) == 0) { + llc_send(linkp, LLCFT_RNR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + LLC_SETFLAG(linkp, DATA, 2); + LLC_NEWSTATE(linkp, BUSY); + action = 0; + } else { + llc_send(linkp, LLCFT_RNR, LLC_CMD, 0); + LLC_SETFLAG(linkp, DATA, 2); + LLC_NEWSTATE(linkp, BUSY); + action = 0; + } + break; + case LLC_INVALID_NS + LLC_CMD: + case LLC_INVALID_NS + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + action = 0; + } else if (pollfinal == 0 || + (cmdrsp == LLC_RSP && pollfinal == 1 && p == 1)) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + if (cmdrsp == LLC_RSP && pollfinal == 1) { + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else action = 0; + } + break; + } + case LLCFT_INFO + LLC_CMD: + case LLCFT_INFO + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + LLC_INC(linkp->llcl_vr); + LLC_SENDACKNOWLEDGE(linkp, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_STOP_REJ_TIMER(linkp); + LLC_NEWSTATE(linkp, NORMAL); + action = LLC_DATA_INDICATION; + } else if ((cmdrsp = LLC_RSP && pollfinal == p) || + (cmdrsp == LLC_CMD && pollfinal == 0 && p == 0)) { + LLC_INC(linkp->llcl_vr); + LLC_SENDACKNOWLEDGE(linkp, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + if (cmdrsp == LLC_RSP && pollfinal == 1) + LLC_CLEAR_REMOTE_BUSY(linkp, action); + LLC_STOP_REJ_TIMER(linkp); + LLC_NEWSTATE(linkp, NORMAL); + action = LLC_DATA_INDICATION; + } else if (pollfinal == 0 && p == 1) { + LLC_INC(linkp->llcl_vr); + LLC_SENDACKNOWLEDGE(linkp, LLC_CMD, 0); + LLC_STOP_REJ_TIMER(linkp); + LLC_NEWSTATE(linkp, NORMAL); + action = LLC_DATA_INDICATION; + } + break; + } + case LLCFT_RR + LLC_CMD: + case LLCFT_RR + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + LLC_SENDACKNOWLEDGE(linkp, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else if (pollfinal == 0 || + (cmdrsp == LLC_RSP && pollfinal == 1 && p == 1)) { + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } + break; + } + case LLCFT_RNR + LLC_CMD: + case LLCFT_RNR + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SET_REMOTE_BUSY(linkp, action); + } else if (pollfinal == 0 || + (cmdrsp == LLC_RSP && pollfinal == 1 && p == 1)) { + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + action = 0; + } + break; + } + case LLCFT_REJ + LLC_CMD: + case LLCFT_REJ + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + linkp->llcl_vs = nr; + LLC_UPDATE_NR_RECEIVED(linkp, nr); + llc_resend(linkp, LLC_RSP, 1); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else if ((cmdrsp == LLC_CMD && pollfinal == 0 && p == 0) || + (cmdrsp == LLC_RSP && pollfinal == p)) { + linkp->llcl_vs = nr; + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_UPDATE_P_FLAG(linkp, cmdrsp, pollfinal); + llc_resend(linkp, LLC_CMD, 0); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else if (pollfinal == 0 && p == 1) { + linkp->llcl_vs = nr; + LLC_UPDATE_NR_RECEIVED(linkp, nr); + llc_resend(linkp, LLC_CMD, 0); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } + break; + } + case NL_INITIATE_PF_CYCLE: + if (LLC_GETFLAG(linkp, P) == 0) { + llc_send(linkp, LLCFT_RR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + action = 0; + } + break; + case LLC_REJ_TIMER_EXPIRED: + if (LLC_GETFLAG(linkp, P) == 0 && linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_REJ, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + LLC_START_REJ_TIMER(linkp); + linkp->llcl_retry++; + action = 0; + } + case LLC_P_TIMER_EXPIRED: + if (linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_RR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + LLC_START_REJ_TIMER(linkp); + linkp->llcl_retry++; + LLC_NEWSTATE(linkp, AWAIT_REJECT); + action = 0; + } + break; + case LLC_ACK_TIMER_EXPIRED: + case LLC_BUSY_TIMER_EXPIRED: + if (LLC_GETFLAG(linkp, P) == 0 && linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_RR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + LLC_START_REJ_TIMER(linkp); + linkp->llcl_retry++; + /* + * I cannot locate the description of RESET_V(S) + * in ISO 8802-2, table 7-1, state REJECT, last event, + * and assume they meant to set V(S) to 0 ... + */ + linkp->llcl_vs = 0; /* XXX */ + LLC_NEWSTATE(linkp, AWAIT_REJECT); + action = 0; + } + + break; + } + if (action == LLC_PASSITON) + action = llc_state_NBRAcore(linkp, frame, frame_kind, + cmdrsp, pollfinal); + + return action; +} + +/* + * AWAIT --- A data link connection exists between the local LLC service access + * point and the remote LLC service access point. The local LLC is + * performing a timer recovery operation and has sent a command PDU + * with the P bit set to ``1'', and is awaiting an acknowledgement + * from the remote LLC. I PDUs may be received but not sent. + * Supervisory PDUs may be both sent and received. + */ +int +llc_state_AWAIT(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = LLC_PASSITON; + + switch(frame_kind + cmdrsp) { + case LLC_LOCAL_BUSY_DETECTED: + llc_send(linkp, LLCFT_RNR, LLC_CMD, 0); + LLC_SETFLAG(linkp, DATA, 0); + LLC_NEWSTATE(linkp, AWAIT_BUSY); + action = 0; + break; + case LLC_INVALID_NS + LLC_CMD: + case LLC_INVALID_NS + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_REJ, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_START_REJ_TIMER(linkp); + LLC_NEWSTATE(linkp, AWAIT_REJECT); + action = 0; + } else if (cmdrsp == LLC_RSP && pollfinal == 1) { + llc_send(linkp, LLCFT_REJ, LLC_CMD, 0); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + LLC_STOP_P_TIMER(linkp); + llc_resend(linkp, LLC_CMD, 0); + LLC_START_REJ_TIMER(linkp); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + LLC_NEWSTATE(linkp, REJECT); + } else if (pollfinal == 0) { + llc_send(linkp, LLCFT_REJ, LLC_CMD, 0); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_START_REJ_TIMER(linkp); + LLC_NEWSTATE(linkp, AWAIT_REJECT); + action = 0; + } + break; + } + case LLCFT_INFO + LLC_RSP: + case LLCFT_INFO + LLC_CMD: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + LLC_INC(linkp->llcl_vr); + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + action = LLC_DATA_INDICATION; + } else if (cmdrsp == LLC_RSP && pollfinal == 1) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + llc_resend(linkp, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + LLC_NEWSTATE(linkp, NORMAL); + action = LLC_DATA_INDICATION; + } else if (pollfinal == 0) { + llc_send(linkp, LLCFT_RR, LLC_CMD, 0); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + action = LLC_DATA_INDICATION; + } + break; + } + case LLCFT_RR + LLC_CMD: + case LLCFT_RR + LLC_RSP: + case LLCFT_REJ + LLC_CMD: + case LLCFT_REJ + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else if (cmdrsp == LLC_RSP && pollfinal == 1) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + LLC_STOP_P_TIMER(linkp); + llc_resend(linkp, LLC_CMD, 0); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + LLC_NEWSTATE(linkp, NORMAL); + } else if (pollfinal == 0) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } + break; + } + case LLCFT_RNR + LLC_CMD: + case LLCFT_RNR + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (pollfinal == 1 && cmdrsp == LLC_CMD) { + llc_send(linkp, LLCFT_RR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SET_REMOTE_BUSY(linkp, action); + } else if (pollfinal == 1 && cmdrsp == LLC_RSP) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + LLC_STOP_P_TIMER(linkp); + LLC_SET_REMOTE_BUSY(linkp, action); + LLC_NEWSTATE(linkp, NORMAL); + } else if (pollfinal == 0) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SET_REMOTE_BUSY(linkp, action); + } + break; + } + case LLC_P_TIMER_EXPIRED: + if (linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_RR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + linkp->llcl_retry++; + action = 0; + } + break; + } + if (action == LLC_PASSITON) + action = llc_state_NBRAcore(linkp, frame, frame_kind, + cmdrsp, pollfinal); + + return action; +} + +/* + * AWAIT_BUSY --- A data link connection exists between the local LLC service + * access point and the remote LLC service access point. The + * local LLC is performing a timer recovery operation and has + * sent a command PDU with the P bit set to ``1'', and is + * awaiting an acknowledgement from the remote LLC. I PDUs may + * not be sent. Local conditions make it likely that the + * information feld of receoved I PDUs will be ignored. + * Supervisory PDUs may be both sent and received. + */ +int +llc_state_AWAIT_BUSY(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = LLC_PASSITON; + + switch(frame_kind + cmdrsp) { + case LLC_LOCAL_BUSY_CLEARED: + switch (LLC_GETFLAG(linkp, DATA)) { + case 1: + llc_send(linkp, LLCFT_REJ, LLC_CMD, 0); + LLC_START_REJ_TIMER(linkp); + LLC_NEWSTATE(linkp, AWAIT_REJECT); + action = 0; + break; + case 0: + llc_send(linkp, LLCFT_RR, LLC_CMD, 0); + LLC_NEWSTATE(linkp, AWAIT); + action = 0; + break; + case 2: + llc_send(linkp, LLCFT_RR, LLC_CMD, 0); + LLC_NEWSTATE(linkp, AWAIT_REJECT); + action = 0; + break; + } + break; + case LLC_INVALID_NS + LLC_CMD: + case LLC_INVALID_NS + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RNR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SETFLAG(linkp, DATA, 1); + action = 0; + } else if (cmdrsp == LLC_RSP && pollfinal == 1) { + /* optionally */ + llc_send(linkp, LLCFT_RNR, LLC_CMD, 0); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + LLC_STOP_P_TIMER(linkp); + LLC_SETFLAG(linkp, DATA, 1); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + llc_resend(linkp, LLC_CMD, 0); + LLC_NEWSTATE(linkp, BUSY); + } else if (pollfinal == 0) { + /* optionally */ + llc_send(linkp, LLCFT_RNR, LLC_CMD, 0); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SETFLAG(linkp, DATA, 1); + action = 0; + } + } + case LLCFT_INFO + LLC_CMD: + case LLCFT_INFO + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RNR, LLC_RSP, 1); + LLC_INC(linkp->llcl_vr); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SETFLAG(linkp, DATA, 0); + action = LLC_DATA_INDICATION; + } else if (cmdrsp == LLC_RSP && pollfinal == 1) { + llc_send(linkp, LLCFT_RNR, LLC_CMD, 1); + LLC_INC(linkp->llcl_vr); + LLC_START_P_TIMER(linkp); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + LLC_SETFLAG(linkp, DATA, 0); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + llc_resend(linkp, LLC_CMD, 0); + LLC_NEWSTATE(linkp, BUSY); + action = LLC_DATA_INDICATION; + } else if (pollfinal == 0) { + llc_send(linkp, LLCFT_RNR, LLC_CMD, 0); + LLC_INC(linkp->llcl_vr); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SETFLAG(linkp, DATA, 0); + action = LLC_DATA_INDICATION; + } + break; + } + case LLCFT_RR + LLC_CMD: + case LLCFT_REJ + LLC_CMD: + case LLCFT_RR + LLC_RSP: + case LLCFT_REJ + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RNR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else if (cmdrsp == LLC_RSP && pollfinal == 1) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + LLC_STOP_P_TIMER(linkp); + llc_resend(linkp, LLC_CMD, 0); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + LLC_NEWSTATE(linkp, BUSY); + } else if (pollfinal == 0) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + LLC_STOP_P_TIMER(linkp); + llc_resend(linkp, LLC_CMD, 0); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } + break; + } + case LLCFT_RNR + LLC_CMD: + case LLCFT_RNR + LLC_RSP: { + register int p = LLC_GETFLAG(linkp, P); + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RNR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SET_REMOTE_BUSY(linkp, action); + } else if (cmdrsp == LLC_RSP && pollfinal == 1) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + LLC_STOP_P_TIMER(linkp); + LLC_SET_REMOTE_BUSY(linkp, action); + LLC_NEWSTATE(linkp, BUSY); + } else if (pollfinal == 0) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SET_REMOTE_BUSY(linkp, action); + } + break; + } + case LLC_P_TIMER_EXPIRED: + if (linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_RNR, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + linkp->llcl_retry++; + action = 0; + } + break; + } + if (action == LLC_PASSITON) + action = llc_state_NBRAcore(linkp, frame, frame_kind, + cmdrsp, pollfinal); + + return action; +} + +/* + * AWAIT_REJECT --- A data link connection exists between the local LLC service + * access point and the remote LLC service access point. The + * local connection component has requested that the remote + * connection component re-transmit a specific I PDU that the + * local connection component has detected as being out of + * sequence. Before the local LLC entered this state it was + * performing a timer recovery operation and had sent a + * command PDU with the P bit set to ``1'', and is still + * awaiting an acknowledgment from the remote LLC. I PDUs may + * be received but not transmitted. Supervisory PDUs may be + * both transmitted and received. + */ +int +llc_state_AWAIT_REJECT(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + int action = LLC_PASSITON; + + switch(frame_kind + cmdrsp) { + case LLC_LOCAL_BUSY_DETECTED: + llc_send(linkp, LLCFT_RNR, LLC_CMD, 0); + LLC_SETFLAG(linkp, DATA, 2); + LLC_NEWSTATE(linkp, AWAIT_BUSY); + action = 0; + break; + case LLC_INVALID_NS + LLC_CMD: + case LLC_INVALID_NS + LLC_RSP: { + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + action = 0; + } else if (cmdrsp == LLC_RSP && pollfinal == 1) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + llc_resend(linkp, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + LLC_NEWSTATE(linkp, REJECT); + } else if (pollfinal == 0) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + action = 0; + } + break; + } + case LLCFT_INFO + LLC_CMD: + case LLCFT_INFO + LLC_RSP: { + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + LLC_INC(linkp->llcl_vr); + llc_send(linkp, LLCFT_RR, LLC_RSP, 1); + LLC_STOP_REJ_TIMER(linkp); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_NEWSTATE(linkp, AWAIT); + action = LLC_DATA_INDICATION; + } else if (cmdrsp == LLC_RSP && pollfinal == 1) { + LLC_INC(linkp->llcl_vr); + LLC_STOP_P_TIMER(linkp); + LLC_STOP_REJ_TIMER(linkp); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + llc_resend(linkp, LLC_CMD, 0); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + LLC_NEWSTATE(linkp, NORMAL); + action = LLC_DATA_INDICATION; + } else if (pollfinal == 0) { + LLC_INC(linkp->llcl_vr); + llc_send(linkp, LLCFT_RR, LLC_CMD, 0); + LLC_STOP_REJ_TIMER(linkp); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_NEWSTATE(linkp, AWAIT); + action = LLC_DATA_INDICATION; + } + break; + } + case LLCFT_RR + LLC_CMD: + case LLCFT_REJ + LLC_CMD: + case LLCFT_RR + LLC_RSP: + case LLCFT_REJ + LLC_RSP: { + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } else if (cmdrsp == LLC_RSP && pollfinal == 1) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + llc_resend(linkp, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + LLC_NEWSTATE(linkp, REJECT); + } else if (pollfinal == 0) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_CLEAR_REMOTE_BUSY(linkp, action); + } + break; + } + case LLCFT_RNR + LLC_CMD: + case LLCFT_RNR + LLC_RSP: { + register int nr = LLCGBITS(frame->llc_control_ext, s_nr); + + if (cmdrsp == LLC_CMD && pollfinal == 1) { + llc_send(linkp, LLCFT_RR, LLC_RSP, 1); + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SET_REMOTE_BUSY(linkp, action); + } else if (cmdrsp == LLC_RSP && pollfinal == 1) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + linkp->llcl_vs = nr; + LLC_STOP_P_TIMER(linkp); + LLC_SET_REMOTE_BUSY(linkp, action); + LLC_NEWSTATE(linkp, REJECT); + } else if (pollfinal == 0) { + LLC_UPDATE_NR_RECEIVED(linkp, nr); + LLC_SET_REMOTE_BUSY(linkp, action); + } + break; + } + case LLC_P_TIMER_EXPIRED: + if (linkp->llcl_retry < llc_n2) { + llc_send(linkp, LLCFT_REJ, LLC_CMD, 1); + LLC_START_P_TIMER(linkp); + linkp->llcl_retry++; + action = 0; + } + break; + } + if (action == LLC_PASSITON) + action = llc_state_NBRAcore(linkp, frame, frame_kind, + cmdrsp, pollfinal); + + return action; +} + + +/* + * llc_statehandler() --- Wrapper for llc_state_*() functions. + * Deals with action codes and checks for + * ``stuck'' links. + */ + +int +llc_statehandler(struct llc_linkcb *linkp, struct llc *frame, int frame_kind, + int cmdrsp, int pollfinal) +{ + register int action = 0; + + /* + * To check for ``zombie'' links each time llc_statehandler() gets called + * the AGE timer of linkp is reset. If it expires llc_timer() will + * take care of the link --- i.e. kill it 8=) + */ + LLC_STARTTIMER(linkp, AGE); + + /* + * Now call the current statehandler function. + */ + action = (*linkp->llcl_statehandler)(linkp, frame, frame_kind, + cmdrsp, pollfinal); +once_more_and_again: + switch (action) { + case LLC_CONNECT_INDICATION: { + int naction; + + LLC_TRACE(linkp, LLCTR_INTERESTING, "CONNECT INDICATION"); + linkp->llcl_nlnext = + (*linkp->llcl_sapinfo->si_ctlinput) + (PRC_CONNECT_INDICATION, + (struct sockaddr *) &linkp->llcl_addr, (caddr_t) linkp); + if (linkp->llcl_nlnext == 0) + naction = NL_DISCONNECT_REQUEST; + else naction = NL_CONNECT_RESPONSE; + action = (*linkp->llcl_statehandler)(linkp, frame, naction, 0, 0); + goto once_more_and_again; + } + case LLC_CONNECT_CONFIRM: + /* llc_resend(linkp, LLC_CMD, 0); */ + llc_start(linkp); + break; + case LLC_DISCONNECT_INDICATION: + LLC_TRACE(linkp, LLCTR_INTERESTING, "DISCONNECT INDICATION"); + (*linkp->llcl_sapinfo->si_ctlinput) + (PRC_DISCONNECT_INDICATION, + (struct sockaddr *) &linkp->llcl_addr, linkp->llcl_nlnext); + break; + /* internally visible only */ + case LLC_RESET_CONFIRM: + case LLC_RESET_INDICATION_LOCAL: + /* + * not much we can do here, the state machine either makes it or + * brakes it ... + */ + break; + case LLC_RESET_INDICATION_REMOTE: + LLC_TRACE(linkp, LLCTR_SHOULDKNOW, "RESET INDICATION (REMOTE)"); + action = (*linkp->llcl_statehandler)(linkp, frame, + NL_RESET_RESPONSE, 0, 0); + goto once_more_and_again; + case LLC_FRMR_SENT: + LLC_TRACE(linkp, LLCTR_URGENT, "FRMR SENT"); + break; + case LLC_FRMR_RECEIVED: + LLC_TRACE(linkp, LLCTR_URGEN, "FRMR RECEIVED"); + action = (*linkp->llcl_statehandler)(linkp, frame, + NL_RESET_REQUEST, 0, 0); + + goto once_more_and_again; + case LLC_REMOTE_BUSY: + LLC_TRACE(linkp, LLCTR_SHOULDKNOW, "REMOTE BUSY"); + break; + case LLC_REMOTE_NOT_BUSY: + LLC_TRACE(linkp, LLCTR_SHOULDKNOW, "REMOTE BUSY CLEARED"); + /* + * try to get queued frames out + */ + llc_start(linkp); + break; + } + + /* + * Only LLC_DATA_INDICATION is for the time being + * passed up to the network layer entity. + * The remaining action codes are for the time + * being visible internally only. + * However, this can/may be changed if necessary. + */ + + return action; +} + + +/* + * Core LLC2 routines + */ + +/* + * The INIT call. This routine is called once after the system is booted. + */ + +llc_init() +{ + llcintrq.ifq_maxlen = IFQ_MAXLEN; +} + + +/* + * In case of a link reset we need to shuffle the frames queued inside the + * LLC2 window. + */ + +void +llc_resetwindow(struct llc_linkcb *linkp) +{ + register struct mbuf *mptr = (struct mbuf *) 0; + register struct mbuf *anchor = (struct mbuf *)0; + register short i; + + /* Pick up all queued frames and collect them in a linked mbuf list */ + if (linkp->llcl_slotsfree != linkp->llcl_window) { + i = llc_seq2slot(linkp, linkp->llcl_nr_received); + anchor = mptr = linkp->llcl_output_buffers[i]; + for (; i != linkp->llcl_freeslot; + i = llc_seq2slot(linkp, i+1)) { + if (linkp->llcl_output_buffers[i]) { + mptr->m_nextpkt = linkp->llcl_output_buffers[i]; + mptr = mptr->m_nextpkt; + } else panic("LLC2 window broken"); + } + } + /* clean closure */ + if (mptr) + mptr->m_nextpkt = (struct mbuf *) 0; + + /* Now --- plug 'em in again */ + if (anchor != (struct mbuf *)0) { + for (i = 0, mptr = anchor; mptr != (struct mbuf *) 0; i++) { + linkp->llcl_output_buffers[i] = mptr; + mptr = mptr->m_nextpkt; + linkp->llcl_output_buffers[i]->m_nextpkt = (struct mbuf *)0; + } + linkp->llcl_freeslot = i; + } else linkp->llcl_freeslot = 0; + + /* We're resetting the link, the next frame to be acknowledged is 0 */ + linkp->llcl_nr_received = 0; + + /* set distance between LLC2 sequence number and the top of window to 0 */ + linkp->llcl_projvs = linkp->llcl_freeslot; + + return; +} + +/* + * llc_newlink() --- We allocate enough memory to contain a link control block + * and initialize it properly. We don't intiate the actual setup + * of the LLC2 link here. + */ +struct llc_linkcb * +llc_newlink(struct sockaddr_dl *dst, struct ifnet *ifp, struct rtentry *nlrt, + caddr_t nlnext, struct rtentry *llrt) +{ + struct llc_linkcb *nlinkp; + u_char sap = LLSAPADDR(dst); + short llcwindow; + + + /* allocate memory for link control block */ + MALLOC(nlinkp, struct llc_linkcb *, sizeof(struct llc_linkcb), + M_PCB, M_DONTWAIT); + if (nlinkp == 0) + return (NULL); + bzero((caddr_t)nlinkp, sizeof(struct llc_linkcb)); + + /* copy link address */ + sdl_copy(dst, &nlinkp->llcl_addr); + + /* hold on to the network layer route entry */ + nlinkp->llcl_nlrt = nlrt; + + /* likewise the network layer control block */ + nlinkp->llcl_nlnext = nlnext; + + /* jot down the link layer route entry */ + nlinkp->llcl_llrt = llrt; + + /* reset writeq */ + nlinkp->llcl_writeqh = nlinkp->llcl_writeqt = NULL; + + /* setup initial state handler function */ + nlinkp->llcl_statehandler = llc_state_ADM; + + /* hold on to interface pointer */ + nlinkp->llcl_if = ifp; + + /* get service access point information */ + nlinkp->llcl_sapinfo = llc_getsapinfo(sap, ifp); + + /* get window size from SAP info block */ + if ((llcwindow = nlinkp->llcl_sapinfo->si_window) == 0) + llcwindow = LLC_MAX_WINDOW; + + /* allocate memory for window buffer */ + MALLOC(nlinkp->llcl_output_buffers, struct mbuf **, + llcwindow*sizeof(struct mbuf *), M_PCB, M_DONTWAIT); + if (nlinkp->llcl_output_buffers == 0) { + FREE(nlinkp, M_PCB); + return(NULL); + } + bzero((caddr_t)nlinkp->llcl_output_buffers, + llcwindow*sizeof(struct mbuf *)); + + /* set window size & slotsfree */ + nlinkp->llcl_slotsfree = nlinkp->llcl_window = llcwindow; + + /* enter into linked listed of link control blocks */ + insque(nlinkp, &llccb_q); + + return(nlinkp); +} + +/* + * llc_dellink() --- farewell to link control block + */ +llc_dellink(struct llc_linkcb *linkp) +{ + register struct mbuf *m; + register struct mbuf *n; + register struct npaidbentry *sapinfo = linkp->llcl_sapinfo; + register i; + + /* notify upper layer of imminent death */ + if (linkp->llcl_nlnext && sapinfo->si_ctlinput) + (*sapinfo->si_ctlinput) + (PRC_DISCONNECT_INDICATION, + (struct sockaddr *)&linkp->llcl_addr, linkp->llcl_nlnext); + + /* pull the plug */ + if (linkp->llcl_llrt) + ((struct npaidbentry *)(linkp->llcl_llrt->rt_llinfo))->np_link + = (struct llc_linkcb *) 0; + + /* leave link control block queue */ + remque(linkp); + + /* drop queued packets */ + for (m = linkp->llcl_writeqh; m;) { + n = m->m_act; + m_freem(m); + m = n; + } + + /* drop packets in the window */ + for(i = 0; i < linkp->llcl_window; i++) + if (linkp->llcl_output_buffers[i]) + m_freem(linkp->llcl_output_buffers[i]); + + /* return the window space */ + FREE((caddr_t)linkp->llcl_output_buffers, M_PCB); + + /* return the control block space --- now it's gone ... */ + FREE((caddr_t)linkp, M_PCB); +} + +llc_decode(struct llc* frame, struct llc_linkcb * linkp) +{ + register int ft = LLC_BAD_PDU; + + if ((frame->llc_control & 01) == 0) { + ft = LLCFT_INFO; + /* S or U frame ? */ + } else switch (frame->llc_control) { + + /* U frames */ + case LLC_UI: + case LLC_UI_P: ft = LLC_UI; break; + case LLC_DM: + case LLC_DM_P: ft =LLCFT_DM; break; + case LLC_DISC: + case LLC_DISC_P: ft = LLCFT_DISC; break; + case LLC_UA: + case LLC_UA_P: ft = LLCFT_UA; break; + case LLC_SABME: + case LLC_SABME_P: ft = LLCFT_SABME; break; + case LLC_FRMR: + case LLC_FRMR_P: ft = LLCFT_FRMR; break; + case LLC_XID: + case LLC_XID_P: ft = LLCFT_XID; break; + case LLC_TEST: + case LLC_TEST_P: ft = LLCFT_TEST; break; + + /* S frames */ + case LLC_RR: ft = LLCFT_RR; break; + case LLC_RNR: ft = LLCFT_RNR; break; + case LLC_REJ: ft = LLCFT_REJ; break; + } /* switch */ + + if (linkp) { + switch (ft) { + case LLCFT_INFO: + if (LLCGBITS(frame->llc_control, i_ns) != linkp->llcl_vr) { + ft = LLC_INVALID_NS; + break; + } + /* fall thru --- yeeeeeee */ + case LLCFT_RR: + case LLCFT_RNR: + case LLCFT_REJ: + /* splash! */ + if (LLC_NR_VALID(linkp, LLCGBITS(frame->llc_control_ext, + s_nr)) == 0) + ft = LLC_INVALID_NR; + break; + } + } + + return ft; +} + +/* + * llc_anytimersup() --- Checks if at least one timer is still up and running. + */ +int +llc_anytimersup(struct llc_linkcb * linkp) +{ + register int i; + + FOR_ALL_LLC_TIMERS(i) + if (linkp->llcl_timers[i] > 0) + break; + if (i == LLC_AGE_SHIFT) + return 0; + else return 1; +} + +/* + * llc_link_dump() - dump link info + */ + +#define SAL(s) ((struct sockaddr_dl *)&(s)->llcl_addr) +#define CHECK(l, s) if (LLC_STATEEQ(l, s)) return #s + +char *timer_names[] = {"ACK", "P", "BUSY", "REJ", "AGE"}; + +char * +llc_getstatename(struct llc_linkcb *linkp) +{ + CHECK(linkp, ADM); + CHECK(linkp, CONN); + CHECK(linkp, RESET_WAIT); + CHECK(linkp, RESET_CHECK); + CHECK(linkp, SETUP); + CHECK(linkp, RESET); + CHECK(linkp, D_CONN); + CHECK(linkp, ERROR); + CHECK(linkp, NORMAL); + CHECK(linkp, BUSY); + CHECK(linkp, REJECT); + CHECK(linkp, AWAIT); + CHECK(linkp, AWAIT_BUSY); + CHECK(linkp, AWAIT_REJECT); + + return "UNKNOWN - eh?"; +} + +void +llc_link_dump(struct llc_linkcb* linkp, const char *message) +{ + register int i; + register char *state; + + /* print interface */ + printf("if %s%d\n", linkp->llcl_if->if_name, linkp->llcl_if->if_unit); + + /* print message */ + printf(">> %s <<\n", message); + + /* print MAC and LSAP */ + printf("llc addr "); + for (i = 0; i < (SAL(linkp)->sdl_alen)-2; i++) + printf("%x:", (char)*(LLADDR(SAL(linkp))+i) & 0xff); + printf("%x,", (char)*(LLADDR(SAL(linkp))+i) & 0xff); + printf("%x\n", (char)*(LLADDR(SAL(linkp))+i+1) & 0xff); + + /* print state we're in and timers */ + printf("state %s, ", llc_getstatename(linkp)); + for (i = LLC_ACK_SHIFT; i < LLC_AGE_SHIFT; i++) + printf("%s-%c %d/", timer_names[i], + (linkp->llcl_timerflags & (1<llcl_timers[i]); + printf("%s-%c %d\n", timer_names[i], (linkp->llcl_timerflags & (1<llcl_timers[i]); + + /* print flag values */ + printf("flags P %d/F %d/S %d/DATA %d/REMOTE_BUSY %d\n", + LLC_GETFLAG(linkp, P), LLC_GETFLAG(linkp, S), + LLC_GETFLAG(linkp, DATA), LLC_GETFLAG(linkp, REMOTE_BUSY)); + + /* print send and receive state variables, ack, and window */ + printf("V(R) %d/V(S) %d/N(R) received %d/window %d/freeslot %d\n", + linkp->llcl_vs, linkp->llcl_vr, linkp->llcl_nr_received, + linkp->llcl_window, linkp->llcl_freeslot); + + /* further expansions can follow here */ + +} + +void +llc_trace(struct llc_linkcb *linkp, int level, const char *message) +{ + if (linkp->llcl_sapinfo->si_trace && level > llc_tracelevel) + llc_link_dump(linkp, message); + + return; +} diff --git a/sys/netccitt/llc_timer.c b/sys/netccitt/llc_timer.c new file mode 100644 index 00000000000..0aecd08b68d --- /dev/null +++ b/sys/netccitt/llc_timer.c @@ -0,0 +1,180 @@ +/* + * Copyright (C) Dirk Husemann, Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1990, 1991, 1992 + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Dirk Husemann and the Computer Science Department (IV) of + * the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)llc_timer.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + + +/* + * Various timer values. They can be adjusted + * by patching the binary with adb if necessary. + */ +/* ISO 8802-2 timers */ +int llc_n2 = LLC_N2_VALUE; +int llc_ACK_timer = LLC_ACK_TIMER; +int llc_P_timer = LLC_P_TIMER; +int llc_BUSY_timer = LLC_BUSY_TIMER; +int llc_REJ_timer = LLC_REJ_TIMER; +/* Implementation specific timers */ +int llc_AGE_timer = LLC_AGE_TIMER; +int llc_DACTION_timer = LLC_DACTION_TIMER; + +/* + * The timer routine. We are called every 500ms by the kernel. + * Handle the various virtual timers. + */ + +void +llc_timer() +{ + register struct llc_linkcb *linkp; + register struct llc_linkcb *nlinkp; + register int timer; + register int action; + register int s = splimp(); + + /* + * All links are accessible over the doubly linked list llccb_q + */ + if (!LQEMPTY) { + /* + * A for-loop is not that great an idea as the linkp + * might get deleted if the age timer has expired ... + */ + linkp = LQFIRST; + while (LQVALID(linkp)) { + nlinkp = LQNEXT(linkp); + /* + * Check implementation specific timers first + */ + /* The delayed action/acknowledge idle timer */ + switch (LLC_TIMERXPIRED(linkp, DACTION)) { + case LLC_TIMER_RUNNING: + LLC_AGETIMER(linkp, DACTION); + break; + case LLC_TIMER_EXPIRED: { + register int cmdrsp; + register int pollfinal; + + switch (LLC_GETFLAG(linkp, DACTION)) { + case LLC_DACKCMD: + cmdrsp = LLC_CMD, pollfinal = 0; + break; + case LLC_DACKCMDPOLL: + cmdrsp = LLC_CMD, pollfinal = 1; + break; + case LLC_DACKRSP: + cmdrsp = LLC_RSP, pollfinal = 0; + break; + case LLC_DACKRSPFINAL: + cmdrsp = LLC_RSP, pollfinal = 1; + break; + } + llc_send(linkp, LLCFT_RR, cmdrsp, pollfinal); + LLC_STOPTIMER(linkp, DACTION); + break; + } + } + /* The link idle timer */ + switch (LLC_TIMERXPIRED(linkp, AGE)) { + case LLC_TIMER_RUNNING: + LLC_AGETIMER(linkp, AGE); + break; + case LLC_TIMER_EXPIRED: + /* + * Only crunch the link when really no + * timers are running any more. + */ + if (llc_anytimersup(linkp) == 0) { + llc_dellink(linkp); + LLC_STOPTIMER(linkp, AGE); + goto gone; + } else { + LLC_STARTTIMER(linkp, AGE); + } + break; + } + /* + * Now, check all the ISO 8802-2 timers + */ + FOR_ALL_LLC_TIMERS(timer) { + action = 0; + if ((linkp->llcl_timerflags & (1<llcl_timers[timer] == 0)) { + switch (timer) { + case LLC_ACK_SHIFT: + action = LLC_ACK_TIMER_EXPIRED; + break; + case LLC_P_SHIFT: + action = LLC_P_TIMER_EXPIRED; + break; + case LLC_BUSY_SHIFT: + action = LLC_BUSY_TIMER_EXPIRED; + break; + case LLC_REJ_SHIFT: + action = LLC_REJ_TIMER_EXPIRED; + break; + } + linkp->llcl_timerflags &= ~(1<llcl_timers[timer] > 0) + linkp->llcl_timers[timer]--; + } + +gone: linkp = nlinkp; + } + } + splx (s); +} diff --git a/sys/netccitt/llc_var.h b/sys/netccitt/llc_var.h new file mode 100644 index 00000000000..a27db52d37a --- /dev/null +++ b/sys/netccitt/llc_var.h @@ -0,0 +1,659 @@ +/* + * Copyright (C) Dirk Husemann, Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1990, 1991, 1992 + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Dirk Husemann and the Computer Science Department (IV) of + * the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)llc_var.h 8.1 (Berkeley) 6/10/93 + */ + +#ifdef __STDC__ +/* + * Forward structure declarations for function prototypes [sic]. + */ +struct llc; +#endif + +#define NPAIDB_LINK 0 + +struct npaidbentry { + union { + /* MAC,DLSAP -> CONS */ + struct { + struct llc_linkcb *NE_link; + struct rtentry *NE_rt; + } NE; + /* SAP info for unconfigured incoming calls */ + struct { + u_short SI_class; +#define LLC_CLASS_I 0x1 +#define LLC_CLASS_II 0x3 +#define LLC_CLASS_III 0x4 /* Future */ +#define LLC_CLASS_IV 0x7 /* Future */ + u_short SI_window; + u_short SI_trace; + u_short SI_xchxid; + void (*SI_input) + __P((struct mbuf *)); + caddr_t (*SI_ctlinput) + __P((int, struct sockaddr *, caddr_t)); + } SI; + } NESIun; +}; +#define np_link NESIun.NE.NE_link +#define np_rt NESIun.NE.NE_rt +#define si_class NESIun.SI.SI_class +#define si_window NESIun.SI.SI_window +#define si_trace NESIun.SI.SI_trace +#define si_xchxid NESIun.SI.SI_xchxid +#define si_input NESIun.SI.SI_input +#define si_ctlinput NESIun.SI.SI_ctlinput + +#define NPDL_SAPNETMASK 0x7e + +/* + * Definitions for accessing bitfields/bitslices inside + * LLC2 headers + */ +struct bitslice { + unsigned int bs_mask; + unsigned int bs_shift; +}; + + +#define i_z 0 +#define i_ns 1 +#define i_pf 0 +#define i_nr 1 +#define s_oz 2 +#define s_selector 3 +#define s_pf 0 +#define s_nr 1 +#define u_bb 2 +#define u_select_other 3 +#define u_pf 4 +#define u_select 5 +#define f_vs 1 +#define f_cr 0 +#define f_vr 1 +#define f_wxyzv 6 + +#define LLCGBITS(Arg, Index) (((Arg) & llc_bitslice[(Index)].bs_mask) >> llc_bitslice[(Index)].bs_shift) +#define LLCSBITS(Arg, Index, Val) (Arg) |= (((Val) << llc_bitslice[(Index)].bs_shift) & llc_bitslice[(Index)].bs_mask) +#define LLCCSBITS(Arg, Index, Val) (Arg) = (((Val) << llc_bitslice[(Index)].bs_shift) & llc_bitslice[(Index)].bs_mask) + +extern struct bitslice llc_bitslice[]; + +#define LLC_CMD 0 +#define LLC_RSP 1 +#define LLC_MAXCMDRSP 2 + +/* + * LLC events --- These events may either be frames received from the + * remote LLC DSAP, request from the network layer user, + * timer events from llc_timer(), or diagnostic events from + * llc_input(). + */ + +/* LLC frame types */ +#define LLCFT_INFO 0 * LLC_MAXCMDRSP +#define LLCFT_RR 1 * LLC_MAXCMDRSP +#define LLCFT_RNR 2 * LLC_MAXCMDRSP +#define LLCFT_REJ 3 * LLC_MAXCMDRSP +#define LLCFT_DM 4 * LLC_MAXCMDRSP +#define LLCFT_SABME 5 * LLC_MAXCMDRSP +#define LLCFT_DISC 6 * LLC_MAXCMDRSP +#define LLCFT_UA 7 * LLC_MAXCMDRSP +#define LLCFT_FRMR 8 * LLC_MAXCMDRSP +#define LLCFT_UI 9 * LLC_MAXCMDRSP +#define LLCFT_XID 10 * LLC_MAXCMDRSP +#define LLCFT_TEST 11 * LLC_MAXCMDRSP + +/* LLC2 timer events */ +#define LLC_ACK_TIMER_EXPIRED 12 * LLC_MAXCMDRSP +#define LLC_P_TIMER_EXPIRED 13 * LLC_MAXCMDRSP +#define LLC_REJ_TIMER_EXPIRED 14 * LLC_MAXCMDRSP +#define LLC_BUSY_TIMER_EXPIRED 15 * LLC_MAXCMDRSP + +/* LLC2 diagnostic events */ +#define LLC_INVALID_NR 16 * LLC_MAXCMDRSP +#define LLC_INVALID_NS 17 * LLC_MAXCMDRSP +#define LLC_BAD_PDU 18 * LLC_MAXCMDRSP +#define LLC_LOCAL_BUSY_DETECTED 19 * LLC_MAXCMDRSP +#define LLC_LOCAL_BUSY_CLEARED 20 * LLC_MAXCMDRSP + +/* Network layer user requests */ +/* + * NL_CONNECT_REQUEST --- The user has requested that a data link connection + * be established with a remote LLC DSAP. + */ +#define NL_CONNECT_REQUEST 21 * LLC_MAXCMDRSP +/* + * NL_CONNECT_RESPONSE --- The user has accepted the data link connection. + */ +#define NL_CONNECT_RESPONSE 22 * LLC_MAXCMDRSP +/* + * NL_RESET_REQUEST --- The user has requested that the data link with the + * remote LLC DSAP be reset. + */ +#define NL_RESET_REQUEST 23 * LLC_MAXCMDRSP +/* + * NL_RESET_RESPONSE --- The user has accepted the reset of the data link + * connection. + */ +#define NL_RESET_RESPONSE 24 * LLC_MAXCMDRSP +/* + * NL_DISCONNECT_REQUEST --- The user has requested that the data link + * connection with remote LLC DSAP be terminated. + */ +#define NL_DISCONNECT_REQUEST 25 * LLC_MAXCMDRSP +/* + * NL_DATA_REQUEST --- The user has requested that a data unit be sent ot the + * remote LLC DSAP. + */ +#define NL_DATA_REQUEST 26 * LLC_MAXCMDRSP +/* + * NL_INITIATE_PF_CYCLE --- The local LLC wants to initiate a P/F cycle. + */ +#define NL_INITIATE_PF_CYCLE 27 * LLC_MAXCMDRSP +/* + * NL_LOCAL_BUSY_DETECTED --- The local entity has encountered a busy condition + */ +#define NL_LOCAL_BUSY_DETECTED 28 * LLC_MAXCMDRSP + +#define LLCFT_NONE 255 + +/* return message from state handlers */ + +/* + * LLC_CONNECT_INDICATION --- Inform the user that a connection has been + * requested by a remote LLC SSAP. + */ +#define LLC_CONNECT_INDICATION 1 +/* + * LLC_CONNECT_CONFIRM --- The connection service component indicates that the + * remote network entity has accepted the connection. + */ +#define LLC_CONNECT_CONFIRM 2 +/* + * LLC_DISCONNECT_INDICATION --- Inform the user that the remote network + * entity has intiated disconnection of the data + * link connection. + */ +#define LLC_DISCONNECT_INDICATION 3 +/* + * LLC_RESET_CONFIRM --- The connection service component indicates that the + * remote network entity has accepted the reset. + */ +#define LLC_RESET_CONFIRM 4 +/* + * LLC_RESET_INDICATION_REMOTE --- The remote network entity or remote peer + * has initiated a reset of the data link + * connection. + */ +#define LLC_RESET_INDICATION_REMOTE 5 +/* + * LLC_RESET_INDICATION_LOCAL --- The local LLC has determined that the data + * link connection is in need of + * reinitialization. + */ +#define LLC_RESET_INDICATION_LOCAL 6 +/* + * LLC_FRMR_RECEIVED --- The local connection service component has received a + * FRMR response PDU. + */ +#define LLC_FRMR_RECEIVED 7 +/* + * LLC_FRMR_SENT --- The local connection component has received an ivalid + * PDU, and has sent a FRMR response PDU. + */ +#define LLC_FRMR_SENT 8 +/* + * LLC_DATA_INDICATION --- The connection service component passes the data + * unit from the received I PDU to the user. + */ +#define LLC_DATA_INDICATION 9 +/* + * LLC_REMOTE_NOT_BUSY --- The remote LLC DSAP is no longer busy. The local + * connection service component will now accept a + * DATA_REQUEST. + */ +#define LLC_REMOTE_NOT_BUSY 10 +/* + * LLC_REMOTE_BUSY --- The remote LLC DSAP is busy. The local connection + * service component will not accept a DATA_REQUEST. + */ +#define LLC_REMOTE_BUSY 11 + +/* Internal return code */ +#define LLC_PASSITON 255 + +#define INFORMATION_CONTROL 0x00 +#define SUPERVISORY_CONTROL 0x02 +#define UNUMBERED_CONTROL 0x03 + +/* + * Other necessary definitions + */ + +#define LLC_MAX_SEQUENCE 128 +#define LLC_MAX_WINDOW 127 +#define LLC_WINDOW_SIZE 7 + +/* + * Don't we love this one? CCITT likes to suck on bits 8=) + */ +#define NLHDRSIZEGUESS 3 + +/* + * LLC control block + */ + +struct llc_linkcb { + struct llccb_q { + struct llccb_q *q_forw; /* admin chain */ + struct llccb_q *q_backw; + } llcl_q; + struct npaidbentry *llcl_sapinfo; /* SAP information */ + struct sockaddr_dl llcl_addr; /* link snpa address */ + struct rtentry *llcl_nlrt; /* layer 3 -> LLC */ + struct rtentry *llcl_llrt; /* LLC -> layer 3 */ + struct ifnet *llcl_if; /* our interface */ + caddr_t llcl_nlnext; /* cb for network layer */ + struct mbuf *llcl_writeqh; /* Write queue head */ + struct mbuf *llcl_writeqt; /* Write queue tail */ + struct mbuf **llcl_output_buffers; + short llcl_timers[6]; /* timer array */ + long llcl_timerflags; /* flags signalling running timers */ + int (*llcl_statehandler) + __P((struct llc_linkcb *, struct llc *, int, int, int)); + int llcl_P_flag; + int llcl_F_flag; + int llcl_S_flag; + int llcl_DATA_flag; + int llcl_REMOTE_BUSY_flag; + int llcl_DACTION_flag; /* delayed action */ + int llcl_retry; + /* + * The following components deal --- in one way or the other --- + * with the LLC2 window. Indicated by either [L] or [W] is the + * domain of the specific component: + * + * [L] The domain is 0--LLC_MAX_WINDOW + * [W] The domain is 0--llcl_window + */ + short llcl_vr; /* next to receive [L] */ + short llcl_vs; /* next to send [L] */ + short llcl_nr_received; /* next frame to b ack'd [L] */ + short llcl_freeslot; /* next free slot [W] */ + short llcl_projvs; /* V(S) associated with freeslot */ + short llcl_slotsfree; /* free slots [W] */ + short llcl_window; /* window size */ + /* + * In llcl_frmrinfo we jot down the last frmr info field, which we + * need to do as we need to be able to resend it in the ERROR state. + */ + struct frmrinfo llcl_frmrinfo; /* last FRMR info field */ +}; +#define llcl_frmr_pdu0 llcl_frmrinfo.rej_pdu_0 +#define llcl_frmr_pdu1 llcl_frmrinfo.rej_pdu_1 +#define llcl_frmr_control llcl_frmrinfo.frmr_control +#define llcl_frmr_control_ext llcl_frmrinfo.frmr_control_ext +#define llcl_frmr_cause llcl_frmrinfo.frmr_cause + +#define LQNEXT(l) (struct llc_linkcb *)((l)->llcl_q.q_forw) +#define LQEMPTY (llccb_q.q_forw == &llccb_q) +#define LQFIRST (struct llc_linkcb *)(llccb_q.q_forw) +#define LQVALID(l) (!((struct llccb_q *)(l) == &llccb_q)) + +#define LLC_ENQUEUE(l, m) if ((l)->llcl_writeqh == NULL) { \ + (l)->llcl_writeqh = (m); \ + (l)->llcl_writeqt = (m); \ + } else { \ + (l)->llcl_writeqt->m_nextpkt = (m); \ + (l)->llcl_writeqt = (m); \ + } + +#define LLC_DEQUEUE(l, m) if ((l)->llcl_writeqh == NULL) \ + (m) = NULL; \ + else { \ + (m) = (l)->llcl_writeqh; \ + (l)->llcl_writeqh = (l)->llcl_writeqh->m_nextpkt; \ + } + +#define LLC_SETFRAME(l, m) { \ + if ((l)->llcl_slotsfree > 0) { \ + (l)->llcl_slotsfree--; \ + (l)->llcl_output_buffers[(l)->llcl_freeslot] = (m); \ + (l)->llcl_freeslot = ((l)->llcl_freeslot+1) % (l)->llcl_window; \ + LLC_INC((l)->llcl_projvs); \ + } \ + } + +/* + * handling of sockaddr_dl's + */ + +#define LLADDRLEN(s) ((s)->sdl_alen + (s)->sdl_nlen) +#define LLSAPADDR(s) ((s)->sdl_data[LLADDRLEN(s)-1] & 0xff) +#define LLSAPLOC(s, if) ((s)->sdl_nlen + (if)->if_addrlen) + +struct sdl_hdr { + struct sockaddr_dl sdlhdr_dst; + struct sockaddr_dl sdlhdr_src; + long sdlhdr_len; +}; + +#define LLC_GETHDR(f,m) { \ + struct mbuf *_m = (struct mbuf *) (m); \ + if (_m) { \ + M_PREPEND(_m, LLC_ISFRAMELEN, M_DONTWAIT); \ + bzero(mtod(_m, caddr_t), LLC_ISFRAMELEN); \ + } else { \ + MGETHDR (_m, M_DONTWAIT, MT_HEADER); \ + if (_m != NULL) { \ + _m->m_pkthdr.len = _m->m_len = LLC_UFRAMELEN; \ + _m->m_next = _m->m_act = NULL; \ + bzero(mtod(_m, caddr_t), LLC_UFRAMELEN); \ + } else return; \ + } \ + (m) = _m; \ + (f) = mtod(m, struct llc *); \ + } + +#define LLC_NEWSTATE(l, LLCstate) (l)->llcl_statehandler = llc_state_##LLCstate +#define LLC_STATEEQ(l, LLCstate) ((l)->llcl_statehandler == llc_state_##LLCstate ? 1 : 0) + +#define LLC_ACK_SHIFT 0 +#define LLC_P_SHIFT 1 +#define LLC_BUSY_SHIFT 2 +#define LLC_REJ_SHIFT 3 +#define LLC_AGE_SHIFT 4 +#define LLC_DACTION_SHIFT 5 + +#define LLC_TIMER_NOTRUNNING 0 +#define LLC_TIMER_RUNNING 1 +#define LLC_TIMER_EXPIRED 2 + +#define LLC_STARTTIMER(l, LLCtimer) { \ + (l)->llcl_timers[LLC_##LLCtimer##_SHIFT] = llc_##LLCtimer##_timer; \ + (l)->llcl_timerflags |= (1<llcl_timers[LLC_##LLCtimer##_SHIFT] = 0; \ + (l)->llcl_timerflags &= ~(1<llcl_timers[LLC_##LLCtimer##_SHIFT] > 0) \ + (l)->llcl_timers[LLC_##LLCtimer##_SHIFT]--; + +#define LLC_TIMERXPIRED(l, LLCtimer) \ + (((l)->llcl_timerflags & (1<llcl_timers[LLC_##LLCtimer##_SHIFT] == 0 ) ? \ + LLC_TIMER_EXPIRED : LLC_TIMER_RUNNING) : LLC_TIMER_NOTRUNNING) + +#define FOR_ALL_LLC_TIMERS(t) for ((t) = LLC_ACK_SHIFT; (t) < LLC_AGE_SHIFT; (t)++) + +#define LLC_SETFLAG(l, LLCflag, v) (l)->llcl_##LLCflag##_flag = (v) +#define LLC_GETFLAG(l, LLCflag) (l)->llcl_##LLCflag##_flag + +#define LLC_RESETCOUNTER(l) { \ + (l)->llcl_vs = (l)->llcl_vr = (l)->llcl_retry = 0; \ + llc_resetwindow((l)); \ + } + +/* + * LLC2 macro definitions + */ + + +#define LLC_START_ACK_TIMER(l) LLC_STARTTIMER((l), ACK) +#define LLC_STOP_ACK_TIMER(l) LLC_STOPTIMER((l), ACK) +#define LLC_START_REJ_TIMER(l) LLC_STARTTIMER((l), REJ) +#define LLC_STOP_REJ_TIMER(l) LLC_STOPTIMER((l), REJ) +#define LLC_START_P_TIMER(l) { \ + LLC_STARTTIMER((l), P); \ + if (LLC_GETFLAG((l), P) == 0) \ + (l)->llcl_retry = 0; \ + LLC_SETFLAG((l), P, 1); \ + } +#define LLC_STOP_P_TIMER(l) { \ + LLC_STOPTIMER((l), P); \ + LLC_SETFLAG((l), P, 0); \ + } +#define LLC_STOP_ALL_TIMERS(l) { \ + LLC_STOPTIMER((l), ACK); \ + LLC_STOPTIMER((l), REJ); \ + LLC_STOPTIMER((l), BUSY); \ + LLC_STOPTIMER((l), P); \ + } + + +#define LLC_INC(i) (i) = ((i)+1) % LLC_MAX_SEQUENCE + +#define LLC_NR_VALID(l, nr) ((l)->llcl_vs < (l)->llcl_nr_received ? \ + (((nr) >= (l)->llcl_nr_received) || \ + ((nr) <= (l)->llcl_vs) ? 1 : 0) : \ + (((nr) <= (l)->llcl_vs) && \ + ((nr) >= (l)->llcl_nr_received) ? 1 : 0)) + +#define LLC_UPDATE_P_FLAG(l, cr, pf) { \ + if ((cr) == LLC_RSP && (pf) == 1) { \ + LLC_SETFLAG((l), P, 0); \ + LLC_STOPTIMER((l), P); \ + } \ + } + +#define LLC_UPDATE_NR_RECEIVED(l, nr) { \ + while ((l)->llcl_nr_received != (nr)) { \ + struct mbuf *_m; \ + register short seq; \ + if (_m = (l)->llcl_output_buffers[seq = llc_seq2slot((l), (l)->llcl_nr_received)]) \ + m_freem(_m); \ + (l)->llcl_output_buffers[seq] = NULL; \ + LLC_INC((l)->llcl_nr_received); \ + (l)->llcl_slotsfree++; \ + } \ + (l)->llcl_retry = 0; \ + if ((l)->llcl_slotsfree < (l)->llcl_window) { \ + LLC_START_ACK_TIMER(l); \ + } else LLC_STOP_ACK_TIMER(l); \ + LLC_STARTTIMER((l), DACTION); \ + } + +#define LLC_SET_REMOTE_BUSY(l,a) { \ + if (LLC_GETFLAG((l), REMOTE_BUSY) == 0) { \ + LLC_SETFLAG((l), REMOTE_BUSY, 1); \ + LLC_STARTTIMER((l), BUSY); \ + (a) = LLC_REMOTE_BUSY; \ + } else { \ + (a) = 0; \ + } \ + } +#define LLC_CLEAR_REMOTE_BUSY(l,a) { \ + if (LLC_GETFLAG((l), REMOTE_BUSY) == 1) { \ + LLC_SETFLAG((l), REMOTE_BUSY, 1); \ + LLC_STOPTIMER((l), BUSY); \ + if (LLC_STATEEQ((l), NORMAL) || \ + LLC_STATEEQ((l), REJECT) || \ + LLC_STATEEQ((l), BUSY)) \ + llc_resend((l), LLC_CMD, 0); \ + (a) = LLC_REMOTE_NOT_BUSY; \ + } else { \ + (a) = 0; \ + } \ + } + +#define LLC_DACKCMD 0x1 +#define LLC_DACKCMDPOLL 0x2 +#define LLC_DACKRSP 0x3 +#define LLC_DACKRSPFINAL 0x4 + +#define LLC_SENDACKNOWLEDGE(l, cmd, pf) { \ + if ((cmd) == LLC_CMD) { \ + LLC_SETFLAG((l), DACTION, ((pf) == 0 ? LLC_DACKCMD : LLC_DACKCMDPOLL)); \ + } else { \ + LLC_SETFLAG((l), DACTION, ((pf) == 0 ? LLC_DACKRSP : LLC_DACKRSPFINAL)); \ + } \ + } + +#define LLC_FRMR_W (1<<0) +#define LLC_FRMR_X (1<<1) +#define LLC_FRMR_Y (1<<2) +#define LLC_FRMR_Z (1<<3) +#define LLC_FRMR_V (1<<4) + +#define LLC_SETFRMR(l, f, cr, c) { \ + if ((f)->llc_control & 0x3) { \ + (l)->llcl_frmr_pdu0 = (f)->llc_control; \ + (l)->llcl_frmr_pdu1 = 0; \ + } else { \ + (l)->llcl_frmr_pdu0 = (f)->llc_control; \ + (l)->llcl_frmr_pdu1 = (f)->llc_control_ext; \ + } \ + LLCCSBITS((l)->llcl_frmr_control, f_vs, (l)->llcl_vs); \ + LLCCSBITS((l)->llcl_frmr_control_ext, f_cr, (cr)); \ + LLCSBITS((l)->llcl_frmr_control_ext, f_vr, (l)->llcl_vr); \ + LLCCSBITS((l)->llcl_frmr_cause, f_wxyzv, (c)); \ + } + +/* + * LLC tracing levels: + * LLCTR_INTERESTING interesting event, we might care to know about + * it, but then again, we might not ... + * LLCTR_SHOULDKNOW we probably should know about this event + * LLCTR_URGENT something has gone utterly wrong ... + */ +#define LLCTR_INTERESTING 1 +#define LLCTR_SHOULDKNOW 2 +#define LLCTR_URGENT 3 + +#ifdef LLCDEBUG +#define LLC_TRACE(lp, l, msg) llc_trace((lp), (l), (msg)) +#else /* LLCDEBUG */ +#define LLC_TRACE(lp, l, msg) /* NOOP */ +#endif /* LLCDEBUG */ + +#define LLC_N2_VALUE 15 /* up to 15 retries */ +#define LLC_ACK_TIMER 10 /* 5 secs */ +#define LLC_P_TIMER 4 /* 2 secs */ +#define LLC_BUSY_TIMER 12 /* 6 secs */ +#define LLC_REJ_TIMER 12 /* 6 secs */ +#define LLC_AGE_TIMER 40 /* 20 secs */ +#define LLC_DACTION_TIMER 2 /* 1 secs */ + +#if defined (KERNEL) && defined(LLC) +extern int llc_n2; +extern int llc_ACK_timer; +extern int llc_P_timer; +extern int llc_REJ_timer; +extern int llc_BUSY_timer; +extern int llc_AGE_timer; +extern int llc_DACTION_timer; + +extern int af_link_rts_init_done; + +#define USES_AF_LINK_RTS { \ + if (!af_link_rts_init_done) { \ + rn_inithead((void **)&rt_tables[AF_LINK], 32); \ + af_link_rts_init_done++; \ + } \ + } + +struct ifqueue llcintrq; + +extern struct llccb_q llccb_q; +extern char *frame_names[]; + +/* + * Function prototypes + */ +int sdl_cmp __P((struct sockaddr_dl *, struct sockaddr_dl *)); +int sdl_copy __P((struct sockaddr_dl *, struct sockaddr_dl *)); +int sdl_swapaddr __P((struct sockaddr_dl *, struct sockaddr_dl *)); +int sdl_checkaddrif __P((struct ifnet *, struct sockaddr_dl *)); +int sdl_setaddrif __P((struct ifnet *, u_char *, u_char, u_char, + struct sockaddr_dl *)); +int sdl_sethdrif __P((struct ifnet *, u_char *, u_char, u_char *, u_char, u_char, + struct sdl_hdr *)); +struct npaidbentry *llc_setsapinfo __P((struct ifnet *, u_char, u_char, + struct dllconfig *)); +struct npaidbentry *llc_getsapinfo __P((u_char, struct ifnet *)); +struct rtentry *npaidb_enrich __P((short, caddr_t, struct sockaddr_dl *)); +int npaidb_destroy __P((struct rtentry *)); +short llc_seq2slot __P((struct llc_linkcb *, short)); +int llc_state_ADM __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_CONN __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_RESET_WAIT __P((struct llc_linkcb *, struct llc *, + int, int, int)); +int llc_state_RESET_CHECK __P((struct llc_linkcb *, struct llc *, + int, int, int)); +int llc_state_SETUP __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_RESET __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_D_CONN __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_ERROR __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_NBRAcore __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_NORMAL __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_BUSY __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_REJECT __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_AWAIT __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_AWAIT_BUSY __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_state_AWAIT_REJECT __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_statehandler __P((struct llc_linkcb *, struct llc *, int, int, int)); +int llc_init __P((void)); +struct llc_linkcb *llc_newlink __P((struct sockaddr_dl *, struct ifnet *, + struct rtentry *, caddr_t, struct rtentry *)); +int llc_dellink __P((struct llc_linkcb *)); +int llc_anytimersup __P((struct llc_linkcb *)); +char * llc_getstatename __P((struct llc_linkcb *)); +void llc_link_dump __P((struct llc_linkcb *, const char *)); +void llc_trace __P((struct llc_linkcb *, int, const char *)); +void llc_resetwindow __P((struct llc_linkcb *)); +int llc_decode __P((struct llc *, struct llc_linkcb *)); +void llc_timer __P((void)); +void llcintr __P((void)); +int llc_input __P((struct llc_linkcb *, struct mbuf *, u_char)); +caddr_t llc_ctlinput __P((int, struct sockaddr *, caddr_t)); +int llc_output __P((struct llc_linkcb *, struct mbuf *)); +void llc_start __P((struct llc_linkcb *)); +int llc_send __P((struct llc_linkcb *, int, int, int)); +int llc_resend __P((struct llc_linkcb *, int, int)); +int llc_rawsend __P((struct llc_linkcb *, struct mbuf *, struct llc *, int, int, + int, int)); +int cons_rtrequest __P((int, struct rtentry *, struct sockaddr *)); +int x25_llcglue __P((int, struct sockaddr *)); + +#endif + + diff --git a/sys/netccitt/pk.h b/sys/netccitt/pk.h new file mode 100644 index 00000000000..528e0a68080 --- /dev/null +++ b/sys/netccitt/pk.h @@ -0,0 +1,207 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pk.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * + * X.25 Packet Level Definitions: + * + */ + +/* Packet type identifier field defintions. */ + +#define X25_CALL 11 +#define X25_CALL_ACCEPTED 15 +#define X25_CLEAR 19 +#define X25_CLEAR_CONFIRM 23 +#define X25_DATA 0 +#define X25_INTERRUPT 35 +#define X25_INTERRUPT_CONFIRM 39 + +#define X25_RR 1 +#define X25_RNR 5 +#define X25_REJECT 9 +#define X25_RESET 27 +#define X25_RESET_CONFIRM 31 +#define X25_DIAGNOSTIC 241 + +#define X25_RESTART 251 +#define X25_RESTART_CONFIRM 255 + +/* Restart cause field definitions. */ + +#define X25_RESTART_DTE_ORIGINATED 0 +#define X25_RESTART_LOCAL_PROCEDURE_ERROR 1 +#define X25_RESTART_NETWORK_CONGESTION 3 +#define X25_RESTART_NETWORK_OPERATIONAL 7 +#define X25_RESTART_DTE_ORIGINATED2 128 + + +/* Miscellaneous definitions. */ + +#define DATA_PACKET_DESIGNATOR 0x01 +#define RR_OR_RNR_PACKET_DESIGNATOR 0x02 +#define RR_PACKET_DESIGNATOR 0x04 + +#define DEFAULT_WINDOW_SIZE 2 +#define MODULUS 8 + +#define ADDRLN 1 +#define MAXADDRLN 15 +#define FACILITIESLN 1 +#define MAXFACILITIESLN 10 +#define MAXUSERDATA 16 +#define MAXCALLINFOLN 1+15+1+10+16 + +#define PACKET_OK 0 +#define IGNORE_PACKET 1 +#define ERROR_PACKET 2 + +typedef char bool; +#define FALSE 0 +#define TRUE 1 + +/* + * X.25 Packet format definitions + * This will eventually have to be rewritten without reference + * to bit fields, to be ansi C compliant and allignment safe. + */ + +typedef u_char octet; + +struct x25_calladdr { + octet addrlens; + octet address_field[MAXADDRLN]; +}; + +struct x25_packet { + octet bits; + octet logical_channel_number; + octet packet_type; + octet packet_data; +}; +#define packet_cause packet_data + +struct data_packet { + octet bits; +}; + +#define FACILITIES_REVERSE_CHARGE 0x1 +#define FACILITIES_THROUGHPUT 0x2 +#define FACILITIES_PACKETSIZE 0x42 +#define FACILITIES_WINDOWSIZE 0x43 + +#define PKHEADERLN 3 + +#define DP(xp) (((struct data_packet *)&(xp) -> packet_type) -> bits) +#define PS(xp) X25GBITS(DP(xp), p_s) +#define PR(xp) X25GBITS(DP(xp), p_r) +#define MBIT(xp) X25GBITS(DP(xp), m_bit) +#define SPR(xp, v) X25SBITS(DP(xp), p_r, (v)) +#define SPS(xp, v) X25SBITS(DP(xp), p_s, (v)) +#define SMBIT(xp, v) X25SBITS(DP(xp), m_bit, (v)) + +#define LCN(xp) (xp -> logical_channel_number + \ + (X25GBITS(xp -> bits, lc_group_number) ? (X25GBITS(xp -> bits, lc_group_number) << 8) : 0)) +#define SET_LCN(xp, lcn) ((xp -> logical_channel_number = lcn), \ + (X25SBITS(xp -> bits, lc_group_number, lcn > 255 ? lcn >> 8 : 0))) + +struct mbuf *pk_template (); + +/* Define X.25 packet level states. */ + +/* Call setup and clearing substates. */ + +#define LISTEN 0 +#define READY 1 +#define RECEIVED_CALL 2 +#define SENT_CALL 3 +#define DATA_TRANSFER 4 +#define RECEIVED_CLEAR 5 +#define SENT_CLEAR 6 + +/* DTE states. */ + +#define DTE_WAITING 7 +#define DTE_RECEIVED_RESTART 8 +#define DTE_SENT_RESTART 9 +#define DTE_READY 0 + +/* Cleaning out ... */ + +#define LCN_ZOMBIE 10 + +#define MAXSTATES 11 + +/* + * The following definitions are used in a switch statement after + * determining the packet type. These values are returned by the + * pk_decode procedure. + */ + +#define CALL 0 * MAXSTATES +#define CALL_ACCEPTED 1 * MAXSTATES +#define CLEAR 2 * MAXSTATES +#define CLEAR_CONF 3 * MAXSTATES +#define DATA 4 * MAXSTATES +#define INTERRUPT 5 * MAXSTATES +#define INTERRUPT_CONF 6 * MAXSTATES +#define RR 7 * MAXSTATES +#define RNR 8 * MAXSTATES +#define RESET 9 * MAXSTATES +#define RESET_CONF 10 * MAXSTATES +#define RESTART 11 * MAXSTATES +#define RESTART_CONF 12 * MAXSTATES +#define REJECT 13 * MAXSTATES +#define DIAG_TYPE 14 * MAXSTATES +#define INVALID_PACKET 15 * MAXSTATES +#define DELETE_PACKET INVALID_PACKET + +/* + * The following definitions are used by the restart procedures + * for noting wether the PLE is supposed to behave as DTE or DCE + * (essentially necessary for operation over LLC2) + */ +#define DTE_DXERESOLVING 0x0001 +#define DTE_PLAYDTE 0x0002 +#define DTE_PLAYDCE 0x0004 +#define DTE_CONNECTPENDING 0x0010 +#define DTE_PRETENDDTE 0x0020 + +#define MAXRESTARTCOLLISIONS 10 diff --git a/sys/netccitt/pk_acct.c b/sys/netccitt/pk_acct.c new file mode 100644 index 00000000000..fccd875285e --- /dev/null +++ b/sys/netccitt/pk_acct.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pk_acct.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + + +struct vnode *pkacctp; +/* + * Turn on packet accounting + */ + +pk_accton (path) + char *path; +{ + register struct vnode *vp = NULL; + struct nameidata nd; + struct vnode *oacctp = pkacctp; + struct proc *p = curproc; + int error; + + if (path == 0) + goto close; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, path, p); + if (error = vn_open (&nd, FWRITE, 0644)) + return (error); + vp = nd.ni_vp; + VOP_UNLOCK(vp); + if (vp -> v_type != VREG) { + vrele (vp); + return (EACCES); + } + pkacctp = vp; + if (oacctp) { + close: + error = vn_close (oacctp, FWRITE, p -> p_ucred, p); + } + return (error); +} + +/* + * Write a record on the accounting file. + */ + +pk_acct (lcp) +register struct pklcd *lcp; +{ + register struct vnode *vp; + register struct sockaddr_x25 *sa; + register char *src, *dst; + register int len; + register long etime; + static struct x25acct acbuf; + + if ((vp = pkacctp) == 0) + return; + bzero ((caddr_t)&acbuf, sizeof (acbuf)); + if (lcp -> lcd_ceaddr != 0) + sa = lcp -> lcd_ceaddr; + else if (lcp -> lcd_craddr != 0) { + sa = lcp -> lcd_craddr; + acbuf.x25acct_callin = 1; + } else + return; + + if (sa -> x25_opts.op_flags & X25_REVERSE_CHARGE) + acbuf.x25acct_revcharge = 1; + acbuf.x25acct_stime = lcp -> lcd_stime; + acbuf.x25acct_etime = time.tv_sec - acbuf.x25acct_stime; + acbuf.x25acct_uid = curproc -> p_cred -> p_ruid; + acbuf.x25acct_psize = sa -> x25_opts.op_psize; + acbuf.x25acct_net = sa -> x25_net; + /* + * Convert address to bcd + */ + src = sa -> x25_addr; + dst = acbuf.x25acct_addr; + for (len = 0; *src; len++) + if (len & 01) + *dst++ |= *src++ & 0xf; + else + *dst = *src++ << 4; + acbuf.x25acct_addrlen = len; + + bcopy (sa -> x25_udata, acbuf.x25acct_udata, + sizeof (acbuf.x25acct_udata)); + acbuf.x25acct_txcnt = lcp -> lcd_txcnt; + acbuf.x25acct_rxcnt = lcp -> lcd_rxcnt; + + (void) vn_rdwr(UIO_WRITE, vp, (caddr_t)&acbuf, sizeof (acbuf), + (off_t)0, UIO_SYSSPACE, IO_UNIT|IO_APPEND, + curproc -> p_ucred, (int *)0, + (struct proc *)0); +} diff --git a/sys/netccitt/pk_debug.c b/sys/netccitt/pk_debug.c new file mode 100644 index 00000000000..b5103557c56 --- /dev/null +++ b/sys/netccitt/pk_debug.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pk_debug.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +char *pk_state[] = { + "Listen", "Ready", "Received-Call", + "Sent-Call", "Data-Transfer","Received-Clear", + "Sent-Clear", +}; + +char *pk_name[] = { + "Call", "Call-Conf", "Clear", + "Clear-Conf", "Data", "Intr", "Intr-Conf", + "Rr", "Rnr", "Reset", "Reset-Conf", + "Restart", "Restart-Conf", "Reject", "Diagnostic", + "Invalid" +}; + +pk_trace (xcp, m, dir) +struct x25config *xcp; +register struct mbuf *m; +char *dir; +{ + register char *s; + struct x25_packet *xp = mtod(m, struct x25_packet *); + register int i, len = 0, cnt = 0; + + if (xcp -> xc_ptrace == 0) + return; + + i = pk_decode (xp) / MAXSTATES; + for (; m; m = m -> m_next) { + len = len + m -> m_len; + ++cnt; + } + printf ("LCN=%d %s: %s #=%d, len=%d ", + LCN(xp), dir, pk_name[i], cnt, len); + for (s = (char *) xp, i = 0; i < 5; ++i, ++s) + printf ("%x ", (int) * s & 0xff); + printf ("\n"); +} + +mbuf_cache(c, m) +register struct mbuf_cache *c; +struct mbuf *m; +{ + register struct mbuf **mp; + + if (c->mbc_size != c->mbc_oldsize) { + unsigned zero_size, copy_size; + unsigned new_size = c->mbc_size * sizeof(m); + caddr_t cache = (caddr_t)c->mbc_cache; + + if (new_size) { + c->mbc_cache = (struct mbuf **) + malloc(new_size, M_MBUF, M_NOWAIT); + if (c->mbc_cache == 0) { + c->mbc_cache = (struct mbuf **)cache; + return; + } + c->mbc_num %= c->mbc_size; + } else + c->mbc_cache = 0; + if (c->mbc_size < c->mbc_oldsize) { + register struct mbuf **mplim; + mp = c->mbc_size + (struct mbuf **)cache; + mplim = c->mbc_oldsize + (struct mbuf **)cache; + while (mp < mplim) + m_freem(*mp++); + zero_size = 0; + } else + zero_size = (c->mbc_size - c->mbc_oldsize) * sizeof(m); + copy_size = new_size - zero_size; + c->mbc_oldsize = c->mbc_size; + if (copy_size) + bcopy(cache, (caddr_t)c->mbc_cache, copy_size); + if (cache) + free(cache, M_MBUF); + if (zero_size) + bzero(copy_size + (caddr_t)c->mbc_cache, zero_size); + } + if (c->mbc_size == 0) + return; + mp = c->mbc_cache + c->mbc_num; + c->mbc_num = (1 + c->mbc_num) % c->mbc_size; + if (*mp) + m_freem(*mp); + if (*mp = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) + (*mp)->m_flags |= m->m_flags & 0x08; +} diff --git a/sys/netccitt/pk_input.c b/sys/netccitt/pk_input.c new file mode 100644 index 00000000000..1f8f0bc7127 --- /dev/null +++ b/sys/netccitt/pk_input.c @@ -0,0 +1,1119 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (C) Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1992 + * Copyright (c) 1991, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by the + * Laboratory for Computation Vision and the Computer Science Department + * of the the University of British Columbia and the Computer Science + * Department (IV) of the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pk_input.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +struct pkcb_q pkcb_q = {&pkcb_q, &pkcb_q}; + +/* + * ccittintr() is the generic interrupt handler for HDLC, LLC2, and X.25. This + * allows to have kernel running X.25 but no HDLC or LLC2 or both (in case we + * employ boards that do all the stuff themselves, e.g. ADAX X.25 or TPS ISDN.) + */ +void +ccittintr () +{ + extern struct ifqueue pkintrq; + extern struct ifqueue hdintrq; + extern struct ifqueue llcintrq; + +#ifdef HDLC + if (hdintrq.ifq_len) + hdintr (); +#endif +#ifdef LLC + if (llcintrq.ifq_len) + llcintr (); +#endif + if (pkintrq.ifq_len) + pkintr (); +} + +struct pkcb * +pk_newlink (ia, llnext) +struct x25_ifaddr *ia; +caddr_t llnext; +{ + register struct x25config *xcp = &ia -> ia_xc; + register struct pkcb *pkp; + register struct pklcd *lcp; + register struct protosw *pp; + unsigned size; + + pp = pffindproto (AF_CCITT, (int) xcp -> xc_lproto, 0); + if (pp == 0 || pp -> pr_output == 0) { + pk_message (0, xcp, "link level protosw error"); + return ((struct pkcb *)0); + } + /* + * Allocate a network control block structure + */ + size = sizeof (struct pkcb); + pkp = (struct pkcb *) malloc (size, M_PCB, M_WAITOK); + if (pkp == 0) + return ((struct pkcb *)0); + bzero ((caddr_t) pkp, size); + pkp -> pk_lloutput = pp -> pr_output; + pkp -> pk_llctlinput = (caddr_t (*)()) pp -> pr_ctlinput; + pkp -> pk_xcp = xcp; + pkp -> pk_ia = ia; + pkp -> pk_state = DTE_WAITING; + pkp -> pk_llnext = llnext; + insque (pkp, &pkcb_q); + + /* + * set defaults + */ + + if (xcp -> xc_pwsize == 0) + xcp -> xc_pwsize = DEFAULT_WINDOW_SIZE; + if (xcp -> xc_psize == 0) + xcp -> xc_psize = X25_PS128; + /* + * Allocate logical channel descriptor vector + */ + + (void) pk_resize (pkp); + return (pkp); +} + + +pk_dellink (pkp) +register struct pkcb *pkp; +{ + register int i; + register struct protosw *pp; + + /* + * Essentially we have the choice to + * (a) go ahead and let the route be deleted and + * leave the pkcb associated with that route + * as it is, i.e. the connections stay open + * (b) do a pk_disconnect() on all channels associated + * with the route via the pkcb and then proceed. + * + * For the time being we stick with (b) + */ + + for (i = 1; i < pkp -> pk_maxlcn; ++i) + if (pkp -> pk_chan[i]) + pk_disconnect (pkp -> pk_chan[i]); + + /* + * Free the pkcb + */ + + /* + * First find the protoswitch to get hold of the link level + * protocol to be notified that the packet level entity is + * dissolving ... + */ + pp = pffindproto (AF_CCITT, (int) pkp -> pk_xcp -> xc_lproto, 0); + if (pp == 0 || pp -> pr_output == 0) { + pk_message (0, pkp -> pk_xcp, "link level protosw error"); + return (EPROTONOSUPPORT); + } + + pkp -> pk_refcount--; + if (!pkp -> pk_refcount) { + struct dll_ctlinfo ctlinfo; + + remque (pkp); + if (pkp -> pk_rt -> rt_llinfo == (caddr_t) pkp) + pkp -> pk_rt -> rt_llinfo = (caddr_t) NULL; + + /* + * Tell the link level that the pkcb is dissolving + */ + if (pp -> pr_ctlinput && pkp -> pk_llnext) { + ctlinfo.dlcti_pcb = pkp -> pk_llnext; + ctlinfo.dlcti_rt = pkp -> pk_rt; + (pp -> pr_ctlinput)(PRC_DISCONNECT_REQUEST, + pkp -> pk_xcp, &ctlinfo); + } + free ((caddr_t) pkp -> pk_chan, M_IFADDR); + free ((caddr_t) pkp, M_PCB); + } + + return (0); +} + + +pk_resize (pkp) +register struct pkcb *pkp; +{ + struct pklcd *dev_lcp = 0; + struct x25config *xcp = pkp -> pk_xcp; + if (pkp -> pk_chan && + (pkp -> pk_maxlcn != xcp -> xc_maxlcn)) { + pk_restart (pkp, X25_RESTART_NETWORK_CONGESTION); + dev_lcp = pkp -> pk_chan[0]; + free ((caddr_t) pkp -> pk_chan, M_IFADDR); + pkp -> pk_chan = 0; + } + if (pkp -> pk_chan == 0) { + unsigned size; + pkp -> pk_maxlcn = xcp -> xc_maxlcn; + size = (pkp -> pk_maxlcn + 1) * sizeof (struct pklcd *); + pkp -> pk_chan = + (struct pklcd **) malloc (size, M_IFADDR, M_WAITOK); + if (pkp -> pk_chan) { + bzero ((caddr_t) pkp -> pk_chan, size); + /* + * Allocate a logical channel descriptor for lcn 0 + */ + if (dev_lcp == 0 && + (dev_lcp = pk_attach ((struct socket *)0)) == 0) + return (ENOBUFS); + dev_lcp -> lcd_state = READY; + dev_lcp -> lcd_pkp = pkp; + pkp -> pk_chan[0] = dev_lcp; + } else { + if (dev_lcp) + pk_close (dev_lcp); + return (ENOBUFS); + } + } + return 0; +} + +/* + * This procedure is called by the link level whenever the link + * becomes operational, is reset, or when the link goes down. + */ +/*VARARGS*/ +caddr_t +pk_ctlinput (code, src, addr) + struct sockaddr *src; + caddr_t addr; +{ + register struct pkcb *pkp = (struct pkcb *) addr; + + switch (code) { + case PRC_LINKUP: + if (pkp -> pk_state == DTE_WAITING) + pk_restart (pkp, X25_RESTART_NETWORK_CONGESTION); + break; + + case PRC_LINKDOWN: + pk_restart (pkp, -1); /* Clear all active circuits */ + pkp -> pk_state = DTE_WAITING; + break; + + case PRC_LINKRESET: + pk_restart (pkp, X25_RESTART_NETWORK_CONGESTION); + break; + + case PRC_CONNECT_INDICATION: { + struct rtentry *llrt; + + if ((llrt = rtalloc1(src, 0)) == 0) + return 0; + else llrt -> rt_refcnt--; + + pkp = (((struct npaidbentry *) llrt -> rt_llinfo) -> np_rt) ? + (struct pkcb *)(((struct npaidbentry *) llrt -> rt_llinfo) -> np_rt -> rt_llinfo) : (struct pkcb *) 0; + if (pkp == (struct pkcb *) 0) + return 0; + pkp -> pk_llnext = addr; + + return ((caddr_t) pkp); + } + case PRC_DISCONNECT_INDICATION: + pk_restart (pkp, -1) ; /* Clear all active circuits */ + pkp -> pk_state = DTE_WAITING; + pkp -> pk_llnext = (caddr_t) 0; + } + return (0); +} +struct ifqueue pkintrq; +/* + * This routine is called if there are semi-smart devices that do HDLC + * in hardware and want to queue the packet and call level 3 directly + */ +pkintr () +{ + register struct mbuf *m; + register struct ifaddr *ifa; + register struct ifnet *ifp; + register int s; + + for (;;) { + s = splimp (); + IF_DEQUEUE (&pkintrq, m); + splx (s); + if (m == 0) + break; + if (m -> m_len < PKHEADERLN) { + printf ("pkintr: packet too short (len=%d)\n", + m -> m_len); + m_freem (m); + continue; + } + pk_input (m); + } +} +struct mbuf *pk_bad_packet; +struct mbuf_cache pk_input_cache = {0 }; +/* + * X.25 PACKET INPUT + * + * This procedure is called by a link level procedure whenever + * an information frame is received. It decodes the packet and + * demultiplexes based on the logical channel number. + * + * We change the original conventions of the UBC code here -- + * since there may be multiple pkcb's for a given interface + * of type 802.2 class 2, we retrieve which one it is from + * m_pkthdr.rcvif (which has been overwritten by lower layers); + * That field is then restored for the benefit of upper layers which + * may make use of it, such as CLNP. + * + */ + +#define RESTART_DTE_ORIGINATED(xp) (((xp) -> packet_cause == X25_RESTART_DTE_ORIGINATED) || \ + ((xp) -> packet_cause >= X25_RESTART_DTE_ORIGINATED2)) + +pk_input (m) +register struct mbuf *m; +{ + register struct x25_packet *xp; + register struct pklcd *lcp; + register struct socket *so = 0; + register struct pkcb *pkp; + int ptype, lcn, lcdstate = LISTEN; + + if (pk_input_cache.mbc_size || pk_input_cache.mbc_oldsize) + mbuf_cache (&pk_input_cache, m); + if ((m -> m_flags & M_PKTHDR) == 0) + panic ("pkintr"); + + if ((pkp = (struct pkcb *) m -> m_pkthdr.rcvif) == 0) + return; + xp = mtod (m, struct x25_packet *); + ptype = pk_decode (xp); + lcn = LCN(xp); + lcp = pkp -> pk_chan[lcn]; + + /* + * If the DTE is in Restart state, then it will ignore data, + * interrupt, call setup and clearing, flow control and reset + * packets. + */ + if (lcn < 0 || lcn > pkp -> pk_maxlcn) { + pk_message (lcn, pkp -> pk_xcp, "illegal lcn"); + m_freem (m); + return; + } + + pk_trace (pkp -> pk_xcp, m, "P-In"); + + if (pkp -> pk_state != DTE_READY && ptype != RESTART && ptype != RESTART_CONF) { + m_freem (m); + return; + } + if (lcp) { + so = lcp -> lcd_so; + lcdstate = lcp -> lcd_state; + } else { + if (ptype == CLEAR) { /* idle line probe (Datapac specific) */ + /* send response on lcd 0's output queue */ + lcp = pkp -> pk_chan[0]; + lcp -> lcd_template = pk_template (lcn, X25_CLEAR_CONFIRM); + pk_output (lcp); + m_freem (m); + return; + } + if (ptype != CALL) + ptype = INVALID_PACKET; + } + + if (lcn == 0 && ptype != RESTART && ptype != RESTART_CONF) { + pk_message (0, pkp -> pk_xcp, "illegal ptype (%d, %s) on lcn 0", + ptype, pk_name[ptype / MAXSTATES]); + if (pk_bad_packet) + m_freem (pk_bad_packet); + pk_bad_packet = m; + return; + } + + m -> m_pkthdr.rcvif = pkp -> pk_ia -> ia_ifp; + + switch (ptype + lcdstate) { + /* + * Incoming Call packet received. + */ + case CALL + LISTEN: + pk_incoming_call (pkp, m); + break; + + /* + * Call collision: Just throw this "incoming call" away since + * the DCE will ignore it anyway. + */ + case CALL + SENT_CALL: + pk_message ((int) lcn, pkp -> pk_xcp, + "incoming call collision"); + break; + + /* + * Call confirmation packet received. This usually means our + * previous connect request is now complete. + */ + case CALL_ACCEPTED + SENT_CALL: + MCHTYPE(m, MT_CONTROL); + pk_call_accepted (lcp, m); + break; + + /* + * This condition can only happen if the previous state was + * SENT_CALL. Just ignore the packet, eventually a clear + * confirmation should arrive. + */ + case CALL_ACCEPTED + SENT_CLEAR: + break; + + /* + * Clear packet received. This requires a complete tear down + * of the virtual circuit. Free buffers and control blocks. + * and send a clear confirmation. + */ + case CLEAR + READY: + case CLEAR + RECEIVED_CALL: + case CLEAR + SENT_CALL: + case CLEAR + DATA_TRANSFER: + lcp -> lcd_state = RECEIVED_CLEAR; + lcp -> lcd_template = pk_template (lcp -> lcd_lcn, X25_CLEAR_CONFIRM); + pk_output (lcp); + pk_clearcause (pkp, xp); + if (lcp -> lcd_upper) { + MCHTYPE(m, MT_CONTROL); + lcp -> lcd_upper (lcp, m); + } + pk_close (lcp); + lcp = 0; + break; + + /* + * Clear collision: Treat this clear packet as a confirmation. + */ + case CLEAR + SENT_CLEAR: + pk_close (lcp); + break; + + /* + * Clear confirmation received. This usually means the virtual + * circuit is now completely removed. + */ + case CLEAR_CONF + SENT_CLEAR: + pk_close (lcp); + break; + + /* + * A clear confirmation on an unassigned logical channel - just + * ignore it. Note: All other packets on an unassigned channel + * results in a clear. + */ + case CLEAR_CONF + READY: + case CLEAR_CONF + LISTEN: + break; + + /* + * Data packet received. Pass on to next level. Move the Q and M + * bits into the data portion for the next level. + */ + case DATA + DATA_TRANSFER: + if (lcp -> lcd_reset_condition) { + ptype = DELETE_PACKET; + break; + } + + /* + * Process the P(S) flow control information in this Data packet. + * Check that the packets arrive in the correct sequence and that + * they are within the "lcd_input_window". Input window rotation is + * initiated by the receive interface. + */ + + if (PS(xp) != ((lcp -> lcd_rsn + 1) % MODULUS) || + PS(xp) == ((lcp -> lcd_input_window + lcp -> lcd_windowsize) % MODULUS)) { + m_freem (m); + pk_procerror (RESET, lcp, "p(s) flow control error", 1); + break; + } + lcp -> lcd_rsn = PS(xp); + + if (pk_ack (lcp, PR(xp)) != PACKET_OK) { + m_freem (m); + break; + } + m -> m_data += PKHEADERLN; + m -> m_len -= PKHEADERLN; + m -> m_pkthdr.len -= PKHEADERLN; + + lcp -> lcd_rxcnt++; + if (lcp -> lcd_flags & X25_MBS_HOLD) { + register struct mbuf *n = lcp -> lcd_cps; + int mbit = MBIT(xp); + octet q_and_d_bits; + + if (n) { + n -> m_pkthdr.len += m -> m_pkthdr.len; + while (n -> m_next) + n = n -> m_next; + n -> m_next = m; + m = lcp -> lcd_cps; + + if (lcp -> lcd_cpsmax && + n -> m_pkthdr.len > lcp -> lcd_cpsmax) { + pk_procerror (RESET, lcp, + "C.P.S. overflow", 128); + return; + } + q_and_d_bits = 0xc0 & *(octet *) xp; + xp = (struct x25_packet *) + (mtod (m, octet *) - PKHEADERLN); + *(octet *) xp |= q_and_d_bits; + } + if (mbit) { + lcp -> lcd_cps = m; + pk_flowcontrol (lcp, 0, 1); + return; + } + lcp -> lcd_cps = 0; + } + if (so == 0) + break; + if (lcp -> lcd_flags & X25_MQBIT) { + octet t = (X25GBITS(xp -> bits, q_bit)) ? t = 0x80 : 0; + + if (MBIT(xp)) + t |= 0x40; + m -> m_data -= 1; + m -> m_len += 1; + m -> m_pkthdr.len += 1; + *mtod (m, octet *) = t; + } + + /* + * Discard Q-BIT packets if the application + * doesn't want to be informed of M and Q bit status + */ + if (X25GBITS(xp -> bits, q_bit) + && (lcp -> lcd_flags & X25_MQBIT) == 0) { + m_freem (m); + /* + * NB. This is dangerous: sending a RR here can + * cause sequence number errors if a previous data + * packet has not yet been passed up to the application + * (RR's are normally generated via PRU_RCVD). + */ + pk_flowcontrol (lcp, 0, 1); + } else { + sbappendrecord (&so -> so_rcv, m); + sorwakeup (so); + } + break; + + /* + * Interrupt packet received. + */ + case INTERRUPT + DATA_TRANSFER: + if (lcp -> lcd_reset_condition) + break; + lcp -> lcd_intrdata = xp -> packet_data; + lcp -> lcd_template = pk_template (lcp -> lcd_lcn, X25_INTERRUPT_CONFIRM); + pk_output (lcp); + m -> m_data += PKHEADERLN; + m -> m_len -= PKHEADERLN; + m -> m_pkthdr.len -= PKHEADERLN; + MCHTYPE(m, MT_OOBDATA); + if (so) { + if (so -> so_options & SO_OOBINLINE) + sbinsertoob (&so -> so_rcv, m); + else + m_freem (m); + sohasoutofband (so); + } + break; + + /* + * Interrupt confirmation packet received. + */ + case INTERRUPT_CONF + DATA_TRANSFER: + if (lcp -> lcd_reset_condition) + break; + if (lcp -> lcd_intrconf_pending == TRUE) + lcp -> lcd_intrconf_pending = FALSE; + else + pk_procerror (RESET, lcp, "unexpected packet", 43); + break; + + /* + * Receiver ready received. Rotate the output window and output + * any data packets waiting transmission. + */ + case RR + DATA_TRANSFER: + if (lcp -> lcd_reset_condition || + pk_ack (lcp, PR(xp)) != PACKET_OK) { + ptype = DELETE_PACKET; + break; + } + if (lcp -> lcd_rnr_condition == TRUE) + lcp -> lcd_rnr_condition = FALSE; + pk_output (lcp); + break; + + /* + * Receiver Not Ready received. Packets up to the P(R) can be + * be sent. Condition is cleared with a RR. + */ + case RNR + DATA_TRANSFER: + if (lcp -> lcd_reset_condition || + pk_ack (lcp, PR(xp)) != PACKET_OK) { + ptype = DELETE_PACKET; + break; + } + lcp -> lcd_rnr_condition = TRUE; + break; + + /* + * Reset packet received. Set state to FLOW_OPEN. The Input and + * Output window edges ar set to zero. Both the send and receive + * numbers are reset. A confirmation is returned. + */ + case RESET + DATA_TRANSFER: + if (lcp -> lcd_reset_condition) + /* Reset collision. Just ignore packet. */ + break; + + pk_resetcause (pkp, xp); + lcp -> lcd_window_condition = lcp -> lcd_rnr_condition = + lcp -> lcd_intrconf_pending = FALSE; + lcp -> lcd_output_window = lcp -> lcd_input_window = + lcp -> lcd_last_transmitted_pr = 0; + lcp -> lcd_ssn = 0; + lcp -> lcd_rsn = MODULUS - 1; + + lcp -> lcd_template = pk_template (lcp -> lcd_lcn, X25_RESET_CONFIRM); + pk_output (lcp); + + pk_flush (lcp); + if (so == 0) + break; + wakeup ((caddr_t) & so -> so_timeo); + sorwakeup (so); + sowwakeup (so); + break; + + /* + * Reset confirmation received. + */ + case RESET_CONF + DATA_TRANSFER: + if (lcp -> lcd_reset_condition) { + lcp -> lcd_reset_condition = FALSE; + pk_output (lcp); + } + else + pk_procerror (RESET, lcp, "unexpected packet", 32); + break; + + case DATA + SENT_CLEAR: + ptype = DELETE_PACKET; + case RR + SENT_CLEAR: + case RNR + SENT_CLEAR: + case INTERRUPT + SENT_CLEAR: + case INTERRUPT_CONF + SENT_CLEAR: + case RESET + SENT_CLEAR: + case RESET_CONF + SENT_CLEAR: + /* Just ignore p if we have sent a CLEAR already. + */ + break; + + /* + * Restart sets all the permanent virtual circuits to the "Data + * Transfer" stae and all the switched virtual circuits to the + * "Ready" state. + */ + case RESTART + READY: + switch (pkp -> pk_state) { + case DTE_SENT_RESTART: + /* + * Restart collision. + * If case the restart cause is "DTE originated" we + * have a DTE-DTE situation and are trying to resolve + * who is going to play DTE/DCE [ISO 8208:4.2-4.5] + */ + if (RESTART_DTE_ORIGINATED(xp)) { + pk_restart (pkp, X25_RESTART_DTE_ORIGINATED); + pk_message (0, pkp -> pk_xcp, + "RESTART collision"); + if ((pkp -> pk_restartcolls++) > MAXRESTARTCOLLISIONS) { + pk_message (0, pkp -> pk_xcp, + "excessive RESTART collisions"); + pkp -> pk_restartcolls = 0; + } + break; + } + pkp -> pk_state = DTE_READY; + pkp -> pk_dxerole |= DTE_PLAYDTE; + pkp -> pk_dxerole &= ~DTE_PLAYDCE; + pk_message (0, pkp -> pk_xcp, + "Packet level operational"); + pk_message (0, pkp -> pk_xcp, + "Assuming DTE role"); + if (pkp -> pk_dxerole & DTE_CONNECTPENDING) + pk_callcomplete (pkp); + break; + + default: + pk_restart (pkp, -1); + pk_restartcause (pkp, xp); + pkp -> pk_chan[0] -> lcd_template = pk_template (0, + X25_RESTART_CONFIRM); + pk_output (pkp -> pk_chan[0]); + pkp -> pk_state = DTE_READY; + pkp -> pk_dxerole |= RESTART_DTE_ORIGINATED(xp) ? DTE_PLAYDCE : + DTE_PLAYDTE; + if (pkp -> pk_dxerole & DTE_PLAYDTE) { + pkp -> pk_dxerole &= ~DTE_PLAYDCE; + pk_message (0, pkp -> pk_xcp, + "Assuming DTE role"); + } else { + pkp -> pk_dxerole &= ~DTE_PLAYDTE; + pk_message (0, pkp -> pk_xcp, + "Assuming DCE role"); + } + if (pkp -> pk_dxerole & DTE_CONNECTPENDING) + pk_callcomplete (pkp); + } + break; + + /* + * Restart confirmation received. All logical channels are set + * to READY. + */ + case RESTART_CONF + READY: + switch (pkp -> pk_state) { + case DTE_SENT_RESTART: + pkp -> pk_state = DTE_READY; + pkp -> pk_dxerole |= DTE_PLAYDTE; + pkp -> pk_dxerole &= ~DTE_PLAYDCE; + pk_message (0, pkp -> pk_xcp, + "Packet level operational"); + pk_message (0, pkp -> pk_xcp, + "Assuming DTE role"); + if (pkp -> pk_dxerole & DTE_CONNECTPENDING) + pk_callcomplete (pkp); + break; + + default: + /* Restart local procedure error. */ + pk_restart (pkp, X25_RESTART_LOCAL_PROCEDURE_ERROR); + pkp -> pk_state = DTE_SENT_RESTART; + pkp -> pk_dxerole &= ~(DTE_PLAYDTE | DTE_PLAYDCE); + } + break; + + default: + if (lcp) { + pk_procerror (CLEAR, lcp, "unknown packet error", 33); + pk_message (lcn, pkp -> pk_xcp, + "\"%s\" unexpected in \"%s\" state", + pk_name[ptype/MAXSTATES], pk_state[lcdstate]); + } else + pk_message (lcn, pkp -> pk_xcp, + "packet arrived on unassigned lcn"); + break; + } + if (so == 0 && lcp && lcp -> lcd_upper && lcdstate == DATA_TRANSFER) { + if (ptype != DATA && ptype != INTERRUPT) + MCHTYPE(m, MT_CONTROL); + lcp -> lcd_upper (lcp, m); + } else if (ptype != DATA && ptype != INTERRUPT) + m_freem (m); +} + +static +prune_dnic (from, to, dnicname, xcp) +char *from, *to, *dnicname; +register struct x25config *xcp; +{ + register char *cp1 = from, *cp2 = from; + if (xcp -> xc_prepnd0 && *cp1 == '0') { + from = ++cp1; + goto copyrest; + } + if (xcp -> xc_nodnic) { + for (cp1 = dnicname; *cp2 = *cp1++;) + cp2++; + cp1 = from; + } +copyrest: + for (cp1 = dnicname; *cp2 = *cp1++;) + cp2++; +} +/* static */ +pk_simple_bsd (from, to, lower, len) +register octet *from, *to; +register len, lower; +{ + register int c; + while (--len >= 0) { + c = *from; + if (lower & 0x01) + *from++; + else + c >>= 4; + c &= 0x0f; c |= 0x30; *to++ = c; lower++; + } + *to = 0; +} + +/*static octet * */ +pk_from_bcd (a, iscalling, sa, xcp) +register struct x25_calladdr *a; +register struct sockaddr_x25 *sa; +register struct x25config *xcp; +{ + octet buf[MAXADDRLN+1]; + octet *cp; + unsigned count; + + bzero ((caddr_t) sa, sizeof (*sa)); + sa -> x25_len = sizeof (*sa); + sa -> x25_family = AF_CCITT; + if (iscalling) { + cp = a -> address_field + (X25GBITS(a -> addrlens, called_addrlen) / 2); + count = X25GBITS(a -> addrlens, calling_addrlen); + pk_simple_bsd (cp, buf, X25GBITS(a -> addrlens, called_addrlen), count); + } else { + count = X25GBITS(a -> addrlens, called_addrlen); + pk_simple_bsd (a -> address_field, buf, 0, count); + } + if (xcp -> xc_addr.x25_net && (xcp -> xc_nodnic || xcp -> xc_prepnd0)) { + octet dnicname[sizeof (long) * NBBY/3 + 2]; + + sprintf ((char *) dnicname, "%d", xcp -> xc_addr.x25_net); + prune_dnic ((char *) buf, sa -> x25_addr, dnicname, xcp); + } else + bcopy ((caddr_t) buf, (caddr_t) sa -> x25_addr, count + 1); +} + +static +save_extra (m0, fp, so) +struct mbuf *m0; +octet *fp; +struct socket *so; +{ + register struct mbuf *m; + struct cmsghdr cmsghdr; + if (m = m_copy (m, 0, (int)M_COPYALL)) { + int off = fp - mtod (m0, octet *); + int len = m -> m_pkthdr.len - off + sizeof (cmsghdr); + cmsghdr.cmsg_len = len; + cmsghdr.cmsg_level = AF_CCITT; + cmsghdr.cmsg_type = PK_FACILITIES; + m_adj (m, off); + M_PREPEND (m, sizeof (cmsghdr), M_DONTWAIT); + if (m == 0) + return; + bcopy ((caddr_t)&cmsghdr, mtod (m, caddr_t), sizeof (cmsghdr)); + MCHTYPE(m, MT_CONTROL); + sbappendrecord (&so -> so_rcv, m); + } +} + +/* + * This routine handles incoming call packets. It matches the protocol + * field on the Call User Data field (usually the first four bytes) with + * sockets awaiting connections. + */ + +pk_incoming_call (pkp, m0) +struct mbuf *m0; +struct pkcb *pkp; +{ + register struct pklcd *lcp = 0, *l; + register struct sockaddr_x25 *sa; + register struct x25_calladdr *a; + register struct socket *so = 0; + struct x25_packet *xp = mtod (m0, struct x25_packet *); + struct mbuf *m; + struct x25config *xcp = pkp -> pk_xcp; + int len = m0 -> m_pkthdr.len; + unsigned udlen; + char *errstr = "server unavailable"; + octet *u, *facp; + int lcn = LCN(xp); + + /* First, copy the data from the incoming call packet to a X25 address + descriptor. It is to be regretted that you have + to parse the facilities into a sockaddr to determine + if reverse charging is being requested */ + if ((m = m_get (M_DONTWAIT, MT_SONAME)) == 0) + return; + sa = mtod (m, struct sockaddr_x25 *); + a = (struct x25_calladdr *) &xp -> packet_data; + facp = u = (octet *) (a -> address_field + + ((X25GBITS(a -> addrlens, called_addrlen) + X25GBITS(a -> addrlens, calling_addrlen) + 1) / 2)); + u += *u + 1; + udlen = min (16, ((octet *) xp) + len - u); + if (udlen < 0) + udlen = 0; + pk_from_bcd (a, 1, sa, pkp -> pk_xcp); /* get calling address */ + pk_parse_facilities (facp, sa); + bcopy ((caddr_t) u, sa -> x25_udata, udlen); + sa -> x25_udlen = udlen; + + /* + * Now, loop through the listen sockets looking for a match on the + * PID. That is the first few octets of the user data field. + * This is the closest thing to a port number for X.25 packets. + * It does provide a way of multiplexing services at the user level. + */ + + for (l = pk_listenhead; l; l = l -> lcd_listen) { + struct sockaddr_x25 *sxp = l -> lcd_ceaddr; + + if (bcmp (sxp -> x25_udata, u, sxp -> x25_udlen)) + continue; + if (sxp -> x25_net && + sxp -> x25_net != xcp -> xc_addr.x25_net) + continue; + /* + * don't accept incoming calls with the D-Bit on + * unless the server agrees + */ + if (X25GBITS(xp -> bits, d_bit) && !(sxp -> x25_opts.op_flags & X25_DBIT)) { + errstr = "incoming D-Bit mismatch"; + break; + } + /* + * don't accept incoming collect calls unless + * the server sets the reverse charging option. + */ + if ((sxp -> x25_opts.op_flags & (X25_OLDSOCKADDR|X25_REVERSE_CHARGE)) == 0 && + sa -> x25_opts.op_flags & X25_REVERSE_CHARGE) { + errstr = "incoming collect call refused"; + break; + } + if (l -> lcd_so) { + if (so = sonewconn (l -> lcd_so, SS_ISCONNECTED)) + lcp = (struct pklcd *) so -> so_pcb; + } else + lcp = pk_attach ((struct socket *) 0); + if (lcp == 0) { + /* + * Insufficient space or too many unaccepted + * connections. Just throw the call away. + */ + errstr = "server malfunction"; + break; + } + lcp -> lcd_upper = l -> lcd_upper; + lcp -> lcd_upnext = l -> lcd_upnext; + lcp -> lcd_lcn = lcn; + lcp -> lcd_state = RECEIVED_CALL; + sa -> x25_opts.op_flags |= (sxp -> x25_opts.op_flags & + ~X25_REVERSE_CHARGE) | l -> lcd_flags; + pk_assoc (pkp, lcp, sa); + lcp -> lcd_faddr = *sa; + lcp -> lcd_laddr.x25_udlen = sxp -> x25_udlen; + lcp -> lcd_craddr = &lcp -> lcd_faddr; + lcp -> lcd_template = pk_template (lcp -> lcd_lcn, X25_CALL_ACCEPTED); + if (lcp -> lcd_flags & X25_DBIT) { + if (X25GBITS(xp -> bits, d_bit)) + X25SBITS(mtod (lcp -> lcd_template, + struct x25_packet *) -> bits, d_bit, 1); + else + lcp -> lcd_flags &= ~X25_DBIT; + } + if (so) { + pk_output (lcp); + soisconnected (so); + if (so -> so_options & SO_OOBINLINE) + save_extra (m0, facp, so); + } else if (lcp -> lcd_upper) { + (*lcp -> lcd_upper) (lcp, m0); + } + (void) m_free (m); + return; + } + + /* + * If the call fails for whatever reason, we still need to build a + * skeleton LCD in order to be able to properly receive the CLEAR + * CONFIRMATION. + */ +#ifdef WATERLOO /* be explicit */ + if (l == 0 && bcmp (sa -> x25_udata, "ean", 3) == 0) + pk_message (lcn, pkp -> pk_xcp, "host=%s ean%c: %s", + sa -> x25_addr, sa -> x25_udata[3] & 0xff, errstr); + else if (l == 0 && bcmp (sa -> x25_udata, "\1\0\0\0", 4) == 0) + pk_message (lcn, pkp -> pk_xcp, "host=%s x29d: %s", + sa -> x25_addr, errstr); + else +#endif + pk_message (lcn, pkp -> pk_xcp, "host=%s pid=%x %x %x %x: %s", + sa -> x25_addr, sa -> x25_udata[0] & 0xff, + sa -> x25_udata[1] & 0xff, sa -> x25_udata[2] & 0xff, + sa -> x25_udata[3] & 0xff, errstr); + if ((lcp = pk_attach ((struct socket *)0)) == 0) { + (void) m_free (m); + return; + } + lcp -> lcd_lcn = lcn; + lcp -> lcd_state = RECEIVED_CALL; + pk_assoc (pkp, lcp, sa); + (void) m_free (m); + pk_clear (lcp, 0, 1); +} + +pk_call_accepted (lcp, m) +struct pklcd *lcp; +struct mbuf *m; +{ + register struct x25_calladdr *ap; + register octet *fcp; + struct x25_packet *xp = mtod (m, struct x25_packet *); + int len = m -> m_len; + + lcp -> lcd_state = DATA_TRANSFER; + if (lcp -> lcd_so) + soisconnected (lcp -> lcd_so); + if ((lcp -> lcd_flags & X25_DBIT) && (X25GBITS(xp -> bits, d_bit) == 0)) + lcp -> lcd_flags &= ~X25_DBIT; + if (len > 3) { + ap = (struct x25_calladdr *) &xp -> packet_data; + fcp = (octet *) ap -> address_field + (X25GBITS(ap -> addrlens, calling_addrlen) + + X25GBITS(ap -> addrlens, called_addrlen) + 1) / 2; + if (fcp + *fcp <= ((octet *) xp) + len) + pk_parse_facilities (fcp, lcp -> lcd_ceaddr); + } + pk_assoc (lcp -> lcd_pkp, lcp, lcp -> lcd_ceaddr); + if (lcp -> lcd_so == 0 && lcp -> lcd_upper) + lcp -> lcd_upper (lcp, m); +} + +pk_parse_facilities (fcp, sa) +register octet *fcp; +register struct sockaddr_x25 *sa; +{ + register octet *maxfcp; + + maxfcp = fcp + *fcp; + fcp++; + while (fcp < maxfcp) { + /* + * Ignore national DCE or DTE facilities + */ + if (*fcp == 0 || *fcp == 0xff) + break; + switch (*fcp) { + case FACILITIES_WINDOWSIZE: + sa -> x25_opts.op_wsize = fcp[1]; + fcp += 3; + break; + + case FACILITIES_PACKETSIZE: + sa -> x25_opts.op_psize = fcp[1]; + fcp += 3; + break; + + case FACILITIES_THROUGHPUT: + sa -> x25_opts.op_speed = fcp[1]; + fcp += 2; + break; + + case FACILITIES_REVERSE_CHARGE: + if (fcp[1] & 01) + sa -> x25_opts.op_flags |= X25_REVERSE_CHARGE; + /* + * Datapac specific: for a X.25(1976) DTE, bit 2 + * indicates a "hi priority" (eg. international) call. + */ + if (fcp[1] & 02 && sa -> x25_opts.op_psize == 0) + sa -> x25_opts.op_psize = X25_PS128; + fcp += 2; + break; + + default: +/*printf("unknown facility %x, class=%d\n", *fcp, (*fcp & 0xc0) >> 6);*/ + switch ((*fcp & 0xc0) >> 6) { + case 0: /* class A */ + fcp += 2; + break; + + case 1: + fcp += 3; + break; + + case 2: + fcp += 4; + break; + + case 3: + fcp++; + fcp += *fcp; + } + } + } +} diff --git a/sys/netccitt/pk_llcsubr.c b/sys/netccitt/pk_llcsubr.c new file mode 100644 index 00000000000..d8cc5016a28 --- /dev/null +++ b/sys/netccitt/pk_llcsubr.c @@ -0,0 +1,369 @@ +/* + * Copyright (C) Dirk Husemann, Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1990, 1991, 1992 + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Dirk Husemann and the Computer Science Department (IV) of + * the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pk_llcsubr.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +/* + * Routing support for X.25 + * + * We distinguish between two cases: + * RTF_HOST: + * rt_key(rt) X.25 address of host + * rt_gateway SNPA (MAC+DLSAP) address of host + * rt_llinfo pkcb for rt_key(rt) + * + * RTF_GATEWAY + * rt_key(rt) X.25 address of host or suitably masked network + * rt_gateway X.25 address of next X.25 gateway (switch) + * rt_llinfo rtentry for rt_gateway address + * ought to be of type RTF_HOST + * + * + * Mapping of X.121 to pkcbs: + * + * HDLC uses the DTE-DCE model of X.25, therefore we need a many-to-one + * relationship, i.e.: + * + * {X.121_a, X.121_b, X.121_c, ..., X.121_i} -> pkcb_0 + * + * LLC2 utilizes the DTE-DTE model of X.25, resulting effectively in a + * one-to-one relationship, i.e.: + * + * {X.121_j} -> pkcb_1a + * {X.121_k} -> pkcb_1b + * ... + * {X.121_q} -> pkcb_1q + * + * It might make sense to allow a many-to-one relation for LLC2 also, + * + * {X.121_r, X.121_s, X.121_t, X.121_u} -> pkcb_2a + * + * This would make addresses X.121_[r-u] essentially aliases of one + * address ({X.121_[r-u]} would constitute a representative set). + * + * Each one-to-one relation must obviously be entered individually with + * a route add command, whereas a many-to-one relationship can be + * either entered individually or generated by using a netmask. + * + * To facilitate dealings the many-to-one case for LLC2 can only be + * established via a netmask. + * + */ + +#define XTRACTPKP(rt) ((rt)->rt_flags & RTF_GATEWAY ? \ + ((rt)->rt_llinfo ? \ + (struct pkcb *) ((struct rtentry *)((rt)->rt_llinfo))->rt_llinfo : \ + (struct pkcb *) NULL) : \ + (struct pkcb *)((rt)->rt_llinfo)) + +#define equal(a1, a2) (bcmp((caddr_t)(a1), \ + (caddr_t)(a2), \ + (a1)->sa_len) == 0) +#define XIFA(rt) ((struct x25_ifaddr *)((rt)->rt_ifa)) +#define SA(s) ((struct sockaddr *)s) + +int +cons_rtrequest(int cmd, struct rtentry *rt, struct sockaddr *dst) +{ + register struct pkcb *pkp; + register int i; + register char one_to_one; + struct pkcb *pk_newlink(); + struct rtentry *npaidb_enter(); + + pkp = XTRACTPKP(rt); + + switch(cmd) { + case RTM_RESOLVE: + case RTM_ADD: + if (pkp) + return(EEXIST); + + if (rt->rt_flags & RTF_GATEWAY) { + if (rt->rt_llinfo) + RTFREE((struct rtentry *)rt->rt_llinfo); + rt->rt_llinfo = (caddr_t) rtalloc1(rt->rt_gateway, 1); + return(0); + } + /* + * Assumptions: (1) ifnet structure is filled in + * (2) at least the pkcb created via + * x25config (ifconfig?) has been + * set up already. + * (3) HDLC interfaces have an if_type of + * IFT_X25{,DDN}, LLC2 interfaces + * anything else (any better way to + * do this?) + * + */ + if (!rt->rt_ifa) + return (ENETDOWN); + + /* + * We differentiate between dealing with a many-to-one + * (HDLC: DTE-DCE) and a one-to-one (LLC2: DTE-DTE) + * relationship (by looking at the if type). + * + * Only in case of the many-to-one relationship (HDLC) + * we set the ia->ia_pkcb pointer to the pkcb allocated + * via pk_newlink() as we will use just that one pkcb for + * future route additions (the rtentry->rt_llinfo pointer + * points to the pkcb allocated for that route). + * + * In case of the one-to-one relationship (LLC2) we + * create a new pkcb (via pk_newlink()) for each new rtentry. + * + * NOTE: Only in case of HDLC does ia->ia_pkcb point + * to a pkcb, in the LLC2 case it doesn't (as we don't + * need it here)! + */ + one_to_one = ISISO8802(rt->rt_ifp); + + if (!(pkp = XIFA(rt)->ia_pkcb) && !one_to_one) + XIFA(rt)->ia_pkcb = pkp = + pk_newlink(XIFA(rt), (caddr_t) 0); + else if (one_to_one && + !equal(rt->rt_gateway, rt->rt_ifa->ifa_addr)) { + pkp = pk_newlink(XIFA(rt), (caddr_t) 0); + /* + * We also need another route entry for mapping + * MAC+LSAP->X.25 address + */ + pkp->pk_llrt = npaidb_enter(rt->rt_gateway, rt_key(rt), rt, 0); + } + if (pkp) { + if (!pkp->pk_rt) + pkp->pk_rt = rt; + pkp->pk_refcount++; + } + rt->rt_llinfo = (caddr_t) pkp; + + return(0); + + case RTM_DELETE: + { + /* + * The pkp might be empty if we are dealing + * with an interface route entry for LLC2, in this + * case we don't need to do anything ... + */ + if (pkp) { + if ( rt->rt_flags & RTF_GATEWAY ) { + if (rt->rt_llinfo) + RTFREE((struct rtentry *)rt->rt_llinfo); + return(0); + } + + if (pkp->pk_llrt) + npaidb_destroy(pkp->pk_llrt); + + pk_dellink (pkp); + + return(0); + } + } + } +} + +/* + * Network Protocol Addressing Information DataBase (npaidb) + * + * To speed up locating the entity dealing with an LLC packet use is made + * of a routing tree. This npaidb routing tree is handled + * by the normal rn_*() routines just like (almost) any other routing tree. + * + * The mapping being done by the npaidb_*() routines is as follows: + * + * Key: MAC,LSAP (enhancing struct sockaddr_dl) + * Gateway: sockaddr_x25 (i.e. X.25 address - X.121 or NSAP) + * Llinfo: npaidbentry { + * struct llc_linkcb *npaidb_linkp; + * struct rtentry *npaidb_rt; + * } + * + * Using the npaidbentry provided by llinfo we can then access + * + * o the pkcb by using (struct pkcb *) (npaidb_rt->rt_llinfo) + * o the linkcb via npaidb_linkp + * + * The following functions are provided + * + * o npaidb_enter(struct sockaddr_dl *sdl, struct sockaddr_x25 *sx25, + * struct struct llc_linkcb *link, struct rtentry *rt) + * + * o npaidb_enrich(short type, caddr_t info) + * + */ + +struct sockaddr_dl npdl_netmask = { + sizeof(struct sockaddr_dl), /* _len */ + 0, /* _family */ + 0, /* _index */ + 0, /* _type */ + -1, /* _nlen */ + -1, /* _alen */ + -1, /* _slen */ + { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* _data */ +}; +struct sockaddr npdl_dummy; + +int npdl_datasize = sizeof(struct sockaddr_dl)- + ((int)((caddr_t)&((struct sockaddr_dl *)0)->sdl_data[0])); + +struct rtentry * +npaidb_enter(struct sockaddr_dl *key, struct sockaddr *value, + struct rtentry *rt, struct llc_linkcb *link) +{ + struct rtentry *nprt; register int i; + + USES_AF_LINK_RTS; + + if ((nprt = rtalloc1(SA(key), 0)) == 0) { + register u_int size = sizeof(struct npaidbentry); + register u_char saploc = LLSAPLOC(key, rt->rt_ifp); + + /* + * set up netmask: LLC2 packets have the lowest bit set in + * response packets (e.g. 0x7e for command packets, 0x7f for + * response packets), to facilitate the lookup we use a netmask + * of 11111110 for the SAP position. The remaining positions + * are zeroed out. + */ + npdl_netmask.sdl_data[saploc] = NPDL_SAPNETMASK; + bzero((caddr_t)&npdl_netmask.sdl_data[saploc+1], + npdl_datasize-saploc-1); + + if (value == 0) + value = &npdl_dummy; + + /* now enter it */ + rtrequest(RTM_ADD, SA(key), SA(value), + SA(&npdl_netmask), 0, &nprt); + + /* and reset npdl_netmask */ + for (i = saploc; i < npdl_datasize; i++) + npdl_netmask.sdl_data[i] = -1; + + nprt->rt_llinfo = malloc(size , M_PCB, M_WAITOK); + if (nprt->rt_llinfo) { + bzero (nprt->rt_llinfo, size); + ((struct npaidbentry *) (nprt->rt_llinfo))->np_rt = rt; + } + } else nprt->rt_refcnt--; + return nprt; +} + +struct rtentry * +npaidb_enrich(short type, caddr_t info, struct sockaddr_dl *sdl) +{ + struct rtentry *rt; + + USES_AF_LINK_RTS; + + if (rt = rtalloc1((struct sockaddr *)sdl, 0)) { + rt->rt_refcnt--; + switch (type) { + case NPAIDB_LINK: + ((struct npaidbentry *)(rt->rt_llinfo))->np_link = + (struct llc_linkcb *) info; + break; + } + return rt; + } + + return ((struct rtentry *) 0); + +} + +npaidb_destroy(struct rtentry *rt) +{ + USES_AF_LINK_RTS; + + if (rt->rt_llinfo) + free((caddr_t) rt->rt_llinfo, M_PCB); + return(rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), + 0, 0)); +} + + +#ifdef LLC +/* + * Glue between X.25 and LLC2 + */ +int +x25_llcglue(int prc, struct sockaddr *addr) +{ + register struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)addr; + register struct x25_ifaddr *x25ifa; + struct dll_ctlinfo ctlinfo; + + if((x25ifa = (struct x25_ifaddr *)ifa_ifwithaddr(addr)) == 0) + return 0; + + ctlinfo.dlcti_cfg = + (struct dllconfig *)(((struct sockaddr_x25 *)(&x25ifa->ia_xc))+1); + ctlinfo.dlcti_lsap = LLC_X25_LSAP; + + return ((int)llc_ctlinput(prc, addr, (caddr_t)&ctlinfo)); +} +#endif /* LLC */ diff --git a/sys/netccitt/pk_output.c b/sys/netccitt/pk_output.c new file mode 100644 index 00000000000..ccc02a4c327 --- /dev/null +++ b/sys/netccitt/pk_output.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (C) Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1992 + * Copyright (c) 1991, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by the + * Laboratory for Computation Vision and the Computer Science Department + * of the the University of British Columbia and the Computer Science + * Department (IV) of the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pk_output.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +struct mbuf_cache pk_output_cache = {0 }, pk_input_cache; +struct mbuf *nextpk (); + +pk_output (lcp) +register struct pklcd *lcp; +{ + register struct x25_packet *xp; + register struct mbuf *m; + register struct pkcb *pkp = lcp -> lcd_pkp; + + if (lcp == 0 || pkp == 0) { + printf ("pk_output: zero arg\n"); + return; + } + + while ((m = nextpk (lcp)) != NULL) { + xp = mtod (m, struct x25_packet *); + + switch (pk_decode (xp) + lcp -> lcd_state) { + /* + * All the work is already done - just set the state and + * pass to peer. + */ + case CALL + READY: + lcp -> lcd_state = SENT_CALL; + lcp -> lcd_timer = pk_t21; + break; + + /* + * Just set the state to allow packet to flow and send the + * confirmation. + */ + case CALL_ACCEPTED + RECEIVED_CALL: + lcp -> lcd_state = DATA_TRANSFER; + break; + + /* + * Just set the state. Keep the LCD around till the clear + * confirmation is returned. + */ + case CLEAR + RECEIVED_CALL: + case CLEAR + SENT_CALL: + case CLEAR + DATA_TRANSFER: + lcp -> lcd_state = SENT_CLEAR; + lcp -> lcd_retry = 0; + /* fall through */ + + case CLEAR + SENT_CLEAR: + lcp -> lcd_timer = pk_t23; + lcp -> lcd_retry++; + break; + + case CLEAR_CONF + RECEIVED_CLEAR: + case CLEAR_CONF + SENT_CLEAR: + case CLEAR_CONF + READY: + lcp -> lcd_state = READY; + break; + + case DATA + DATA_TRANSFER: + SPS(xp, lcp -> lcd_ssn); + lcp -> lcd_input_window = + (lcp -> lcd_rsn + 1) % MODULUS; + SPR(xp, lcp -> lcd_input_window); + lcp -> lcd_last_transmitted_pr = lcp -> lcd_input_window; + lcp -> lcd_ssn = (lcp -> lcd_ssn + 1) % MODULUS; + if (lcp -> lcd_ssn == ((lcp -> lcd_output_window + lcp -> lcd_windowsize) % MODULUS)) + lcp -> lcd_window_condition = TRUE; + break; + + case INTERRUPT + DATA_TRANSFER: +#ifdef ancient_history + xp -> packet_data = 0; +#endif + lcp -> lcd_intrconf_pending = TRUE; + break; + + case INTERRUPT_CONF + DATA_TRANSFER: + break; + + case RR + DATA_TRANSFER: + case RNR + DATA_TRANSFER: + lcp -> lcd_input_window = + (lcp -> lcd_rsn + 1) % MODULUS; + SPR(xp, lcp -> lcd_input_window); + lcp -> lcd_last_transmitted_pr = lcp -> lcd_input_window; + break; + + case RESET + DATA_TRANSFER: + lcp -> lcd_reset_condition = TRUE; + break; + + case RESET_CONF + DATA_TRANSFER: + lcp -> lcd_reset_condition = FALSE; + break; + + /* + * A restart should be only generated internally. Therefore + * all logic for restart is in the pk_restart routine. + */ + case RESTART + READY: + lcp -> lcd_timer = pk_t20; + break; + + /* + * Restarts are all handled internally. Therefore all the + * logic for the incoming restart packet is handled in the + * pk_input routine. + */ + case RESTART_CONF + READY: + break; + + default: + m_freem (m); + return; + } + + /* Trace the packet. */ + pk_trace (pkp -> pk_xcp, m, "P-Out"); + + /* Pass the packet on down to the link layer */ + if (pk_input_cache.mbc_size || pk_input_cache.mbc_oldsize) { + m->m_flags |= 0x08; + mbuf_cache(&pk_input_cache, m); + } + (*pkp -> pk_lloutput) (pkp -> pk_llnext, m, pkp -> pk_rt); + } +} + +/* + * This procedure returns the next packet to send or null. A + * packet is composed of one or more mbufs. + */ + +struct mbuf * +nextpk (lcp) +struct pklcd *lcp; +{ + register struct mbuf *m, *n; + struct socket *so = lcp -> lcd_so; + register struct sockbuf *sb = & (so ? so -> so_snd : lcp -> lcd_sb); + + if (lcp -> lcd_template) { + m = lcp -> lcd_template; + lcp -> lcd_template = NULL; + } else { + if (lcp -> lcd_rnr_condition || lcp -> lcd_window_condition || + lcp -> lcd_reset_condition) + return (NULL); + + if ((m = sb -> sb_mb) == 0) + return (NULL); + + sb -> sb_mb = m -> m_nextpkt; + m->m_act = 0; + for (n = m; n; n = n -> m_next) + sbfree (sb, n); + } + return (m); +} diff --git a/sys/netccitt/pk_subr.c b/sys/netccitt/pk_subr.c new file mode 100644 index 00000000000..44c43b6f3f6 --- /dev/null +++ b/sys/netccitt/pk_subr.c @@ -0,0 +1,1192 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (C) Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1992 + * Copyright (c) 1991, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by the + * Laboratory for Computation Vision and the Computer Science Department + * of the the University of British Columbia and the Computer Science + * Department (IV) of the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pk_subr.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +int pk_sendspace = 1024 * 2 + 8; +int pk_recvspace = 1024 * 2 + 8; + +struct pklcd_q pklcd_q = {&pklcd_q, &pklcd_q}; + +struct x25bitslice x25_bitslice[] = { +/* mask, shift value */ + { 0xf0, 0x4 }, + { 0xf, 0x0 }, + { 0x80, 0x7 }, + { 0x40, 0x6 }, + { 0x30, 0x4 }, + { 0xe0, 0x5 }, + { 0x10, 0x4 }, + { 0xe, 0x1 }, + { 0x1, 0x0 } +}; + + +/* + * Attach X.25 protocol to socket, allocate logical channel descripter + * and buffer space, and enter LISTEN state if we are to accept + * IN-COMMING CALL packets. + * + */ + +struct pklcd * +pk_attach (so) +struct socket *so; +{ + register struct pklcd *lcp; + register int error = ENOBUFS; + int pk_output (); + + MALLOC(lcp, struct pklcd *, sizeof (*lcp), M_PCB, M_NOWAIT); + if (lcp) { + bzero ((caddr_t)lcp, sizeof (*lcp)); + insque (&lcp -> lcd_q, &pklcd_q); + lcp -> lcd_state = READY; + lcp -> lcd_send = pk_output; + if (so) { + error = soreserve (so, pk_sendspace, pk_recvspace); + lcp -> lcd_so = so; + if (so -> so_options & SO_ACCEPTCONN) + lcp -> lcd_state = LISTEN; + } else + sbreserve (&lcp -> lcd_sb, pk_sendspace); + } + if (so) { + so -> so_pcb = (caddr_t) lcp; + so -> so_error = error; + } + return (lcp); +} + +/* + * Disconnect X.25 protocol from socket. + */ + +pk_disconnect (lcp) +register struct pklcd *lcp; +{ + register struct socket *so = lcp -> lcd_so; + register struct pklcd *l, *p; + + switch (lcp -> lcd_state) { + case LISTEN: + for (p = 0, l = pk_listenhead; l && l != lcp; p = l, l = l -> lcd_listen); + if (p == 0) { + if (l != 0) + pk_listenhead = l -> lcd_listen; + } + else + if (l != 0) + p -> lcd_listen = l -> lcd_listen; + pk_close (lcp); + break; + + case READY: + pk_acct (lcp); + pk_close (lcp); + break; + + case SENT_CLEAR: + case RECEIVED_CLEAR: + break; + + default: + pk_acct (lcp); + if (so) { + soisdisconnecting (so); + sbflush (&so -> so_rcv); + } + pk_clear (lcp, 241, 0); /* Normal Disconnect */ + + } +} + +/* + * Close an X.25 Logical Channel. Discard all space held by the + * connection and internal descriptors. Wake up any sleepers. + */ + +pk_close (lcp) +struct pklcd *lcp; +{ + register struct socket *so = lcp -> lcd_so; + + /* + * If the X.25 connection is torn down due to link + * level failure (e.g. LLC2 FRMR) and at the same the user + * level is still filling up the socket send buffer that + * send buffer is locked. An attempt to sbflush () that send + * buffer will lead us into - no, not temptation but - panic! + * So - we'll just check wether the send buffer is locked + * and if that's the case we'll mark the lcp as zombie and + * have the pk_timer () do the cleaning ... + */ + + if (so && so -> so_snd.sb_flags & SB_LOCK) + lcp -> lcd_state = LCN_ZOMBIE; + else + pk_freelcd (lcp); + + if (so == NULL) + return; + + so -> so_pcb = 0; + soisdisconnected (so); + /* sofree (so); /* gak!!! you can't do that here */ +} + +/* + * Create a template to be used to send X.25 packets on a logical + * channel. It allocates an mbuf and fills in a skeletal packet + * depending on its type. This packet is passed to pk_output where + * the remainer of the packet is filled in. +*/ + +struct mbuf * +pk_template (lcn, type) +int lcn, type; +{ + register struct mbuf *m; + register struct x25_packet *xp; + + MGETHDR (m, M_DONTWAIT, MT_HEADER); + if (m == 0) + panic ("pk_template"); + m -> m_act = 0; + + /* + * Efficiency hack: leave a four byte gap at the beginning + * of the packet level header with the hope that this will + * be enough room for the link level to insert its header. + */ + m -> m_data += max_linkhdr; + m -> m_pkthdr.len = m -> m_len = PKHEADERLN; + + xp = mtod (m, struct x25_packet *); + *(long *)xp = 0; /* ugly, but fast */ +/* xp -> q_bit = 0;*/ + X25SBITS(xp -> bits, fmt_identifier, 1); +/* xp -> lc_group_number = 0;*/ + + SET_LCN(xp, lcn); + xp -> packet_type = type; + + return (m); +} + +/* + * This routine restarts all the virtual circuits. Actually, + * the virtual circuits are not "restarted" as such. Instead, + * any active switched circuit is simply returned to READY + * state. + */ + +pk_restart (pkp, restart_cause) +register struct pkcb *pkp; +int restart_cause; +{ + register struct mbuf *m; + register struct pklcd *lcp; + register int i; + + /* Restart all logical channels. */ + if (pkp -> pk_chan == 0) + return; + + /* + * Don't do this if we're doing a restart issued from + * inside pk_connect () --- which is only done if and + * only if the X.25 link is down, i.e. a RESTART needs + * to be done to get it up. + */ + if (!(pkp -> pk_dxerole & DTE_CONNECTPENDING)) { + for (i = 1; i <= pkp -> pk_maxlcn; ++i) + if ((lcp = pkp -> pk_chan[i]) != NULL) { + if (lcp -> lcd_so) { + lcp -> lcd_so -> so_error = ENETRESET; + pk_close (lcp); + } else { + pk_flush (lcp); + lcp -> lcd_state = READY; + if (lcp -> lcd_upper) + lcp -> lcd_upper (lcp, 0); + } + } + } + + if (restart_cause < 0) + return; + + pkp -> pk_state = DTE_SENT_RESTART; + pkp -> pk_dxerole &= ~(DTE_PLAYDCE | DTE_PLAYDTE); + lcp = pkp -> pk_chan[0]; + m = lcp -> lcd_template = pk_template (lcp -> lcd_lcn, X25_RESTART); + m -> m_pkthdr.len = m -> m_len += 2; + mtod (m, struct x25_packet *) -> packet_data = 0; /* DTE only */ + mtod (m, octet *)[4] = restart_cause; + pk_output (lcp); +} + + +/* + * This procedure frees up the Logical Channel Descripter. + */ + +pk_freelcd (lcp) +register struct pklcd *lcp; +{ + if (lcp == NULL) + return; + + if (lcp -> lcd_lcn > 0) + lcp -> lcd_pkp -> pk_chan[lcp -> lcd_lcn] = NULL; + + pk_flush (lcp); + remque (&lcp -> lcd_q); + free ((caddr_t)lcp, M_PCB); +} + +static struct x25_ifaddr * +pk_ifwithaddr (sx) + struct sockaddr_x25 *sx; +{ + struct ifnet *ifp; + struct ifaddr *ifa; + register struct x25_ifaddr *ia; + char *addr = sx -> x25_addr; + + for (ifp = ifnet; ifp; ifp = ifp -> if_next) + for (ifa = ifp -> if_addrlist; ifa; ifa = ifa -> ifa_next) + if (ifa -> ifa_addr -> sa_family == AF_CCITT) { + ia = (struct x25_ifaddr *)ifa; + if (bcmp (addr, ia -> ia_xc.xc_addr.x25_addr, + 16) == 0) + return (ia); + + } + return ((struct x25_ifaddr *)0); +} + + +/* + * Bind a address and protocol value to a socket. The important + * part is the protocol value - the first four characters of the + * Call User Data field. + */ + +#define XTRACTPKP(rt) ((rt) -> rt_flags & RTF_GATEWAY ? \ + ((rt) -> rt_llinfo ? \ + (struct pkcb *) ((struct rtentry *)((rt) -> rt_llinfo)) -> rt_llinfo : \ + (struct pkcb *) NULL) : \ + (struct pkcb *)((rt) -> rt_llinfo)) + +pk_bind (lcp, nam) +struct pklcd *lcp; +struct mbuf *nam; +{ + register struct pklcd *pp; + register struct sockaddr_x25 *sa; + + if (nam == NULL) + return (EADDRNOTAVAIL); + if (lcp -> lcd_ceaddr) /* XXX */ + return (EADDRINUSE); + if (pk_checksockaddr (nam)) + return (EINVAL); + sa = mtod (nam, struct sockaddr_x25 *); + + /* + * If the user wishes to accept calls only from a particular + * net (net != 0), make sure the net is known + */ + + if (sa -> x25_addr[0]) { + if (!pk_ifwithaddr (sa)) + return (ENETUNREACH); + } else if (sa -> x25_net) { + if (!ifa_ifwithnet ((struct sockaddr *)sa)) + return (ENETUNREACH); + } + + /* + * For ISO's sake permit default listeners, but only one such . . . + */ + for (pp = pk_listenhead; pp; pp = pp -> lcd_listen) { + register struct sockaddr_x25 *sa2 = pp -> lcd_ceaddr; + if ((sa2 -> x25_udlen == sa -> x25_udlen) && + (sa2 -> x25_udlen == 0 || + (bcmp (sa2 -> x25_udata, sa -> x25_udata, + min (sa2 -> x25_udlen, sa -> x25_udlen)) == 0))) + return (EADDRINUSE); + } + lcp -> lcd_laddr = *sa; + lcp -> lcd_ceaddr = &lcp -> lcd_laddr; + return (0); +} + +/* + * Include a bound control block in the list of listeners. + */ +pk_listen (lcp) +register struct pklcd *lcp; +{ + register struct pklcd **pp; + + if (lcp -> lcd_ceaddr == 0) + return (EDESTADDRREQ); + + lcp -> lcd_state = LISTEN; + /* + * Add default listener at end, any others at start. + */ + if (lcp -> lcd_ceaddr -> x25_udlen == 0) { + for (pp = &pk_listenhead; *pp; ) + pp = &((*pp) -> lcd_listen); + *pp = lcp; + } else { + lcp -> lcd_listen = pk_listenhead; + pk_listenhead = lcp; + } + return (0); +} +/* + * Include a listening control block for the benefit of other protocols. + */ +pk_protolisten (spi, spilen, callee) +int (*callee) (); +{ + register struct pklcd *lcp = pk_attach ((struct socket *)0); + register struct mbuf *nam; + register struct sockaddr_x25 *sa; + int error = ENOBUFS; + + if (lcp) { + if (nam = m_getclr (MT_SONAME, M_DONTWAIT)) { + sa = mtod (nam, struct sockaddr_x25 *); + sa -> x25_family = AF_CCITT; + sa -> x25_len = nam -> m_len = sizeof (*sa); + sa -> x25_udlen = spilen; + sa -> x25_udata[0] = spi; + lcp -> lcd_upper = callee; + lcp -> lcd_flags = X25_MBS_HOLD; + if ((error = pk_bind (lcp, nam)) == 0) + error = pk_listen (lcp); + (void) m_free (nam); + } + if (error) + pk_freelcd (lcp); + } + return error; /* Hopefully Zero !*/ +} + +/* + * Associate a logical channel descriptor with a network. + * Fill in the default network specific parameters and then + * set any parameters explicitly specified by the user or + * by the remote DTE. + */ + +pk_assoc (pkp, lcp, sa) +register struct pkcb *pkp; +register struct pklcd *lcp; +register struct sockaddr_x25 *sa; +{ + + lcp -> lcd_pkp = pkp; + lcp -> lcd_packetsize = pkp -> pk_xcp -> xc_psize; + lcp -> lcd_windowsize = pkp -> pk_xcp -> xc_pwsize; + lcp -> lcd_rsn = MODULUS - 1; + pkp -> pk_chan[lcp -> lcd_lcn] = lcp; + + if (sa -> x25_opts.op_psize) + lcp -> lcd_packetsize = sa -> x25_opts.op_psize; + else + sa -> x25_opts.op_psize = lcp -> lcd_packetsize; + if (sa -> x25_opts.op_wsize) + lcp -> lcd_windowsize = sa -> x25_opts.op_wsize; + else + sa -> x25_opts.op_wsize = lcp -> lcd_windowsize; + sa -> x25_net = pkp -> pk_xcp -> xc_addr.x25_net; + lcp -> lcd_flags |= sa -> x25_opts.op_flags; + lcp -> lcd_stime = time.tv_sec; +} + +pk_connect (lcp, sa) +register struct pklcd *lcp; +register struct sockaddr_x25 *sa; +{ + register struct pkcb *pkp; + register struct rtentry *rt; + register struct rtentry *nrt; + + struct rtentry *npaidb_enter (); + struct pkcb *pk_newlink (); + + if (sa -> x25_addr[0] == '\0') + return (EDESTADDRREQ); + + /* + * Is the destination address known? + */ + if (!(rt = rtalloc1 ((struct sockaddr *)sa, 1))) + return (ENETUNREACH); + + if (!(pkp = XTRACTPKP(rt))) + pkp = pk_newlink ((struct x25_ifaddr *) (rt -> rt_ifa), + (caddr_t) 0); + + /* + * Have we entered the LLC address? + */ + if (nrt = npaidb_enter (rt -> rt_gateway, rt_key (rt), rt, 0)) + pkp -> pk_llrt = nrt; + + /* + * Have we allocated an LLC2 link yet? + */ + if (pkp -> pk_llnext == (caddr_t)0 && pkp -> pk_llctlinput) { + struct dll_ctlinfo ctlinfo; + + ctlinfo.dlcti_rt = rt; + ctlinfo.dlcti_pcb = (caddr_t) pkp; + ctlinfo.dlcti_conf = + (struct dllconfig *) (&((struct x25_ifaddr *)(rt -> rt_ifa)) -> ia_xc); + pkp -> pk_llnext = + (pkp -> pk_llctlinput) (PRC_CONNECT_REQUEST, 0, &ctlinfo); + } + + if (pkp -> pk_state != DTE_READY && pkp -> pk_state != DTE_WAITING) + return (ENETDOWN); + if ((lcp -> lcd_lcn = pk_getlcn (pkp)) == 0) + return (EMFILE); + + lcp -> lcd_faddr = *sa; + lcp -> lcd_ceaddr = & lcp -> lcd_faddr; + pk_assoc (pkp, lcp, lcp -> lcd_ceaddr); + + /* + * If the link is not up yet, initiate an X.25 RESTART + */ + if (pkp -> pk_state == DTE_WAITING) { + pkp -> pk_dxerole |= DTE_CONNECTPENDING; + pk_ctlinput (PRC_LINKUP, (struct sockaddr *)0, pkp); + if (lcp -> lcd_so) + soisconnecting (lcp -> lcd_so); + return 0; + } + + if (lcp -> lcd_so) + soisconnecting (lcp -> lcd_so); + lcp -> lcd_template = pk_template (lcp -> lcd_lcn, X25_CALL); + pk_callrequest (lcp, lcp -> lcd_ceaddr, pkp -> pk_xcp); + return (*pkp -> pk_ia -> ia_start) (lcp); +} + +/* + * Complete all pending X.25 call requests --- this gets called after + * the X.25 link has been restarted. + */ +#define RESHUFFLELCN(maxlcn, lcn) ((maxlcn) - (lcn) + 1) + +pk_callcomplete (pkp) + register struct pkcb *pkp; +{ + register struct pklcd *lcp; + register int i; + register int ni; + + + if (pkp -> pk_dxerole & DTE_CONNECTPENDING) + pkp -> pk_dxerole &= ~DTE_CONNECTPENDING; + else return; + + if (pkp -> pk_chan == 0) + return; + + /* + * We pretended to be a DTE for allocating lcns, if + * it turns out that we are in reality performing as a + * DCE we need to reshuffle the lcps. + * + * /+---------------+-------- - + * / | a (maxlcn-1) | \ + * / +---------------+ \ + * +--- * | b (maxlcn-2) | \ + * | \ +---------------+ \ + * r | \ | c (maxlcn-3) | \ + * e | \+---------------+ | + * s | | . | + * h | | . | m + * u | | . | a + * f | | . | x + * f | | . | l + * l | /+---------------+ | c + * e | / | c' ( 3 ) | | n + * | / +---------------+ | + * +--> * | b' ( 2 ) | / + * \ +---------------+ / + * \ | a' ( 1 ) | / + * \+---------------+ / + * | 0 | / + * +---------------+-------- - + * + */ + if (pkp -> pk_dxerole & DTE_PLAYDCE) { + /* Sigh, reshuffle it */ + for (i = pkp -> pk_maxlcn; i > 0; --i) + if (pkp -> pk_chan[i]) { + ni = RESHUFFLELCN(pkp -> pk_maxlcn, i); + pkp -> pk_chan[ni] = pkp -> pk_chan[i]; + pkp -> pk_chan[i] = NULL; + pkp -> pk_chan[ni] -> lcd_lcn = ni; + } + } + + for (i = 1; i <= pkp -> pk_maxlcn; ++i) + if ((lcp = pkp -> pk_chan[i]) != NULL) { + /* if (lcp -> lcd_so) + soisconnecting (lcp -> lcd_so); */ + lcp -> lcd_template = pk_template (lcp -> lcd_lcn, X25_CALL); + pk_callrequest (lcp, lcp -> lcd_ceaddr, pkp -> pk_xcp); + (*pkp -> pk_ia -> ia_start) (lcp); + } +} + +struct bcdinfo { + octet *cp; + unsigned posn; +}; +/* + * Build the rest of the CALL REQUEST packet. Fill in calling + * address, facilities fields and the user data field. + */ + +pk_callrequest (lcp, sa, xcp) +struct pklcd *lcp; +register struct sockaddr_x25 *sa; +register struct x25config *xcp; +{ + register struct x25_calladdr *a; + register struct mbuf *m = lcp -> lcd_template; + register struct x25_packet *xp = mtod (m, struct x25_packet *); + struct bcdinfo b; + + if (lcp -> lcd_flags & X25_DBIT) + X25SBITS(xp -> bits, d_bit, 1); + a = (struct x25_calladdr *) &xp -> packet_data; + b.cp = (octet *) a -> address_field; + b.posn = 0; + X25SBITS(a -> addrlens, called_addrlen, to_bcd (&b, sa, xcp)); + X25SBITS(a -> addrlens, calling_addrlen, to_bcd (&b, &xcp -> xc_addr, xcp)); + if (b.posn & 0x01) + *b.cp++ &= 0xf0; + m -> m_pkthdr.len = m -> m_len += b.cp - (octet *) a; + + if (lcp -> lcd_facilities) { + m -> m_pkthdr.len += + (m -> m_next = lcp -> lcd_facilities) -> m_pkthdr.len; + lcp -> lcd_facilities = 0; + } else + pk_build_facilities (m, sa, (int)xcp -> xc_type); + + m_copyback (m, m -> m_pkthdr.len, sa -> x25_udlen, sa -> x25_udata); +} + +pk_build_facilities (m, sa, type) +register struct mbuf *m; +struct sockaddr_x25 *sa; +{ + register octet *cp; + register octet *fcp; + register int revcharge; + + cp = mtod (m, octet *) + m -> m_len; + fcp = cp + 1; + revcharge = sa -> x25_opts.op_flags & X25_REVERSE_CHARGE ? 1 : 0; + /* + * This is specific to Datapac X.25(1976) DTEs. International + * calls must have the "hi priority" bit on. + */ + if (type == X25_1976 && sa -> x25_opts.op_psize == X25_PS128) + revcharge |= 02; + if (revcharge) { + *fcp++ = FACILITIES_REVERSE_CHARGE; + *fcp++ = revcharge; + } + switch (type) { + case X25_1980: + case X25_1984: + *fcp++ = FACILITIES_PACKETSIZE; + *fcp++ = sa -> x25_opts.op_psize; + *fcp++ = sa -> x25_opts.op_psize; + + *fcp++ = FACILITIES_WINDOWSIZE; + *fcp++ = sa -> x25_opts.op_wsize; + *fcp++ = sa -> x25_opts.op_wsize; + } + *cp = fcp - cp - 1; + m -> m_pkthdr.len = (m -> m_len += *cp + 1); +} + +to_bcd (b, sa, xcp) +register struct bcdinfo *b; +struct sockaddr_x25 *sa; +register struct x25config *xcp; +{ + register char *x = sa -> x25_addr; + unsigned start = b -> posn; + /* + * The nodnic and prepnd0 stuff looks tedious, + * but it does allow full X.121 addresses to be used, + * which is handy for routing info (& OSI type 37 addresses). + */ + if (xcp -> xc_addr.x25_net && (xcp -> xc_nodnic || xcp -> xc_prepnd0)) { + char dnicname[sizeof (long) * NBBY/3 + 2]; + register char *p = dnicname; + + sprintf (p, "%d", xcp -> xc_addr.x25_net & 0x7fff); + for (; *p; p++) /* *p == 0 means dnic matched */ + if ((*p ^ *x++) & 0x0f) + break; + if (*p || xcp -> xc_nodnic == 0) + x = sa -> x25_addr; + if (*p && xcp -> xc_prepnd0) { + if ((b -> posn)++ & 0x01) + *(b -> cp)++; + else + *(b -> cp) = 0; + } + } + while (*x) + if ((b -> posn)++ & 0x01) + *(b -> cp)++ |= *x++ & 0x0F; + else + *(b -> cp) = *x++ << 4; + return ((b -> posn) - start); +} + +/* + * This routine gets the first available logical channel number. The + * search is + * - from the highest number to lowest number if playing DTE, and + * - from lowest to highest number if playing DCE. + */ + +pk_getlcn (pkp) +register struct pkcb *pkp; +{ + register int i; + + if (pkp -> pk_chan == 0) + return (0); + if ( pkp -> pk_dxerole & DTE_PLAYDCE ) { + for (i = 1; i <= pkp -> pk_maxlcn; ++i) + if (pkp -> pk_chan[i] == NULL) + break; + } else { + for (i = pkp -> pk_maxlcn; i > 0; --i) + if (pkp -> pk_chan[i] == NULL) + break; + } + i = ( i > pkp -> pk_maxlcn ? 0 : i ); + return (i); +} + +/* + * This procedure sends a CLEAR request packet. The lc state is + * set to "SENT_CLEAR". + */ + +pk_clear (lcp, diagnostic, abortive) +register struct pklcd *lcp; +{ + register struct mbuf *m = pk_template (lcp -> lcd_lcn, X25_CLEAR); + + m -> m_len += 2; + m -> m_pkthdr.len += 2; + mtod (m, struct x25_packet *) -> packet_data = 0; + mtod (m, octet *)[4] = diagnostic; + if (lcp -> lcd_facilities) { + m -> m_next = lcp -> lcd_facilities; + m -> m_pkthdr.len += m -> m_next -> m_len; + lcp -> lcd_facilities = 0; + } + if (abortive) + lcp -> lcd_template = m; + else { + struct socket *so = lcp -> lcd_so; + struct sockbuf *sb = so ? & so -> so_snd : & lcp -> lcd_sb; + sbappendrecord (sb, m); + } + pk_output (lcp); + +} + +/* + * This procedure generates RNR's or RR's to inhibit or enable + * inward data flow, if the current state changes (blocked ==> open or + * vice versa), or if forced to generate one. One forces RNR's to ack data. + */ +pk_flowcontrol (lcp, inhibit, forced) +register struct pklcd *lcp; +{ + inhibit = (inhibit != 0); + if (lcp == 0 || lcp -> lcd_state != DATA_TRANSFER || + (forced == 0 && lcp -> lcd_rxrnr_condition == inhibit)) + return; + lcp -> lcd_rxrnr_condition = inhibit; + lcp -> lcd_template = + pk_template (lcp -> lcd_lcn, inhibit ? X25_RNR : X25_RR); + pk_output (lcp); +} + +/* + * This procedure sends a RESET request packet. It re-intializes + * virtual circuit. + */ + +static +pk_reset (lcp, diagnostic) +register struct pklcd *lcp; +{ + register struct mbuf *m; + register struct socket *so = lcp -> lcd_so; + + if (lcp -> lcd_state != DATA_TRANSFER) + return; + + if (so) + so -> so_error = ECONNRESET; + lcp -> lcd_reset_condition = TRUE; + + /* Reset all the control variables for the channel. */ + pk_flush (lcp); + lcp -> lcd_window_condition = lcp -> lcd_rnr_condition = + lcp -> lcd_intrconf_pending = FALSE; + lcp -> lcd_rsn = MODULUS - 1; + lcp -> lcd_ssn = 0; + lcp -> lcd_output_window = lcp -> lcd_input_window = + lcp -> lcd_last_transmitted_pr = 0; + m = lcp -> lcd_template = pk_template (lcp -> lcd_lcn, X25_RESET); + m -> m_pkthdr.len = m -> m_len += 2; + mtod (m, struct x25_packet *) -> packet_data = 0; + mtod (m, octet *)[4] = diagnostic; + pk_output (lcp); + +} + +/* + * This procedure frees all data queued for output or delivery on a + * virtual circuit. + */ + +pk_flush (lcp) +register struct pklcd *lcp; +{ + register struct socket *so; + + if (lcp -> lcd_template) + m_freem (lcp -> lcd_template); + + if (lcp -> lcd_cps) { + m_freem (lcp -> lcd_cps); + lcp -> lcd_cps = 0; + } + if (lcp -> lcd_facilities) { + m_freem (lcp -> lcd_facilities); + lcp -> lcd_facilities = 0; + } + if (so = lcp -> lcd_so) + sbflush (&so -> so_snd); + else + sbflush (&lcp -> lcd_sb); +} + +/* + * This procedure handles all local protocol procedure errors. + */ + +pk_procerror (error, lcp, errstr, diagnostic) +register struct pklcd *lcp; +char *errstr; +{ + + pk_message (lcp -> lcd_lcn, lcp -> lcd_pkp -> pk_xcp, errstr); + + switch (error) { + case CLEAR: + if (lcp -> lcd_so) { + lcp -> lcd_so -> so_error = ECONNABORTED; + soisdisconnecting (lcp -> lcd_so); + } + pk_clear (lcp, diagnostic, 1); + break; + + case RESET: + pk_reset (lcp, diagnostic); + } +} + +/* + * This procedure is called during the DATA TRANSFER state to check + * and process the P(R) values received in the DATA, RR OR RNR + * packets. + */ + +pk_ack (lcp, pr) +struct pklcd *lcp; +unsigned pr; +{ + register struct socket *so = lcp -> lcd_so; + + if (lcp -> lcd_output_window == pr) + return (PACKET_OK); + if (lcp -> lcd_output_window < lcp -> lcd_ssn) { + if (pr < lcp -> lcd_output_window || pr > lcp -> lcd_ssn) { + pk_procerror (RESET, lcp, + "p(r) flow control error", 2); + return (ERROR_PACKET); + } + } + else { + if (pr < lcp -> lcd_output_window && pr > lcp -> lcd_ssn) { + pk_procerror (RESET, lcp, + "p(r) flow control error #2", 2); + return (ERROR_PACKET); + } + } + + lcp -> lcd_output_window = pr; /* Rotate window. */ + if (lcp -> lcd_window_condition == TRUE) + lcp -> lcd_window_condition = FALSE; + + if (so && ((so -> so_snd.sb_flags & SB_WAIT) || + (so -> so_snd.sb_flags & SB_NOTIFY))) + sowwakeup (so); + + return (PACKET_OK); +} + +/* + * This procedure decodes the X.25 level 3 packet returning a + * code to be used in switchs or arrays. + */ + +pk_decode (xp) +register struct x25_packet *xp; +{ + register int type; + + if (X25GBITS(xp -> bits, fmt_identifier) != 1) + return (INVALID_PACKET); +#ifdef ancient_history + /* + * Make sure that the logical channel group number is 0. + * This restriction may be removed at some later date. + */ + if (xp -> lc_group_number != 0) + return (INVALID_PACKET); +#endif + /* + * Test for data packet first. + */ + if (!(xp -> packet_type & DATA_PACKET_DESIGNATOR)) + return (DATA); + + /* + * Test if flow control packet (RR or RNR). + */ + if (!(xp -> packet_type & RR_OR_RNR_PACKET_DESIGNATOR)) + switch (xp -> packet_type & 0x1f) { + case X25_RR: + return (RR); + case X25_RNR: + return (RNR); + case X25_REJECT: + return (REJECT); + } + + /* + * Determine the rest of the packet types. + */ + switch (xp -> packet_type) { + case X25_CALL: + type = CALL; + break; + + case X25_CALL_ACCEPTED: + type = CALL_ACCEPTED; + break; + + case X25_CLEAR: + type = CLEAR; + break; + + case X25_CLEAR_CONFIRM: + type = CLEAR_CONF; + break; + + case X25_INTERRUPT: + type = INTERRUPT; + break; + + case X25_INTERRUPT_CONFIRM: + type = INTERRUPT_CONF; + break; + + case X25_RESET: + type = RESET; + break; + + case X25_RESET_CONFIRM: + type = RESET_CONF; + break; + + case X25_RESTART: + type = RESTART; + break; + + case X25_RESTART_CONFIRM: + type = RESTART_CONF; + break; + + case X25_DIAGNOSTIC: + type = DIAG_TYPE; + break; + + default: + type = INVALID_PACKET; + } + return (type); +} + +/* + * A restart packet has been received. Print out the reason + * for the restart. + */ + +pk_restartcause (pkp, xp) +struct pkcb *pkp; +register struct x25_packet *xp; +{ + register struct x25config *xcp = pkp -> pk_xcp; + register int lcn = LCN(xp); + + switch (xp -> packet_data) { + case X25_RESTART_LOCAL_PROCEDURE_ERROR: + pk_message (lcn, xcp, "restart: local procedure error"); + break; + + case X25_RESTART_NETWORK_CONGESTION: + pk_message (lcn, xcp, "restart: network congestion"); + break; + + case X25_RESTART_NETWORK_OPERATIONAL: + pk_message (lcn, xcp, "restart: network operational"); + break; + + default: + pk_message (lcn, xcp, "restart: unknown cause"); + } +} + +#define MAXRESETCAUSE 7 + +int Reset_cause[] = { + EXRESET, EXROUT, 0, EXRRPE, 0, EXRLPE, 0, EXRNCG +}; + +/* + * A reset packet has arrived. Return the cause to the user. + */ + +pk_resetcause (pkp, xp) +struct pkcb *pkp; +register struct x25_packet *xp; +{ + register struct pklcd *lcp = + pkp -> pk_chan[LCN(xp)]; + register int code = xp -> packet_data; + + if (code > MAXRESETCAUSE) + code = 7; /* EXRNCG */ + + pk_message (LCN(xp), lcp -> lcd_pkp, "reset code 0x%x, diagnostic 0x%x", + xp -> packet_data, 4[(u_char *)xp]); + + if (lcp -> lcd_so) + lcp -> lcd_so -> so_error = Reset_cause[code]; +} + +#define MAXCLEARCAUSE 25 + +int Clear_cause[] = { + EXCLEAR, EXCBUSY, 0, EXCINV, 0, EXCNCG, 0, + 0, 0, EXCOUT, 0, EXCAB, 0, EXCNOB, 0, 0, 0, EXCRPE, + 0, EXCLPE, 0, 0, 0, 0, 0, EXCRRC +}; + +/* + * A clear packet has arrived. Return the cause to the user. + */ + +pk_clearcause (pkp, xp) +struct pkcb *pkp; +register struct x25_packet *xp; +{ + register struct pklcd *lcp = + pkp -> pk_chan[LCN(xp)]; + register int code = xp -> packet_data; + + if (code > MAXCLEARCAUSE) + code = 5; /* EXRNCG */ + if (lcp -> lcd_so) + lcp -> lcd_so -> so_error = Clear_cause[code]; +} + +char * +format_ntn (xcp) +register struct x25config *xcp; +{ + + return (xcp -> xc_addr.x25_addr); +} + +/* VARARGS1 */ +pk_message (lcn, xcp, fmt, a1, a2, a3, a4, a5, a6) +struct x25config *xcp; +char *fmt; +{ + + if (lcn) + if (!PQEMPTY) + printf ("X.25(%s): lcn %d: ", format_ntn (xcp), lcn); + else + printf ("X.25: lcn %d: ", lcn); + else + if (!PQEMPTY) + printf ("X.25(%s): ", format_ntn (xcp)); + else + printf ("X.25: "); + + printf (fmt, a1, a2, a3, a4, a5, a6); + printf ("\n"); +} + +pk_fragment (lcp, m0, qbit, mbit, wait) +struct mbuf *m0; +register struct pklcd *lcp; +{ + register struct mbuf *m = m0; + register struct x25_packet *xp; + register struct sockbuf *sb; + struct mbuf *head = 0, *next, **mp = &head, *m_split (); + int totlen, psize = 1 << (lcp -> lcd_packetsize); + + if (m == 0) + return 0; + if (m -> m_flags & M_PKTHDR == 0) + panic ("pk_fragment"); + totlen = m -> m_pkthdr.len; + m -> m_act = 0; + sb = lcp -> lcd_so ? &lcp -> lcd_so -> so_snd : & lcp -> lcd_sb; + do { + if (totlen > psize) { + if ((next = m_split (m, psize, wait)) == 0) + goto abort; + totlen -= psize; + } else + next = 0; + M_PREPEND(m, PKHEADERLN, wait); + if (m == 0) + goto abort; + *mp = m; + mp = & m -> m_act; + *mp = 0; + xp = mtod (m, struct x25_packet *); + 0[(char *)xp] = 0; + if (qbit) + X25SBITS(xp -> bits, q_bit, 1); + if (lcp -> lcd_flags & X25_DBIT) + X25SBITS(xp -> bits, d_bit, 1); + X25SBITS(xp -> bits, fmt_identifier, 1); + xp -> packet_type = X25_DATA; + SET_LCN(xp, lcp -> lcd_lcn); + if (next || (mbit && (totlen == psize || + (lcp -> lcd_flags & X25_DBIT)))) + SMBIT(xp, 1); + } while (m = next); + for (m = head; m; m = next) { + next = m -> m_act; + m -> m_act = 0; + sbappendrecord (sb, m); + } + return 0; +abort: + if (wait) + panic ("pk_fragment null mbuf after wait"); + if (next) + m_freem (next); + for (m = head; m; m = next) { + next = m -> m_act; + m_freem (m); + } + return ENOBUFS; +} diff --git a/sys/netccitt/pk_timer.c b/sys/netccitt/pk_timer.c new file mode 100644 index 00000000000..52c1860b4b4 --- /dev/null +++ b/sys/netccitt/pk_timer.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) Computing Centre, University of British Columbia, 1984 + * Copyright (C) Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1990, 1992 + * Copyright (c) 1990, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by the + * Laboratory for Computation Vision and the Computer Science Department + * of the the University of British Columbia and the Computer Science + * Department (IV) of the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pk_timer.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +/* + * Various timer values. They can be adjusted + * by patching the binary with adb if necessary. + */ +int pk_t20 = 18 * PR_SLOWHZ; /* restart timer */ +int pk_t21 = 20 * PR_SLOWHZ; /* call timer */ +/* XXX pk_t22 is never used */ +int pk_t22 = 18 * PR_SLOWHZ; /* reset timer */ +int pk_t23 = 18 * PR_SLOWHZ; /* clear timer */ + +pk_timer () +{ + register struct pkcb *pkp; + register struct pklcd *lcp, **pp; + register int lcns_jammed, cant_restart; + + FOR_ALL_PKCBS(pkp) { + switch (pkp -> pk_state) { + case DTE_SENT_RESTART: + lcp = pkp -> pk_chan[0]; + /* + * If restart failures are common, a link level + * reset should be initiated here. + */ + if (lcp -> lcd_timer && --lcp -> lcd_timer == 0) { + pk_message (0, pkp -> pk_xcp, + "packet level restart failed"); + pkp -> pk_state = DTE_WAITING; + } + break; + + case DTE_READY: + lcns_jammed = cant_restart = 0; + for (pp = &pkp -> pk_chan[1]; pp <= &pkp -> pk_chan[pkp -> pk_maxlcn]; pp++) { + if ((lcp = *pp) == 0) + continue; + switch (lcp -> lcd_state) { + case SENT_CALL: + if (--lcp -> lcd_timer == 0) { + if (lcp -> lcd_so) + lcp -> lcd_so -> so_error = ETIMEDOUT; + pk_clear (lcp, 49, 1); + } + break; + + case SENT_CLEAR: + if (lcp -> lcd_retry >= 3) + lcns_jammed++; + else + if (--lcp -> lcd_timer == 0) + pk_clear (lcp, 50, 1); + break; + + case DATA_TRANSFER: /* lcn active */ + cant_restart++; + break; + + case LCN_ZOMBIE: /* zombie state */ + pk_freelcd (lcp); + break; + } + } + if (lcns_jammed > pkp -> pk_maxlcn / 2 && cant_restart == 0) { + pk_message (0, pkp -> pk_xcp, "%d lcns jammed: attempting restart", lcns_jammed); + pk_restart (pkp, 0); + } + } + } +} diff --git a/sys/netccitt/pk_usrreq.c b/sys/netccitt/pk_usrreq.c new file mode 100644 index 00000000000..d0dc42c0d40 --- /dev/null +++ b/sys/netccitt/pk_usrreq.c @@ -0,0 +1,604 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (C) Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1992 + * Copyright (c) 1991, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by the + * Laboratory for Computation Vision and the Computer Science Department + * of the the University of British Columbia and the Computer Science + * Department (IV) of the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pk_usrreq.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +static old_to_new(); +static new_to_old(); +/* + * + * X.25 Packet level protocol interface to socket abstraction. + * + * Process an X.25 user request on a logical channel. If this is a send + * request then m is the mbuf chain of the send data. If this is a timer + * expiration (called from the software clock routine) them timertype is + * the particular timer. + * + */ + +pk_usrreq (so, req, m, nam, control) +struct socket *so; +int req; +register struct mbuf *m, *nam; +struct mbuf *control; +{ + register struct pklcd *lcp = (struct pklcd *) so -> so_pcb; + register int error = 0; + + if (req == PRU_CONTROL) + return (pk_control (so, (int)m, (caddr_t)nam, + (struct ifnet *)control)); + if (control && control -> m_len) { + error = EINVAL; + goto release; + } + if (lcp == NULL && req != PRU_ATTACH) { + error = EINVAL; + goto release; + } + +/* + pk_trace (pkcbhead, TR_USER, (struct pklcd *)0, + req, (struct x25_packet *)0); +*/ + + switch (req) { + /* + * X.25 attaches to socket via PRU_ATTACH and allocates a logical + * channel descriptor. If the socket is to receive connections, + * then the LISTEN state is entered. + */ + case PRU_ATTACH: + if (lcp) { + error = EISCONN; + /* Socket already connected. */ + break; + } + lcp = pk_attach (so); + if (lcp == 0) + error = ENOBUFS; + break; + + /* + * Detach a logical channel from the socket. If the state of the + * channel is embryonic, simply discard it. Otherwise we have to + * initiate a PRU_DISCONNECT which will finish later. + */ + case PRU_DETACH: + pk_disconnect (lcp); + break; + + /* + * Give the socket an address. + */ + case PRU_BIND: + if (nam -> m_len == sizeof (struct x25_sockaddr)) + old_to_new (nam); + error = pk_bind (lcp, nam); + break; + + /* + * Prepare to accept connections. + */ + case PRU_LISTEN: + error = pk_listen (lcp); + break; + + /* + * Initiate a CALL REQUEST to peer entity. Enter state SENT_CALL + * and mark the socket as connecting. Set timer waiting for + * CALL ACCEPT or CLEAR. + */ + case PRU_CONNECT: + if (nam -> m_len == sizeof (struct x25_sockaddr)) + old_to_new (nam); + if (pk_checksockaddr (nam)) + return (EINVAL); + error = pk_connect (lcp, mtod (nam, struct sockaddr_x25 *)); + break; + + /* + * Initiate a disconnect to peer entity via a CLEAR REQUEST packet. + * The socket will be disconnected when we receive a confirmation + * or a clear collision. + */ + case PRU_DISCONNECT: + pk_disconnect (lcp); + break; + + /* + * Accept an INCOMING CALL. Most of the work has already been done + * by pk_input. Just return the callers address to the user. + */ + case PRU_ACCEPT: + if (lcp -> lcd_craddr == NULL) + break; + bcopy ((caddr_t)lcp -> lcd_craddr, mtod (nam, caddr_t), + sizeof (struct sockaddr_x25)); + nam -> m_len = sizeof (struct sockaddr_x25); + if (lcp -> lcd_flags & X25_OLDSOCKADDR) + new_to_old (nam); + break; + + /* + * After a receive, we should send a RR. + */ + case PRU_RCVD: + pk_flowcontrol (lcp, /*sbspace (&so -> so_rcv) <= */ 0, 1); + break; + + /* + * Send INTERRUPT packet. + */ + case PRU_SENDOOB: + if (m == 0) { + MGETHDR(m, M_WAITOK, MT_OOBDATA); + m -> m_pkthdr.len = m -> m_len = 1; + *mtod (m, octet *) = 0; + } + if (m -> m_pkthdr.len > 32) { + m_freem (m); + error = EMSGSIZE; + break; + } + MCHTYPE(m, MT_OOBDATA); + /* FALLTHROUGH */ + + /* + * Do send by placing data on the socket output queue. + */ + case PRU_SEND: + if (control) { + register struct cmsghdr *ch = mtod (m, struct cmsghdr *); + control -> m_len -= sizeof (*ch); + control -> m_data += sizeof (*ch); + error = pk_ctloutput (PRCO_SETOPT, so, ch -> cmsg_level, + ch -> cmsg_type, &control); + } + if (error == 0 && m) + error = pk_send (lcp, m); + break; + + /* + * Abort a virtual circuit. For example all completed calls + * waiting acceptance. + */ + case PRU_ABORT: + pk_disconnect (lcp); + break; + + /* Begin unimplemented hooks. */ + + case PRU_SHUTDOWN: + error = EOPNOTSUPP; + break; + + case PRU_CONTROL: + error = EOPNOTSUPP; + break; + + case PRU_SENSE: +#ifdef BSD4_3 + ((struct stat *)m) -> st_blksize = so -> so_snd.sb_hiwat; +#else + error = EOPNOTSUPP; +#endif + break; + + /* End unimplemented hooks. */ + + case PRU_SOCKADDR: + if (lcp -> lcd_ceaddr == 0) + return (EADDRNOTAVAIL); + nam -> m_len = sizeof (struct sockaddr_x25); + bcopy ((caddr_t)lcp -> lcd_ceaddr, mtod (nam, caddr_t), + sizeof (struct sockaddr_x25)); + if (lcp -> lcd_flags & X25_OLDSOCKADDR) + new_to_old (nam); + break; + + case PRU_PEERADDR: + if (lcp -> lcd_state != DATA_TRANSFER) + return (ENOTCONN); + nam -> m_len = sizeof (struct sockaddr_x25); + bcopy (lcp -> lcd_craddr ? (caddr_t)lcp -> lcd_craddr : + (caddr_t)lcp -> lcd_ceaddr, + mtod (nam, caddr_t), sizeof (struct sockaddr_x25)); + if (lcp -> lcd_flags & X25_OLDSOCKADDR) + new_to_old (nam); + break; + + /* + * Receive INTERRUPT packet. + */ + case PRU_RCVOOB: + if (so -> so_options & SO_OOBINLINE) { + register struct mbuf *n = so -> so_rcv.sb_mb; + if (n && n -> m_type == MT_OOBDATA) { + unsigned len = n -> m_pkthdr.len; + so -> so_rcv.sb_mb = n -> m_nextpkt; + if (len != n -> m_len && + (n = m_pullup (n, len)) == 0) + break; + m -> m_len = len; + bcopy (mtod (m, caddr_t), mtod (n, caddr_t), len); + m_freem (n); + } + break; + } + m -> m_len = 1; + *mtod (m, char *) = lcp -> lcd_intrdata; + break; + + default: + panic ("pk_usrreq"); + } +release: + if (control != NULL) + m_freem (control); + return (error); +} + +/* + * If you want to use UBC X.25 level 3 in conjunction with some + * other X.25 level 2 driver, have the ifp -> if_ioctl routine + * assign pk_start to ia -> ia_start when called with SIOCSIFCONF_X25. + */ +/* ARGSUSED */ +pk_start (lcp) +register struct pklcd *lcp; +{ + pk_output (lcp); + return (0); /* XXX pk_output should return a value */ +} + +#ifndef _offsetof +#define _offsetof(t, m) ((int)((caddr_t)&((t *)0)->m)) +#endif +struct sockaddr_x25 pk_sockmask = { + _offsetof(struct sockaddr_x25, x25_addr[0]), /* x25_len */ + 0, /* x25_family */ + -1, /* x25_net id */ +}; + +/*ARGSUSED*/ +pk_control (so, cmd, data, ifp) +struct socket *so; +int cmd; +caddr_t data; +register struct ifnet *ifp; +{ + register struct ifreq_x25 *ifr = (struct ifreq_x25 *)data; + register struct ifaddr *ifa = 0; + register struct x25_ifaddr *ia = 0; + struct pklcd *dev_lcp = 0; + int error, s, old_maxlcn; + unsigned n; + + /* + * Find address for this interface, if it exists. + */ + if (ifp) + for (ifa = ifp -> if_addrlist; ifa; ifa = ifa -> ifa_next) + if (ifa -> ifa_addr -> sa_family == AF_CCITT) + break; + + ia = (struct x25_ifaddr *)ifa; + switch (cmd) { + case SIOCGIFCONF_X25: + if (ifa == 0) + return (EADDRNOTAVAIL); + ifr -> ifr_xc = ia -> ia_xc; + return (0); + + case SIOCSIFCONF_X25: + if ((so->so_state & SS_PRIV) == 0) + return (EPERM); + if (ifp == 0) + panic ("pk_control"); + if (ifa == (struct ifaddr *)0) { + register struct mbuf *m; + + MALLOC(ia, struct x25_ifaddr *, sizeof (*ia), + M_IFADDR, M_WAITOK); + if (ia == 0) + return (ENOBUFS); + bzero ((caddr_t)ia, sizeof (*ia)); + if (ifa = ifp -> if_addrlist) { + for ( ; ifa -> ifa_next; ifa = ifa -> ifa_next) + ; + ifa -> ifa_next = &ia -> ia_ifa; + } else + ifp -> if_addrlist = &ia -> ia_ifa; + ifa = &ia -> ia_ifa; + ifa -> ifa_netmask = (struct sockaddr *)&pk_sockmask; + ifa -> ifa_addr = (struct sockaddr *)&ia -> ia_xc.xc_addr; + ifa -> ifa_dstaddr = (struct sockaddr *)&ia -> ia_dstaddr; /* XXX */ + ia -> ia_ifp = ifp; + ia -> ia_dstaddr.x25_family = AF_CCITT; + ia -> ia_dstaddr.x25_len = pk_sockmask.x25_len; + } else if (ISISO8802(ifp) == 0) { + rtinit (ifa, (int)RTM_DELETE, 0); + } + old_maxlcn = ia -> ia_maxlcn; + ia -> ia_xc = ifr -> ifr_xc; + ia -> ia_dstaddr.x25_net = ia -> ia_xc.xc_addr.x25_net; + if (ia -> ia_maxlcn != old_maxlcn && old_maxlcn != 0) { + /* VERY messy XXX */ + register struct pkcb *pkp; + FOR_ALL_PKCBS(pkp) + if (pkp -> pk_ia == ia) + pk_resize (pkp); + } + /* + * Give the interface a chance to initialize if this +p * is its first address, and to validate the address. + */ + ia -> ia_start = pk_start; + s = splimp(); + if (ifp -> if_ioctl) + error = (*ifp -> if_ioctl)(ifp, SIOCSIFCONF_X25, + (caddr_t) ifa); + if (error) + ifp -> if_flags &= ~IFF_UP; + else if (ISISO8802(ifp) == 0) + error = rtinit (ifa, (int)RTM_ADD, RTF_UP); + splx (s); + return (error); + + default: + if (ifp == 0 || ifp -> if_ioctl == 0) + return (EOPNOTSUPP); + return ((*ifp -> if_ioctl)(ifp, cmd, data)); + } +} + +pk_ctloutput (cmd, so, level, optname, mp) +struct socket *so; +struct mbuf **mp; +int cmd, level, optname; +{ + register struct mbuf *m = *mp; + register struct pklcd *lcp = (struct pklcd *) so -> so_pcb; + int error = EOPNOTSUPP; + + if (m == 0) + return (EINVAL); + if (cmd == PRCO_SETOPT) switch (optname) { + case PK_FACILITIES: + if (m == 0) + return (EINVAL); + lcp -> lcd_facilities = m; + *mp = 0; + return (0); + + case PK_ACCTFILE: + if ((so->so_state & SS_PRIV) == 0) + error = EPERM; + else if (m -> m_len) + error = pk_accton (mtod (m, char *)); + else + error = pk_accton ((char *)0); + break; + + case PK_RTATTACH: + error = pk_rtattach (so, m); + break; + + case PK_PRLISTEN: + error = pk_user_protolisten (mtod (m, u_char *)); + } + if (*mp) { + (void) m_freem (*mp); + *mp = 0; + } + return (error); + +} + + +/* + * Do an in-place conversion of an "old style" + * socket address to the new style + */ + +static +old_to_new (m) +register struct mbuf *m; +{ + register struct x25_sockaddr *oldp; + register struct sockaddr_x25 *newp; + register char *ocp, *ncp; + struct sockaddr_x25 new; + + oldp = mtod (m, struct x25_sockaddr *); + newp = &new; + bzero ((caddr_t)newp, sizeof (*newp)); + + newp -> x25_family = AF_CCITT; + newp -> x25_len = sizeof(*newp); + newp -> x25_opts.op_flags = (oldp -> xaddr_facilities & X25_REVERSE_CHARGE) + | X25_MQBIT | X25_OLDSOCKADDR; + if (oldp -> xaddr_facilities & XS_HIPRIO) /* Datapac specific */ + newp -> x25_opts.op_psize = X25_PS128; + bcopy ((caddr_t)oldp -> xaddr_addr, newp -> x25_addr, + (unsigned)min (oldp -> xaddr_len, sizeof (newp -> x25_addr) - 1)); + if (bcmp ((caddr_t)oldp -> xaddr_proto, newp -> x25_udata, 4) != 0) { + bcopy ((caddr_t)oldp -> xaddr_proto, newp -> x25_udata, 4); + newp -> x25_udlen = 4; + } + ocp = (caddr_t)oldp -> xaddr_userdata; + ncp = newp -> x25_udata + 4; + while (*ocp && ocp < (caddr_t)oldp -> xaddr_userdata + 12) { + if (newp -> x25_udlen == 0) + newp -> x25_udlen = 4; + *ncp++ = *ocp++; + newp -> x25_udlen++; + } + bcopy ((caddr_t)newp, mtod (m, char *), sizeof (*newp)); + m -> m_len = sizeof (*newp); +} + +/* + * Do an in-place conversion of a new style + * socket address to the old style + */ + +static +new_to_old (m) +register struct mbuf *m; +{ + register struct x25_sockaddr *oldp; + register struct sockaddr_x25 *newp; + register char *ocp, *ncp; + struct x25_sockaddr old; + + oldp = &old; + newp = mtod (m, struct sockaddr_x25 *); + bzero ((caddr_t)oldp, sizeof (*oldp)); + + oldp -> xaddr_facilities = newp -> x25_opts.op_flags & X25_REVERSE_CHARGE; + if (newp -> x25_opts.op_psize == X25_PS128) + oldp -> xaddr_facilities |= XS_HIPRIO; /* Datapac specific */ + ocp = (char *)oldp -> xaddr_addr; + ncp = newp -> x25_addr; + while (*ncp) { + *ocp++ = *ncp++; + oldp -> xaddr_len++; + } + + bcopy (newp -> x25_udata, (caddr_t)oldp -> xaddr_proto, 4); + if (newp -> x25_udlen > 4) + bcopy (newp -> x25_udata + 4, (caddr_t)oldp -> xaddr_userdata, + (unsigned)(newp -> x25_udlen - 4)); + + bcopy ((caddr_t)oldp, mtod (m, char *), sizeof (*oldp)); + m -> m_len = sizeof (*oldp); +} + + +pk_checksockaddr (m) +struct mbuf *m; +{ + register struct sockaddr_x25 *sa = mtod (m, struct sockaddr_x25 *); + register char *cp; + + if (m -> m_len != sizeof (struct sockaddr_x25)) + return (1); + if (sa -> x25_family != AF_CCITT || + sa -> x25_udlen > sizeof (sa -> x25_udata)) + return (1); + for (cp = sa -> x25_addr; *cp; cp++) { + if (*cp < '0' || *cp > '9' || + cp >= &sa -> x25_addr[sizeof (sa -> x25_addr) - 1]) + return (1); + } + return (0); +} + +pk_send (lcp, m) +struct pklcd *lcp; +register struct mbuf *m; +{ + int mqbit = 0, error = 0; + register struct x25_packet *xp; + register struct socket *so; + + if (m -> m_type == MT_OOBDATA) { + if (lcp -> lcd_intrconf_pending) + error = ETOOMANYREFS; + if (m -> m_pkthdr.len > 32) + error = EMSGSIZE; + M_PREPEND(m, PKHEADERLN, M_WAITOK); + if (m == 0 || error) + goto bad; + *(mtod (m, octet *)) = 0; + xp = mtod (m, struct x25_packet *); + X25SBITS(xp -> bits, fmt_identifier, 1); + xp -> packet_type = X25_INTERRUPT; + SET_LCN(xp, lcp -> lcd_lcn); + sbinsertoob ( (so = lcp -> lcd_so) ? + &so -> so_snd : &lcp -> lcd_sb, m); + goto send; + } + /* + * Application has elected (at call setup time) to prepend + * a control byte to each packet written indicating m-bit + * and q-bit status. Examine and then discard this byte. + */ + if (lcp -> lcd_flags & X25_MQBIT) { + if (m -> m_len < 1) { + m_freem (m); + return (EMSGSIZE); + } + mqbit = *(mtod (m, u_char *)); + m -> m_len--; + m -> m_data++; + m -> m_pkthdr.len--; + } + error = pk_fragment (lcp, m, mqbit & 0x80, mqbit & 0x40, 1); +send: + if (error == 0 && lcp -> lcd_state == DATA_TRANSFER) + lcp -> lcd_send (lcp); /* XXXXXXXXX fix pk_output!!! */ + return (error); +bad: + if (m) + m_freem (m); + return (error); +} diff --git a/sys/netccitt/pk_var.h b/sys/netccitt/pk_var.h new file mode 100644 index 00000000000..beda05dc375 --- /dev/null +++ b/sys/netccitt/pk_var.h @@ -0,0 +1,231 @@ +/* + * Copyright (c) Computing Centre, University of British Columbia, 1985 + * Copyright (C) Computer Science Department IV, + * University of Erlangen-Nuremberg, Germany, 1990, 1991, 1992 + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by the + * Laboratory for Computation Vision and the Computer Science Department + * of the the University of British Columbia and the Computer Science + * Department (IV) of the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pk_var.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * + * X.25 Logical Channel Descriptor + * + */ + +struct pklcd { + struct pklcd_q { + struct pklcd_q *q_forw; /* debugging chain */ + struct pklcd_q *q_back; /* debugging chain */ + } lcd_q; + int (*lcd_upper)(); /* switch to socket vs datagram vs ...*/ + caddr_t lcd_upnext; /* reference for lcd_upper() */ + int (*lcd_send)(); /* if X.25 front end, direct connect */ + caddr_t lcd_downnext; /* reference for lcd_send() */ + short lcd_lcn; /* Logical channel number */ + short lcd_state; /* Logical Channel state */ + short lcd_timer; /* Various timer values */ + short lcd_dg_timer; /* to reclaim idle datagram circuits */ + bool lcd_intrconf_pending; /* Interrupt confirmation pending */ + octet lcd_intrdata; /* Octet of incoming intr data */ + char lcd_retry; /* Timer retry count */ + char lcd_rsn; /* Seq no of last received packet */ + char lcd_ssn; /* Seq no of next packet to send */ + char lcd_output_window; /* Output flow control window */ + char lcd_input_window; /* Input flow control window */ + char lcd_last_transmitted_pr;/* Last Pr value transmitted */ + bool lcd_rnr_condition; /* Remote in busy condition */ + bool lcd_window_condition; /* Output window size exceeded */ + bool lcd_reset_condition; /* True, if waiting reset confirm */ + bool lcd_rxrnr_condition; /* True, if we have sent rnr */ + char lcd_packetsize; /* Maximum packet size */ + char lcd_windowsize; /* Window size - both directions */ + octet lcd_closed_user_group; /* Closed user group specification */ + char lcd_flags; /* copy of sockaddr_x25 op_flags */ + struct mbuf *lcd_facilities; /* user supplied facilities for cr */ + struct mbuf *lcd_template; /* Address of response packet */ + struct socket *lcd_so; /* Socket addr for connection */ + struct sockaddr_x25 *lcd_craddr;/* Calling address pointer */ + struct sockaddr_x25 *lcd_ceaddr;/* Called address pointer */ + time_t lcd_stime; /* time circuit established */ + long lcd_txcnt; /* Data packet transmit count */ + long lcd_rxcnt; /* Data packet receive count */ + short lcd_intrcnt; /* Interrupt packet transmit count */ + struct pklcd *lcd_listen; /* Next lcd on listen queue */ + struct pkcb *lcd_pkp; /* Network this lcd is attached to */ + struct mbuf *lcd_cps; /* Complete Packet Sequence reassembly*/ + long lcd_cpsmax; /* Max length for CPS */ + struct sockaddr_x25 lcd_faddr; /* Remote Address (Calling) */ + struct sockaddr_x25 lcd_laddr; /* Local Address (Called) */ + struct sockbuf lcd_sb; /* alternate for datagram service */ +}; + +/* + * Per network information, allocated dynamically + * when a new network is configured. + */ + +struct pkcb { + struct pkcb_q { + struct pkcb_q *q_forw; + struct pkcb_q *q_backw; + } pk_q; + short pk_state; /* packet level status */ + short pk_maxlcn; /* local copy of xc_maxlcn */ + int (*pk_lloutput) (); /* link level output procedure */ + caddr_t (*pk_llctlinput) (); /* link level ctloutput procedure */ + caddr_t pk_llnext; /* handle for next level down */ + struct x25config *pk_xcp; /* network specific configuration */ + struct x25_ifaddr *pk_ia; /* backpointer to ifaddr */ + struct pklcd **pk_chan; /* actual size == xc_maxlcn+1 */ + short pk_dxerole; /* DXE role of PLE over LLC2 */ + short pk_restartcolls; /* counting RESTART collisions til resolved */ + struct rtentry *pk_rt; /* back pointer to route */ + struct rtentry *pk_llrt; /* pointer to reverse mapping */ + u_short pk_refcount; /* ref count */ +}; + +#define FOR_ALL_PKCBS(p) for((p) = (struct pkcb *)(pkcb_q.q_forw); \ + (pkcb_q.q_forw != &pkcb_q) && ((struct pkcb_q *)(p) != &pkcb_q); \ + (p) = (struct pkcb *)((p) -> pk_q.q_forw)) + +#define PQEMPTY (pkcb_q.q_forw == &pkcb_q) + +/* + * Interface address, x25 version. Exactly one of these structures is + * allocated for each interface with an x25 address. + * + * The ifaddr structure conatins the protocol-independent part + * of the structure, and is assumed to be first. + */ +struct x25_ifaddr { + struct ifaddr ia_ifa; /* protocol-independent info */ +#define ia_ifp ia_ifa.ifa_ifp +#define ia_flags ia_ifa.ifa_flags + struct x25config ia_xc; /* network specific configuration */ + struct pkcb *ia_pkcb; +#define ia_maxlcn ia_xc.xc_maxlcn + int (*ia_start) (); /* connect, confirm method */ + struct sockaddr_x25 ia_dstaddr; /* reserve space for route dst */ +}; + +/* + * ``Link-Level'' extension to Routing Entry for upper level + * packet switching via X.25 virtual circuits. + */ +struct llinfo_x25 { + struct llinfo_x25 *lx_next; /* chain together in linked list */ + struct llinfo_x25 *lx_prev; /* chain together in linked list */ + struct rtentry *lx_rt; /* back pointer to route */ + struct pklcd *lx_lcd; /* local connection block */ + struct x25_ifaddr *lx_ia; /* may not be same as rt_ifa */ + int lx_state; /* can't trust lcd->lcd_state */ + int lx_flags; + int lx_timer; /* for idle timeout */ + int lx_family; /* for dispatch */ +}; + +/* States for lx_state */ +#define LXS_NEWBORN 0 +#define LXS_RESOLVING 1 +#define LXS_FREE 2 +#define LXS_CONNECTING 3 +#define LXS_CONNECTED 4 +#define LXS_DISCONNECTING 5 +#define LXS_LISTENING 6 + +/* flags */ +#define LXF_VALID 0x1 /* Circuit is live, etc. */ +#define LXF_RTHELD 0x2 /* this lcb references rtentry */ +#define LXF_LISTEN 0x4 /* accepting incoming calls */ + +/* + * Definitions for accessing bitfields/bitslices inside X.25 structs + */ + + +struct x25bitslice { + unsigned int bs_mask; + unsigned int bs_shift; +}; + +#define calling_addrlen 0 +#define called_addrlen 1 +#define q_bit 2 +#define d_bit 3 +#define fmt_identifier 4 +#define lc_group_number 1 +#define p_r 5 +#define m_bit 6 +#define p_s 7 +#define zilch 8 + +#define X25GBITS(Arg, Index) (((Arg) & x25_bitslice[(Index)].bs_mask) >> x25_bitslice[(Index)].bs_shift) +#define X25SBITS(Arg, Index, Val) (Arg) |= (((Val) << x25_bitslice[(Index)].bs_shift) & x25_bitslice[(Index)].bs_mask) +#define X25CSBITS(Arg, Index, Val) (Arg) = (((Val) << x25_bitslice[(Index)].bs_shift) & x25_bitslice[(Index)].bs_mask) + +extern struct x25bitslice x25_bitslice[]; + + +#define ISOFIFTTYPE(i,t) ((i)->if_type == (t)) +#define ISISO8802(i) ((ISOFIFTTYPE(i, IFT_ETHER) || \ + ISOFIFTTYPE(i, IFT_ISO88023) || \ + ISOFIFTTYPE(i, IFT_ISO88024) || \ + ISOFIFTTYPE(i, IFT_ISO88025) || \ + ISOFIFTTYPE(i, IFT_ISO88026) || \ + ISOFIFTTYPE(i, IFT_P10) || \ + ISOFIFTTYPE(i, IFT_P80) || \ + ISOFIFTTYPE(i, IFT_FDDI))) + +/* + * miscellenous debugging info + */ +struct mbuf_cache { + int mbc_size; + int mbc_num; + int mbc_oldsize; + struct mbuf **mbc_cache; +}; + +#if defined(KERNEL) && defined(CCITT) +extern struct pkcb_q pkcb_q; +struct pklcd *pk_listenhead; +struct pklcd *pk_attach(); + +extern char *pk_name[], *pk_state[]; +int pk_t20, pk_t21, pk_t22, pk_t23; +#endif diff --git a/sys/netccitt/x25.h b/sys/netccitt/x25.h new file mode 100644 index 00000000000..e86af39a1a6 --- /dev/null +++ b/sys/netccitt/x25.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * University of Erlangen-Nuremberg, Germany, 1992 + * + * This code is derived from software contributed to Berkeley by the + * Laboratory for Computation Vision and the Computer Science Department + * of the the University of British Columbia and the Computer Science + * Department (IV) of the University of Erlangen-Nuremberg, Germany. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)x25.h 8.1 (Berkeley) 6/10/93 + */ + +#ifdef KERNEL +#define PRC_IFUP 3 +#define PRC_LINKUP 4 +#define PRC_LINKDOWN 5 +#define PRC_LINKRESET 6 +#define PRC_LINKDONTCOPY 7 +#ifndef PRC_DISCONNECT_REQUEST +#define PRC_DISCONNECT_REQUEST 10 +#endif +#endif + +#define CCITTPROTO_HDLC 1 +#define CCITTPROTO_X25 2 /* packet level protocol */ +#define IEEEPROTO_802LLC 3 /* doesn't belong here */ + +#define HDLCPROTO_LAP 1 +#define HDLCPROTO_LAPB 2 +#define HDLCPROTO_UNSET 3 +#define HDLCPROTO_LAPD 4 + +/* socket options */ +#define PK_ACCTFILE 1 /* use level = CCITTPROTO_X25 */ +#define PK_FACILITIES 2 /* use level = CCITTPROTO_X25 */ +#define PK_RTATTACH 3 /* use level = CCITTPROTO_X25 */ +#define PK_PRLISTEN 4 /* use level = CCITTPROTO_X25 */ + +#define MAX_FACILITIES 109 /* maximum size for facilities */ + +/* + * X.25 Socket address structure. It contains the X.121 or variation of + * X.121, facilities information, higher level protocol value (first four + * bytes of the User Data field), and the last 12 characters of the User + * Data field. + */ + +struct x25_sockaddr { /* obsolete - use sockaddr_x25 */ + short xaddr_len; /* Length of xaddr_addr. */ + u_char xaddr_addr[15]; /* Network dependent or X.121 address. */ + u_char xaddr_facilities; /* Facilities information. */ +#define XS_REVERSE_CHARGE 0x01 +#define XS_HIPRIO 0x02 + u_char xaddr_proto[4]; /* Protocol ID (4 bytes of user data). */ + u_char xaddr_userdata[12]; /* Remaining User data field. */ +}; + +/* + * X.25 Socket address structure. It contains the network id, X.121 + * address, facilities information, higher level protocol value (first four + * bytes of the User Data field), and up to 12 characters of User Data. + */ + +struct sockaddr_x25 { + u_char x25_len; + u_char x25_family; /* must be AF_CCITT */ + short x25_net; /* network id code (usually a dnic) */ + char x25_addr[16]; /* X.121 address (null terminated) */ + struct x25opts { + char op_flags; /* miscellaneous options */ + /* pk_var.h defines other lcd_flags */ +#define X25_REVERSE_CHARGE 0x01 /* remote DTE pays for call */ +#define X25_DBIT 0x02 /* not yet supported */ +#define X25_MQBIT 0x04 /* prepend M&Q bit status byte to packet data */ +#define X25_OLDSOCKADDR 0x08 /* uses old sockaddr structure */ +#define X25_DG_CIRCUIT 0x10 /* lcd_flag: used for datagrams */ +#define X25_DG_ROUTING 0x20 /* lcd_flag: peer addr not yet known */ +#define X25_MBS_HOLD 0x40 /* lcd_flag: collect m-bit sequences */ + char op_psize; /* requested packet size */ +#define X25_PS128 7 +#define X25_PS256 8 +#define X25_PS512 9 + char op_wsize; /* window size (1 .. 7) */ + char op_speed; /* throughput class */ + } x25_opts; + short x25_udlen; /* user data field length */ + char x25_udata[16]; /* user data field */ +}; + +/* + * network configuration info + * this structure must be 16 bytes long + */ + +struct x25config { + struct sockaddr_x25 xc_addr; + /* link level parameters */ + u_short xc_lproto:4, /* link level protocol eg. CCITTPROTO_HDLC */ + xc_lptype:4, /* protocol type eg. HDLCPROTO_LAPB */ + xc_ltrace:1, /* link level tracing flag */ + xc_lwsize:7; /* link level window size */ + u_short xc_lxidxchg:1, /* link level XID exchange flag - NOT YET */ + /* packet level parameters */ + xc_rsvd1:2, + xc_pwsize:3, /* default window size */ + xc_psize:4, /* default packet size 7=128, 8=256, ... */ + xc_type:3, /* network type */ +#define X25_1976 0 +#define X25_1980 1 +#define X25_1984 2 +#define X25_DDN 3 +#define X25_BASIC 4 + xc_ptrace:1, /* packet level tracing flag */ + xc_nodnic:1, /* remove our dnic when calling on net */ + xc_prepnd0:1; /* prepend 0 when making offnet calls */ + u_short xc_maxlcn; /* max logical channels */ + u_short xc_dg_idletimo; /* timeout for idle datagram circuits. */ +}; + +#ifdef IFNAMSIZ +struct ifreq_x25 { + char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct x25config ifr_xc; +}; +#define SIOCSIFCONF_X25 _IOW('i', 12, struct ifreq_x25) /* set ifnet config */ +#define SIOCGIFCONF_X25 _IOWR('i',13, struct ifreq_x25) /* get ifnet config */ +#endif diff --git a/sys/netccitt/x25acct.h b/sys/netccitt/x25acct.h new file mode 100644 index 00000000000..71f3fd89603 --- /dev/null +++ b/sys/netccitt/x25acct.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)x25acct.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Format of X.25 accounting record written + * to X25ACCTF whenever a circuit is closed. + */ + +#ifdef waterloo +#define X25ACCTF "/usr/adm/logs/x25acct" +#else +#define X25ACCTF "/usr/adm/x25acct" +#endif + +struct x25acct { + time_t x25acct_stime; /* start time */ +#ifdef waterloo + u_long x25acct_etime; /* elapsed time (seconds) */ +#else + u_short x25acct_etime; /* elapsed time (seconds) */ +#endif + short x25acct_uid; /* user id */ + short x25acct_net; /* network id */ + u_short x25acct_psize:4, /* packet size */ + x25acct_addrlen:4, /* x25acct_addr length */ + x25acct_revcharge:1, /* reverse charging */ + x25acct_callin:1, /* incoming call */ + x25acct_unused:6; + char x25acct_addr[8]; /* remote DTE address (in bcd) */ + char x25acct_udata[4]; /* protocol id */ + long x25acct_txcnt; /* packets transmitted */ + long x25acct_rxcnt; /* packets received */ +}; diff --git a/sys/netccitt/x25err.h b/sys/netccitt/x25err.h new file mode 100644 index 00000000000..44d5490b422 --- /dev/null +++ b/sys/netccitt/x25err.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) University of British Columbia, 1984 + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Laboratory for Computation Vision and the Computer Science Department + * of the University of British Columbia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)x25err.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * + * X.25 Reset and Clear errors and diagnostics. These values are + * returned in the u_error field of the u structure. + * + */ + +#define EXRESET 100 /* Reset: call reset */ +#define EXROUT 101 /* Reset: out of order */ +#define EXRRPE 102 /* Reset: remote procedure error */ +#define EXRLPE 103 /* Reset: local procedure error */ +#define EXRNCG 104 /* Reset: network congestion */ + +#define EXCLEAR 110 /* Clear: call cleared */ +#define EXCBUSY 111 /* Clear: number busy */ +#define EXCOUT 112 /* Clear: out of order */ +#define EXCRPE 113 /* Clear: remote procedure error */ +#define EXCRRC 114 /* Clear: collect call refused */ +#define EXCINV 115 /* Clear: invalid call */ +#define EXCAB 116 /* Clear: access barred */ +#define EXCLPE 117 /* Clear: local procedure error */ +#define EXCNCG 118 /* Clear: network congestion */ +#define EXCNOB 119 /* Clear: not obtainable */ + diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h new file mode 100644 index 00000000000..beef16e1836 --- /dev/null +++ b/sys/netinet/icmp_var.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)icmp_var.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Variables related to this implementation + * of the internet control message protocol. + */ +struct icmpstat { +/* statistics related to icmp packets generated */ + u_long icps_error; /* # of calls to icmp_error */ + u_long icps_oldshort; /* no error 'cuz old ip too short */ + u_long icps_oldicmp; /* no error 'cuz old was icmp */ + u_long icps_outhist[ICMP_MAXTYPE + 1]; +/* statistics related to input messages processed */ + u_long icps_badcode; /* icmp_code out of range */ + u_long icps_tooshort; /* packet < ICMP_MINLEN */ + u_long icps_checksum; /* bad checksum */ + u_long icps_badlen; /* calculated bound mismatch */ + u_long icps_reflect; /* number of responses */ + u_long icps_inhist[ICMP_MAXTYPE + 1]; +}; + +/* + * Names for ICMP sysctl objects + */ +#define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */ +#define ICMPCTL_MAXID 2 + +#define ICMPCTL_NAMES { \ + { 0, 0 }, \ + { "maskrepl", CTLTYPE_INT }, \ +} + +#ifdef KERNEL +struct icmpstat icmpstat; +#endif diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c new file mode 100644 index 00000000000..41f07c017b8 --- /dev/null +++ b/sys/netinet/if_ether.c @@ -0,0 +1,554 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 + */ + +/* + * Ethernet address resolution protocol. + * TODO: + * add "inuse/lock" bit (or ref. count) along with valid bit + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#define SIN(s) ((struct sockaddr_in *)s) +#define SDL(s) ((struct sockaddr_dl *)s) +#define SRP(s) ((struct sockaddr_inarp *)s) + +/* + * ARP trailer negotiation. Trailer protocol is not IP specific, + * but ARP request/response use IP addresses. + */ +#define ETHERTYPE_IPTRAILERS ETHERTYPE_TRAIL + + +/* timer values */ +int arpt_prune = (5*60*1); /* walk list every 5 minutes */ +int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ +int arpt_down = 20; /* once declared down, don't send for 20 secs */ +#define rt_expire rt_rmx.rmx_expire + +static void arprequest __P((struct arpcom *, u_long *, u_long *, u_char *)); +static void arptfree __P((struct llinfo_arp *)); +static void arptimer __P((void *)); +static struct llinfo_arp *arplookup __P((u_long, int, int)); +static void in_arpinput __P((struct mbuf *)); + +extern struct ifnet loif; +extern struct timeval time; +struct llinfo_arp llinfo_arp = {&llinfo_arp, &llinfo_arp}; +struct ifqueue arpintrq = {0, 0, 0, 50}; +int arp_inuse, arp_allocated, arp_intimer; +int arp_maxtries = 5; +int useloopback = 1; /* use loopback interface for local traffic */ +int arpinit_done = 0; + +/* + * Timeout routine. Age arp_tab entries periodically. + */ +/* ARGSUSED */ +static void +arptimer(ignored_arg) + void *ignored_arg; +{ + int s = splnet(); + register struct llinfo_arp *la = llinfo_arp.la_next; + + timeout(arptimer, (caddr_t)0, arpt_prune * hz); + while (la != &llinfo_arp) { + register struct rtentry *rt = la->la_rt; + la = la->la_next; + if (rt->rt_expire && rt->rt_expire <= time.tv_sec) + arptfree(la->la_prev); /* timer has expired, clear */ + } + splx(s); +} + +/* + * Parallel to llc_rtrequest. + */ +void +arp_rtrequest(req, rt, sa) + int req; + register struct rtentry *rt; + struct sockaddr *sa; +{ + register struct sockaddr *gate = rt->rt_gateway; + register struct llinfo_arp *la = (struct llinfo_arp *)rt->rt_llinfo; + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + + if (!arpinit_done) { + arpinit_done = 1; + timeout(arptimer, (caddr_t)0, hz); + } + if (rt->rt_flags & RTF_GATEWAY) + return; + switch (req) { + + case RTM_ADD: + /* + * XXX: If this is a manually added route to interface + * such as older version of routed or gated might provide, + * restore cloning bit. + */ + if ((rt->rt_flags & RTF_HOST) == 0 && + SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) + rt->rt_flags |= RTF_CLONING; + if (rt->rt_flags & RTF_CLONING) { + /* + * Case 1: This route should come from a route to iface. + */ + rt_setgate(rt, rt_key(rt), + (struct sockaddr *)&null_sdl); + gate = rt->rt_gateway; + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + rt->rt_expire = time.tv_sec; + break; + } + /* Announce a new entry if requested. */ + if (rt->rt_flags & RTF_ANNOUNCE) + arprequest((struct arpcom *)rt->rt_ifp, + &SIN(rt_key(rt))->sin_addr.s_addr, + &SIN(rt_key(rt))->sin_addr.s_addr, + (u_char *)LLADDR(SDL(gate))); + /*FALLTHROUGH*/ + case RTM_RESOLVE: + if (gate->sa_family != AF_LINK || + gate->sa_len < sizeof(null_sdl)) { + log(LOG_DEBUG, "arp_rtrequest: bad gateway value"); + break; + } + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + if (la != 0) + break; /* This happens on a route change */ + /* + * Case 2: This route may come from cloning, or a manual route + * add with a LL address. + */ + R_Malloc(la, struct llinfo_arp *, sizeof(*la)); + rt->rt_llinfo = (caddr_t)la; + if (la == 0) { + log(LOG_DEBUG, "arp_rtrequest: malloc failed\n"); + break; + } + arp_inuse++, arp_allocated++; + Bzero(la, sizeof(*la)); + la->la_rt = rt; + rt->rt_flags |= RTF_LLINFO; + insque(la, &llinfo_arp); + if (SIN(rt_key(rt))->sin_addr.s_addr == + (IA_SIN(rt->rt_ifa))->sin_addr.s_addr) { + /* + * This test used to be + * if (loif.if_flags & IFF_UP) + * It allowed local traffic to be forced + * through the hardware by configuring the loopback down. + * However, it causes problems during network configuration + * for boards that can't receive packets they send. + * It is now necessary to clear "useloopback" and remove + * the route to force traffic out to the hardware. + */ + rt->rt_expire = 0; + Bcopy(((struct arpcom *)rt->rt_ifp)->ac_enaddr, + LLADDR(SDL(gate)), SDL(gate)->sdl_alen = 6); + if (useloopback) + rt->rt_ifp = &loif; + + } + break; + + case RTM_DELETE: + if (la == 0) + break; + arp_inuse--; + remque(la); + rt->rt_llinfo = 0; + rt->rt_flags &= ~RTF_LLINFO; + if (la->la_hold) + m_freem(la->la_hold); + Free((caddr_t)la); + } +} + +/* + * Broadcast an ARP packet, asking who has addr on interface ac. + */ +void +arpwhohas(ac, addr) + register struct arpcom *ac; + register struct in_addr *addr; +{ + arprequest(ac, &ac->ac_ipaddr.s_addr, &addr->s_addr, ac->ac_enaddr); +} + +/* + * Broadcast an ARP request. Caller specifies: + * - arp header source ip address + * - arp header target ip address + * - arp header source ethernet address + */ +static void +arprequest(ac, sip, tip, enaddr) + register struct arpcom *ac; + register u_long *sip, *tip; + register u_char *enaddr; +{ + register struct mbuf *m; + register struct ether_header *eh; + register struct ether_arp *ea; + struct sockaddr sa; + + if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) + return; + m->m_len = sizeof(*ea); + m->m_pkthdr.len = sizeof(*ea); + MH_ALIGN(m, sizeof(*ea)); + ea = mtod(m, struct ether_arp *); + eh = (struct ether_header *)sa.sa_data; + bzero((caddr_t)ea, sizeof (*ea)); + bcopy((caddr_t)etherbroadcastaddr, (caddr_t)eh->ether_dhost, + sizeof(eh->ether_dhost)); + eh->ether_type = ETHERTYPE_ARP; /* if_output will swap */ + ea->arp_hrd = htons(ARPHRD_ETHER); + ea->arp_pro = htons(ETHERTYPE_IP); + ea->arp_hln = sizeof(ea->arp_sha); /* hardware address length */ + ea->arp_pln = sizeof(ea->arp_spa); /* protocol address length */ + ea->arp_op = htons(ARPOP_REQUEST); + bcopy((caddr_t)enaddr, (caddr_t)ea->arp_sha, sizeof(ea->arp_sha)); + bcopy((caddr_t)sip, (caddr_t)ea->arp_spa, sizeof(ea->arp_spa)); + bcopy((caddr_t)tip, (caddr_t)ea->arp_tpa, sizeof(ea->arp_tpa)); + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + (*ac->ac_if.if_output)(&ac->ac_if, m, &sa, (struct rtentry *)0); +} + +/* + * Resolve an IP address into an ethernet address. If success, + * desten is filled in. If there is no entry in arptab, + * set one up and broadcast a request for the IP address. + * Hold onto this mbuf and resend it once the address + * is finally resolved. A return value of 1 indicates + * that desten has been filled in and the packet should be sent + * normally; a 0 return indicates that the packet has been + * taken over here, either now or for later transmission. + */ +int +arpresolve(ac, rt, m, dst, desten) + register struct arpcom *ac; + register struct rtentry *rt; + struct mbuf *m; + register struct sockaddr *dst; + register u_char *desten; +{ + register struct llinfo_arp *la; + struct sockaddr_dl *sdl; + + if (m->m_flags & M_BCAST) { /* broadcast */ + bcopy((caddr_t)etherbroadcastaddr, (caddr_t)desten, + sizeof(etherbroadcastaddr)); + return (1); + } + if (m->m_flags & M_MCAST) { /* multicast */ + ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); + return(1); + } + if (rt) + la = (struct llinfo_arp *)rt->rt_llinfo; + else { + if (la = arplookup(SIN(dst)->sin_addr.s_addr, 1, 0)) + rt = la->la_rt; + } + if (la == 0 || rt == 0) { + log(LOG_DEBUG, "arpresolve: can't allocate llinfo"); + m_freem(m); + return (0); + } + sdl = SDL(rt->rt_gateway); + /* + * Check the address family and length is valid, the address + * is resolved; otherwise, try to resolve. + */ + if ((rt->rt_expire == 0 || rt->rt_expire > time.tv_sec) && + sdl->sdl_family == AF_LINK && sdl->sdl_alen != 0) { + bcopy(LLADDR(sdl), desten, sdl->sdl_alen); + return 1; + } + /* + * There is an arptab entry, but no ethernet address + * response yet. Replace the held mbuf with this + * latest one. + */ + if (la->la_hold) + m_freem(la->la_hold); + la->la_hold = m; + if (rt->rt_expire) { + rt->rt_flags &= ~RTF_REJECT; + if (la->la_asked == 0 || rt->rt_expire != time.tv_sec) { + rt->rt_expire = time.tv_sec; + if (la->la_asked++ < arp_maxtries) + arpwhohas(ac, &(SIN(dst)->sin_addr)); + else { + rt->rt_flags |= RTF_REJECT; + rt->rt_expire += arpt_down; + la->la_asked = 0; + } + + } + } + return (0); +} + +/* + * Common length and type checks are done here, + * then the protocol-specific routine is called. + */ +void +arpintr() +{ + register struct mbuf *m; + register struct arphdr *ar; + int s; + + while (arpintrq.ifq_head) { + s = splimp(); + IF_DEQUEUE(&arpintrq, m); + splx(s); + if (m == 0 || (m->m_flags & M_PKTHDR) == 0) + panic("arpintr"); + if (m->m_len >= sizeof(struct arphdr) && + (ar = mtod(m, struct arphdr *)) && + ntohs(ar->ar_hrd) == ARPHRD_ETHER && + m->m_len >= + sizeof(struct arphdr) + 2 * ar->ar_hln + 2 * ar->ar_pln) + + switch (ntohs(ar->ar_pro)) { + + case ETHERTYPE_IP: + case ETHERTYPE_IPTRAILERS: + in_arpinput(m); + continue; + } + m_freem(m); + } +} + +/* + * ARP for Internet protocols on 10 Mb/s Ethernet. + * Algorithm is that given in RFC 826. + * In addition, a sanity check is performed on the sender + * protocol address, to catch impersonators. + * We no longer handle negotiations for use of trailer protocol: + * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent + * along with IP replies if we wanted trailers sent to us, + * and also sent them in response to IP replies. + * This allowed either end to announce the desire to receive + * trailer packets. + * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, + * but formerly didn't normally send requests. + */ +static void +in_arpinput(m) + struct mbuf *m; +{ + register struct ether_arp *ea; + register struct arpcom *ac = (struct arpcom *)m->m_pkthdr.rcvif; + struct ether_header *eh; + register struct llinfo_arp *la = 0; + register struct rtentry *rt; + struct in_ifaddr *ia, *maybe_ia = 0; + struct sockaddr_dl *sdl; + struct sockaddr sa; + struct in_addr isaddr, itaddr, myaddr; + int op; + + ea = mtod(m, struct ether_arp *); + op = ntohs(ea->arp_op); + bcopy((caddr_t)ea->arp_spa, (caddr_t)&isaddr, sizeof (isaddr)); + bcopy((caddr_t)ea->arp_tpa, (caddr_t)&itaddr, sizeof (itaddr)); + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if (ia->ia_ifp == &ac->ac_if) { + maybe_ia = ia; + if ((itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) || + (isaddr.s_addr == ia->ia_addr.sin_addr.s_addr)) + break; + } + if (maybe_ia == 0) + goto out; + myaddr = ia ? ia->ia_addr.sin_addr : maybe_ia->ia_addr.sin_addr; + if (!bcmp((caddr_t)ea->arp_sha, (caddr_t)ac->ac_enaddr, + sizeof (ea->arp_sha))) + goto out; /* it's from me, ignore it. */ + if (!bcmp((caddr_t)ea->arp_sha, (caddr_t)etherbroadcastaddr, + sizeof (ea->arp_sha))) { + log(LOG_ERR, + "arp: ether address is broadcast for IP address %x!\n", + ntohl(isaddr.s_addr)); + goto out; + } + if (isaddr.s_addr == myaddr.s_addr) { + log(LOG_ERR, + "duplicate IP address %x!! sent from ethernet address: %s\n", + ntohl(isaddr.s_addr), ether_sprintf(ea->arp_sha)); + itaddr = myaddr; + goto reply; + } + la = arplookup(isaddr.s_addr, itaddr.s_addr == myaddr.s_addr, 0); + if (la && (rt = la->la_rt) && (sdl = SDL(rt->rt_gateway))) { + if (sdl->sdl_alen && + bcmp((caddr_t)ea->arp_sha, LLADDR(sdl), sdl->sdl_alen)) + log(LOG_INFO, "arp info overwritten for %x by %s\n", + isaddr.s_addr, ether_sprintf(ea->arp_sha)); + bcopy((caddr_t)ea->arp_sha, LLADDR(sdl), + sdl->sdl_alen = sizeof(ea->arp_sha)); + if (rt->rt_expire) + rt->rt_expire = time.tv_sec + arpt_keep; + rt->rt_flags &= ~RTF_REJECT; + la->la_asked = 0; + if (la->la_hold) { + (*ac->ac_if.if_output)(&ac->ac_if, la->la_hold, + rt_key(rt), rt); + la->la_hold = 0; + } + } +reply: + if (op != ARPOP_REQUEST) { + out: + m_freem(m); + return; + } + if (itaddr.s_addr == myaddr.s_addr) { + /* I am the target */ + bcopy((caddr_t)ea->arp_sha, (caddr_t)ea->arp_tha, + sizeof(ea->arp_sha)); + bcopy((caddr_t)ac->ac_enaddr, (caddr_t)ea->arp_sha, + sizeof(ea->arp_sha)); + } else { + la = arplookup(itaddr.s_addr, 0, SIN_PROXY); + if (la == NULL) + goto out; + rt = la->la_rt; + bcopy((caddr_t)ea->arp_sha, (caddr_t)ea->arp_tha, + sizeof(ea->arp_sha)); + sdl = SDL(rt->rt_gateway); + bcopy(LLADDR(sdl), (caddr_t)ea->arp_sha, sizeof(ea->arp_sha)); + } + + bcopy((caddr_t)ea->arp_spa, (caddr_t)ea->arp_tpa, sizeof(ea->arp_spa)); + bcopy((caddr_t)&itaddr, (caddr_t)ea->arp_spa, sizeof(ea->arp_spa)); + ea->arp_op = htons(ARPOP_REPLY); + ea->arp_pro = htons(ETHERTYPE_IP); /* let's be sure! */ + eh = (struct ether_header *)sa.sa_data; + bcopy((caddr_t)ea->arp_tha, (caddr_t)eh->ether_dhost, + sizeof(eh->ether_dhost)); + eh->ether_type = ETHERTYPE_ARP; + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + (*ac->ac_if.if_output)(&ac->ac_if, m, &sa, (struct rtentry *)0); + return; +} + +/* + * Free an arp entry. + */ +static void +arptfree(la) + register struct llinfo_arp *la; +{ + register struct rtentry *rt = la->la_rt; + register struct sockaddr_dl *sdl; + if (rt == 0) + panic("arptfree"); + if (rt->rt_refcnt > 0 && (sdl = SDL(rt->rt_gateway)) && + sdl->sdl_family == AF_LINK) { + sdl->sdl_alen = 0; + la->la_asked = 0; + rt->rt_flags &= ~RTF_REJECT; + return; + } + rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt), + 0, (struct rtentry **)0); +} +/* + * Lookup or enter a new address in arptab. + */ +static struct llinfo_arp * +arplookup(addr, create, proxy) + u_long addr; + int create, proxy; +{ + register struct rtentry *rt; + static struct sockaddr_inarp sin = {sizeof(sin), AF_INET }; + + sin.sin_addr.s_addr = addr; + sin.sin_other = proxy ? SIN_PROXY : 0; + rt = rtalloc1((struct sockaddr *)&sin, create); + if (rt == 0) + return (0); + rt->rt_refcnt--; + if ((rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 || + rt->rt_gateway->sa_family != AF_LINK) { + if (create) + log(LOG_DEBUG, "arptnew failed on %x\n", ntohl(addr)); + return (0); + } + return ((struct llinfo_arp *)rt->rt_llinfo); +} + +int +arpioctl(cmd, data) + int cmd; + caddr_t data; +{ + return (EOPNOTSUPP); +} diff --git a/sys/netinet/if_ether.h b/sys/netinet/if_ether.h new file mode 100644 index 00000000000..6b4def054f7 --- /dev/null +++ b/sys/netinet/if_ether.h @@ -0,0 +1,224 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ether.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Structure of a 10Mb/s Ethernet header. + */ +struct ether_header { + u_char ether_dhost[6]; + u_char ether_shost[6]; + u_short ether_type; +}; + +#define ETHERTYPE_PUP 0x0200 /* PUP protocol */ +#define ETHERTYPE_IP 0x0800 /* IP protocol */ +#define ETHERTYPE_ARP 0x0806 /* Addr. resolution protocol */ +#define ETHERTYPE_REVARP 0x8035 /* reverse Addr. resolution protocol */ + +/* + * The ETHERTYPE_NTRAILER packet types starting at ETHERTYPE_TRAIL have + * (type-ETHERTYPE_TRAIL)*512 bytes of data followed + * by an ETHER type (as given above) and then the (variable-length) header. + */ +#define ETHERTYPE_TRAIL 0x1000 /* Trailer packet */ +#define ETHERTYPE_NTRAILER 16 + +#define ETHERMTU 1500 +#define ETHERMIN (60-14) + +#ifdef KERNEL +/* + * Macro to map an IP multicast address to an Ethernet multicast address. + * The high-order 25 bits of the Ethernet address are statically assigned, + * and the low-order 23 bits are taken from the low end of the IP address. + */ +#define ETHER_MAP_IP_MULTICAST(ipaddr, enaddr) \ + /* struct in_addr *ipaddr; */ \ + /* u_char enaddr[6]; */ \ +{ \ + (enaddr)[0] = 0x01; \ + (enaddr)[1] = 0x00; \ + (enaddr)[2] = 0x5e; \ + (enaddr)[3] = ((u_char *)ipaddr)[1] & 0x7f; \ + (enaddr)[4] = ((u_char *)ipaddr)[2]; \ + (enaddr)[5] = ((u_char *)ipaddr)[3]; \ +} +#endif + +/* + * Ethernet Address Resolution Protocol. + * + * See RFC 826 for protocol description. Structure below is adapted + * to resolving internet addresses. Field names used correspond to + * RFC 826. + */ +struct ether_arp { + struct arphdr ea_hdr; /* fixed-size header */ + u_char arp_sha[6]; /* sender hardware address */ + u_char arp_spa[4]; /* sender protocol address */ + u_char arp_tha[6]; /* target hardware address */ + u_char arp_tpa[4]; /* target protocol address */ +}; +#define arp_hrd ea_hdr.ar_hrd +#define arp_pro ea_hdr.ar_pro +#define arp_hln ea_hdr.ar_hln +#define arp_pln ea_hdr.ar_pln +#define arp_op ea_hdr.ar_op + + +/* + * Structure shared between the ethernet driver modules and + * the address resolution code. For example, each ec_softc or il_softc + * begins with this structure. + */ +struct arpcom { + struct ifnet ac_if; /* network-visible interface */ + u_char ac_enaddr[6]; /* ethernet hardware address */ + struct in_addr ac_ipaddr; /* copy of ip address- XXX */ + struct ether_multi *ac_multiaddrs; /* list of ether multicast addrs */ + int ac_multicnt; /* length of ac_multiaddrs list */ +}; + +struct llinfo_arp { + struct llinfo_arp *la_next; + struct llinfo_arp *la_prev; + struct rtentry *la_rt; + struct mbuf *la_hold; /* last packet until resolved/timeout */ + long la_asked; /* last time we QUERIED for this addr */ +#define la_timer la_rt->rt_rmx.rmx_expire /* deletion time in seconds */ +}; + +struct sockaddr_inarp { + u_char sin_len; + u_char sin_family; + u_short sin_port; + struct in_addr sin_addr; + struct in_addr sin_srcaddr; + u_short sin_tos; + u_short sin_other; +#define SIN_PROXY 1 +}; +/* + * IP and ethernet specific routing flags + */ +#define RTF_USETRAILERS RTF_PROTO1 /* use trailers */ +#define RTF_ANNOUNCE RTF_PROTO2 /* announce new arp entry */ + +#ifdef KERNEL +u_char etherbroadcastaddr[6]; +u_char ether_ipmulticast_min[6]; +u_char ether_ipmulticast_max[6]; +struct ifqueue arpintrq; + +struct llinfo_arp *arptnew __P((struct in_addr *)); +struct llinfo_arp llinfo_arp; /* head of the llinfo queue */ + +void arpwhohas __P((struct arpcom *, struct in_addr *)); +void arpintr __P((void)); +int arpresolve __P((struct arpcom *, + struct rtentry *, struct mbuf *, struct sockaddr *, u_char *)); +void arp_rtrequest __P((int, struct rtentry *, struct sockaddr *)); +void arpwhohas __P((struct arpcom *, struct in_addr *)); + +int ether_addmulti __P((struct ifreq *, struct arpcom *)); +int ether_delmulti __P((struct ifreq *, struct arpcom *)); + +/* + * Ethernet multicast address structure. There is one of these for each + * multicast address or range of multicast addresses that we are supposed + * to listen to on a particular interface. They are kept in a linked list, + * rooted in the interface's arpcom structure. (This really has nothing to + * do with ARP, or with the Internet address family, but this appears to be + * the minimally-disrupting place to put it.) + */ +struct ether_multi { + u_char enm_addrlo[6]; /* low or only address of range */ + u_char enm_addrhi[6]; /* high or only address of range */ + struct arpcom *enm_ac; /* back pointer to arpcom */ + u_int enm_refcount; /* no. claims to this addr/range */ + struct ether_multi *enm_next; /* ptr to next ether_multi */ +}; + +/* + * Structure used by macros below to remember position when stepping through + * all of the ether_multi records. + */ +struct ether_multistep { + struct ether_multi *e_enm; +}; + +/* + * Macro for looking up the ether_multi record for a given range of Ethernet + * multicast addresses connected to a given arpcom structure. If no matching + * record is found, "enm" returns NULL. + */ +#define ETHER_LOOKUP_MULTI(addrlo, addrhi, ac, enm) \ + /* u_char addrlo[6]; */ \ + /* u_char addrhi[6]; */ \ + /* struct arpcom *ac; */ \ + /* struct ether_multi *enm; */ \ +{ \ + for ((enm) = (ac)->ac_multiaddrs; \ + (enm) != NULL && \ + (bcmp((enm)->enm_addrlo, (addrlo), 6) != 0 || \ + bcmp((enm)->enm_addrhi, (addrhi), 6) != 0); \ + (enm) = (enm)->enm_next); \ +} + +/* + * Macro to step through all of the ether_multi records, one at a time. + * The current position is remembered in "step", which the caller must + * provide. ETHER_FIRST_MULTI(), below, must be called to initialize "step" + * and get the first record. Both macros return a NULL "enm" when there + * are no remaining records. + */ +#define ETHER_NEXT_MULTI(step, enm) \ + /* struct ether_multistep step; */ \ + /* struct ether_multi *enm; */ \ +{ \ + if (((enm) = (step).e_enm) != NULL) \ + (step).e_enm = (enm)->enm_next; \ +} + +#define ETHER_FIRST_MULTI(step, ac, enm) \ + /* struct ether_multistep step; */ \ + /* struct arpcom *ac; */ \ + /* struct ether_multi *enm; */ \ +{ \ + (step).e_enm = (ac)->ac_multiaddrs; \ + ETHER_NEXT_MULTI((step), (enm)); \ +} + +#endif diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c new file mode 100644 index 00000000000..78b426c49ea --- /dev/null +++ b/sys/netinet/igmp.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp.c 8.1 (Berkeley) 7/19/93 + */ + +/* Internet Group Management Protocol (IGMP) routines. */ + + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +extern struct ifnet loif; + +static int igmp_timers_are_running = 0; +static u_long igmp_all_hosts_group; + +static void igmp_sendreport __P((struct in_multi *)); + +void +igmp_init() +{ + /* + * To avoid byte-swapping the same value over and over again. + */ + igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP); +} + +void +igmp_input(m, iphlen) + register struct mbuf *m; + register int iphlen; +{ + register struct igmp *igmp; + register struct ip *ip; + register int igmplen; + register struct ifnet *ifp = m->m_pkthdr.rcvif; + register int minlen; + register struct in_multi *inm; + register struct in_ifaddr *ia; + struct in_multistep step; + + ++igmpstat.igps_rcv_total; + + ip = mtod(m, struct ip *); + igmplen = ip->ip_len; + + /* + * Validate lengths + */ + if (igmplen < IGMP_MINLEN) { + ++igmpstat.igps_rcv_tooshort; + m_freem(m); + return; + } + minlen = iphlen + IGMP_MINLEN; + if ((m->m_flags & M_EXT || m->m_len < minlen) && + (m = m_pullup(m, minlen)) == 0) { + ++igmpstat.igps_rcv_tooshort; + return; + } + + /* + * Validate checksum + */ + m->m_data += iphlen; + m->m_len -= iphlen; + igmp = mtod(m, struct igmp *); + if (in_cksum(m, igmplen)) { + ++igmpstat.igps_rcv_badsum; + m_freem(m); + return; + } + m->m_data -= iphlen; + m->m_len += iphlen; + ip = mtod(m, struct ip *); + + switch (igmp->igmp_type) { + + case IGMP_HOST_MEMBERSHIP_QUERY: + ++igmpstat.igps_rcv_queries; + + if (ifp == &loif) + break; + + if (ip->ip_dst.s_addr != igmp_all_hosts_group) { + ++igmpstat.igps_rcv_badqueries; + m_freem(m); + return; + } + + /* + * Start the timers in all of our membership records for + * the interface on which the query arrived, except those + * that are already running and those that belong to the + * "all-hosts" group. + */ + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + if (inm->inm_ifp == ifp && inm->inm_timer == 0 && + inm->inm_addr.s_addr != igmp_all_hosts_group) { + inm->inm_timer = + IGMP_RANDOM_DELAY(inm->inm_addr); + igmp_timers_are_running = 1; + } + IN_NEXT_MULTI(step, inm); + } + + break; + + case IGMP_HOST_MEMBERSHIP_REPORT: + ++igmpstat.igps_rcv_reports; + + if (ifp == &loif) + break; + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || + igmp->igmp_group.s_addr != ip->ip_dst.s_addr) { + ++igmpstat.igps_rcv_badreports; + m_freem(m); + return; + } + + /* + * KLUDGE: if the IP source address of the report has an + * unspecified (i.e., zero) subnet number, as is allowed for + * a booting host, replace it with the correct subnet number + * so that a process-level multicast routing demon can + * determine which subnet it arrived from. This is necessary + * to compensate for the lack of any way for a process to + * determine the arrival interface of an incoming packet. + */ + if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0) { + IFP_TO_IA(ifp, ia); + if (ia) ip->ip_src.s_addr = htonl(ia->ia_subnet); + } + + /* + * If we belong to the group being reported, stop + * our timer for that group. + */ + IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); + if (inm != NULL) { + inm->inm_timer = 0; + ++igmpstat.igps_rcv_ourreports; + } + + break; + } + + /* + * Pass all valid IGMP packets up to any process(es) listening + * on a raw IGMP socket. + */ + rip_input(m); +} + +void +igmp_joingroup(inm) + struct in_multi *inm; +{ + register int s = splnet(); + + if (inm->inm_addr.s_addr == igmp_all_hosts_group || + inm->inm_ifp == &loif) + inm->inm_timer = 0; + else { + igmp_sendreport(inm); + inm->inm_timer = IGMP_RANDOM_DELAY(inm->inm_addr); + igmp_timers_are_running = 1; + } + splx(s); +} + +void +igmp_leavegroup(inm) + struct in_multi *inm; +{ + /* + * No action required on leaving a group. + */ +} + +void +igmp_fasttimo() +{ + register struct in_multi *inm; + register int s; + struct in_multistep step; + + /* + * Quick check to see if any work needs to be done, in order + * to minimize the overhead of fasttimo processing. + */ + if (!igmp_timers_are_running) + return; + + s = splnet(); + igmp_timers_are_running = 0; + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + if (inm->inm_timer == 0) { + /* do nothing */ + } else if (--inm->inm_timer == 0) { + igmp_sendreport(inm); + } else { + igmp_timers_are_running = 1; + } + IN_NEXT_MULTI(step, inm); + } + splx(s); +} + +static void +igmp_sendreport(inm) + register struct in_multi *inm; +{ + register struct mbuf *m; + register struct igmp *igmp; + register struct ip *ip; + register struct ip_moptions *imo; + struct ip_moptions simo; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; + /* + * Assume max_linkhdr + sizeof(struct ip) + IGMP_MINLEN + * is smaller than mbuf size returned by MGETHDR. + */ + m->m_data += max_linkhdr; + m->m_len = sizeof(struct ip) + IGMP_MINLEN; + m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN; + + ip = mtod(m, struct ip *); + ip->ip_tos = 0; + ip->ip_len = sizeof(struct ip) + IGMP_MINLEN; + ip->ip_off = 0; + ip->ip_p = IPPROTO_IGMP; + ip->ip_src.s_addr = INADDR_ANY; + ip->ip_dst = inm->inm_addr; + + igmp = (struct igmp *)(ip + 1); + igmp->igmp_type = IGMP_HOST_MEMBERSHIP_REPORT; + igmp->igmp_code = 0; + igmp->igmp_group = inm->inm_addr; + igmp->igmp_cksum = 0; + igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN); + + imo = &simo; + bzero((caddr_t)imo, sizeof(*imo)); + imo->imo_multicast_ifp = inm->inm_ifp; + imo->imo_multicast_ttl = 1; + /* + * Request loopback of the report if we are acting as a multicast + * router, so that the process-level routing demon can hear it. + */ +#ifdef MROUTING + { + extern struct socket *ip_mrouter; + imo->imo_multicast_loop = (ip_mrouter != NULL); + } +#endif + ip_output(m, NULL, NULL, 0, imo); + + ++igmpstat.igps_snd_reports; +} diff --git a/sys/netinet/igmp.h b/sys/netinet/igmp.h new file mode 100644 index 00000000000..29ce21dee6f --- /dev/null +++ b/sys/netinet/igmp.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp.h 8.1 (Berkeley) 6/10/93 + */ + +/* Internet Group Management Protocol (IGMP) definitions. */ + +/* + * IGMP packet format. + */ +struct igmp { + u_char igmp_type; /* version & type of IGMP message */ + u_char igmp_code; /* unused, should be zero */ + u_short igmp_cksum; /* IP-style checksum */ + struct in_addr igmp_group; /* group address being reported */ +}; /* (zero for queries) */ + +#define IGMP_MINLEN 8 + +#define IGMP_HOST_MEMBERSHIP_QUERY 0x11 /* message types, incl. version */ +#define IGMP_HOST_MEMBERSHIP_REPORT 0x12 +#define IGMP_DVMRP 0x13 /* for experimental multicast */ + /* routing protocol */ + +#define IGMP_MAX_HOST_REPORT_DELAY 10 /* max delay for response to */ diff --git a/sys/netinet/igmp_var.h b/sys/netinet/igmp_var.h new file mode 100644 index 00000000000..ff70f70e2b3 --- /dev/null +++ b/sys/netinet/igmp_var.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp_var.h 8.1 (Berkeley) 7/19/93 + */ + +/* + * Internet Group Management Protocol (IGMP), + * implementation-specific definitions. + * + * Written by Steve Deering, Stanford, May 1988. + * + * MULTICAST 1.1 + */ + +struct igmpstat { + u_long igps_rcv_total; /* total IGMP messages received */ + u_long igps_rcv_tooshort; /* received with too few bytes */ + u_long igps_rcv_badsum; /* received with bad checksum */ + u_long igps_rcv_queries; /* received membership queries */ + u_long igps_rcv_badqueries; /* received invalid queries */ + u_long igps_rcv_reports; /* received membership reports */ + u_long igps_rcv_badreports; /* received invalid reports */ + u_long igps_rcv_ourreports; /* received reports for our groups */ + u_long igps_snd_reports; /* sent membership reports */ +}; + +#ifdef KERNEL +struct igmpstat igmpstat; + +/* + * Macro to compute a random timer value between 1 and (IGMP_MAX_REPORTING_ + * DELAY * countdown frequency). We generate a "random" number by adding + * the total number of IP packets received, our primary IP address, and the + * multicast address being timed-out. The 4.3 random() routine really + * ought to be available in the kernel! + */ +#define IGMP_RANDOM_DELAY(multiaddr) \ + /* struct in_addr multiaddr; */ \ + ( (ipstat.ips_total + \ + ntohl(IA_SIN(in_ifaddr)->sin_addr.s_addr) + \ + ntohl((multiaddr).s_addr) \ + ) \ + % (IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ) + 1 \ + ) + +void igmp_init __P(()); +void igmp_input __P((struct mbuf *, int)); +void igmp_joingroup __P((struct in_multi *)); +void igmp_leavegroup __P((struct in_multi *)); +void igmp_fasttimo __P(()); +#endif diff --git a/sys/netinet/in.c b/sys/netinet/in.c new file mode 100644 index 00000000000..e8b481b4005 --- /dev/null +++ b/sys/netinet/in.c @@ -0,0 +1,622 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in.c 8.2 (Berkeley) 11/15/93 + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#ifdef INET +/* + * Return the network number from an internet address. + */ +u_long +in_netof(in) + struct in_addr in; +{ + register u_long i = ntohl(in.s_addr); + register u_long net; + register struct in_ifaddr *ia; + + if (IN_CLASSA(i)) + net = i & IN_CLASSA_NET; + else if (IN_CLASSB(i)) + net = i & IN_CLASSB_NET; + else if (IN_CLASSC(i)) + net = i & IN_CLASSC_NET; + else if (IN_CLASSD(i)) + net = i & IN_CLASSD_NET; + else + return (0); + + /* + * Check whether network is a subnet; + * if so, return subnet number. + */ + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if (net == ia->ia_net) + return (i & ia->ia_subnetmask); + return (net); +} + +#ifndef SUBNETSARELOCAL +#define SUBNETSARELOCAL 1 +#endif +int subnetsarelocal = SUBNETSARELOCAL; +/* + * Return 1 if an internet address is for a ``local'' host + * (one to which we have a connection). If subnetsarelocal + * is true, this includes other subnets of the local net. + * Otherwise, it includes only the directly-connected (sub)nets. + */ +in_localaddr(in) + struct in_addr in; +{ + register u_long i = ntohl(in.s_addr); + register struct in_ifaddr *ia; + + if (subnetsarelocal) { + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if ((i & ia->ia_netmask) == ia->ia_net) + return (1); + } else { + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if ((i & ia->ia_subnetmask) == ia->ia_subnet) + return (1); + } + return (0); +} + +/* + * Determine whether an IP address is in a reserved set of addresses + * that may not be forwarded, or whether datagrams to that destination + * may be forwarded. + */ +in_canforward(in) + struct in_addr in; +{ + register u_long i = ntohl(in.s_addr); + register u_long net; + + if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i)) + return (0); + if (IN_CLASSA(i)) { + net = i & IN_CLASSA_NET; + if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) + return (0); + } + return (1); +} + +/* + * Trim a mask in a sockaddr + */ +void +in_socktrim(ap) +struct sockaddr_in *ap; +{ + register char *cplim = (char *) &ap->sin_addr; + register char *cp = (char *) (&ap->sin_addr + 1); + + ap->sin_len = 0; + while (--cp > cplim) + if (*cp) { + (ap)->sin_len = cp - (char *) (ap) + 1; + break; + } +} + +int in_interfaces; /* number of external internet interfaces */ +extern struct ifnet loif; + +/* + * Generic internet control operations (ioctl's). + * Ifp is 0 if not an interface-specific ioctl. + */ +/* ARGSUSED */ +in_control(so, cmd, data, ifp) + struct socket *so; + int cmd; + caddr_t data; + register struct ifnet *ifp; +{ + register struct ifreq *ifr = (struct ifreq *)data; + register struct in_ifaddr *ia = 0; + register struct ifaddr *ifa; + struct in_ifaddr *oia; + struct in_aliasreq *ifra = (struct in_aliasreq *)data; + struct sockaddr_in oldaddr; + int error, hostIsNew, maskIsNew; + u_long i; + + /* + * Find address for this interface, if it exists. + */ + if (ifp) + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if (ia->ia_ifp == ifp) + break; + + switch (cmd) { + + case SIOCAIFADDR: + case SIOCDIFADDR: + if (ifra->ifra_addr.sin_family == AF_INET) + for (oia = ia; ia; ia = ia->ia_next) { + if (ia->ia_ifp == ifp && + ia->ia_addr.sin_addr.s_addr == + ifra->ifra_addr.sin_addr.s_addr) + break; + } + if (cmd == SIOCDIFADDR && ia == 0) + return (EADDRNOTAVAIL); + /* FALLTHROUGH */ + case SIOCSIFADDR: + case SIOCSIFNETMASK: + case SIOCSIFDSTADDR: + if ((so->so_state & SS_PRIV) == 0) + return (EPERM); + + if (ifp == 0) + panic("in_control"); + if (ia == (struct in_ifaddr *)0) { + oia = (struct in_ifaddr *) + malloc(sizeof *oia, M_IFADDR, M_WAITOK); + if (oia == (struct in_ifaddr *)NULL) + return (ENOBUFS); + bzero((caddr_t)oia, sizeof *oia); + if (ia = in_ifaddr) { + for ( ; ia->ia_next; ia = ia->ia_next) + continue; + ia->ia_next = oia; + } else + in_ifaddr = oia; + ia = oia; + if (ifa = ifp->if_addrlist) { + for ( ; ifa->ifa_next; ifa = ifa->ifa_next) + continue; + ifa->ifa_next = (struct ifaddr *) ia; + } else + ifp->if_addrlist = (struct ifaddr *) ia; + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + ia->ia_ifa.ifa_dstaddr + = (struct sockaddr *)&ia->ia_dstaddr; + ia->ia_ifa.ifa_netmask + = (struct sockaddr *)&ia->ia_sockmask; + ia->ia_sockmask.sin_len = 8; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); + ia->ia_broadaddr.sin_family = AF_INET; + } + ia->ia_ifp = ifp; + if (ifp != &loif) + in_interfaces++; + } + break; + + case SIOCSIFBRDADDR: + if ((so->so_state & SS_PRIV) == 0) + return (EPERM); + /* FALLTHROUGH */ + + case SIOCGIFADDR: + case SIOCGIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCGIFBRDADDR: + if (ia == (struct in_ifaddr *)0) + return (EADDRNOTAVAIL); + break; + } + switch (cmd) { + + case SIOCGIFADDR: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr; + break; + + case SIOCGIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (EINVAL); + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr; + break; + + case SIOCGIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr; + break; + + case SIOCGIFNETMASK: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask; + break; + + case SIOCSIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + oldaddr = ia->ia_dstaddr; + ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr; + if (ifp->if_ioctl && (error = (*ifp->if_ioctl) + (ifp, SIOCSIFDSTADDR, (caddr_t)ia))) { + ia->ia_dstaddr = oldaddr; + return (error); + } + if (ia->ia_flags & IFA_ROUTE) { + ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr; + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + ia->ia_ifa.ifa_dstaddr = + (struct sockaddr *)&ia->ia_dstaddr; + rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); + } + break; + + case SIOCSIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (EINVAL); + ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr; + break; + + case SIOCSIFADDR: + return (in_ifinit(ifp, ia, + (struct sockaddr_in *) &ifr->ifr_addr, 1)); + + case SIOCSIFNETMASK: + i = ifra->ifra_addr.sin_addr.s_addr; + ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr = i); + break; + + case SIOCAIFADDR: + maskIsNew = 0; + hostIsNew = 1; + error = 0; + if (ia->ia_addr.sin_family == AF_INET) { + if (ifra->ifra_addr.sin_len == 0) { + ifra->ifra_addr = ia->ia_addr; + hostIsNew = 0; + } else if (ifra->ifra_addr.sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) + hostIsNew = 0; + } + if (ifra->ifra_mask.sin_len) { + in_ifscrub(ifp, ia); + ia->ia_sockmask = ifra->ifra_mask; + ia->ia_subnetmask = + ntohl(ia->ia_sockmask.sin_addr.s_addr); + maskIsNew = 1; + } + if ((ifp->if_flags & IFF_POINTOPOINT) && + (ifra->ifra_dstaddr.sin_family == AF_INET)) { + in_ifscrub(ifp, ia); + ia->ia_dstaddr = ifra->ifra_dstaddr; + maskIsNew = 1; /* We lie; but the effect's the same */ + } + if (ifra->ifra_addr.sin_family == AF_INET && + (hostIsNew || maskIsNew)) + error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); + if ((ifp->if_flags & IFF_BROADCAST) && + (ifra->ifra_broadaddr.sin_family == AF_INET)) + ia->ia_broadaddr = ifra->ifra_broadaddr; + return (error); + + case SIOCDIFADDR: + in_ifscrub(ifp, ia); + if ((ifa = ifp->if_addrlist) == (struct ifaddr *)ia) + ifp->if_addrlist = ifa->ifa_next; + else { + while (ifa->ifa_next && + (ifa->ifa_next != (struct ifaddr *)ia)) + ifa = ifa->ifa_next; + if (ifa->ifa_next) + ifa->ifa_next = ((struct ifaddr *)ia)->ifa_next; + else + printf("Couldn't unlink inifaddr from ifp\n"); + } + oia = ia; + if (oia == (ia = in_ifaddr)) + in_ifaddr = ia->ia_next; + else { + while (ia->ia_next && (ia->ia_next != oia)) + ia = ia->ia_next; + if (ia->ia_next) + ia->ia_next = oia->ia_next; + else + printf("Didn't unlink inifadr from list\n"); + } + IFAFREE((&oia->ia_ifa)); + break; + + default: + if (ifp == 0 || ifp->if_ioctl == 0) + return (EOPNOTSUPP); + return ((*ifp->if_ioctl)(ifp, cmd, data)); + } + return (0); +} + +/* + * Delete any existing route for an interface. + */ +void +in_ifscrub(ifp, ia) + register struct ifnet *ifp; + register struct in_ifaddr *ia; +{ + + if ((ia->ia_flags & IFA_ROUTE) == 0) + return; + if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + else + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, 0); + ia->ia_flags &= ~IFA_ROUTE; +} + +/* + * Initialize an interface's internet address + * and routing table entry. + */ +in_ifinit(ifp, ia, sin, scrub) + register struct ifnet *ifp; + register struct in_ifaddr *ia; + struct sockaddr_in *sin; + int scrub; +{ + register u_long i = ntohl(sin->sin_addr.s_addr); + struct sockaddr_in oldaddr; + int s = splimp(), flags = RTF_UP, error, ether_output(); + + oldaddr = ia->ia_addr; + ia->ia_addr = *sin; + /* + * Give the interface a chance to initialize + * if this is its first address, + * and to validate the address if necessary. + */ + if (ifp->if_ioctl && + (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia))) { + splx(s); + ia->ia_addr = oldaddr; + return (error); + } + if (ifp->if_output == ether_output) { /* XXX: Another Kludge */ + ia->ia_ifa.ifa_rtrequest = arp_rtrequest; + ia->ia_ifa.ifa_flags |= RTF_CLONING; + } + splx(s); + if (scrub) { + ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; + in_ifscrub(ifp, ia); + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + } + if (IN_CLASSA(i)) + ia->ia_netmask = IN_CLASSA_NET; + else if (IN_CLASSB(i)) + ia->ia_netmask = IN_CLASSB_NET; + else + ia->ia_netmask = IN_CLASSC_NET; + /* + * The subnet mask usually includes at least the standard network part, + * but may may be smaller in the case of supernetting. + * If it is set, we believe it. + */ + if (ia->ia_subnetmask == 0) { + ia->ia_subnetmask = ia->ia_netmask; + ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); + } else + ia->ia_netmask &= ia->ia_subnetmask; + ia->ia_net = i & ia->ia_netmask; + ia->ia_subnet = i & ia->ia_subnetmask; + in_socktrim(&ia->ia_sockmask); + /* + * Add route for the network. + */ + ia->ia_ifa.ifa_metric = ifp->if_metric; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_addr.s_addr = + htonl(ia->ia_subnet | ~ia->ia_subnetmask); + ia->ia_netbroadcast.s_addr = + htonl(ia->ia_net | ~ ia->ia_netmask); + } else if (ifp->if_flags & IFF_LOOPBACK) { + ia->ia_ifa.ifa_dstaddr = ia->ia_ifa.ifa_addr; + flags |= RTF_HOST; + } else if (ifp->if_flags & IFF_POINTOPOINT) { + if (ia->ia_dstaddr.sin_family != AF_INET) + return (0); + flags |= RTF_HOST; + } + if ((error = rtinit(&(ia->ia_ifa), (int)RTM_ADD, flags)) == 0) + ia->ia_flags |= IFA_ROUTE; + /* + * If the interface supports multicast, join the "all hosts" + * multicast group on that interface. + */ + if (ifp->if_flags & IFF_MULTICAST) { + struct in_addr addr; + + addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); + in_addmulti(&addr, ifp); + } + return (error); +} + + +/* + * Return 1 if the address might be a local broadcast address. + */ +in_broadcast(in, ifp) + struct in_addr in; + struct ifnet *ifp; +{ + register struct ifaddr *ifa; + u_long t; + + if (in.s_addr == INADDR_BROADCAST || + in.s_addr == INADDR_ANY) + return 1; + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return 0; + t = ntohl(in.s_addr); + /* + * Look through the list of addresses for a match + * with a broadcast address. + */ +#define ia ((struct in_ifaddr *)ifa) + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) + if (ifa->ifa_addr->sa_family == AF_INET && + (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || + in.s_addr == ia->ia_netbroadcast.s_addr || + /* + * Check for old-style (host 0) broadcast. + */ + t == ia->ia_subnet || t == ia->ia_net)) + return 1; + return (0); +#undef ia +} +/* + * Add an address to the list of IP multicast addresses for a given interface. + */ +struct in_multi * +in_addmulti(ap, ifp) + register struct in_addr *ap; + register struct ifnet *ifp; +{ + register struct in_multi *inm; + struct ifreq ifr; + struct in_ifaddr *ia; + int s = splnet(); + + /* + * See if address already in list. + */ + IN_LOOKUP_MULTI(*ap, ifp, inm); + if (inm != NULL) { + /* + * Found it; just increment the reference count. + */ + ++inm->inm_refcount; + } + else { + /* + * New address; allocate a new multicast record + * and link it into the interface's multicast list. + */ + inm = (struct in_multi *)malloc(sizeof(*inm), + M_IPMADDR, M_NOWAIT); + if (inm == NULL) { + splx(s); + return (NULL); + } + inm->inm_addr = *ap; + inm->inm_ifp = ifp; + inm->inm_refcount = 1; + IFP_TO_IA(ifp, ia); + if (ia == NULL) { + free(inm, M_IPMADDR); + splx(s); + return (NULL); + } + inm->inm_ia = ia; + inm->inm_next = ia->ia_multiaddrs; + ia->ia_multiaddrs = inm; + /* + * Ask the network driver to update its multicast reception + * filter appropriately for the new address. + */ + ((struct sockaddr_in *)&ifr.ifr_addr)->sin_family = AF_INET; + ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr = *ap; + if ((ifp->if_ioctl == NULL) || + (*ifp->if_ioctl)(ifp, SIOCADDMULTI,(caddr_t)&ifr) != 0) { + ia->ia_multiaddrs = inm->inm_next; + free(inm, M_IPMADDR); + splx(s); + return (NULL); + } + /* + * Let IGMP know that we have joined a new IP multicast group. + */ + igmp_joingroup(inm); + } + splx(s); + return (inm); +} + +/* + * Delete a multicast address record. + */ +int +in_delmulti(inm) + register struct in_multi *inm; +{ + register struct in_multi **p; + struct ifreq ifr; + int s = splnet(); + + if (--inm->inm_refcount == 0) { + /* + * No remaining claims to this record; let IGMP know that + * we are leaving the multicast group. + */ + igmp_leavegroup(inm); + /* + * Unlink from list. + */ + for (p = &inm->inm_ia->ia_multiaddrs; + *p != inm; + p = &(*p)->inm_next) + continue; + *p = (*p)->inm_next; + /* + * Notify the network driver to update its multicast reception + * filter. + */ + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr = + inm->inm_addr; + (*inm->inm_ifp->if_ioctl)(inm->inm_ifp, SIOCDELMULTI, + (caddr_t)&ifr); + free(inm, M_IPMADDR); + } + splx(s); +} +#endif diff --git a/sys/netinet/in.h b/sys/netinet/in.h new file mode 100644 index 00000000000..1ce9948f6e3 --- /dev/null +++ b/sys/netinet/in.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in.h 8.3 (Berkeley) 1/3/94 + */ + +/* + * Constants and structures defined by the internet system, + * Per RFC 790, September 1981, and numerous additions. + */ + +/* + * Protocols + */ +#define IPPROTO_IP 0 /* dummy for IP */ +#define IPPROTO_ICMP 1 /* control message protocol */ +#define IPPROTO_IGMP 2 /* group mgmt protocol */ +#define IPPROTO_GGP 3 /* gateway^2 (deprecated) */ +#define IPPROTO_TCP 6 /* tcp */ +#define IPPROTO_EGP 8 /* exterior gateway protocol */ +#define IPPROTO_PUP 12 /* pup */ +#define IPPROTO_UDP 17 /* user datagram protocol */ +#define IPPROTO_IDP 22 /* xns idp */ +#define IPPROTO_TP 29 /* tp-4 w/ class negotiation */ +#define IPPROTO_EON 80 /* ISO cnlp */ +#define IPPROTO_ENCAP 98 /* encapsulation header */ + +#define IPPROTO_RAW 255 /* raw IP packet */ +#define IPPROTO_MAX 256 + + +/* + * Local port number conventions: + * Ports < IPPORT_RESERVED are reserved for + * privileged processes (e.g. root). + * Ports > IPPORT_USERRESERVED are reserved + * for servers, not necessarily privileged. + */ +#define IPPORT_RESERVED 1024 +#define IPPORT_USERRESERVED 5000 + +/* + * Internet address (a structure for historical reasons) + */ +struct in_addr { + u_long s_addr; +}; + +/* + * Definitions of bits in internet address integers. + * On subnets, the decomposition of addresses to host and net parts + * is done according to subnet mask, not the masks here. + */ +#define IN_CLASSA(i) (((long)(i) & 0x80000000) == 0) +#define IN_CLASSA_NET 0xff000000 +#define IN_CLASSA_NSHIFT 24 +#define IN_CLASSA_HOST 0x00ffffff +#define IN_CLASSA_MAX 128 + +#define IN_CLASSB(i) (((long)(i) & 0xc0000000) == 0x80000000) +#define IN_CLASSB_NET 0xffff0000 +#define IN_CLASSB_NSHIFT 16 +#define IN_CLASSB_HOST 0x0000ffff +#define IN_CLASSB_MAX 65536 + +#define IN_CLASSC(i) (((long)(i) & 0xe0000000) == 0xc0000000) +#define IN_CLASSC_NET 0xffffff00 +#define IN_CLASSC_NSHIFT 8 +#define IN_CLASSC_HOST 0x000000ff + +#define IN_CLASSD(i) (((long)(i) & 0xf0000000) == 0xe0000000) +#define IN_CLASSD_NET 0xf0000000 /* These ones aren't really */ +#define IN_CLASSD_NSHIFT 28 /* net and host fields, but */ +#define IN_CLASSD_HOST 0x0fffffff /* routing needn't know. */ +#define IN_MULTICAST(i) IN_CLASSD(i) + +#define IN_EXPERIMENTAL(i) (((long)(i) & 0xf0000000) == 0xf0000000) +#define IN_BADCLASS(i) (((long)(i) & 0xf0000000) == 0xf0000000) + +#define INADDR_ANY (u_long)0x00000000 +#define INADDR_BROADCAST (u_long)0xffffffff /* must be masked */ +#ifndef KERNEL +#define INADDR_NONE 0xffffffff /* -1 return */ +#endif + +#define INADDR_UNSPEC_GROUP (u_long)0xe0000000 /* 224.0.0.0 */ +#define INADDR_ALLHOSTS_GROUP (u_long)0xe0000001 /* 224.0.0.1 */ +#define INADDR_MAX_LOCAL_GROUP (u_long)0xe00000ff /* 224.0.0.255 */ + +#define IN_LOOPBACKNET 127 /* official! */ + +/* + * Socket address, internet style. + */ +struct sockaddr_in { + u_char sin_len; + u_char sin_family; + u_short sin_port; + struct in_addr sin_addr; + char sin_zero[8]; +}; + +/* + * Structure used to describe IP options. + * Used to store options internally, to pass them to a process, + * or to restore options retrieved earlier. + * The ip_dst is used for the first-hop gateway when using a source route + * (this gets put into the header proper). + */ +struct ip_opts { + struct in_addr ip_dst; /* first hop, 0 w/o src rt */ + char ip_opts[40]; /* actually variable in size */ +}; + +/* + * Options for use with [gs]etsockopt at the IP level. + * First word of comment is data type; bool is stored in int. + */ +#define IP_OPTIONS 1 /* buf/ip_opts; set/get IP options */ +#define IP_HDRINCL 2 /* int; header is included with data */ +#define IP_TOS 3 /* int; IP type of service and preced. */ +#define IP_TTL 4 /* int; IP time to live */ +#define IP_RECVOPTS 5 /* bool; receive all IP opts w/dgram */ +#define IP_RECVRETOPTS 6 /* bool; receive IP opts for response */ +#define IP_RECVDSTADDR 7 /* bool; receive IP dst addr w/dgram */ +#define IP_RETOPTS 8 /* ip_opts; set/get IP options */ +#define IP_MULTICAST_IF 9 /* u_char; set/get IP multicast i/f */ +#define IP_MULTICAST_TTL 10 /* u_char; set/get IP multicast ttl */ +#define IP_MULTICAST_LOOP 11 /* u_char; set/get IP multicast loopback */ +#define IP_ADD_MEMBERSHIP 12 /* ip_mreq; add an IP group membership */ +#define IP_DROP_MEMBERSHIP 13 /* ip_mreq; drop an IP group membership */ + +/* + * Defaults and limits for options + */ +#define IP_DEFAULT_MULTICAST_TTL 1 /* normally limit m'casts to 1 hop */ +#define IP_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */ +#define IP_MAX_MEMBERSHIPS 20 /* per socket; must fit in one mbuf */ + +/* + * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP. + */ +struct ip_mreq { + struct in_addr imr_multiaddr; /* IP multicast address of group */ + struct in_addr imr_interface; /* local IP address of interface */ +}; + +/* + * Definitions for inet sysctl operations. + * + * Third level is protocol number. + * Fourth level is desired variable within that protocol. + */ +#define IPPROTO_MAXID (IPPROTO_IDP + 1) /* don't list to IPPROTO_MAX */ + +#define CTL_IPPROTO_NAMES { \ + { "ip", CTLTYPE_NODE }, \ + { "icmp", CTLTYPE_NODE }, \ + { "igmp", CTLTYPE_NODE }, \ + { "ggp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "tcp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { "egp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "pup", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "udp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "idp", CTLTYPE_NODE }, \ +} + +/* + * Names for IP sysctl objects + */ +#define IPCTL_FORWARDING 1 /* act as router */ +#define IPCTL_SENDREDIRECTS 2 /* may send redirects when forwarding */ +#define IPCTL_DEFTTL 3 /* default TTL */ +#ifdef notyet +#define IPCTL_DEFMTU 4 /* default MTU */ +#endif +#define IPCTL_MAXID 5 + +#define IPCTL_NAMES { \ + { 0, 0 }, \ + { "forwarding", CTLTYPE_INT }, \ + { "redirect", CTLTYPE_INT }, \ + { "ttl", CTLTYPE_INT }, \ + { "mtu", CTLTYPE_INT }, \ +} + + +#ifdef KERNEL +int in_broadcast __P((struct in_addr, struct ifnet *)); +int in_canforward __P((struct in_addr)); +int in_cksum __P((struct mbuf *, int)); +int in_localaddr __P((struct in_addr)); +u_long in_netof __P((struct in_addr)); +void in_socktrim __P((struct sockaddr_in *)); +#endif diff --git a/sys/netinet/in_cksum.c b/sys/netinet/in_cksum.c new file mode 100644 index 00000000000..c19a9200836 --- /dev/null +++ b/sys/netinet/in_cksum.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include + +/* + * Checksum routine for Internet Protocol family headers (Portable Version). + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + */ + +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} + +int +in_cksum(m, len) + register struct mbuf *m; + register int len; +{ + register u_short *w; + register int sum = 0; + register int mlen = 0; + int byte_swapped = 0; + + union { + char c[2]; + u_short s; + } s_util; + union { + u_short s[2]; + long l; + } l_util; + + for (;m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * The first byte of this mbuf is the continuation + * of a word spanning between this mbuf and the + * last mbuf. + * + * s_util.c[0] is already saved when scanning previous + * mbuf. + */ + s_util.c[1] = *(char *)w; + sum += s_util.s; + w = (u_short *)((char *)w + 1); + mlen = m->m_len - 1; + len--; + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + /* + * Force to even boundary. + */ + if ((1 & (int) w) && (mlen > 0)) { + REDUCE; + sum <<= 8; + s_util.c[0] = *(u_char *)w; + w = (u_short *)((char *)w + 1); + mlen--; + byte_swapped = 1; + } + /* + * Unroll the loop to make overhead from + * branches &c small. + */ + while ((mlen -= 32) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; + sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; + sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; + w += 16; + } + mlen += 32; + while ((mlen -= 8) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + w += 4; + } + mlen += 8; + if (mlen == 0 && byte_swapped == 0) + continue; + REDUCE; + while ((mlen -= 2) >= 0) { + sum += *w++; + } + if (byte_swapped) { + REDUCE; + sum <<= 8; + byte_swapped = 0; + if (mlen == -1) { + s_util.c[1] = *(char *)w; + sum += s_util.s; + mlen = 0; + } else + mlen = -1; + } else if (mlen == -1) + s_util.c[0] = *(char *)w; + } + if (len) + printf("cksum: out of data\n"); + if (mlen == -1) { + /* The last mbuf has odd # of bytes. Follow the + standard (the odd byte may be shifted left by 8 bits + or not as determined by endian-ness of the machine) */ + s_util.c[1] = 0; + sum += s_util.s; + } + REDUCE; + return (~sum & 0xffff); +} diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c new file mode 100644 index 00000000000..01b6b17961c --- /dev/null +++ b/sys/netinet/in_pcb.c @@ -0,0 +1,497 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +struct in_addr zeroin_addr; + +int +in_pcballoc(so, head) + struct socket *so; + struct inpcb *head; +{ + register struct inpcb *inp; + + MALLOC(inp, struct inpcb *, sizeof(*inp), M_PCB, M_WAITOK); + if (inp == NULL) + return (ENOBUFS); + bzero((caddr_t)inp, sizeof(*inp)); + inp->inp_head = head; + inp->inp_socket = so; + insque(inp, head); + so->so_pcb = (caddr_t)inp; + return (0); +} + +int +in_pcbbind(inp, nam) + register struct inpcb *inp; + struct mbuf *nam; +{ + register struct socket *so = inp->inp_socket; + register struct inpcb *head = inp->inp_head; + register struct sockaddr_in *sin; + struct proc *p = curproc; /* XXX */ + u_short lport = 0; + int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int error; + + if (in_ifaddr == 0) + return (EADDRNOTAVAIL); + if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY) + return (EINVAL); + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && + ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 || + (so->so_options & SO_ACCEPTCONN) == 0)) + wild = INPLOOKUP_WILDCARD; + if (nam) { + sin = mtod(nam, struct sockaddr_in *); + if (nam->m_len != sizeof (*sin)) + return (EINVAL); +#ifdef notdef + /* + * We should check the family, but old programs + * incorrectly fail to initialize it. + */ + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); +#endif + lport = sin->sin_port; + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { + /* + * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; + * allow complete duplication of binding if + * SO_REUSEPORT is set, or if SO_REUSEADDR is set + * and a multicast address is bound on both + * new and duplicated sockets. + */ + if (so->so_options & SO_REUSEADDR) + reuseport = SO_REUSEADDR|SO_REUSEPORT; + } else if (sin->sin_addr.s_addr != INADDR_ANY) { + sin->sin_port = 0; /* yech... */ + if (ifa_ifwithaddr((struct sockaddr *)sin) == 0) + return (EADDRNOTAVAIL); + } + if (lport) { + struct inpcb *t; + + /* GROSS */ + if (ntohs(lport) < IPPORT_RESERVED && + (error = suser(p->p_ucred, &p->p_acflag))) + return (error); + t = in_pcblookup(head, zeroin_addr, 0, + sin->sin_addr, lport, wild); + if (t && (reuseport & t->inp_socket->so_options) == 0) + return (EADDRINUSE); + } + inp->inp_laddr = sin->sin_addr; + } + if (lport == 0) + do { + if (head->inp_lport++ < IPPORT_RESERVED || + head->inp_lport > IPPORT_USERRESERVED) + head->inp_lport = IPPORT_RESERVED; + lport = htons(head->inp_lport); + } while (in_pcblookup(head, + zeroin_addr, 0, inp->inp_laddr, lport, wild)); + inp->inp_lport = lport; + return (0); +} + +/* + * Connect from a socket to a specified address. + * Both address and port must be specified in argument sin. + * If don't have a local address for this socket yet, + * then pick one. + */ +int +in_pcbconnect(inp, nam) + register struct inpcb *inp; + struct mbuf *nam; +{ + struct in_ifaddr *ia; + struct sockaddr_in *ifaddr; + register struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *); + + if (nam->m_len != sizeof (*sin)) + return (EINVAL); + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); + if (sin->sin_port == 0) + return (EADDRNOTAVAIL); + if (in_ifaddr) { + /* + * If the destination address is INADDR_ANY, + * use the primary local address. + * If the supplied address is INADDR_BROADCAST, + * and the primary interface supports broadcast, + * choose the broadcast address for that interface. + */ +#define satosin(sa) ((struct sockaddr_in *)(sa)) +#define sintosa(sin) ((struct sockaddr *)(sin)) +#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) + if (sin->sin_addr.s_addr == INADDR_ANY) + sin->sin_addr = IA_SIN(in_ifaddr)->sin_addr; + else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST && + (in_ifaddr->ia_ifp->if_flags & IFF_BROADCAST)) + sin->sin_addr = satosin(&in_ifaddr->ia_broadaddr)->sin_addr; + } + if (inp->inp_laddr.s_addr == INADDR_ANY) { + register struct route *ro; + + ia = (struct in_ifaddr *)0; + /* + * If route is known or can be allocated now, + * our src addr is taken from the i/f, else punt. + */ + ro = &inp->inp_route; + if (ro->ro_rt && + (satosin(&ro->ro_dst)->sin_addr.s_addr != + sin->sin_addr.s_addr || + inp->inp_socket->so_options & SO_DONTROUTE)) { + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/ + (ro->ro_rt == (struct rtentry *)0 || + ro->ro_rt->rt_ifp == (struct ifnet *)0)) { + /* No route yet, so try to acquire one */ + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + sin->sin_addr; + rtalloc(ro); + } + /* + * If we found a route, use the address + * corresponding to the outgoing interface + * unless it is the loopback (in case a route + * to our address on another net goes to loopback). + */ + if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) + ia = ifatoia(ro->ro_rt->rt_ifa); + if (ia == 0) { + u_short fport = sin->sin_port; + + sin->sin_port = 0; + ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin))); + if (ia == 0) + ia = ifatoia(ifa_ifwithnet(sintosa(sin))); + sin->sin_port = fport; + if (ia == 0) + ia = in_ifaddr; + if (ia == 0) + return (EADDRNOTAVAIL); + } + /* + * If the destination address is multicast and an outgoing + * interface has been set as a multicast option, use the + * address of that interface as our source address. + */ + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && + inp->inp_moptions != NULL) { + struct ip_moptions *imo; + struct ifnet *ifp; + + imo = inp->inp_moptions; + if (imo->imo_multicast_ifp != NULL) { + ifp = imo->imo_multicast_ifp; + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if (ia->ia_ifp == ifp) + break; + if (ia == 0) + return (EADDRNOTAVAIL); + } + } + ifaddr = (struct sockaddr_in *)&ia->ia_addr; + } + if (in_pcblookup(inp->inp_head, + sin->sin_addr, + sin->sin_port, + inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr->sin_addr, + inp->inp_lport, + 0)) + return (EADDRINUSE); + if (inp->inp_laddr.s_addr == INADDR_ANY) { + if (inp->inp_lport == 0) + (void)in_pcbbind(inp, (struct mbuf *)0); + inp->inp_laddr = ifaddr->sin_addr; + } + inp->inp_faddr = sin->sin_addr; + inp->inp_fport = sin->sin_port; + return (0); +} + +int +in_pcbdisconnect(inp) + struct inpcb *inp; +{ + + inp->inp_faddr.s_addr = INADDR_ANY; + inp->inp_fport = 0; + if (inp->inp_socket->so_state & SS_NOFDREF) + in_pcbdetach(inp); +} + +int +in_pcbdetach(inp) + struct inpcb *inp; +{ + struct socket *so = inp->inp_socket; + + so->so_pcb = 0; + sofree(so); + if (inp->inp_options) + (void)m_free(inp->inp_options); + if (inp->inp_route.ro_rt) + rtfree(inp->inp_route.ro_rt); + ip_freemoptions(inp->inp_moptions); + remque(inp); + FREE(inp, M_PCB); +} + +int +in_setsockaddr(inp, nam) + register struct inpcb *inp; + struct mbuf *nam; +{ + register struct sockaddr_in *sin; + + nam->m_len = sizeof (*sin); + sin = mtod(nam, struct sockaddr_in *); + bzero((caddr_t)sin, sizeof (*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_port = inp->inp_lport; + sin->sin_addr = inp->inp_laddr; +} + +int +in_setpeeraddr(inp, nam) + struct inpcb *inp; + struct mbuf *nam; +{ + register struct sockaddr_in *sin; + + nam->m_len = sizeof (*sin); + sin = mtod(nam, struct sockaddr_in *); + bzero((caddr_t)sin, sizeof (*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_port = inp->inp_fport; + sin->sin_addr = inp->inp_faddr; +} + +/* + * Pass some notification to all connections of a protocol + * associated with address dst. The local address and/or port numbers + * may be specified to limit the search. The "usual action" will be + * taken, depending on the ctlinput cmd. The caller must filter any + * cmds that are uninteresting (e.g., no error in the map). + * Call the protocol specific routine (if any) to report + * any errors for each matching socket. + * + * Must be called at splnet. + */ +int +in_pcbnotify(head, dst, fport_arg, laddr, lport_arg, cmd, notify) + struct inpcb *head; + struct sockaddr *dst; + u_int fport_arg, lport_arg; + struct in_addr laddr; + int cmd; + void (*notify) __P((struct inpcb *, int)); +{ + extern u_char inetctlerrmap[]; + register struct inpcb *inp, *oinp; + struct in_addr faddr; + u_short fport = fport_arg, lport = lport_arg; + int errno; + + if ((unsigned)cmd > PRC_NCMDS || dst->sa_family != AF_INET) + return; + faddr = ((struct sockaddr_in *)dst)->sin_addr; + if (faddr.s_addr == INADDR_ANY) + return; + + /* + * Redirects go to all references to the destination, + * and use in_rtchange to invalidate the route cache. + * Dead host indications: notify all references to the destination. + * Otherwise, if we have knowledge of the local port and address, + * deliver only to that socket. + */ + if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { + fport = 0; + lport = 0; + laddr.s_addr = 0; + if (cmd != PRC_HOSTDEAD) + notify = in_rtchange; + } + errno = inetctlerrmap[cmd]; + for (inp = head->inp_next; inp != head;) { + if (inp->inp_faddr.s_addr != faddr.s_addr || + inp->inp_socket == 0 || + (lport && inp->inp_lport != lport) || + (laddr.s_addr && inp->inp_laddr.s_addr != laddr.s_addr) || + (fport && inp->inp_fport != fport)) { + inp = inp->inp_next; + continue; + } + oinp = inp; + inp = inp->inp_next; + if (notify) + (*notify)(oinp, errno); + } +} + +/* + * Check for alternatives when higher level complains + * about service problems. For now, invalidate cached + * routing information. If the route was created dynamically + * (by a redirect), time to try a default gateway again. + */ +int +in_losing(inp) + struct inpcb *inp; +{ + register struct rtentry *rt; + struct rt_addrinfo info; + + if ((rt = inp->inp_route.ro_rt)) { + inp->inp_route.ro_rt = 0; + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_DST] = + (struct sockaddr *)&inp->inp_route.ro_dst; + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0); + if (rt->rt_flags & RTF_DYNAMIC) + (void) rtrequest(RTM_DELETE, rt_key(rt), + rt->rt_gateway, rt_mask(rt), rt->rt_flags, + (struct rtentry **)0); + else + /* + * A new route can be allocated + * the next time output is attempted. + */ + rtfree(rt); + } +} + +/* + * After a routing change, flush old routing + * and allocate a (hopefully) better one. + */ +void +in_rtchange(inp, errno) + register struct inpcb *inp; + int errno; +{ + if (inp->inp_route.ro_rt) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = 0; + /* + * A new route can be allocated the next time + * output is attempted. + */ + } +} + +struct inpcb * +in_pcblookup(head, faddr, fport_arg, laddr, lport_arg, flags) + struct inpcb *head; + struct in_addr faddr, laddr; + u_int fport_arg, lport_arg; + int flags; +{ + register struct inpcb *inp, *match = 0; + int matchwild = 3, wildcard; + u_short fport = fport_arg, lport = lport_arg; + + for (inp = head->inp_next; inp != head; inp = inp->inp_next) { + if (inp->inp_lport != lport) + continue; + wildcard = 0; + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (laddr.s_addr == INADDR_ANY) + wildcard++; + else if (inp->inp_laddr.s_addr != laddr.s_addr) + continue; + } else { + if (laddr.s_addr != INADDR_ANY) + wildcard++; + } + if (inp->inp_faddr.s_addr != INADDR_ANY) { + if (faddr.s_addr == INADDR_ANY) + wildcard++; + else if (inp->inp_faddr.s_addr != faddr.s_addr || + inp->inp_fport != fport) + continue; + } else { + if (faddr.s_addr != INADDR_ANY) + wildcard++; + } + if (wildcard && (flags & INPLOOKUP_WILDCARD) == 0) + continue; + if (wildcard < matchwild) { + match = inp; + matchwild = wildcard; + if (matchwild == 0) + break; + } + } + return (match); +} diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h new file mode 100644 index 00000000000..c85324702a7 --- /dev/null +++ b/sys/netinet/in_pcb.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Common structure pcb for internet protocol implementation. + * Here are stored pointers to local and foreign host table + * entries, local and foreign socket numbers, and pointers + * up (to a socket structure) and down (to a protocol-specific) + * control block. + */ +struct inpcb { + struct inpcb *inp_next,*inp_prev; + /* pointers to other pcb's */ + struct inpcb *inp_head; /* pointer back to chain of inpcb's + for this protocol */ + struct in_addr inp_faddr; /* foreign host table entry */ + u_short inp_fport; /* foreign port */ + struct in_addr inp_laddr; /* local host table entry */ + u_short inp_lport; /* local port */ + struct socket *inp_socket; /* back pointer to socket */ + caddr_t inp_ppcb; /* pointer to per-protocol pcb */ + struct route inp_route; /* placeholder for routing entry */ + int inp_flags; /* generic IP/datagram flags */ + struct ip inp_ip; /* header prototype; should have more */ + struct mbuf *inp_options; /* IP options */ + struct ip_moptions *inp_moptions; /* IP multicast options */ +}; + +/* flags in inp_flags: */ +#define INP_RECVOPTS 0x01 /* receive incoming IP options */ +#define INP_RECVRETOPTS 0x02 /* receive IP options for reply */ +#define INP_RECVDSTADDR 0x04 /* receive IP dst address */ +#define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR) +#define INP_HDRINCL 0x08 /* user supplies entire IP header */ + +#define INPLOOKUP_WILDCARD 1 +#define INPLOOKUP_SETLOCAL 2 + +#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) + +#ifdef KERNEL +int in_losing __P((struct inpcb *)); +int in_pcballoc __P((struct socket *, struct inpcb *)); +int in_pcbbind __P((struct inpcb *, struct mbuf *)); +int in_pcbconnect __P((struct inpcb *, struct mbuf *)); +int in_pcbdetach __P((struct inpcb *)); +int in_pcbdisconnect __P((struct inpcb *)); +struct inpcb * + in_pcblookup __P((struct inpcb *, + struct in_addr, u_int, struct in_addr, u_int, int)); +int in_pcbnotify __P((struct inpcb *, struct sockaddr *, + u_int, struct in_addr, u_int, int, void (*)(struct inpcb *, int))); +void in_rtchange __P((struct inpcb *, int)); +int in_setpeeraddr __P((struct inpcb *, struct mbuf *)); +int in_setsockaddr __P((struct inpcb *, struct mbuf *)); +#endif diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c new file mode 100644 index 00000000000..00916b4ce1a --- /dev/null +++ b/sys/netinet/in_proto.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_proto.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +/* + * TCP/IP protocol family: IP, ICMP, UDP, TCP. + */ + +#ifdef NSIP +void idpip_input(), nsip_ctlinput(); +#endif + +#ifdef TPIP +void tpip_input(), tpip_ctlinput(), tp_ctloutput(); +int tp_init(), tp_slowtimo(), tp_drain(), tp_usrreq(); +#endif + +#ifdef EON +void eoninput(), eonctlinput(), eonprotoinit(); +#endif /* EON */ + +extern struct domain inetdomain; + +struct protosw inetsw[] = { +{ 0, &inetdomain, 0, 0, + 0, ip_output, 0, 0, + 0, + ip_init, 0, ip_slowtimo, ip_drain, ip_sysctl +}, +{ SOCK_DGRAM, &inetdomain, IPPROTO_UDP, PR_ATOMIC|PR_ADDR, + udp_input, 0, udp_ctlinput, ip_ctloutput, + udp_usrreq, + udp_init, 0, 0, 0, udp_sysctl +}, +{ SOCK_STREAM, &inetdomain, IPPROTO_TCP, PR_CONNREQUIRED|PR_WANTRCVD, + tcp_input, 0, tcp_ctlinput, tcp_ctloutput, + tcp_usrreq, + tcp_init, tcp_fasttimo, tcp_slowtimo, tcp_drain, +}, +{ SOCK_RAW, &inetdomain, IPPROTO_RAW, PR_ATOMIC|PR_ADDR, + rip_input, rip_output, 0, rip_ctloutput, + rip_usrreq, + 0, 0, 0, 0, +}, +{ SOCK_RAW, &inetdomain, IPPROTO_ICMP, PR_ATOMIC|PR_ADDR, + icmp_input, rip_output, 0, rip_ctloutput, + rip_usrreq, + 0, 0, 0, 0, icmp_sysctl +}, +{ SOCK_RAW, &inetdomain, IPPROTO_IGMP, PR_ATOMIC|PR_ADDR, + igmp_input, rip_output, 0, rip_ctloutput, + rip_usrreq, + igmp_init, igmp_fasttimo, 0, 0, +}, +#ifdef TPIP +{ SOCK_SEQPACKET,&inetdomain, IPPROTO_TP, PR_CONNREQUIRED|PR_WANTRCVD, + tpip_input, 0, tpip_ctlinput, tp_ctloutput, + tp_usrreq, + tp_init, 0, tp_slowtimo, tp_drain, +}, +#endif +/* EON (ISO CLNL over IP) */ +#ifdef EON +{ SOCK_RAW, &inetdomain, IPPROTO_EON, 0, + eoninput, 0, eonctlinput, 0, + 0, + eonprotoinit, 0, 0, 0, +}, +#endif +#ifdef NSIP +{ SOCK_RAW, &inetdomain, IPPROTO_IDP, PR_ATOMIC|PR_ADDR, + idpip_input, rip_output, nsip_ctlinput, 0, + rip_usrreq, + 0, 0, 0, 0, +}, +#endif + /* raw wildcard */ +{ SOCK_RAW, &inetdomain, 0, PR_ATOMIC|PR_ADDR, + rip_input, rip_output, 0, rip_ctloutput, + rip_usrreq, + rip_init, 0, 0, 0, +}, +}; + +struct domain inetdomain = + { AF_INET, "internet", 0, 0, 0, + inetsw, &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], 0, + rn_inithead, 32, sizeof(struct sockaddr_in) }; + +#include "imp.h" +#if NIMP > 0 +extern struct domain impdomain; +int rimp_output(), hostslowtimo(); + +struct protosw impsw[] = { +{ SOCK_RAW, &impdomain, 0, PR_ATOMIC|PR_ADDR, + 0, rimp_output, 0, 0, + rip_usrreq, + 0, 0, hostslowtimo, 0, +}, +}; + +struct domain impdomain = + { AF_IMPLINK, "imp", 0, 0, 0, + impsw, &impsw[sizeof (impsw)/sizeof(impsw[0])] }; +#endif + +#include "hy.h" +#if NHY > 0 +/* + * HYPERchannel protocol family: raw interface. + */ +int rhy_output(); +extern struct domain hydomain; + +struct protosw hysw[] = { +{ SOCK_RAW, &hydomain, 0, PR_ATOMIC|PR_ADDR, + 0, rhy_output, 0, 0, + rip_usrreq, + 0, 0, 0, 0, +}, +}; + +struct domain hydomain = + { AF_HYLINK, "hy", 0, 0, 0, hysw, &hysw[sizeof (hysw)/sizeof(hysw[0])] }; +#endif diff --git a/sys/netinet/in_systm.h b/sys/netinet/in_systm.h new file mode 100644 index 00000000000..cbd8e539a1e --- /dev/null +++ b/sys/netinet/in_systm.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_systm.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Miscellaneous internetwork + * definitions for kernel. + */ + +/* + * Network types. + * + * Internally the system keeps counters in the headers with the bytes + * swapped so that VAX instructions will work on them. It reverses + * the bytes before transmission at each protocol level. The n_ types + * represent the types with the bytes in ``high-ender'' order. + */ +typedef u_short n_short; /* short as received from the net */ +typedef u_long n_long; /* long as received from the net */ + +typedef u_long n_time; /* ms since 00:00 GMT, byte rev */ + +#ifdef KERNEL +n_time iptime __P((void)); +#endif diff --git a/sys/netinet/in_var.h b/sys/netinet/in_var.h new file mode 100644 index 00000000000..8218f0b74a3 --- /dev/null +++ b/sys/netinet/in_var.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 1985, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_var.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Interface address, Internet version. One of these structures + * is allocated for each interface with an Internet address. + * The ifaddr structure contains the protocol-independent part + * of the structure and is assumed to be first. + */ +struct in_ifaddr { + struct ifaddr ia_ifa; /* protocol-independent info */ +#define ia_ifp ia_ifa.ifa_ifp +#define ia_flags ia_ifa.ifa_flags + /* ia_{,sub}net{,mask} in host order */ + u_long ia_net; /* network number of interface */ + u_long ia_netmask; /* mask of net part */ + u_long ia_subnet; /* subnet number, including net */ + u_long ia_subnetmask; /* mask of subnet part */ + struct in_addr ia_netbroadcast; /* to recognize net broadcasts */ + struct in_ifaddr *ia_next; /* next in list of internet addresses */ + struct sockaddr_in ia_addr; /* reserve space for interface name */ + struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ +#define ia_broadaddr ia_dstaddr + struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ + struct in_multi *ia_multiaddrs; /* list of multicast addresses */ +}; + +struct in_aliasreq { + char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct sockaddr_in ifra_addr; + struct sockaddr_in ifra_broadaddr; +#define ifra_dstaddr ifra_broadaddr + struct sockaddr_in ifra_mask; +}; +/* + * Given a pointer to an in_ifaddr (ifaddr), + * return a pointer to the addr as a sockaddr_in. + */ +#define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) + +#define IN_LNAOF(in, ifa) \ + ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) + + +#ifdef KERNEL +extern struct in_ifaddr *in_ifaddr; +extern struct ifqueue ipintrq; /* ip packet input queue */ +void in_socktrim __P((struct sockaddr_in *)); + + +/* + * Macro for finding the interface (ifnet structure) corresponding to one + * of our IP addresses. + */ +#define INADDR_TO_IFP(addr, ifp) \ + /* struct in_addr addr; */ \ + /* struct ifnet *ifp; */ \ +{ \ + register struct in_ifaddr *ia; \ +\ + for (ia = in_ifaddr; \ + ia != NULL && IA_SIN(ia)->sin_addr.s_addr != (addr).s_addr; \ + ia = ia->ia_next) \ + continue; \ + (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ +} + +/* + * Macro for finding the internet address structure (in_ifaddr) corresponding + * to a given interface (ifnet structure). + */ +#define IFP_TO_IA(ifp, ia) \ + /* struct ifnet *ifp; */ \ + /* struct in_ifaddr *ia; */ \ +{ \ + for ((ia) = in_ifaddr; \ + (ia) != NULL && (ia)->ia_ifp != (ifp); \ + (ia) = (ia)->ia_next) \ + continue; \ +} +#endif + +/* + * Internet multicast address structure. There is one of these for each IP + * multicast group to which this host belongs on a given network interface. + * They are kept in a linked list, rooted in the interface's in_ifaddr + * structure. + */ +struct in_multi { + struct in_addr inm_addr; /* IP multicast address */ + struct ifnet *inm_ifp; /* back pointer to ifnet */ + struct in_ifaddr *inm_ia; /* back pointer to in_ifaddr */ + u_int inm_refcount; /* no. membership claims by sockets */ + u_int inm_timer; /* IGMP membership report timer */ + struct in_multi *inm_next; /* ptr to next multicast address */ +}; + +#ifdef KERNEL +/* + * Structure used by macros below to remember position when stepping through + * all of the in_multi records. + */ +struct in_multistep { + struct in_ifaddr *i_ia; + struct in_multi *i_inm; +}; + +/* + * Macro for looking up the in_multi record for a given IP multicast address + * on a given interface. If no matching record is found, "inm" returns NULL. + */ +#define IN_LOOKUP_MULTI(addr, ifp, inm) \ + /* struct in_addr addr; */ \ + /* struct ifnet *ifp; */ \ + /* struct in_multi *inm; */ \ +{ \ + register struct in_ifaddr *ia; \ +\ + IFP_TO_IA((ifp), ia); \ + if (ia == NULL) \ + (inm) = NULL; \ + else \ + for ((inm) = ia->ia_multiaddrs; \ + (inm) != NULL && (inm)->inm_addr.s_addr != (addr).s_addr; \ + (inm) = inm->inm_next) \ + continue; \ +} + +/* + * Macro to step through all of the in_multi records, one at a time. + * The current position is remembered in "step", which the caller must + * provide. IN_FIRST_MULTI(), below, must be called to initialize "step" + * and get the first record. Both macros return a NULL "inm" when there + * are no remaining records. + */ +#define IN_NEXT_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +{ \ + if (((inm) = (step).i_inm) != NULL) \ + (step).i_inm = (inm)->inm_next; \ + else \ + while ((step).i_ia != NULL) { \ + (inm) = (step).i_ia->ia_multiaddrs; \ + (step).i_ia = (step).i_ia->ia_next; \ + if ((inm) != NULL) { \ + (step).i_inm = (inm)->inm_next; \ + break; \ + } \ + } \ +} + +#define IN_FIRST_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +{ \ + (step).i_ia = in_ifaddr; \ + (step).i_inm = NULL; \ + IN_NEXT_MULTI((step), (inm)); \ +} + +int in_ifinit __P((struct ifnet *, + struct in_ifaddr *, struct sockaddr_in *, int)); +struct in_multi *in_addmulti __P((struct in_addr *, struct ifnet *)); +int in_delmulti __P((struct in_multi *)); +void in_ifscrub __P((struct ifnet *, struct in_ifaddr *)); +int in_control __P((struct socket *, int, caddr_t, struct ifnet *)); +#endif diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h new file mode 100644 index 00000000000..8a31dfaf13d --- /dev/null +++ b/sys/netinet/ip.h @@ -0,0 +1,168 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions for internet protocol version 4. + * Per RFC 791, September 1981. + */ +#define IPVERSION 4 + +/* + * Structure of an internet header, naked of options. + * + * We declare ip_len and ip_off to be short, rather than u_short + * pragmatically since otherwise unsigned comparisons can result + * against negative integers quite easily, and fail in subtle ways. + */ +struct ip { +#if BYTE_ORDER == LITTLE_ENDIAN + u_char ip_hl:4, /* header length */ + ip_v:4; /* version */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_char ip_v:4, /* version */ + ip_hl:4; /* header length */ +#endif + u_char ip_tos; /* type of service */ + short ip_len; /* total length */ + u_short ip_id; /* identification */ + short ip_off; /* fragment offset field */ +#define IP_DF 0x4000 /* dont fragment flag */ +#define IP_MF 0x2000 /* more fragments flag */ +#define IP_OFFMASK 0x1fff /* mask for fragmenting bits */ + u_char ip_ttl; /* time to live */ + u_char ip_p; /* protocol */ + u_short ip_sum; /* checksum */ + struct in_addr ip_src,ip_dst; /* source and dest address */ +}; + +#define IP_MAXPACKET 65535 /* maximum packet size */ + +/* + * Definitions for IP type of service (ip_tos) + */ +#define IPTOS_LOWDELAY 0x10 +#define IPTOS_THROUGHPUT 0x08 +#define IPTOS_RELIABILITY 0x04 + +/* + * Definitions for IP precedence (also in ip_tos) (hopefully unused) + */ +#define IPTOS_PREC_NETCONTROL 0xe0 +#define IPTOS_PREC_INTERNETCONTROL 0xc0 +#define IPTOS_PREC_CRITIC_ECP 0xa0 +#define IPTOS_PREC_FLASHOVERRIDE 0x80 +#define IPTOS_PREC_FLASH 0x60 +#define IPTOS_PREC_IMMEDIATE 0x40 +#define IPTOS_PREC_PRIORITY 0x20 +#define IPTOS_PREC_ROUTINE 0x10 + +/* + * Definitions for options. + */ +#define IPOPT_COPIED(o) ((o)&0x80) +#define IPOPT_CLASS(o) ((o)&0x60) +#define IPOPT_NUMBER(o) ((o)&0x1f) + +#define IPOPT_CONTROL 0x00 +#define IPOPT_RESERVED1 0x20 +#define IPOPT_DEBMEAS 0x40 +#define IPOPT_RESERVED2 0x60 + +#define IPOPT_EOL 0 /* end of option list */ +#define IPOPT_NOP 1 /* no operation */ + +#define IPOPT_RR 7 /* record packet route */ +#define IPOPT_TS 68 /* timestamp */ +#define IPOPT_SECURITY 130 /* provide s,c,h,tcc */ +#define IPOPT_LSRR 131 /* loose source route */ +#define IPOPT_SATID 136 /* satnet id */ +#define IPOPT_SSRR 137 /* strict source route */ + +/* + * Offsets to fields in options other than EOL and NOP. + */ +#define IPOPT_OPTVAL 0 /* option ID */ +#define IPOPT_OLEN 1 /* option length */ +#define IPOPT_OFFSET 2 /* offset within option */ +#define IPOPT_MINOFF 4 /* min value of above */ + +/* + * Time stamp option structure. + */ +struct ip_timestamp { + u_char ipt_code; /* IPOPT_TS */ + u_char ipt_len; /* size of structure (variable) */ + u_char ipt_ptr; /* index of current entry */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_char ipt_flg:4, /* flags, see below */ + ipt_oflw:4; /* overflow counter */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_char ipt_oflw:4, /* overflow counter */ + ipt_flg:4; /* flags, see below */ +#endif + union ipt_timestamp { + n_long ipt_time[1]; + struct ipt_ta { + struct in_addr ipt_addr; + n_long ipt_time; + } ipt_ta[1]; + } ipt_timestamp; +}; + +/* flag bits for ipt_flg */ +#define IPOPT_TS_TSONLY 0 /* timestamps only */ +#define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */ +#define IPOPT_TS_PRESPEC 3 /* specified modules only */ + +/* bits for security (not byte swapped) */ +#define IPOPT_SECUR_UNCLASS 0x0000 +#define IPOPT_SECUR_CONFID 0xf135 +#define IPOPT_SECUR_EFTO 0x789a +#define IPOPT_SECUR_MMMM 0xbc4d +#define IPOPT_SECUR_RESTR 0xaf13 +#define IPOPT_SECUR_SECRET 0xd788 +#define IPOPT_SECUR_TOPSECRET 0x6bc5 + +/* + * Internet implementation parameters. + */ +#define MAXTTL 255 /* maximum time to live (seconds) */ +#define IPDEFTTL 64 /* default ttl, from RFC 1340 */ +#define IPFRAGTTL 60 /* time to live for frags, slowhz */ +#define IPTTLDEC 1 /* subtracted when forwarding */ + +#define IP_MSS 576 /* default maximum segment size */ diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c new file mode 100644 index 00000000000..c9b82bca908 --- /dev/null +++ b/sys/netinet/ip_icmp.c @@ -0,0 +1,591 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * ICMP routines: error generation, receive packet processing, and + * routines to turnaround packets back to the originator, and + * host table maintenance routines. + */ + +int icmpmaskrepl = 0; +#ifdef ICMPPRINTFS +int icmpprintfs = 0; +#endif + +extern struct protosw inetsw[]; + +/* + * Generate an error packet of type error + * in response to bad packet ip. + */ +void +icmp_error(n, type, code, dest, destifp) + struct mbuf *n; + int type, code; + n_long dest; + struct ifnet *destifp; +{ + register struct ip *oip = mtod(n, struct ip *), *nip; + register unsigned oiplen = oip->ip_hl << 2; + register struct icmp *icp; + register struct mbuf *m; + unsigned icmplen; + +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_error(%x, %d, %d)\n", oip, type, code); +#endif + if (type != ICMP_REDIRECT) + icmpstat.icps_error++; + /* + * Don't send error if not the first fragment of message. + * Don't error if the old packet protocol was ICMP + * error message, only known informational types. + */ + if (oip->ip_off &~ (IP_MF|IP_DF)) + goto freeit; + if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && + n->m_len >= oiplen + ICMP_MINLEN && + !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiplen))->icmp_type)) { + icmpstat.icps_oldicmp++; + goto freeit; + } + /* Don't send error in response to a multicast or broadcast packet */ + if (n->m_flags & (M_BCAST|M_MCAST)) + goto freeit; + /* + * First, formulate icmp message + */ + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + goto freeit; + icmplen = oiplen + min(8, oip->ip_len); + m->m_len = icmplen + ICMP_MINLEN; + MH_ALIGN(m, m->m_len); + icp = mtod(m, struct icmp *); + if ((u_int)type > ICMP_MAXTYPE) + panic("icmp_error"); + icmpstat.icps_outhist[type]++; + icp->icmp_type = type; + if (type == ICMP_REDIRECT) + icp->icmp_gwaddr.s_addr = dest; + else { + icp->icmp_void = 0; + /* + * The following assignments assume an overlay with the + * zeroed icmp_void field. + */ + if (type == ICMP_PARAMPROB) { + icp->icmp_pptr = code; + code = 0; + } else if (type == ICMP_UNREACH && + code == ICMP_UNREACH_NEEDFRAG && destifp) { + icp->icmp_nextmtu = htons(destifp->if_mtu); + } + } + + icp->icmp_code = code; + bcopy((caddr_t)oip, (caddr_t)&icp->icmp_ip, icmplen); + nip = &icp->icmp_ip; + nip->ip_len = htons((u_short)(nip->ip_len + oiplen)); + + /* + * Now, copy old ip header (without options) + * in front of icmp message. + */ + if (m->m_data - sizeof(struct ip) < m->m_pktdat) + panic("icmp len"); + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + m->m_pkthdr.len = m->m_len; + m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; + nip = mtod(m, struct ip *); + bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); + nip->ip_len = m->m_len; + nip->ip_hl = sizeof(struct ip) >> 2; + nip->ip_p = IPPROTO_ICMP; + nip->ip_tos = 0; + icmp_reflect(m); + +freeit: + m_freem(n); +} + +static struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET }; +static struct sockaddr_in icmpdst = { sizeof (struct sockaddr_in), AF_INET }; +static struct sockaddr_in icmpgw = { sizeof (struct sockaddr_in), AF_INET }; +struct sockaddr_in icmpmask = { 8, 0 }; + +/* + * Process a received ICMP message. + */ +void +icmp_input(m, hlen) + register struct mbuf *m; + int hlen; +{ + register struct icmp *icp; + register struct ip *ip = mtod(m, struct ip *); + int icmplen = ip->ip_len; + register int i; + struct in_ifaddr *ia; + void (*ctlfunc) __P((int, struct sockaddr *, struct ip *)); + int code; + extern u_char ip_protox[]; + + /* + * Locate icmp structure in mbuf, and check + * that not corrupted and of at least minimum length. + */ +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_input from %x to %x, len %d\n", + ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), + icmplen); +#endif + if (icmplen < ICMP_MINLEN) { + icmpstat.icps_tooshort++; + goto freeit; + } + i = hlen + min(icmplen, ICMP_ADVLENMIN); + if (m->m_len < i && (m = m_pullup(m, i)) == 0) { + icmpstat.icps_tooshort++; + return; + } + ip = mtod(m, struct ip *); + m->m_len -= hlen; + m->m_data += hlen; + icp = mtod(m, struct icmp *); + if (in_cksum(m, icmplen)) { + icmpstat.icps_checksum++; + goto freeit; + } + m->m_len += hlen; + m->m_data -= hlen; + +#ifdef ICMPPRINTFS + /* + * Message type specific processing. + */ + if (icmpprintfs) + printf("icmp_input, type %d code %d\n", icp->icmp_type, + icp->icmp_code); +#endif + if (icp->icmp_type > ICMP_MAXTYPE) + goto raw; + icmpstat.icps_inhist[icp->icmp_type]++; + code = icp->icmp_code; + switch (icp->icmp_type) { + + case ICMP_UNREACH: + switch (code) { + case ICMP_UNREACH_NET: + case ICMP_UNREACH_HOST: + case ICMP_UNREACH_PROTOCOL: + case ICMP_UNREACH_PORT: + case ICMP_UNREACH_SRCFAIL: + code += PRC_UNREACH_NET; + break; + + case ICMP_UNREACH_NEEDFRAG: + code = PRC_MSGSIZE; + break; + + case ICMP_UNREACH_NET_UNKNOWN: + case ICMP_UNREACH_NET_PROHIB: + case ICMP_UNREACH_TOSNET: + code = PRC_UNREACH_NET; + break; + + case ICMP_UNREACH_HOST_UNKNOWN: + case ICMP_UNREACH_ISOLATED: + case ICMP_UNREACH_HOST_PROHIB: + case ICMP_UNREACH_TOSHOST: + code = PRC_UNREACH_HOST; + break; + + default: + goto badcode; + } + goto deliver; + + case ICMP_TIMXCEED: + if (code > 1) + goto badcode; + code += PRC_TIMXCEED_INTRANS; + goto deliver; + + case ICMP_PARAMPROB: + if (code > 1) + goto badcode; + code = PRC_PARAMPROB; + goto deliver; + + case ICMP_SOURCEQUENCH: + if (code) + goto badcode; + code = PRC_QUENCH; + deliver: + /* + * Problem with datagram; advise higher level routines. + */ + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || + icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { + icmpstat.icps_badlen++; + goto freeit; + } + NTOHS(icp->icmp_ip.ip_len); +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; + if (ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput) + (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, + &icp->icmp_ip); + break; + + badcode: + icmpstat.icps_badcode++; + break; + + case ICMP_ECHO: + icp->icmp_type = ICMP_ECHOREPLY; + goto reflect; + + case ICMP_TSTAMP: + if (icmplen < ICMP_TSLEN) { + icmpstat.icps_badlen++; + break; + } + icp->icmp_type = ICMP_TSTAMPREPLY; + icp->icmp_rtime = iptime(); + icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ + goto reflect; + + case ICMP_MASKREQ: +#define satosin(sa) ((struct sockaddr_in *)(sa)) + if (icmpmaskrepl == 0) + break; + /* + * We are not able to respond with all ones broadcast + * unless we receive it over a point-to-point interface. + */ + if (icmplen < ICMP_MASKLEN) + break; + switch (ip->ip_dst.s_addr) { + + case INADDR_BROADCAST: + case INADDR_ANY: + icmpdst.sin_addr = ip->ip_src; + break; + + default: + icmpdst.sin_addr = ip->ip_dst; + } + ia = (struct in_ifaddr *)ifaof_ifpforaddr( + (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); + if (ia == 0) + break; + icp->icmp_type = ICMP_MASKREPLY; + icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; + if (ip->ip_src.s_addr == 0) { + if (ia->ia_ifp->if_flags & IFF_BROADCAST) + ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; + else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) + ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; + } +reflect: + ip->ip_len += hlen; /* since ip_input deducts this */ + icmpstat.icps_reflect++; + icmpstat.icps_outhist[icp->icmp_type]++; + icmp_reflect(m); + return; + + case ICMP_REDIRECT: + if (code > 3) + goto badcode; + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || + icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { + icmpstat.icps_badlen++; + break; + } + /* + * Short circuit routing redirects to force + * immediate change in the kernel's routing + * tables. The message is also handed to anyone + * listening on a raw socket (e.g. the routing + * daemon for use in updating its tables). + */ + icmpgw.sin_addr = ip->ip_src; + icmpdst.sin_addr = icp->icmp_gwaddr; +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("redirect dst %x to %x\n", icp->icmp_ip.ip_dst, + icp->icmp_gwaddr); +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; + rtredirect((struct sockaddr *)&icmpsrc, + (struct sockaddr *)&icmpdst, + (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, + (struct sockaddr *)&icmpgw, (struct rtentry **)0); + pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); + break; + + /* + * No kernel processing for the following; + * just fall through to send to raw listener. + */ + case ICMP_ECHOREPLY: + case ICMP_ROUTERADVERT: + case ICMP_ROUTERSOLICIT: + case ICMP_TSTAMPREPLY: + case ICMP_IREQREPLY: + case ICMP_MASKREPLY: + default: + break; + } + +raw: + rip_input(m); + return; + +freeit: + m_freem(m); +} + +/* + * Reflect the ip packet back to the source + */ +void +icmp_reflect(m) + struct mbuf *m; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct in_ifaddr *ia; + struct in_addr t; + struct mbuf *opts = 0, *ip_srcroute(); + int optlen = (ip->ip_hl << 2) - sizeof(struct ip); + + if (!in_canforward(ip->ip_src) && + ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) != + (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) { + m_freem(m); /* Bad return address */ + goto done; /* Ip_output() will check for broadcast */ + } + t = ip->ip_dst; + ip->ip_dst = ip->ip_src; + /* + * If the incoming packet was addressed directly to us, + * use dst as the src for the reply. Otherwise (broadcast + * or anonymous), use the address which corresponds + * to the incoming interface. + */ + for (ia = in_ifaddr; ia; ia = ia->ia_next) { + if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) + break; + if ((ia->ia_ifp->if_flags & IFF_BROADCAST) && + t.s_addr == satosin(&ia->ia_broadaddr)->sin_addr.s_addr) + break; + } + icmpdst.sin_addr = t; + if (ia == (struct in_ifaddr *)0) + ia = (struct in_ifaddr *)ifaof_ifpforaddr( + (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); + /* + * The following happens if the packet was not addressed to us, + * and was received on an interface with no IP address. + */ + if (ia == (struct in_ifaddr *)0) + ia = in_ifaddr; + t = IA_SIN(ia)->sin_addr; + ip->ip_src = t; + ip->ip_ttl = MAXTTL; + + if (optlen > 0) { + register u_char *cp; + int opt, cnt; + u_int len; + + /* + * Retrieve any source routing from the incoming packet; + * add on any record-route or timestamp options. + */ + cp = (u_char *) (ip + 1); + if ((opts = ip_srcroute()) == 0 && + (opts = m_gethdr(M_DONTWAIT, MT_HEADER))) { + opts->m_len = sizeof(struct in_addr); + mtod(opts, struct in_addr *)->s_addr = 0; + } + if (opts) { +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_reflect optlen %d rt %d => ", + optlen, opts->m_len); +#endif + for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + len = 1; + else { + len = cp[IPOPT_OLEN]; + if (len <= 0 || len > cnt) + break; + } + /* + * Should check for overflow, but it "can't happen" + */ + if (opt == IPOPT_RR || opt == IPOPT_TS || + opt == IPOPT_SECURITY) { + bcopy((caddr_t)cp, + mtod(opts, caddr_t) + opts->m_len, len); + opts->m_len += len; + } + } + /* Terminate & pad, if necessary */ + if (cnt = opts->m_len % 4) { + for (; cnt < 4; cnt++) { + *(mtod(opts, caddr_t) + opts->m_len) = + IPOPT_EOL; + opts->m_len++; + } + } +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("%d\n", opts->m_len); +#endif + } + /* + * Now strip out original options by copying rest of first + * mbuf's data back, and adjust the IP length. + */ + ip->ip_len -= optlen; + ip->ip_hl = sizeof(struct ip) >> 2; + m->m_len -= optlen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= optlen; + optlen += sizeof(struct ip); + bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1), + (unsigned)(m->m_len - sizeof(struct ip))); + } + m->m_flags &= ~(M_BCAST|M_MCAST); + icmp_send(m, opts); +done: + if (opts) + (void)m_free(opts); +} + +/* + * Send an icmp packet back to the ip level, + * after supplying a checksum. + */ +void +icmp_send(m, opts) + register struct mbuf *m; + struct mbuf *opts; +{ + register struct ip *ip = mtod(m, struct ip *); + register int hlen; + register struct icmp *icp; + + hlen = ip->ip_hl << 2; + m->m_data += hlen; + m->m_len -= hlen; + icp = mtod(m, struct icmp *); + icp->icmp_cksum = 0; + icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen); + m->m_data -= hlen; + m->m_len += hlen; +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_send dst %x src %x\n", ip->ip_dst, ip->ip_src); +#endif + (void) ip_output(m, opts, NULL, 0, NULL); +} + +n_time +iptime() +{ + struct timeval atv; + u_long t; + + microtime(&atv); + t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; + return (htonl(t)); +} + +int +icmp_sysctl(name, namelen, oldp, oldlenp, newp, newlen) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; +{ + + /* All sysctl names at this level are terminal. */ + if (namelen != 1) + return (ENOTDIR); + + switch (name[0]) { + case ICMPCTL_MASKREPL: + return (sysctl_int(oldp, oldlenp, newp, newlen, &icmpmaskrepl)); + default: + return (ENOPROTOOPT); + } + /* NOTREACHED */ +} diff --git a/sys/netinet/ip_icmp.h b/sys/netinet/ip_icmp.h new file mode 100644 index 00000000000..3c3462d3266 --- /dev/null +++ b/sys/netinet/ip_icmp.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Interface Control Message Protocol Definitions. + * Per RFC 792, September 1981. + */ + +/* + * Structure of an icmp header. + */ +struct icmp { + u_char icmp_type; /* type of message, see below */ + u_char icmp_code; /* type sub code */ + u_short icmp_cksum; /* ones complement cksum of struct */ + union { + u_char ih_pptr; /* ICMP_PARAMPROB */ + struct in_addr ih_gwaddr; /* ICMP_REDIRECT */ + struct ih_idseq { + n_short icd_id; + n_short icd_seq; + } ih_idseq; + int ih_void; + + /* ICMP_UNREACH_NEEDFRAG -- Path MTU Discovery (RFC1191) */ + struct ih_pmtu { + n_short ipm_void; + n_short ipm_nextmtu; + } ih_pmtu; + } icmp_hun; +#define icmp_pptr icmp_hun.ih_pptr +#define icmp_gwaddr icmp_hun.ih_gwaddr +#define icmp_id icmp_hun.ih_idseq.icd_id +#define icmp_seq icmp_hun.ih_idseq.icd_seq +#define icmp_void icmp_hun.ih_void +#define icmp_pmvoid icmp_hun.ih_pmtu.ipm_void +#define icmp_nextmtu icmp_hun.ih_pmtu.ipm_nextmtu + union { + struct id_ts { + n_time its_otime; + n_time its_rtime; + n_time its_ttime; + } id_ts; + struct id_ip { + struct ip idi_ip; + /* options and then 64 bits of data */ + } id_ip; + u_long id_mask; + char id_data[1]; + } icmp_dun; +#define icmp_otime icmp_dun.id_ts.its_otime +#define icmp_rtime icmp_dun.id_ts.its_rtime +#define icmp_ttime icmp_dun.id_ts.its_ttime +#define icmp_ip icmp_dun.id_ip.idi_ip +#define icmp_mask icmp_dun.id_mask +#define icmp_data icmp_dun.id_data +}; + +/* + * Lower bounds on packet lengths for various types. + * For the error advice packets must first insure that the + * packet is large enought to contain the returned ip header. + * Only then can we do the check to see if 64 bits of packet + * data have been returned, since we need to check the returned + * ip header length. + */ +#define ICMP_MINLEN 8 /* abs minimum */ +#define ICMP_TSLEN (8 + 3 * sizeof (n_time)) /* timestamp */ +#define ICMP_MASKLEN 12 /* address mask */ +#define ICMP_ADVLENMIN (8 + sizeof (struct ip) + 8) /* min */ +#define ICMP_ADVLEN(p) (8 + ((p)->icmp_ip.ip_hl << 2) + 8) + /* N.B.: must separately check that ip_hl >= 5 */ + +/* + * Definition of type and code field values. + */ +#define ICMP_ECHOREPLY 0 /* echo reply */ +#define ICMP_UNREACH 3 /* dest unreachable, codes: */ +#define ICMP_UNREACH_NET 0 /* bad net */ +#define ICMP_UNREACH_HOST 1 /* bad host */ +#define ICMP_UNREACH_PROTOCOL 2 /* bad protocol */ +#define ICMP_UNREACH_PORT 3 /* bad port */ +#define ICMP_UNREACH_NEEDFRAG 4 /* IP_DF caused drop */ +#define ICMP_UNREACH_SRCFAIL 5 /* src route failed */ +#define ICMP_UNREACH_NET_UNKNOWN 6 /* unknown net */ +#define ICMP_UNREACH_HOST_UNKNOWN 7 /* unknown host */ +#define ICMP_UNREACH_ISOLATED 8 /* src host isolated */ +#define ICMP_UNREACH_NET_PROHIB 9 /* prohibited access */ +#define ICMP_UNREACH_HOST_PROHIB 10 /* ditto */ +#define ICMP_UNREACH_TOSNET 11 /* bad tos for net */ +#define ICMP_UNREACH_TOSHOST 12 /* bad tos for host */ +#define ICMP_SOURCEQUENCH 4 /* packet lost, slow down */ +#define ICMP_REDIRECT 5 /* shorter route, codes: */ +#define ICMP_REDIRECT_NET 0 /* for network */ +#define ICMP_REDIRECT_HOST 1 /* for host */ +#define ICMP_REDIRECT_TOSNET 2 /* for tos and net */ +#define ICMP_REDIRECT_TOSHOST 3 /* for tos and host */ +#define ICMP_ECHO 8 /* echo service */ +#define ICMP_ROUTERADVERT 9 /* router advertisement */ +#define ICMP_ROUTERSOLICIT 10 /* router solicitation */ +#define ICMP_TIMXCEED 11 /* time exceeded, code: */ +#define ICMP_TIMXCEED_INTRANS 0 /* ttl==0 in transit */ +#define ICMP_TIMXCEED_REASS 1 /* ttl==0 in reass */ +#define ICMP_PARAMPROB 12 /* ip header bad */ +#define ICMP_PARAMPROB_OPTABSENT 1 /* req. opt. absent */ +#define ICMP_TSTAMP 13 /* timestamp request */ +#define ICMP_TSTAMPREPLY 14 /* timestamp reply */ +#define ICMP_IREQ 15 /* information request */ +#define ICMP_IREQREPLY 16 /* information reply */ +#define ICMP_MASKREQ 17 /* address mask request */ +#define ICMP_MASKREPLY 18 /* address mask reply */ + +#define ICMP_MAXTYPE 18 + +#define ICMP_INFOTYPE(type) \ + ((type) == ICMP_ECHOREPLY || (type) == ICMP_ECHO || \ + (type) == ICMP_ROUTERADVERT || (type) == ICMP_ROUTERSOLICIT || \ + (type) == ICMP_TSTAMP || (type) == ICMP_TSTAMPREPLY || \ + (type) == ICMP_IREQ || (type) == ICMP_IREQREPLY || \ + (type) == ICMP_MASKREQ || (type) == ICMP_MASKREPLY) + +#ifdef KERNEL +void icmp_error __P((struct mbuf *, int, int, n_long, struct ifnet *)); +void icmp_input __P((struct mbuf *, int)); +void icmp_reflect __P((struct mbuf *)); +void icmp_send __P((struct mbuf *, struct mbuf *)); +int icmp_sysctl __P((int *, u_int, void *, size_t *, void *, size_t)); +#endif diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c new file mode 100644 index 00000000000..d3bfeac4b19 --- /dev/null +++ b/sys/netinet/ip_input.c @@ -0,0 +1,1166 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifndef IPFORWARDING +#ifdef GATEWAY +#define IPFORWARDING 1 /* forward IP packets not for us */ +#else /* GATEWAY */ +#define IPFORWARDING 0 /* don't forward IP packets not for us */ +#endif /* GATEWAY */ +#endif /* IPFORWARDING */ +#ifndef IPSENDREDIRECTS +#define IPSENDREDIRECTS 1 +#endif +int ipforwarding = IPFORWARDING; +int ipsendredirects = IPSENDREDIRECTS; +int ip_defttl = IPDEFTTL; +#ifdef DIAGNOSTIC +int ipprintfs = 0; +#endif + +extern struct domain inetdomain; +extern struct protosw inetsw[]; +u_char ip_protox[IPPROTO_MAX]; +int ipqmaxlen = IFQ_MAXLEN; +struct in_ifaddr *in_ifaddr; /* first inet address */ +struct ifqueue ipintrq; + +/* + * We need to save the IP options in case a protocol wants to respond + * to an incoming packet over the same route if the packet got here + * using IP source routing. This allows connection establishment and + * maintenance when the remote end is on a network that is not known + * to us. + */ +int ip_nhops = 0; +static struct ip_srcrt { + struct in_addr dst; /* final destination */ + char nop; /* one NOP to align */ + char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */ + struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)]; +} ip_srcrt; + +#ifdef GATEWAY +extern int if_index; +u_long *ip_ifmatrix; +#endif + +static void save_rte __P((u_char *, struct in_addr)); +/* + * IP initialization: fill in IP protocol switch table. + * All protocols not implemented in kernel go to raw IP protocol handler. + */ +void +ip_init() +{ + register struct protosw *pr; + register int i; + + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); + if (pr == 0) + panic("ip_init"); + for (i = 0; i < IPPROTO_MAX; i++) + ip_protox[i] = pr - inetsw; + for (pr = inetdomain.dom_protosw; + pr < inetdomain.dom_protoswNPROTOSW; pr++) + if (pr->pr_domain->dom_family == PF_INET && + pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) + ip_protox[pr->pr_protocol] = pr - inetsw; + ipq.next = ipq.prev = &ipq; + ip_id = time.tv_sec & 0xffff; + ipintrq.ifq_maxlen = ipqmaxlen; +#ifdef GATEWAY + i = (if_index + 1) * (if_index + 1) * sizeof (u_long); + ip_ifmatrix = (u_long *) malloc(i, M_RTABLE, M_WAITOK); + bzero((char *)ip_ifmatrix, i); +#endif +} + +struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; +struct route ipforward_rt; + +/* + * Ip input routine. Checksum and byte swap header. If fragmented + * try to reassemble. Process options. Pass to next level. + */ +void +ipintr() +{ + register struct ip *ip; + register struct mbuf *m; + register struct ipq *fp; + register struct in_ifaddr *ia; + int hlen, s; + +next: + /* + * Get next datagram off input queue and get IP header + * in first mbuf. + */ + s = splimp(); + IF_DEQUEUE(&ipintrq, m); + splx(s); + if (m == 0) + return; +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("ipintr no HDR"); +#endif + /* + * If no IP addresses have been set yet but the interfaces + * are receiving, can't do anything with incoming packets yet. + */ + if (in_ifaddr == NULL) + goto bad; + ipstat.ips_total++; + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == 0) { + ipstat.ips_toosmall++; + goto next; + } + ip = mtod(m, struct ip *); + if (ip->ip_v != IPVERSION) { + ipstat.ips_badvers++; + goto bad; + } + hlen = ip->ip_hl << 2; + if (hlen < sizeof(struct ip)) { /* minimum header length */ + ipstat.ips_badhlen++; + goto bad; + } + if (hlen > m->m_len) { + if ((m = m_pullup(m, hlen)) == 0) { + ipstat.ips_badhlen++; + goto next; + } + ip = mtod(m, struct ip *); + } + if (ip->ip_sum = in_cksum(m, hlen)) { + ipstat.ips_badsum++; + goto bad; + } + + /* + * Convert fields to host representation. + */ + NTOHS(ip->ip_len); + if (ip->ip_len < hlen) { + ipstat.ips_badlen++; + goto bad; + } + NTOHS(ip->ip_id); + NTOHS(ip->ip_off); + + /* + * Check that the amount of data in the buffers + * is as at least much as the IP header would have us expect. + * Trim mbufs if longer than we expect. + * Drop packet if shorter than we expect. + */ + if (m->m_pkthdr.len < ip->ip_len) { + ipstat.ips_tooshort++; + goto bad; + } + if (m->m_pkthdr.len > ip->ip_len) { + if (m->m_len == m->m_pkthdr.len) { + m->m_len = ip->ip_len; + m->m_pkthdr.len = ip->ip_len; + } else + m_adj(m, ip->ip_len - m->m_pkthdr.len); + } + + /* + * Process options and, if not destined for us, + * ship it on. ip_dooptions returns 1 when an + * error was detected (causing an icmp message + * to be sent and the original packet to be freed). + */ + ip_nhops = 0; /* for source routed packets */ + if (hlen > sizeof (struct ip) && ip_dooptions(m)) + goto next; + + /* + * Check our list of addresses, to see if the packet is for us. + */ + for (ia = in_ifaddr; ia; ia = ia->ia_next) { +#define satosin(sa) ((struct sockaddr_in *)(sa)) + + if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr) + goto ours; + if ( +#ifdef DIRECTED_BROADCAST + ia->ia_ifp == m->m_pkthdr.rcvif && +#endif + (ia->ia_ifp->if_flags & IFF_BROADCAST)) { + u_long t; + + if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == + ip->ip_dst.s_addr) + goto ours; + if (ip->ip_dst.s_addr == ia->ia_netbroadcast.s_addr) + goto ours; + /* + * Look for all-0's host part (old broadcast addr), + * either for subnet or net. + */ + t = ntohl(ip->ip_dst.s_addr); + if (t == ia->ia_subnet) + goto ours; + if (t == ia->ia_net) + goto ours; + } + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct in_multi *inm; +#ifdef MROUTING + extern struct socket *ip_mrouter; + + if (ip_mrouter) { + /* + * If we are acting as a multicast router, all + * incoming multicast packets are passed to the + * kernel-level multicast forwarding function. + * The packet is returned (relatively) intact; if + * ip_mforward() returns a non-zero value, the packet + * must be discarded, else it may be accepted below. + * + * (The IP ident field is put in the same byte order + * as expected when ip_mforward() is called from + * ip_output().) + */ + ip->ip_id = htons(ip->ip_id); + if (ip_mforward(m, m->m_pkthdr.rcvif) != 0) { + ipstat.ips_cantforward++; + m_freem(m); + goto next; + } + ip->ip_id = ntohs(ip->ip_id); + + /* + * The process-level routing demon needs to receive + * all multicast IGMP packets, whether or not this + * host belongs to their destination groups. + */ + if (ip->ip_p == IPPROTO_IGMP) + goto ours; + ipstat.ips_forward++; + } +#endif + /* + * See if we belong to the destination multicast group on the + * arrival interface. + */ + IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); + if (inm == NULL) { + ipstat.ips_cantforward++; + m_freem(m); + goto next; + } + goto ours; + } + if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) + goto ours; + if (ip->ip_dst.s_addr == INADDR_ANY) + goto ours; + + /* + * Not for us; forward if possible and desirable. + */ + if (ipforwarding == 0) { + ipstat.ips_cantforward++; + m_freem(m); + } else + ip_forward(m, 0); + goto next; + +ours: + /* + * If offset or IP_MF are set, must reassemble. + * Otherwise, nothing need be done. + * (We could look in the reassembly queue to see + * if the packet was previously fragmented, + * but it's not worth the time; just let them time out.) + */ + if (ip->ip_off &~ IP_DF) { + if (m->m_flags & M_EXT) { /* XXX */ + if ((m = m_pullup(m, sizeof (struct ip))) == 0) { + ipstat.ips_toosmall++; + goto next; + } + ip = mtod(m, struct ip *); + } + /* + * Look for queue of fragments + * of this datagram. + */ + for (fp = ipq.next; fp != &ipq; fp = fp->next) + if (ip->ip_id == fp->ipq_id && + ip->ip_src.s_addr == fp->ipq_src.s_addr && + ip->ip_dst.s_addr == fp->ipq_dst.s_addr && + ip->ip_p == fp->ipq_p) + goto found; + fp = 0; +found: + + /* + * Adjust ip_len to not reflect header, + * set ip_mff if more fragments are expected, + * convert offset of this to bytes. + */ + ip->ip_len -= hlen; + ((struct ipasfrag *)ip)->ipf_mff &= ~1; + if (ip->ip_off & IP_MF) + ((struct ipasfrag *)ip)->ipf_mff |= 1; + ip->ip_off <<= 3; + + /* + * If datagram marked as having more fragments + * or if this is not the first fragment, + * attempt reassembly; if it succeeds, proceed. + */ + if (((struct ipasfrag *)ip)->ipf_mff & 1 || ip->ip_off) { + ipstat.ips_fragments++; + ip = ip_reass((struct ipasfrag *)ip, fp); + if (ip == 0) + goto next; + ipstat.ips_reassembled++; + m = dtom(ip); + } else + if (fp) + ip_freef(fp); + } else + ip->ip_len -= hlen; + + /* + * Switch out to protocol's input routine. + */ + ipstat.ips_delivered++; + (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); + goto next; +bad: + m_freem(m); + goto next; +} + +/* + * Take incoming datagram fragment and try to + * reassemble it into whole datagram. If a chain for + * reassembly of this datagram already exists, then it + * is given as fp; otherwise have to make a chain. + */ +struct ip * +ip_reass(ip, fp) + register struct ipasfrag *ip; + register struct ipq *fp; +{ + register struct mbuf *m = dtom(ip); + register struct ipasfrag *q; + struct mbuf *t; + int hlen = ip->ip_hl << 2; + int i, next; + + /* + * Presence of header sizes in mbufs + * would confuse code below. + */ + m->m_data += hlen; + m->m_len -= hlen; + + /* + * If first fragment to arrive, create a reassembly queue. + */ + if (fp == 0) { + if ((t = m_get(M_DONTWAIT, MT_FTABLE)) == NULL) + goto dropfrag; + fp = mtod(t, struct ipq *); + insque(fp, &ipq); + fp->ipq_ttl = IPFRAGTTL; + fp->ipq_p = ip->ip_p; + fp->ipq_id = ip->ip_id; + fp->ipq_next = fp->ipq_prev = (struct ipasfrag *)fp; + fp->ipq_src = ((struct ip *)ip)->ip_src; + fp->ipq_dst = ((struct ip *)ip)->ip_dst; + q = (struct ipasfrag *)fp; + goto insert; + } + + /* + * Find a segment which begins after this one does. + */ + for (q = fp->ipq_next; q != (struct ipasfrag *)fp; q = q->ipf_next) + if (q->ip_off > ip->ip_off) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if (q->ipf_prev != (struct ipasfrag *)fp) { + i = q->ipf_prev->ip_off + q->ipf_prev->ip_len - ip->ip_off; + if (i > 0) { + if (i >= ip->ip_len) + goto dropfrag; + m_adj(dtom(ip), i); + ip->ip_off += i; + ip->ip_len -= i; + } + } + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q != (struct ipasfrag *)fp && ip->ip_off + ip->ip_len > q->ip_off) { + i = (ip->ip_off + ip->ip_len) - q->ip_off; + if (i < q->ip_len) { + q->ip_len -= i; + q->ip_off += i; + m_adj(dtom(q), i); + break; + } + q = q->ipf_next; + m_freem(dtom(q->ipf_prev)); + ip_deq(q->ipf_prev); + } + +insert: + /* + * Stick new segment in its place; + * check for complete reassembly. + */ + ip_enq(ip, q->ipf_prev); + next = 0; + for (q = fp->ipq_next; q != (struct ipasfrag *)fp; q = q->ipf_next) { + if (q->ip_off != next) + return (0); + next += q->ip_len; + } + if (q->ipf_prev->ipf_mff & 1) + return (0); + + /* + * Reassembly is complete; concatenate fragments. + */ + q = fp->ipq_next; + m = dtom(q); + t = m->m_next; + m->m_next = 0; + m_cat(m, t); + q = q->ipf_next; + while (q != (struct ipasfrag *)fp) { + t = dtom(q); + q = q->ipf_next; + m_cat(m, t); + } + + /* + * Create header for new ip packet by + * modifying header of first packet; + * dequeue and discard fragment reassembly header. + * Make header visible. + */ + ip = fp->ipq_next; + ip->ip_len = next; + ip->ipf_mff &= ~1; + ((struct ip *)ip)->ip_src = fp->ipq_src; + ((struct ip *)ip)->ip_dst = fp->ipq_dst; + remque(fp); + (void) m_free(dtom(fp)); + m = dtom(ip); + m->m_len += (ip->ip_hl << 2); + m->m_data -= (ip->ip_hl << 2); + /* some debugging cruft by sklower, below, will go away soon */ + if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ + register int plen = 0; + for (t = m; m; m = m->m_next) + plen += m->m_len; + t->m_pkthdr.len = plen; + } + return ((struct ip *)ip); + +dropfrag: + ipstat.ips_fragdropped++; + m_freem(m); + return (0); +} + +/* + * Free a fragment reassembly header and all + * associated datagrams. + */ +void +ip_freef(fp) + struct ipq *fp; +{ + register struct ipasfrag *q, *p; + + for (q = fp->ipq_next; q != (struct ipasfrag *)fp; q = p) { + p = q->ipf_next; + ip_deq(q); + m_freem(dtom(q)); + } + remque(fp); + (void) m_free(dtom(fp)); +} + +/* + * Put an ip fragment on a reassembly chain. + * Like insque, but pointers in middle of structure. + */ +void +ip_enq(p, prev) + register struct ipasfrag *p, *prev; +{ + + p->ipf_prev = prev; + p->ipf_next = prev->ipf_next; + prev->ipf_next->ipf_prev = p; + prev->ipf_next = p; +} + +/* + * To ip_enq as remque is to insque. + */ +void +ip_deq(p) + register struct ipasfrag *p; +{ + + p->ipf_prev->ipf_next = p->ipf_next; + p->ipf_next->ipf_prev = p->ipf_prev; +} + +/* + * IP timer processing; + * if a timer expires on a reassembly + * queue, discard it. + */ +void +ip_slowtimo() +{ + register struct ipq *fp; + int s = splnet(); + + fp = ipq.next; + if (fp == 0) { + splx(s); + return; + } + while (fp != &ipq) { + --fp->ipq_ttl; + fp = fp->next; + if (fp->prev->ipq_ttl == 0) { + ipstat.ips_fragtimeout++; + ip_freef(fp->prev); + } + } + splx(s); +} + +/* + * Drain off all datagram fragments. + */ +void +ip_drain() +{ + + while (ipq.next != &ipq) { + ipstat.ips_fragdropped++; + ip_freef(ipq.next); + } +} + +/* + * Do option processing on a datagram, + * possibly discarding it if bad options are encountered, + * or forwarding it if source-routed. + * Returns 1 if packet has been forwarded/freed, + * 0 if the packet should be processed further. + */ +int +ip_dooptions(m) + struct mbuf *m; +{ + register struct ip *ip = mtod(m, struct ip *); + register u_char *cp; + register struct ip_timestamp *ipt; + register struct in_ifaddr *ia; + int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; + struct in_addr *sin, dst; + n_time ntime; + + dst = ip->ip_dst; + cp = (u_char *)(ip + 1); + cnt = (ip->ip_hl << 2) - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > cnt) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + } + switch (opt) { + + default: + break; + + /* + * Source routing with record. + * Find interface with current destination address. + * If none on this machine then drop if strictly routed, + * or do nothing if loosely routed. + * Record interface address and bring up next address + * component. If strictly routed make sure next + * address is on directly accessible net. + */ + case IPOPT_LSRR: + case IPOPT_SSRR: + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + ipaddr.sin_addr = ip->ip_dst; + ia = (struct in_ifaddr *) + ifa_ifwithaddr((struct sockaddr *)&ipaddr); + if (ia == 0) { + if (opt == IPOPT_SSRR) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + /* + * Loose routing, and not at next destination + * yet; nothing to do except forward. + */ + break; + } + off--; /* 0 origin */ + if (off > optlen - sizeof(struct in_addr)) { + /* + * End of source route. Should be for us. + */ + save_rte(cp, ip->ip_src); + break; + } + /* + * locate outgoing interface + */ + bcopy((caddr_t)(cp + off), (caddr_t)&ipaddr.sin_addr, + sizeof(ipaddr.sin_addr)); + if (opt == IPOPT_SSRR) { +#define INA struct in_ifaddr * +#define SA struct sockaddr * + if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0) + ia = (INA)ifa_ifwithnet((SA)&ipaddr); + } else + ia = ip_rtaddr(ipaddr.sin_addr); + if (ia == 0) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + ip->ip_dst = ipaddr.sin_addr; + bcopy((caddr_t)&(IA_SIN(ia)->sin_addr), + (caddr_t)(cp + off), sizeof(struct in_addr)); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + /* + * Let ip_intr's mcast routing check handle mcast pkts + */ + forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); + break; + + case IPOPT_RR: + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + /* + * If no space remains, ignore. + */ + off--; /* 0 origin */ + if (off > optlen - sizeof(struct in_addr)) + break; + bcopy((caddr_t)(&ip->ip_dst), (caddr_t)&ipaddr.sin_addr, + sizeof(ipaddr.sin_addr)); + /* + * locate outgoing interface; if we're the destination, + * use the incoming interface (should be same). + */ + if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 && + (ia = ip_rtaddr(ipaddr.sin_addr)) == 0) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + goto bad; + } + bcopy((caddr_t)&(IA_SIN(ia)->sin_addr), + (caddr_t)(cp + off), sizeof(struct in_addr)); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + break; + + case IPOPT_TS: + code = cp - (u_char *)ip; + ipt = (struct ip_timestamp *)cp; + if (ipt->ipt_len < 5) + goto bad; + if (ipt->ipt_ptr > ipt->ipt_len - sizeof (long)) { + if (++ipt->ipt_oflw == 0) + goto bad; + break; + } + sin = (struct in_addr *)(cp + ipt->ipt_ptr - 1); + switch (ipt->ipt_flg) { + + case IPOPT_TS_TSONLY: + break; + + case IPOPT_TS_TSANDADDR: + if (ipt->ipt_ptr + sizeof(n_time) + + sizeof(struct in_addr) > ipt->ipt_len) + goto bad; + ipaddr.sin_addr = dst; + ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, + m->m_pkthdr.rcvif); + if (ia == 0) + continue; + bcopy((caddr_t)&IA_SIN(ia)->sin_addr, + (caddr_t)sin, sizeof(struct in_addr)); + ipt->ipt_ptr += sizeof(struct in_addr); + break; + + case IPOPT_TS_PRESPEC: + if (ipt->ipt_ptr + sizeof(n_time) + + sizeof(struct in_addr) > ipt->ipt_len) + goto bad; + bcopy((caddr_t)sin, (caddr_t)&ipaddr.sin_addr, + sizeof(struct in_addr)); + if (ifa_ifwithaddr((SA)&ipaddr) == 0) + continue; + ipt->ipt_ptr += sizeof(struct in_addr); + break; + + default: + goto bad; + } + ntime = iptime(); + bcopy((caddr_t)&ntime, (caddr_t)cp + ipt->ipt_ptr - 1, + sizeof(n_time)); + ipt->ipt_ptr += sizeof(n_time); + } + } + if (forward) { + ip_forward(m, 1); + return (1); + } + return (0); +bad: + ip->ip_len -= ip->ip_hl << 2; /* XXX icmp_error adds in hdr length */ + icmp_error(m, type, code, 0, 0); + ipstat.ips_badoptions++; + return (1); +} + +/* + * Given address of next destination (final or next hop), + * return internet address info of interface to be used to get there. + */ +struct in_ifaddr * +ip_rtaddr(dst) + struct in_addr dst; +{ + register struct sockaddr_in *sin; + + sin = (struct sockaddr_in *) &ipforward_rt.ro_dst; + + if (ipforward_rt.ro_rt == 0 || dst.s_addr != sin->sin_addr.s_addr) { + if (ipforward_rt.ro_rt) { + RTFREE(ipforward_rt.ro_rt); + ipforward_rt.ro_rt = 0; + } + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = dst; + + rtalloc(&ipforward_rt); + } + if (ipforward_rt.ro_rt == 0) + return ((struct in_ifaddr *)0); + return ((struct in_ifaddr *) ipforward_rt.ro_rt->rt_ifa); +} + +/* + * Save incoming source route for use in replies, + * to be picked up later by ip_srcroute if the receiver is interested. + */ +void +save_rte(option, dst) + u_char *option; + struct in_addr dst; +{ + unsigned olen; + + olen = option[IPOPT_OLEN]; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("save_rte: olen %d\n", olen); +#endif + if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst))) + return; + bcopy((caddr_t)option, (caddr_t)ip_srcrt.srcopt, olen); + ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); + ip_srcrt.dst = dst; +} + +/* + * Retrieve incoming source route for use in replies, + * in the same form used by setsockopt. + * The first hop is placed before the options, will be removed later. + */ +struct mbuf * +ip_srcroute() +{ + register struct in_addr *p, *q; + register struct mbuf *m; + + if (ip_nhops == 0) + return ((struct mbuf *)0); + m = m_get(M_DONTWAIT, MT_SOOPTS); + if (m == 0) + return ((struct mbuf *)0); + +#define OPTSIZ (sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt)) + + /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ + m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + + OPTSIZ; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len); +#endif + + /* + * First save first hop for return route + */ + p = &ip_srcrt.route[ip_nhops - 1]; + *(mtod(m, struct in_addr *)) = *p--; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" hops %lx", ntohl(mtod(m, struct in_addr *)->s_addr)); +#endif + + /* + * Copy option fields and padding (nop) to mbuf. + */ + ip_srcrt.nop = IPOPT_NOP; + ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; + bcopy((caddr_t)&ip_srcrt.nop, + mtod(m, caddr_t) + sizeof(struct in_addr), OPTSIZ); + q = (struct in_addr *)(mtod(m, caddr_t) + + sizeof(struct in_addr) + OPTSIZ); +#undef OPTSIZ + /* + * Record return path as an IP source route, + * reversing the path (pointers are now aligned). + */ + while (p >= ip_srcrt.route) { +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" %lx", ntohl(q->s_addr)); +#endif + *q++ = *p--; + } + /* + * Last hop goes to final destination. + */ + *q = ip_srcrt.dst; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" %lx\n", ntohl(q->s_addr)); +#endif + return (m); +} + +/* + * Strip out IP options, at higher + * level protocol in the kernel. + * Second argument is buffer to which options + * will be moved, and return value is their length. + * XXX should be deleted; last arg currently ignored. + */ +void +ip_stripoptions(m, mopt) + register struct mbuf *m; + struct mbuf *mopt; +{ + register int i; + struct ip *ip = mtod(m, struct ip *); + register caddr_t opts; + int olen; + + olen = (ip->ip_hl<<2) - sizeof (struct ip); + opts = (caddr_t)(ip + 1); + i = m->m_len - (sizeof (struct ip) + olen); + bcopy(opts + olen, opts, (unsigned)i); + m->m_len -= olen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= olen; + ip->ip_hl = sizeof(struct ip) >> 2; +} + +u_char inetctlerrmap[PRC_NCMDS] = { + 0, 0, 0, 0, + 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, + EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, + EMSGSIZE, EHOSTUNREACH, 0, 0, + 0, 0, 0, 0, + ENOPROTOOPT +}; + +/* + * Forward a packet. If some error occurs return the sender + * an icmp packet. Note we can't always generate a meaningful + * icmp message because icmp doesn't have a large enough repertoire + * of codes and types. + * + * If not forwarding, just drop the packet. This could be confusing + * if ipforwarding was zero but some routing protocol was advancing + * us as a gateway to somewhere. However, we must let the routing + * protocol deal with that. + * + * The srcrt parameter indicates whether the packet is being forwarded + * via a source route. + */ +void +ip_forward(m, srcrt) + struct mbuf *m; + int srcrt; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct sockaddr_in *sin; + register struct rtentry *rt; + int error, type = 0, code; + struct mbuf *mcopy; + n_long dest; + struct ifnet *destifp; + + dest = 0; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("forward: src %x dst %x ttl %x\n", ip->ip_src, + ip->ip_dst, ip->ip_ttl); +#endif + if (m->m_flags & M_BCAST || in_canforward(ip->ip_dst) == 0) { + ipstat.ips_cantforward++; + m_freem(m); + return; + } + HTONS(ip->ip_id); + if (ip->ip_ttl <= IPTTLDEC) { + icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, dest, 0); + return; + } + ip->ip_ttl -= IPTTLDEC; + + sin = (struct sockaddr_in *)&ipforward_rt.ro_dst; + if ((rt = ipforward_rt.ro_rt) == 0 || + ip->ip_dst.s_addr != sin->sin_addr.s_addr) { + if (ipforward_rt.ro_rt) { + RTFREE(ipforward_rt.ro_rt); + ipforward_rt.ro_rt = 0; + } + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ip->ip_dst; + + rtalloc(&ipforward_rt); + if (ipforward_rt.ro_rt == 0) { + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); + return; + } + rt = ipforward_rt.ro_rt; + } + + /* + * Save at most 64 bytes of the packet in case + * we need to generate an ICMP message to the src. + */ + mcopy = m_copy(m, 0, imin((int)ip->ip_len, 64)); + +#ifdef GATEWAY + ip_ifmatrix[rt->rt_ifp->if_index + + if_index * m->m_pkthdr.rcvif->if_index]++; +#endif + /* + * If forwarding packet using same interface that it came in on, + * perhaps should send a redirect to sender to shortcut a hop. + * Only send redirect if source is sending directly to us, + * and if packet was not source routed (or has any options). + * Also, don't send redirect if forwarding using a default route + * or a route modified by a redirect. + */ +#define satosin(sa) ((struct sockaddr_in *)(sa)) + if (rt->rt_ifp == m->m_pkthdr.rcvif && + (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && + satosin(rt_key(rt))->sin_addr.s_addr != 0 && + ipsendredirects && !srcrt) { +#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) + u_long src = ntohl(ip->ip_src.s_addr); + + if (RTA(rt) && + (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { + if (rt->rt_flags & RTF_GATEWAY) + dest = satosin(rt->rt_gateway)->sin_addr.s_addr; + else + dest = ip->ip_dst.s_addr; + /* Router requirements says to only send host redirects */ + type = ICMP_REDIRECT; + code = ICMP_REDIRECT_HOST; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("redirect (%d) to %lx\n", code, (u_long)dest); +#endif + } + } + + error = ip_output(m, (struct mbuf *)0, &ipforward_rt, IP_FORWARDING +#ifdef DIRECTED_BROADCAST + | IP_ALLOWBROADCAST +#endif + , 0); + if (error) + ipstat.ips_cantforward++; + else { + ipstat.ips_forward++; + if (type) + ipstat.ips_redirectsent++; + else { + if (mcopy) + m_freem(mcopy); + return; + } + } + if (mcopy == NULL) + return; + destifp = NULL; + + switch (error) { + + case 0: /* forwarded, but need redirect */ + /* type, code set above */ + break; + + case ENETUNREACH: /* shouldn't happen, checked above */ + case EHOSTUNREACH: + case ENETDOWN: + case EHOSTDOWN: + default: + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + break; + + case EMSGSIZE: + type = ICMP_UNREACH; + code = ICMP_UNREACH_NEEDFRAG; + if (ipforward_rt.ro_rt) + destifp = ipforward_rt.ro_rt->rt_ifp; + ipstat.ips_cantfrag++; + break; + + case ENOBUFS: + type = ICMP_SOURCEQUENCH; + code = 0; + break; + } + icmp_error(mcopy, type, code, dest, destifp); +} + +int +ip_sysctl(name, namelen, oldp, oldlenp, newp, newlen) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; +{ + /* All sysctl names at this level are terminal. */ + if (namelen != 1) + return (ENOTDIR); + + switch (name[0]) { + case IPCTL_FORWARDING: + return (sysctl_int(oldp, oldlenp, newp, newlen, &ipforwarding)); + case IPCTL_SENDREDIRECTS: + return (sysctl_int(oldp, oldlenp, newp, newlen, + &ipsendredirects)); + case IPCTL_DEFTTL: + return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_defttl)); +#ifdef notyet + case IPCTL_DEFMTU: + return (sysctl_int(oldp, oldlenp, newp, newlen, &ip_mtu)); +#endif + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c new file mode 100644 index 00000000000..1744ec17fb6 --- /dev/null +++ b/sys/netinet/ip_mroute.c @@ -0,0 +1,834 @@ +/* + * Copyright (c) 1989 Stephen Deering + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 + */ + +/* + * Procedures for the kernel part of DVMRP, + * a Distance-Vector Multicast Routing Protocol. + * (See RFC-1075.) + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * + * MROUTING 1.1 + */ + +#ifndef MROUTING +int ip_mrtproto; /* for netstat only */ +#else + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Static forwards */ +static int ip_mrouter_init __P((struct socket *)); +static int add_vif __P((struct vifctl *)); +static int del_vif __P((vifi_t *vifip)); +static int add_lgrp __P((struct lgrplctl *)); +static int del_lgrp __P((struct lgrplctl *)); +static int grplst_member __P((struct vif *, struct in_addr)); +static u_long nethash __P((struct in_addr in)); +static int add_mrt __P((struct mrtctl *)); +static int del_mrt __P((struct in_addr *)); +static struct mrt *mrtfind __P((struct in_addr)); +static void phyint_send __P((struct mbuf *, struct vif *)); +static void tunnel_send __P((struct mbuf *, struct vif *)); + +#define INSIZ sizeof(struct in_addr) +#define same(a1, a2) (bcmp((caddr_t)(a1), (caddr_t)(a2), INSIZ) == 0) +#define satosin(sa) ((struct sockaddr_in *)(sa)) + +/* + * Globals. All but ip_mrouter and ip_mrtproto could be static, + * except for netstat or debugging purposes. + */ +struct socket *ip_mrouter = NULL; +int ip_mrtproto = IGMP_DVMRP; /* for netstat only */ + +struct mrt *mrttable[MRTHASHSIZ]; +struct vif viftable[MAXVIFS]; +struct mrtstat mrtstat; + +/* + * Private variables. + */ +static vifi_t numvifs = 0; +static struct mrt *cached_mrt = NULL; +static u_long cached_origin; +static u_long cached_originmask; + +/* + * Handle DVMRP setsockopt commands to modify the multicast routing tables. + */ +int +ip_mrouter_cmd(cmd, so, m) + register int cmd; + register struct socket *so; + register struct mbuf *m; +{ + register int error = 0; + + if (cmd != DVMRP_INIT && so != ip_mrouter) + error = EACCES; + else switch (cmd) { + + case DVMRP_INIT: + error = ip_mrouter_init(so); + break; + + case DVMRP_DONE: + error = ip_mrouter_done(); + break; + + case DVMRP_ADD_VIF: + if (m == NULL || m->m_len < sizeof(struct vifctl)) + error = EINVAL; + else + error = add_vif(mtod(m, struct vifctl *)); + break; + + case DVMRP_DEL_VIF: + if (m == NULL || m->m_len < sizeof(short)) + error = EINVAL; + else + error = del_vif(mtod(m, vifi_t *)); + break; + + case DVMRP_ADD_LGRP: + if (m == NULL || m->m_len < sizeof(struct lgrplctl)) + error = EINVAL; + else + error = add_lgrp(mtod(m, struct lgrplctl *)); + break; + + case DVMRP_DEL_LGRP: + if (m == NULL || m->m_len < sizeof(struct lgrplctl)) + error = EINVAL; + else + error = del_lgrp(mtod(m, struct lgrplctl *)); + break; + + case DVMRP_ADD_MRT: + if (m == NULL || m->m_len < sizeof(struct mrtctl)) + error = EINVAL; + else + error = add_mrt(mtod(m, struct mrtctl *)); + break; + + case DVMRP_DEL_MRT: + if (m == NULL || m->m_len < sizeof(struct in_addr)) + error = EINVAL; + else + error = del_mrt(mtod(m, struct in_addr *)); + break; + + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +/* + * Enable multicast routing + */ +static int +ip_mrouter_init(so) + register struct socket *so; +{ + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_IGMP) + return (EOPNOTSUPP); + + if (ip_mrouter != NULL) + return (EADDRINUSE); + + ip_mrouter = so; + + return (0); +} + +/* + * Disable multicast routing + */ +int +ip_mrouter_done() +{ + register vifi_t vifi; + register int i; + register struct ifnet *ifp; + register int s; + struct ifreq ifr; + + s = splnet(); + + /* + * For each phyint in use, free its local group list and + * disable promiscuous reception of all IP multicasts. + */ + for (vifi = 0; vifi < numvifs; vifi++) { + if (viftable[vifi].v_lcl_addr.s_addr != 0 && + !(viftable[vifi].v_flags & VIFF_TUNNEL)) { + if (viftable[vifi].v_lcl_grps) + free(viftable[vifi].v_lcl_grps, M_MRTABLE); + satosin(&ifr.ifr_addr)->sin_family = AF_INET; + satosin(&ifr.ifr_addr)->sin_addr.s_addr = INADDR_ANY; + ifp = viftable[vifi].v_ifp; + (*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr); + } + } + bzero((caddr_t)viftable, sizeof(viftable)); + numvifs = 0; + + /* + * Free any multicast route entries. + */ + for (i = 0; i < MRTHASHSIZ; i++) + if (mrttable[i]) + free(mrttable[i], M_MRTABLE); + bzero((caddr_t)mrttable, sizeof(mrttable)); + cached_mrt = NULL; + + ip_mrouter = NULL; + + splx(s); + return (0); +} + +/* + * Add a vif to the vif table + */ +static int +add_vif(vifcp) + register struct vifctl *vifcp; +{ + register struct vif *vifp = viftable + vifcp->vifc_vifi; + register struct ifaddr *ifa; + register struct ifnet *ifp; + struct ifreq ifr; + register int error, s; + static struct sockaddr_in sin = { sizeof(sin), AF_INET }; + + if (vifcp->vifc_vifi >= MAXVIFS) + return (EINVAL); + if (vifp->v_lcl_addr.s_addr != 0) + return (EADDRINUSE); + + /* Find the interface with an address in AF_INET family */ + sin.sin_addr = vifcp->vifc_lcl_addr; + ifa = ifa_ifwithaddr((struct sockaddr *)&sin); + if (ifa == 0) + return (EADDRNOTAVAIL); + + s = splnet(); + + if (vifcp->vifc_flags & VIFF_TUNNEL) + vifp->v_rmt_addr = vifcp->vifc_rmt_addr; + else { + /* Make sure the interface supports multicast */ + ifp = ifa->ifa_ifp; + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + splx(s); + return (EOPNOTSUPP); + } + /* + * Enable promiscuous reception of all IP multicasts + * from the interface. + */ + satosin(&ifr.ifr_addr)->sin_family = AF_INET; + satosin(&ifr.ifr_addr)->sin_addr.s_addr = INADDR_ANY; + error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr); + if (error) { + splx(s); + return (error); + } + } + + vifp->v_flags = vifcp->vifc_flags; + vifp->v_threshold = vifcp->vifc_threshold; + vifp->v_lcl_addr = vifcp->vifc_lcl_addr; + vifp->v_ifp = ifa->ifa_ifp; + + /* Adjust numvifs up if the vifi is higher than numvifs */ + if (numvifs <= vifcp->vifc_vifi) + numvifs = vifcp->vifc_vifi + 1; + + splx(s); + return (0); +} + +/* + * Delete a vif from the vif table + */ +static int +del_vif(vifip) + register vifi_t *vifip; +{ + register struct vif *vifp = viftable + *vifip; + register struct ifnet *ifp; + register int i, s; + struct ifreq ifr; + + if (*vifip >= numvifs) + return (EINVAL); + if (vifp->v_lcl_addr.s_addr == 0) + return (EADDRNOTAVAIL); + + s = splnet(); + + if (!(vifp->v_flags & VIFF_TUNNEL)) { + if (vifp->v_lcl_grps) + free(vifp->v_lcl_grps, M_MRTABLE); + satosin(&ifr.ifr_addr)->sin_family = AF_INET; + satosin(&ifr.ifr_addr)->sin_addr.s_addr = INADDR_ANY; + ifp = vifp->v_ifp; + (*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr); + } + + bzero((caddr_t)vifp, sizeof (*vifp)); + + /* Adjust numvifs down */ + for (i = numvifs - 1; i >= 0; i--) + if (viftable[i].v_lcl_addr.s_addr != 0) + break; + numvifs = i + 1; + + splx(s); + return (0); +} + +/* + * Add the multicast group in the lgrpctl to the list of local multicast + * group memberships associated with the vif indexed by gcp->lgc_vifi. + */ +static int +add_lgrp(gcp) + register struct lgrplctl *gcp; +{ + register struct vif *vifp; + register int s; + + if (gcp->lgc_vifi >= numvifs) + return (EINVAL); + + vifp = viftable + gcp->lgc_vifi; + if (vifp->v_lcl_addr.s_addr == 0 || (vifp->v_flags & VIFF_TUNNEL)) + return (EADDRNOTAVAIL); + + /* If not enough space in existing list, allocate a larger one */ + s = splnet(); + if (vifp->v_lcl_grps_n + 1 >= vifp->v_lcl_grps_max) { + register int num; + register struct in_addr *ip; + + num = vifp->v_lcl_grps_max; + if (num <= 0) + num = 32; /* initial number */ + else + num += num; /* double last number */ + ip = (struct in_addr *)malloc(num * sizeof(*ip), + M_MRTABLE, M_NOWAIT); + if (ip == NULL) { + splx(s); + return (ENOBUFS); + } + + bzero((caddr_t)ip, num * sizeof(*ip)); /* XXX paranoid */ + bcopy((caddr_t)vifp->v_lcl_grps, (caddr_t)ip, + vifp->v_lcl_grps_n * sizeof(*ip)); + + vifp->v_lcl_grps_max = num; + if (vifp->v_lcl_grps) + free(vifp->v_lcl_grps, M_MRTABLE); + vifp->v_lcl_grps = ip; + + splx(s); + } + + vifp->v_lcl_grps[vifp->v_lcl_grps_n++] = gcp->lgc_gaddr; + + if (gcp->lgc_gaddr.s_addr == vifp->v_cached_group) + vifp->v_cached_result = 1; + + splx(s); + return (0); +} + +/* + * Delete the the local multicast group associated with the vif + * indexed by gcp->lgc_vifi. + */ + +static int +del_lgrp(gcp) + register struct lgrplctl *gcp; +{ + register struct vif *vifp; + register int i, error, s; + + if (gcp->lgc_vifi >= numvifs) + return (EINVAL); + vifp = viftable + gcp->lgc_vifi; + if (vifp->v_lcl_addr.s_addr == 0 || (vifp->v_flags & VIFF_TUNNEL)) + return (EADDRNOTAVAIL); + + s = splnet(); + + if (gcp->lgc_gaddr.s_addr == vifp->v_cached_group) + vifp->v_cached_result = 0; + + error = EADDRNOTAVAIL; + for (i = 0; i < vifp->v_lcl_grps_n; ++i) + if (same(&gcp->lgc_gaddr, &vifp->v_lcl_grps[i])) { + error = 0; + vifp->v_lcl_grps_n--; + bcopy((caddr_t)&vifp->v_lcl_grps[i + 1], + (caddr_t)&vifp->v_lcl_grps[i], + (vifp->v_lcl_grps_n - i) * sizeof(struct in_addr)); + error = 0; + break; + } + + splx(s); + return (error); +} + +/* + * Return 1 if gaddr is a member of the local group list for vifp. + */ +static int +grplst_member(vifp, gaddr) + register struct vif *vifp; + struct in_addr gaddr; +{ + register int i, s; + register u_long addr; + + mrtstat.mrts_grp_lookups++; + + addr = gaddr.s_addr; + if (addr == vifp->v_cached_group) + return (vifp->v_cached_result); + + mrtstat.mrts_grp_misses++; + + for (i = 0; i < vifp->v_lcl_grps_n; ++i) + if (addr == vifp->v_lcl_grps[i].s_addr) { + s = splnet(); + vifp->v_cached_group = addr; + vifp->v_cached_result = 1; + splx(s); + return (1); + } + s = splnet(); + vifp->v_cached_group = addr; + vifp->v_cached_result = 0; + splx(s); + return (0); +} + +/* + * A simple hash function: returns MRTHASHMOD of the low-order octet of + * the argument's network or subnet number. + */ +static u_long +nethash(in) + struct in_addr in; +{ + register u_long n; + + n = in_netof(in); + while ((n & 0xff) == 0) + n >>= 8; + return (MRTHASHMOD(n)); +} + +/* + * Add an mrt entry + */ +static int +add_mrt(mrtcp) + register struct mrtctl *mrtcp; +{ + struct mrt *rt; + u_long hash; + int s; + + if (rt = mrtfind(mrtcp->mrtc_origin)) { + /* Just update the route */ + s = splnet(); + rt->mrt_parent = mrtcp->mrtc_parent; + VIFM_COPY(mrtcp->mrtc_children, rt->mrt_children); + VIFM_COPY(mrtcp->mrtc_leaves, rt->mrt_leaves); + splx(s); + return (0); + } + + s = splnet(); + + rt = (struct mrt *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + if (rt == NULL) { + splx(s); + return (ENOBUFS); + } + + /* + * insert new entry at head of hash chain + */ + rt->mrt_origin = mrtcp->mrtc_origin; + rt->mrt_originmask = mrtcp->mrtc_originmask; + rt->mrt_parent = mrtcp->mrtc_parent; + VIFM_COPY(mrtcp->mrtc_children, rt->mrt_children); + VIFM_COPY(mrtcp->mrtc_leaves, rt->mrt_leaves); + /* link into table */ + hash = nethash(mrtcp->mrtc_origin); + rt->mrt_next = mrttable[hash]; + mrttable[hash] = rt; + + splx(s); + return (0); +} + +/* + * Delete an mrt entry + */ +static int +del_mrt(origin) + register struct in_addr *origin; +{ + register struct mrt *rt, *prev_rt; + register u_long hash = nethash(*origin); + register int s; + + for (prev_rt = rt = mrttable[hash]; rt; prev_rt = rt, rt = rt->mrt_next) + if (origin->s_addr == rt->mrt_origin.s_addr) + break; + if (!rt) + return (ESRCH); + + s = splnet(); + + if (rt == cached_mrt) + cached_mrt = NULL; + + if (prev_rt == rt) + mrttable[hash] = rt->mrt_next; + else + prev_rt->mrt_next = rt->mrt_next; + free(rt, M_MRTABLE); + + splx(s); + return (0); +} + +/* + * Find a route for a given origin IP address. + */ +static struct mrt * +mrtfind(origin) + struct in_addr origin; +{ + register struct mrt *rt; + register u_int hash; + register int s; + + mrtstat.mrts_mrt_lookups++; + + if (cached_mrt != NULL && + (origin.s_addr & cached_originmask) == cached_origin) + return (cached_mrt); + + mrtstat.mrts_mrt_misses++; + + hash = nethash(origin); + for (rt = mrttable[hash]; rt; rt = rt->mrt_next) + if ((origin.s_addr & rt->mrt_originmask.s_addr) == + rt->mrt_origin.s_addr) { + s = splnet(); + cached_mrt = rt; + cached_origin = rt->mrt_origin.s_addr; + cached_originmask = rt->mrt_originmask.s_addr; + splx(s); + return (rt); + } + return (NULL); +} + +/* + * IP multicast forwarding function. This function assumes that the packet + * pointed to by "ip" has arrived on (or is about to be sent to) the interface + * pointed to by "ifp", and the packet is to be relayed to other networks + * that have members of the packet's destination IP multicast group. + * + * The packet is returned unscathed to the caller, unless it is tunneled + * or erroneous, in which case a non-zero return value tells the caller to + * discard it. + */ + +#define IP_HDR_LEN 20 /* # bytes of fixed IP header (excluding options) */ +#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ + +int +ip_mforward(m, ifp) + register struct mbuf *m; + register struct ifnet *ifp; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct mrt *rt; + register struct vif *vifp; + register int vifi; + register u_char *ipoptions; + u_long tunnel_src; + + if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 || + (ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR ) { + /* + * Packet arrived via a physical interface. + */ + tunnel_src = 0; + } else { + /* + * Packet arrived through a tunnel. + * + * A tunneled packet has a single NOP option and a + * two-element loose-source-and-record-route (LSRR) + * option immediately following the fixed-size part of + * the IP header. At this point in processing, the IP + * header should contain the following IP addresses: + * + * original source - in the source address field + * destination group - in the destination address field + * remote tunnel end-point - in the first element of LSRR + * one of this host's addrs - in the second element of LSRR + * + * NOTE: RFC-1075 would have the original source and + * remote tunnel end-point addresses swapped. However, + * that could cause delivery of ICMP error messages to + * innocent applications on intermediate routing + * hosts! Therefore, we hereby change the spec. + */ + + /* + * Verify that the tunnel options are well-formed. + */ + if (ipoptions[0] != IPOPT_NOP || + ipoptions[2] != 11 || /* LSRR option length */ + ipoptions[3] != 12 || /* LSRR address pointer */ + (tunnel_src = *(u_long *)(&ipoptions[4])) == 0) { + mrtstat.mrts_bad_tunnel++; + return (1); + } + + /* + * Delete the tunnel options from the packet. + */ + ovbcopy((caddr_t)(ipoptions + TUNNEL_LEN), (caddr_t)ipoptions, + (unsigned)(m->m_len - (IP_HDR_LEN + TUNNEL_LEN))); + m->m_len -= TUNNEL_LEN; + ip->ip_len -= TUNNEL_LEN; + ip->ip_hl -= TUNNEL_LEN >> 2; + } + + /* + * Don't forward a packet with time-to-live of zero or one, + * or a packet destined to a local-only group. + */ + if (ip->ip_ttl <= 1 || + ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) + return ((int)tunnel_src); + + /* + * Don't forward if we don't have a route for the packet's origin. + */ + if (!(rt = mrtfind(ip->ip_src))) { + mrtstat.mrts_no_route++; + return ((int)tunnel_src); + } + + /* + * Don't forward if it didn't arrive from the parent vif for its origin. + */ + vifi = rt->mrt_parent; + if (tunnel_src == 0 ) { + if ((viftable[vifi].v_flags & VIFF_TUNNEL) || + viftable[vifi].v_ifp != ifp ) + return ((int)tunnel_src); + } else { + if (!(viftable[vifi].v_flags & VIFF_TUNNEL) || + viftable[vifi].v_rmt_addr.s_addr != tunnel_src ) + return ((int)tunnel_src); + } + + /* + * For each vif, decide if a copy of the packet should be forwarded. + * Forward if: + * - the ttl exceeds the vif's threshold AND + * - the vif is a child in the origin's route AND + * - ( the vif is not a leaf in the origin's route OR + * the destination group has members on the vif ) + * + * (This might be speeded up with some sort of cache -- someday.) + */ + for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) { + if (ip->ip_ttl > vifp->v_threshold && + VIFM_ISSET(vifi, rt->mrt_children) && + (!VIFM_ISSET(vifi, rt->mrt_leaves) || + grplst_member(vifp, ip->ip_dst))) { + if (vifp->v_flags & VIFF_TUNNEL) + tunnel_send(m, vifp); + else + phyint_send(m, vifp); + } + } + + return ((int)tunnel_src); +} + +static void +phyint_send(m, vifp) + register struct mbuf *m; + register struct vif *vifp; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct mbuf *mb_copy; + register struct ip_moptions *imo; + register int error; + struct ip_moptions simo; + + mb_copy = m_copy(m, 0, M_COPYALL); + if (mb_copy == NULL) + return; + + imo = &simo; + imo->imo_multicast_ifp = vifp->v_ifp; + imo->imo_multicast_ttl = ip->ip_ttl - 1; + imo->imo_multicast_loop = 1; + + error = ip_output(mb_copy, NULL, NULL, IP_FORWARDING, imo); +} + +static void +tunnel_send(m, vifp) + register struct mbuf *m; + register struct vif *vifp; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct mbuf *mb_copy, *mb_opts; + register struct ip *ip_copy; + register int error; + register u_char *cp; + + /* + * Make sure that adding the tunnel options won't exceed the + * maximum allowed number of option bytes. + */ + if (ip->ip_hl > (60 - TUNNEL_LEN) >> 2) { + mrtstat.mrts_cant_tunnel++; + return; + } + + /* + * Get a private copy of the IP header so that changes to some + * of the IP fields don't damage the original header, which is + * examined later in ip_input.c. + */ + mb_copy = m_copy(m, IP_HDR_LEN, M_COPYALL); + if (mb_copy == NULL) + return; + MGETHDR(mb_opts, M_DONTWAIT, MT_HEADER); + if (mb_opts == NULL) { + m_freem(mb_copy); + return; + } + /* + * Make mb_opts be the new head of the packet chain. + * Any options of the packet were left in the old packet chain head + */ + mb_opts->m_next = mb_copy; + mb_opts->m_len = IP_HDR_LEN + TUNNEL_LEN; + mb_opts->m_data += MSIZE - mb_opts->m_len; + + ip_copy = mtod(mb_opts, struct ip *); + /* + * Copy the base ip header to the new head mbuf. + */ + *ip_copy = *ip; + ip_copy->ip_ttl--; + ip_copy->ip_dst = vifp->v_rmt_addr; /* remote tunnel end-point */ + /* + * Adjust the ip header length to account for the tunnel options. + */ + ip_copy->ip_hl += TUNNEL_LEN >> 2; + ip_copy->ip_len += TUNNEL_LEN; + /* + * Add the NOP and LSRR after the base ip header + */ + cp = (u_char *)(ip_copy + 1); + *cp++ = IPOPT_NOP; + *cp++ = IPOPT_LSRR; + *cp++ = 11; /* LSRR option length */ + *cp++ = 8; /* LSSR pointer to second element */ + *(u_long*)cp = vifp->v_lcl_addr.s_addr; /* local tunnel end-point */ + cp += 4; + *(u_long*)cp = ip->ip_dst.s_addr; /* destination group */ + + error = ip_output(mb_opts, NULL, NULL, IP_FORWARDING, NULL); +} +#endif diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h new file mode 100644 index 00000000000..adb40be9552 --- /dev/null +++ b/sys/netinet/ip_mroute.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 1989 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_mroute.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions for the kernel part of DVMRP, + * a Distance-Vector Multicast Routing Protocol. + * (See RFC-1075.) + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * + * MROUTING 1.0 + */ + + +/* + * DVMRP-specific setsockopt commands. + */ +#define DVMRP_INIT 100 +#define DVMRP_DONE 101 +#define DVMRP_ADD_VIF 102 +#define DVMRP_DEL_VIF 103 +#define DVMRP_ADD_LGRP 104 +#define DVMRP_DEL_LGRP 105 +#define DVMRP_ADD_MRT 106 +#define DVMRP_DEL_MRT 107 + + +/* + * Types and macros for handling bitmaps with one bit per virtual interface. + */ +#define MAXVIFS 32 +typedef u_long vifbitmap_t; +typedef u_short vifi_t; /* type of a vif index */ + +#define VIFM_SET(n, m) ((m) |= (1 << (n))) +#define VIFM_CLR(n, m) ((m) &= ~(1 << (n))) +#define VIFM_ISSET(n, m) ((m) & (1 << (n))) +#define VIFM_CLRALL(m) ((m) = 0x00000000) +#define VIFM_COPY(mfrom, mto) ((mto) = (mfrom)) +#define VIFM_SAME(m1, m2) ((m1) == (m2)) + + +/* + * Agument structure for DVMRP_ADD_VIF. + * (DVMRP_DEL_VIF takes a single vifi_t argument.) + */ +struct vifctl { + vifi_t vifc_vifi; /* the index of the vif to be added */ + u_char vifc_flags; /* VIFF_ flags defined below */ + u_char vifc_threshold; /* min ttl required to forward on vif */ + struct in_addr vifc_lcl_addr; /* local interface address */ + struct in_addr vifc_rmt_addr; /* remote address (tunnels only) */ +}; + +#define VIFF_TUNNEL 0x1 /* vif represents a tunnel end-point */ + + +/* + * Argument structure for DVMRP_ADD_LGRP and DVMRP_DEL_LGRP. + */ +struct lgrplctl { + vifi_t lgc_vifi; + struct in_addr lgc_gaddr; +}; + + +/* + * Argument structure for DVMRP_ADD_MRT. + * (DVMRP_DEL_MRT takes a single struct in_addr argument, containing origin.) + */ +struct mrtctl { + struct in_addr mrtc_origin; /* subnet origin of multicasts */ + struct in_addr mrtc_originmask; /* subnet mask for origin */ + vifi_t mrtc_parent; /* incoming vif */ + vifbitmap_t mrtc_children; /* outgoing children vifs */ + vifbitmap_t mrtc_leaves; /* subset of outgoing children vifs */ +}; + + +#ifdef KERNEL + +/* + * The kernel's virtual-interface structure. + */ +struct vif { + u_char v_flags; /* VIFF_ flags defined above */ + u_char v_threshold; /* min ttl required to forward on vif */ + struct in_addr v_lcl_addr; /* local interface address */ + struct in_addr v_rmt_addr; /* remote address (tunnels only) */ + struct ifnet *v_ifp; /* pointer to interface */ + struct in_addr *v_lcl_grps; /* list of local grps (phyints only) */ + int v_lcl_grps_max; /* malloc'ed number of v_lcl_grps */ + int v_lcl_grps_n; /* used number of v_lcl_grps */ + u_long v_cached_group; /* last grp looked-up (phyints only) */ + int v_cached_result; /* last look-up result (phyints only) */ +}; + +/* + * The kernel's multicast route structure. + */ +struct mrt { + struct in_addr mrt_origin; /* subnet origin of multicasts */ + struct in_addr mrt_originmask; /* subnet mask for origin */ + vifi_t mrt_parent; /* incoming vif */ + vifbitmap_t mrt_children; /* outgoing children vifs */ + vifbitmap_t mrt_leaves; /* subset of outgoing children vifs */ + struct mrt *mrt_next; /* forward link */ +}; + + +#define MRTHASHSIZ 64 +#if (MRTHASHSIZ & (MRTHASHSIZ - 1)) == 0 /* from sys:route.h */ +#define MRTHASHMOD(h) ((h) & (MRTHASHSIZ - 1)) +#else +#define MRTHASHMOD(h) ((h) % MRTHASHSIZ) +#endif + +/* + * The kernel's multicast routing statistics. + */ +struct mrtstat { + u_long mrts_mrt_lookups; /* # multicast route lookups */ + u_long mrts_mrt_misses; /* # multicast route cache misses */ + u_long mrts_grp_lookups; /* # group address lookups */ + u_long mrts_grp_misses; /* # group address cache misses */ + u_long mrts_no_route; /* no route for packet's origin */ + u_long mrts_bad_tunnel; /* malformed tunnel options */ + u_long mrts_cant_tunnel; /* no room for tunnel options */ +}; + + +int ip_mrouter_cmd __P((int, struct socket *, struct mbuf *)); +int ip_mrouter_done __P((void)); + +#endif /* KERNEL */ + diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c new file mode 100644 index 00000000000..4c22a5e53ec --- /dev/null +++ b/sys/netinet/ip_output.c @@ -0,0 +1,1064 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef vax +#include +#endif + +static struct mbuf *ip_insertoptions __P((struct mbuf *, struct mbuf *, int *)); +static void ip_mloopback + __P((struct ifnet *, struct mbuf *, struct sockaddr_in *)); + +/* + * IP output. The packet in mbuf chain m contains a skeletal IP + * header (with len, off, ttl, proto, tos, src, dst). + * The mbuf chain containing the packet will be freed. + * The mbuf opt, if present, will not be freed. + */ +int +ip_output(m0, opt, ro, flags, imo) + struct mbuf *m0; + struct mbuf *opt; + struct route *ro; + int flags; + struct ip_moptions *imo; +{ + register struct ip *ip, *mhip; + register struct ifnet *ifp; + register struct mbuf *m = m0; + register int hlen = sizeof (struct ip); + int len, off, error = 0; + struct route iproute; + struct sockaddr_in *dst; + struct in_ifaddr *ia; + +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("ip_output no HDR"); +#endif + if (opt) { + m = ip_insertoptions(m, opt, &len); + hlen = len; + } + ip = mtod(m, struct ip *); + /* + * Fill in IP header. + */ + if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { + ip->ip_v = IPVERSION; + ip->ip_off &= IP_DF; + ip->ip_id = htons(ip_id++); + ip->ip_hl = hlen >> 2; + ipstat.ips_localout++; + } else { + hlen = ip->ip_hl << 2; + } + /* + * Route packet. + */ + if (ro == 0) { + ro = &iproute; + bzero((caddr_t)ro, sizeof (*ro)); + } + dst = (struct sockaddr_in *)&ro->ro_dst; + /* + * If there is a cached route, + * check that it is to the same destination + * and is still up. If not, free it and try again. + */ + if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + if (ro->ro_rt == 0) { + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = ip->ip_dst; + } + /* + * If routing to interface only, + * short circuit routing lookup. + */ +#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) +#define sintosa(sin) ((struct sockaddr *)(sin)) + if (flags & IP_ROUTETOIF) { + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 && + (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { + ipstat.ips_noroute++; + error = ENETUNREACH; + goto bad; + } + ifp = ia->ia_ifp; + ip->ip_ttl = 1; + } else { + if (ro->ro_rt == 0) + rtalloc(ro); + if (ro->ro_rt == 0) { + ipstat.ips_noroute++; + error = EHOSTUNREACH; + goto bad; + } + ia = ifatoia(ro->ro_rt->rt_ifa); + ifp = ro->ro_rt->rt_ifp; + ro->ro_rt->rt_use++; + if (ro->ro_rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct in_multi *inm; + extern struct ifnet loif; + + m->m_flags |= M_MCAST; + /* + * IP destination address is multicast. Make sure "dst" + * still points to the address in "ro". (It may have been + * changed to point to a gateway address, above.) + */ + dst = (struct sockaddr_in *)&ro->ro_dst; + /* + * See if the caller provided any multicast options + */ + if (imo != NULL) { + ip->ip_ttl = imo->imo_multicast_ttl; + if (imo->imo_multicast_ifp != NULL) + ifp = imo->imo_multicast_ifp; + } else + ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + /* + * Confirm that the outgoing interface supports multicast. + */ + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + ipstat.ips_noroute++; + error = ENETUNREACH; + goto bad; + } + /* + * If source address not specified yet, use address + * of outgoing interface. + */ + if (ip->ip_src.s_addr == INADDR_ANY) { + register struct in_ifaddr *ia; + + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if (ia->ia_ifp == ifp) { + ip->ip_src = IA_SIN(ia)->sin_addr; + break; + } + } + + IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); + if (inm != NULL && + (imo == NULL || imo->imo_multicast_loop)) { + /* + * If we belong to the destination multicast group + * on the outgoing interface, and the caller did not + * forbid loopback, loop back a copy. + */ + ip_mloopback(ifp, m, dst); + } +#ifdef MROUTING + else { + /* + * If we are acting as a multicast router, perform + * multicast forwarding as if the packet had just + * arrived on the interface to which we are about + * to send. The multicast forwarding function + * recursively calls this function, using the + * IP_FORWARDING flag to prevent infinite recursion. + * + * Multicasts that are looped back by ip_mloopback(), + * above, will be forwarded by the ip_input() routine, + * if necessary. + */ + extern struct socket *ip_mrouter; + if (ip_mrouter && (flags & IP_FORWARDING) == 0) { + if (ip_mforward(m, ifp) != 0) { + m_freem(m); + goto done; + } + } + } +#endif + /* + * Multicasts with a time-to-live of zero may be looped- + * back, above, but must not be transmitted on a network. + * Also, multicasts addressed to the loopback interface + * are not sent -- the above call to ip_mloopback() will + * loop back a copy if this host actually belongs to the + * destination group on the loopback interface. + */ + if (ip->ip_ttl == 0 || ifp == &loif) { + m_freem(m); + goto done; + } + + goto sendit; + } +#ifndef notdef + /* + * If source address not specified yet, use address + * of outgoing interface. + */ + if (ip->ip_src.s_addr == INADDR_ANY) + ip->ip_src = IA_SIN(ia)->sin_addr; +#endif + /* + * Look for broadcast address and + * and verify user is allowed to send + * such a packet. + */ + if (in_broadcast(dst->sin_addr, ifp)) { + if ((ifp->if_flags & IFF_BROADCAST) == 0) { + error = EADDRNOTAVAIL; + goto bad; + } + if ((flags & IP_ALLOWBROADCAST) == 0) { + error = EACCES; + goto bad; + } + /* don't allow broadcast messages to be fragmented */ + if ((u_short)ip->ip_len > ifp->if_mtu) { + error = EMSGSIZE; + goto bad; + } + m->m_flags |= M_BCAST; + } else + m->m_flags &= ~M_BCAST; + +sendit: + /* + * If small enough for interface, can just send directly. + */ + if ((u_short)ip->ip_len <= ifp->if_mtu) { + ip->ip_len = htons((u_short)ip->ip_len); + ip->ip_off = htons((u_short)ip->ip_off); + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, hlen); + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); + goto done; + } + /* + * Too large for interface; fragment if possible. + * Must be able to put at least 8 bytes per fragment. + */ + if (ip->ip_off & IP_DF) { + error = EMSGSIZE; + ipstat.ips_cantfrag++; + goto bad; + } + len = (ifp->if_mtu - hlen) &~ 7; + if (len < 8) { + error = EMSGSIZE; + goto bad; + } + + { + int mhlen, firstlen = len; + struct mbuf **mnext = &m->m_nextpkt; + + /* + * Loop through length of segment after first fragment, + * make new header and copy data of each part and link onto chain. + */ + m0 = m; + mhlen = sizeof (struct ip); + for (off = hlen + len; off < (u_short)ip->ip_len; off += len) { + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == 0) { + error = ENOBUFS; + ipstat.ips_odropped++; + goto sendorfree; + } + m->m_data += max_linkhdr; + mhip = mtod(m, struct ip *); + *mhip = *ip; + if (hlen > sizeof (struct ip)) { + mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); + mhip->ip_hl = mhlen >> 2; + } + m->m_len = mhlen; + mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF); + if (ip->ip_off & IP_MF) + mhip->ip_off |= IP_MF; + if (off + len >= (u_short)ip->ip_len) + len = (u_short)ip->ip_len - off; + else + mhip->ip_off |= IP_MF; + mhip->ip_len = htons((u_short)(len + mhlen)); + m->m_next = m_copy(m0, off, len); + if (m->m_next == 0) { + (void) m_free(m); + error = ENOBUFS; /* ??? */ + ipstat.ips_odropped++; + goto sendorfree; + } + m->m_pkthdr.len = mhlen + len; + m->m_pkthdr.rcvif = (struct ifnet *)0; + mhip->ip_off = htons((u_short)mhip->ip_off); + mhip->ip_sum = 0; + mhip->ip_sum = in_cksum(m, mhlen); + *mnext = m; + mnext = &m->m_nextpkt; + ipstat.ips_ofragments++; + } + /* + * Update first fragment by trimming what's been copied out + * and updating header, then send each fragment (in order). + */ + m = m0; + m_adj(m, hlen + firstlen - (u_short)ip->ip_len); + m->m_pkthdr.len = hlen + firstlen; + ip->ip_len = htons((u_short)m->m_pkthdr.len); + ip->ip_off = htons((u_short)(ip->ip_off | IP_MF)); + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, hlen); +sendorfree: + for (m = m0; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = 0; + if (error == 0) + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); + else + m_freem(m); + } + + if (error == 0) + ipstat.ips_fragmented++; + } +done: + if (ro == &iproute && (flags & IP_ROUTETOIF) == 0 && ro->ro_rt) + RTFREE(ro->ro_rt); + return (error); +bad: + m_freem(m0); + goto done; +} + +/* + * Insert IP options into preformed packet. + * Adjust IP destination as required for IP source routing, + * as indicated by a non-zero in_addr at the start of the options. + */ +static struct mbuf * +ip_insertoptions(m, opt, phlen) + register struct mbuf *m; + struct mbuf *opt; + int *phlen; +{ + register struct ipoption *p = mtod(opt, struct ipoption *); + struct mbuf *n; + register struct ip *ip = mtod(m, struct ip *); + unsigned optlen; + + optlen = opt->m_len - sizeof(p->ipopt_dst); + if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) + return (m); /* XXX should fail */ + if (p->ipopt_dst.s_addr) + ip->ip_dst = p->ipopt_dst; + if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { + MGETHDR(n, M_DONTWAIT, MT_HEADER); + if (n == 0) + return (m); + n->m_pkthdr.len = m->m_pkthdr.len + optlen; + m->m_len -= sizeof(struct ip); + m->m_data += sizeof(struct ip); + n->m_next = m; + m = n; + m->m_len = optlen + sizeof(struct ip); + m->m_data += max_linkhdr; + bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + } else { + m->m_data -= optlen; + m->m_len += optlen; + m->m_pkthdr.len += optlen; + ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + } + ip = mtod(m, struct ip *); + bcopy((caddr_t)p->ipopt_list, (caddr_t)(ip + 1), (unsigned)optlen); + *phlen = sizeof(struct ip) + optlen; + ip->ip_len += optlen; + return (m); +} + +/* + * Copy options from ip to jp, + * omitting those not copied during fragmentation. + */ +int +ip_optcopy(ip, jp) + struct ip *ip, *jp; +{ + register u_char *cp, *dp; + int opt, optlen, cnt; + + cp = (u_char *)(ip + 1); + dp = (u_char *)(jp + 1); + cnt = (ip->ip_hl << 2) - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) { + /* Preserve for IP mcast tunnel's LSRR alignment. */ + *dp++ = IPOPT_NOP; + optlen = 1; + continue; + } else + optlen = cp[IPOPT_OLEN]; + /* bogus lengths should have been caught by ip_dooptions */ + if (optlen > cnt) + optlen = cnt; + if (IPOPT_COPIED(opt)) { + bcopy((caddr_t)cp, (caddr_t)dp, (unsigned)optlen); + dp += optlen; + } + } + for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) + *dp++ = IPOPT_EOL; + return (optlen); +} + +/* + * IP socket option processing. + */ +int +ip_ctloutput(op, so, level, optname, mp) + int op; + struct socket *so; + int level, optname; + struct mbuf **mp; +{ + register struct inpcb *inp = sotoinpcb(so); + register struct mbuf *m = *mp; + register int optval; + int error = 0; + + if (level != IPPROTO_IP) { + error = EINVAL; + if (op == PRCO_SETOPT && *mp) + (void) m_free(*mp); + } else switch (op) { + + case PRCO_SETOPT: + switch (optname) { + case IP_OPTIONS: +#ifdef notyet + case IP_RETOPTS: + return (ip_pcbopts(optname, &inp->inp_options, m)); +#else + return (ip_pcbopts(&inp->inp_options, m)); +#endif + + case IP_TOS: + case IP_TTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + if (m->m_len != sizeof(int)) + error = EINVAL; + else { + optval = *mtod(m, int *); + switch (optname) { + + case IP_TOS: + inp->inp_ip.ip_tos = optval; + break; + + case IP_TTL: + inp->inp_ip.ip_ttl = optval; + break; +#define OPTSET(bit) \ + if (optval) \ + inp->inp_flags |= bit; \ + else \ + inp->inp_flags &= ~bit; + + case IP_RECVOPTS: + OPTSET(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + OPTSET(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + OPTSET(INP_RECVDSTADDR); + break; + } + } + break; +#undef OPTSET + + case IP_MULTICAST_IF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + error = ip_setmoptions(optname, &inp->inp_moptions, m); + break; + + default: + error = ENOPROTOOPT; + break; + } + if (m) + (void)m_free(m); + break; + + case PRCO_GETOPT: + switch (optname) { + case IP_OPTIONS: + case IP_RETOPTS: + *mp = m = m_get(M_WAIT, MT_SOOPTS); + if (inp->inp_options) { + m->m_len = inp->inp_options->m_len; + bcopy(mtod(inp->inp_options, caddr_t), + mtod(m, caddr_t), (unsigned)m->m_len); + } else + m->m_len = 0; + break; + + case IP_TOS: + case IP_TTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + *mp = m = m_get(M_WAIT, MT_SOOPTS); + m->m_len = sizeof(int); + switch (optname) { + + case IP_TOS: + optval = inp->inp_ip.ip_tos; + break; + + case IP_TTL: + optval = inp->inp_ip.ip_ttl; + break; + +#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) + + case IP_RECVOPTS: + optval = OPTBIT(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + optval = OPTBIT(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + optval = OPTBIT(INP_RECVDSTADDR); + break; + } + *mtod(m, int *) = optval; + break; + + case IP_MULTICAST_IF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + error = ip_getmoptions(optname, inp->inp_moptions, mp); + break; + + default: + error = ENOPROTOOPT; + break; + } + break; + } + return (error); +} + +/* + * Set up IP options in pcb for insertion in output packets. + * Store in mbuf with pointer in pcbopt, adding pseudo-option + * with destination address if source routed. + */ +int +#ifdef notyet +ip_pcbopts(optname, pcbopt, m) + int optname; +#else +ip_pcbopts(pcbopt, m) +#endif + struct mbuf **pcbopt; + register struct mbuf *m; +{ + register cnt, optlen; + register u_char *cp; + u_char opt; + + /* turn off any old options */ + if (*pcbopt) + (void)m_free(*pcbopt); + *pcbopt = 0; + if (m == (struct mbuf *)0 || m->m_len == 0) { + /* + * Only turning off any previous options. + */ + if (m) + (void)m_free(m); + return (0); + } + +#ifndef vax + if (m->m_len % sizeof(long)) + goto bad; +#endif + /* + * IP first-hop destination address will be stored before + * actual options; move other options back + * and clear it when none present. + */ + if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) + goto bad; + cnt = m->m_len; + m->m_len += sizeof(struct in_addr); + cp = mtod(m, u_char *) + sizeof(struct in_addr); + ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt); + bzero(mtod(m, caddr_t), sizeof(struct in_addr)); + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= IPOPT_OLEN || optlen > cnt) + goto bad; + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + case IPOPT_SSRR: + /* + * user process specifies route as: + * ->A->B->C->D + * D must be our final destination (but we can't + * check that since we may not have connected yet). + * A is first hop destination, which doesn't appear in + * actual IP option, but is stored before the options. + */ + if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) + goto bad; + m->m_len -= sizeof(struct in_addr); + cnt -= sizeof(struct in_addr); + optlen -= sizeof(struct in_addr); + cp[IPOPT_OLEN] = optlen; + /* + * Move first hop before start of options. + */ + bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), + sizeof(struct in_addr)); + /* + * Then copy rest of options back + * to close up the deleted entry. + */ + ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] + + sizeof(struct in_addr)), + (caddr_t)&cp[IPOPT_OFFSET+1], + (unsigned)cnt + sizeof(struct in_addr)); + break; + } + } + if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) + goto bad; + *pcbopt = m; + return (0); + +bad: + (void)m_free(m); + return (EINVAL); +} + +/* + * Set the IP multicast options in response to user setsockopt(). + */ +int +ip_setmoptions(optname, imop, m) + int optname; + struct ip_moptions **imop; + struct mbuf *m; +{ + register int error = 0; + u_char loop; + register int i; + struct in_addr addr; + register struct ip_mreq *mreq; + register struct ifnet *ifp; + register struct ip_moptions *imo = *imop; + struct route ro; + register struct sockaddr_in *dst; + + if (imo == NULL) { + /* + * No multicast option buffer attached to the pcb; + * allocate one and initialize to default values. + */ + imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, + M_WAITOK); + + if (imo == NULL) + return (ENOBUFS); + *imop = imo; + imo->imo_multicast_ifp = NULL; + imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + imo->imo_num_memberships = 0; + } + + switch (optname) { + + case IP_MULTICAST_IF: + /* + * Select the interface for outgoing multicast packets. + */ + if (m == NULL || m->m_len != sizeof(struct in_addr)) { + error = EINVAL; + break; + } + addr = *(mtod(m, struct in_addr *)); + /* + * INADDR_ANY is used to remove a previous selection. + * When no interface is selected, a default one is + * chosen every time a multicast packet is sent. + */ + if (addr.s_addr == INADDR_ANY) { + imo->imo_multicast_ifp = NULL; + break; + } + /* + * The selected interface is identified by its local + * IP address. Find the interface and confirm that + * it supports multicasting. + */ + INADDR_TO_IFP(addr, ifp); + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + error = EADDRNOTAVAIL; + break; + } + imo->imo_multicast_ifp = ifp; + break; + + case IP_MULTICAST_TTL: + /* + * Set the IP time-to-live for outgoing multicast packets. + */ + if (m == NULL || m->m_len != 1) { + error = EINVAL; + break; + } + imo->imo_multicast_ttl = *(mtod(m, u_char *)); + break; + + case IP_MULTICAST_LOOP: + /* + * Set the loopback flag for outgoing multicast packets. + * Must be zero or one. + */ + if (m == NULL || m->m_len != 1 || + (loop = *(mtod(m, u_char *))) > 1) { + error = EINVAL; + break; + } + imo->imo_multicast_loop = loop; + break; + + case IP_ADD_MEMBERSHIP: + /* + * Add a multicast group membership. + * Group must be a valid IP multicast address. + */ + if (m == NULL || m->m_len != sizeof(struct ip_mreq)) { + error = EINVAL; + break; + } + mreq = mtod(m, struct ip_mreq *); + if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { + error = EINVAL; + break; + } + /* + * If no interface address was provided, use the interface of + * the route to the given multicast address. + */ + if (mreq->imr_interface.s_addr == INADDR_ANY) { + ro.ro_rt = NULL; + dst = (struct sockaddr_in *)&ro.ro_dst; + dst->sin_len = sizeof(*dst); + dst->sin_family = AF_INET; + dst->sin_addr = mreq->imr_multiaddr; + rtalloc(&ro); + if (ro.ro_rt == NULL) { + error = EADDRNOTAVAIL; + break; + } + ifp = ro.ro_rt->rt_ifp; + rtfree(ro.ro_rt); + } + else { + INADDR_TO_IFP(mreq->imr_interface, ifp); + } + /* + * See if we found an interface, and confirm that it + * supports multicast. + */ + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + error = EADDRNOTAVAIL; + break; + } + /* + * See if the membership already exists or if all the + * membership slots are full. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if (imo->imo_membership[i]->inm_ifp == ifp && + imo->imo_membership[i]->inm_addr.s_addr + == mreq->imr_multiaddr.s_addr) + break; + } + if (i < imo->imo_num_memberships) { + error = EADDRINUSE; + break; + } + if (i == IP_MAX_MEMBERSHIPS) { + error = ETOOMANYREFS; + break; + } + /* + * Everything looks good; add a new record to the multicast + * address list for the given interface. + */ + if ((imo->imo_membership[i] = + in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { + error = ENOBUFS; + break; + } + ++imo->imo_num_memberships; + break; + + case IP_DROP_MEMBERSHIP: + /* + * Drop a multicast group membership. + * Group must be a valid IP multicast address. + */ + if (m == NULL || m->m_len != sizeof(struct ip_mreq)) { + error = EINVAL; + break; + } + mreq = mtod(m, struct ip_mreq *); + if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { + error = EINVAL; + break; + } + /* + * If an interface address was specified, get a pointer + * to its ifnet structure. + */ + if (mreq->imr_interface.s_addr == INADDR_ANY) + ifp = NULL; + else { + INADDR_TO_IFP(mreq->imr_interface, ifp); + if (ifp == NULL) { + error = EADDRNOTAVAIL; + break; + } + } + /* + * Find the membership in the membership array. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if ((ifp == NULL || + imo->imo_membership[i]->inm_ifp == ifp) && + imo->imo_membership[i]->inm_addr.s_addr == + mreq->imr_multiaddr.s_addr) + break; + } + if (i == imo->imo_num_memberships) { + error = EADDRNOTAVAIL; + break; + } + /* + * Give up the multicast address record to which the + * membership points. + */ + in_delmulti(imo->imo_membership[i]); + /* + * Remove the gap in the membership array. + */ + for (++i; i < imo->imo_num_memberships; ++i) + imo->imo_membership[i-1] = imo->imo_membership[i]; + --imo->imo_num_memberships; + break; + + default: + error = EOPNOTSUPP; + break; + } + + /* + * If all options have default values, no need to keep the mbuf. + */ + if (imo->imo_multicast_ifp == NULL && + imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && + imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && + imo->imo_num_memberships == 0) { + free(*imop, M_IPMOPTS); + *imop = NULL; + } + + return (error); +} + +/* + * Return the IP multicast options in response to user getsockopt(). + */ +int +ip_getmoptions(optname, imo, mp) + int optname; + register struct ip_moptions *imo; + register struct mbuf **mp; +{ + u_char *ttl; + u_char *loop; + struct in_addr *addr; + struct in_ifaddr *ia; + + *mp = m_get(M_WAIT, MT_SOOPTS); + + switch (optname) { + + case IP_MULTICAST_IF: + addr = mtod(*mp, struct in_addr *); + (*mp)->m_len = sizeof(struct in_addr); + if (imo == NULL || imo->imo_multicast_ifp == NULL) + addr->s_addr = INADDR_ANY; + else { + IFP_TO_IA(imo->imo_multicast_ifp, ia); + addr->s_addr = (ia == NULL) ? INADDR_ANY + : IA_SIN(ia)->sin_addr.s_addr; + } + return (0); + + case IP_MULTICAST_TTL: + ttl = mtod(*mp, u_char *); + (*mp)->m_len = 1; + *ttl = (imo == NULL) ? IP_DEFAULT_MULTICAST_TTL + : imo->imo_multicast_ttl; + return (0); + + case IP_MULTICAST_LOOP: + loop = mtod(*mp, u_char *); + (*mp)->m_len = 1; + *loop = (imo == NULL) ? IP_DEFAULT_MULTICAST_LOOP + : imo->imo_multicast_loop; + return (0); + + default: + return (EOPNOTSUPP); + } +} + +/* + * Discard the IP multicast options. + */ +void +ip_freemoptions(imo) + register struct ip_moptions *imo; +{ + register int i; + + if (imo != NULL) { + for (i = 0; i < imo->imo_num_memberships; ++i) + in_delmulti(imo->imo_membership[i]); + free(imo, M_IPMOPTS); + } +} + +/* + * Routine called from ip_output() to loop back a copy of an IP multicast + * packet to the input queue of a specified interface. Note that this + * calls the output routine of the loopback "driver", but with an interface + * pointer that might NOT be &loif -- easier than replicating that code here. + */ +static void +ip_mloopback(ifp, m, dst) + struct ifnet *ifp; + register struct mbuf *m; + register struct sockaddr_in *dst; +{ + register struct ip *ip; + struct mbuf *copym; + + copym = m_copy(m, 0, M_COPYALL); + if (copym != NULL) { + /* + * We don't bother to fragment if the IP length is greater + * than the interface's MTU. Can this possibly matter? + */ + ip = mtod(copym, struct ip *); + ip->ip_len = htons((u_short)ip->ip_len); + ip->ip_off = htons((u_short)ip->ip_off); + ip->ip_sum = 0; + ip->ip_sum = in_cksum(copym, ip->ip_hl << 2); + (void) looutput(ifp, copym, (struct sockaddr *)dst, NULL); + } +} diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h new file mode 100644 index 00000000000..27eda5e67cd --- /dev/null +++ b/sys/netinet/ip_var.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_var.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Overlay for ip header used by other protocols (tcp, udp). + */ +struct ipovly { + caddr_t ih_next, ih_prev; /* for protocol sequence q's */ + u_char ih_x1; /* (unused) */ + u_char ih_pr; /* protocol */ + short ih_len; /* protocol length */ + struct in_addr ih_src; /* source internet address */ + struct in_addr ih_dst; /* destination internet address */ +}; + +/* + * Ip reassembly queue structure. Each fragment + * being reassembled is attached to one of these structures. + * They are timed out after ipq_ttl drops to 0, and may also + * be reclaimed if memory becomes tight. + */ +struct ipq { + struct ipq *next,*prev; /* to other reass headers */ + u_char ipq_ttl; /* time for reass q to live */ + u_char ipq_p; /* protocol of this fragment */ + u_short ipq_id; /* sequence id for reassembly */ + struct ipasfrag *ipq_next,*ipq_prev; + /* to ip headers of fragments */ + struct in_addr ipq_src,ipq_dst; +}; + +/* + * Ip header, when holding a fragment. + * + * Note: ipf_next must be at same offset as ipq_next above + */ +struct ipasfrag { +#if BYTE_ORDER == LITTLE_ENDIAN + u_char ip_hl:4, + ip_v:4; +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_char ip_v:4, + ip_hl:4; +#endif + u_char ipf_mff; /* XXX overlays ip_tos: use low bit + * to avoid destroying tos; + * copied from (ip_off&IP_MF) */ + short ip_len; + u_short ip_id; + short ip_off; + u_char ip_ttl; + u_char ip_p; + u_short ip_sum; + struct ipasfrag *ipf_next; /* next fragment */ + struct ipasfrag *ipf_prev; /* previous fragment */ +}; + +/* + * Structure stored in mbuf in inpcb.ip_options + * and passed to ip_output when ip options are in use. + * The actual length of the options (including ipopt_dst) + * is in m_len. + */ +#define MAX_IPOPTLEN 40 + +struct ipoption { + struct in_addr ipopt_dst; /* first-hop dst if source routed */ + char ipopt_list[MAX_IPOPTLEN]; /* options proper */ +}; + +/* + * Structure attached to inpcb.ip_moptions and + * passed to ip_output when IP multicast options are in use. + */ +struct ip_moptions { + struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */ + u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ + u_char imo_multicast_loop; /* 1 => hear sends if a member */ + u_short imo_num_memberships; /* no. memberships this socket */ + struct in_multi *imo_membership[IP_MAX_MEMBERSHIPS]; +}; + +struct ipstat { + u_long ips_total; /* total packets received */ + u_long ips_badsum; /* checksum bad */ + u_long ips_tooshort; /* packet too short */ + u_long ips_toosmall; /* not enough data */ + u_long ips_badhlen; /* ip header length < data size */ + u_long ips_badlen; /* ip length < ip header length */ + u_long ips_fragments; /* fragments received */ + u_long ips_fragdropped; /* frags dropped (dups, out of space) */ + u_long ips_fragtimeout; /* fragments timed out */ + u_long ips_forward; /* packets forwarded */ + u_long ips_cantforward; /* packets rcvd for unreachable dest */ + u_long ips_redirectsent; /* packets forwarded on same net */ + u_long ips_noproto; /* unknown or unsupported protocol */ + u_long ips_delivered; /* datagrams delivered to upper level*/ + u_long ips_localout; /* total ip packets generated here */ + u_long ips_odropped; /* lost packets due to nobufs, etc. */ + u_long ips_reassembled; /* total packets reassembled ok */ + u_long ips_fragmented; /* datagrams sucessfully fragmented */ + u_long ips_ofragments; /* output fragments created */ + u_long ips_cantfrag; /* don't fragment flag was set, etc. */ + u_long ips_badoptions; /* error in option processing */ + u_long ips_noroute; /* packets discarded due to no route */ + u_long ips_badvers; /* ip version != 4 */ + u_long ips_rawout; /* total raw ip packets generated */ +}; + +#ifdef KERNEL +/* flags passed to ip_output as last parameter */ +#define IP_FORWARDING 0x1 /* most of ip header exists */ +#define IP_RAWOUTPUT 0x2 /* raw ip header exists */ +#define IP_ROUTETOIF SO_DONTROUTE /* bypass routing tables */ +#define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets */ + +struct ipstat ipstat; +struct ipq ipq; /* ip reass. queue */ +u_short ip_id; /* ip packet ctr, for ids */ +int ip_defttl; /* default IP ttl */ + +int in_control __P((struct socket *, int, caddr_t, struct ifnet *)); +int ip_ctloutput __P((int, struct socket *, int, int, struct mbuf **)); +void ip_deq __P((struct ipasfrag *)); +int ip_dooptions __P((struct mbuf *)); +void ip_drain __P((void)); +void ip_enq __P((struct ipasfrag *, struct ipasfrag *)); +void ip_forward __P((struct mbuf *, int)); +void ip_freef __P((struct ipq *)); +void ip_freemoptions __P((struct ip_moptions *)); +int ip_getmoptions __P((int, struct ip_moptions *, struct mbuf **)); +void ip_init __P((void)); +int ip_mforward __P((struct mbuf *, struct ifnet *)); +int ip_optcopy __P((struct ip *, struct ip *)); +int ip_output __P((struct mbuf *, + struct mbuf *, struct route *, int, struct ip_moptions *)); +int ip_pcbopts __P((struct mbuf **, struct mbuf *)); +struct ip * + ip_reass __P((struct ipasfrag *, struct ipq *)); +struct in_ifaddr * + ip_rtaddr __P((struct in_addr)); +int ip_setmoptions __P((int, struct ip_moptions **, struct mbuf *)); +void ip_slowtimo __P((void)); +struct mbuf * + ip_srcroute __P((void)); +void ip_stripoptions __P((struct mbuf *, struct mbuf *)); +int ip_sysctl __P((int *, u_int, void *, size_t *, void *, size_t)); +void ipintr __P((void)); +int rip_ctloutput __P((int, struct socket *, int, int, struct mbuf **)); +void rip_init __P((void)); +void rip_input __P((struct mbuf *)); +int rip_output __P((struct mbuf *, struct socket *, u_long)); +int rip_usrreq __P((struct socket *, + int, struct mbuf *, struct mbuf *, struct mbuf *)); +#endif diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c new file mode 100644 index 00000000000..c8092ee9ec7 --- /dev/null +++ b/sys/netinet/raw_ip.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_ip.c 8.2 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +struct inpcb rawinpcb; + +/* + * Nominal space allocated to a raw ip socket. + */ +#define RIPSNDQ 8192 +#define RIPRCVQ 8192 + +/* + * Raw interface to IP protocol. + */ + +/* + * Initialize raw connection block q. + */ +void +rip_init() +{ + + rawinpcb.inp_next = rawinpcb.inp_prev = &rawinpcb; +} + +struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET }; +/* + * Setup generic address and protocol structures + * for raw_input routine, then pass them along with + * mbuf chain. + */ +void +rip_input(m) + struct mbuf *m; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct inpcb *inp; + struct socket *last = 0; + + ripsrc.sin_addr = ip->ip_src; + for (inp = rawinpcb.inp_next; inp != &rawinpcb; inp = inp->inp_next) { + if (inp->inp_ip.ip_p && inp->inp_ip.ip_p != ip->ip_p) + continue; + if (inp->inp_laddr.s_addr && + inp->inp_laddr.s_addr == ip->ip_dst.s_addr) + continue; + if (inp->inp_faddr.s_addr && + inp->inp_faddr.s_addr == ip->ip_src.s_addr) + continue; + if (last) { + struct mbuf *n; + if (n = m_copy(m, 0, (int)M_COPYALL)) { + if (sbappendaddr(&last->so_rcv, &ripsrc, + n, (struct mbuf *)0) == 0) + /* should notify about lost packet */ + m_freem(n); + else + sorwakeup(last); + } + } + last = inp->inp_socket; + } + if (last) { + if (sbappendaddr(&last->so_rcv, &ripsrc, + m, (struct mbuf *)0) == 0) + m_freem(m); + else + sorwakeup(last); + } else { + m_freem(m); + ipstat.ips_noproto++; + ipstat.ips_delivered--; + } +} + +/* + * Generate IP header and pass packet to ip_output. + * Tack on options user may have setup with control call. + */ +int +rip_output(m, so, dst) + register struct mbuf *m; + struct socket *so; + u_long dst; +{ + register struct ip *ip; + register struct inpcb *inp = sotoinpcb(so); + struct mbuf *opts; + int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; + + /* + * If the user handed us a complete IP packet, use it. + * Otherwise, allocate an mbuf for a header and fill it in. + */ + if ((inp->inp_flags & INP_HDRINCL) == 0) { + M_PREPEND(m, sizeof(struct ip), M_WAIT); + ip = mtod(m, struct ip *); + ip->ip_tos = 0; + ip->ip_off = 0; + ip->ip_p = inp->inp_ip.ip_p; + ip->ip_len = m->m_pkthdr.len; + ip->ip_src = inp->inp_laddr; + ip->ip_dst.s_addr = dst; + ip->ip_ttl = MAXTTL; + opts = inp->inp_options; + } else { + ip = mtod(m, struct ip *); + if (ip->ip_id == 0) + ip->ip_id = htons(ip_id++); + opts = NULL; + /* XXX prevent ip_output from overwriting header fields */ + flags |= IP_RAWOUTPUT; + ipstat.ips_rawout++; + } + return (ip_output(m, opts, &inp->inp_route, flags, inp->inp_moptions)); +} + +/* + * Raw IP socket option processing. + */ +int +rip_ctloutput(op, so, level, optname, m) + int op; + struct socket *so; + int level, optname; + struct mbuf **m; +{ + register struct inpcb *inp = sotoinpcb(so); + register int error; + + if (level != IPPROTO_IP) + return (EINVAL); + + switch (optname) { + + case IP_HDRINCL: + if (op == PRCO_SETOPT || op == PRCO_GETOPT) { + if (m == 0 || *m == 0 || (*m)->m_len < sizeof (int)) + return (EINVAL); + if (op == PRCO_SETOPT) { + if (*mtod(*m, int *)) + inp->inp_flags |= INP_HDRINCL; + else + inp->inp_flags &= ~INP_HDRINCL; + (void)m_free(*m); + } else { + (*m)->m_len = sizeof (int); + *mtod(*m, int *) = inp->inp_flags & INP_HDRINCL; + } + return (0); + } + break; + + case DVMRP_INIT: + case DVMRP_DONE: + case DVMRP_ADD_VIF: + case DVMRP_DEL_VIF: + case DVMRP_ADD_LGRP: + case DVMRP_DEL_LGRP: + case DVMRP_ADD_MRT: + case DVMRP_DEL_MRT: +#ifdef MROUTING + if (op == PRCO_SETOPT) { + error = ip_mrouter_cmd(optname, so, *m); + if (*m) + (void)m_free(*m); + } else + error = EINVAL; + return (error); +#else + if (op == PRCO_SETOPT && *m) + (void)m_free(*m); + return (EOPNOTSUPP); +#endif + } + return (ip_ctloutput(op, so, level, optname, m)); +} + +u_long rip_sendspace = RIPSNDQ; +u_long rip_recvspace = RIPRCVQ; + +/*ARGSUSED*/ +int +rip_usrreq(so, req, m, nam, control) + register struct socket *so; + int req; + struct mbuf *m, *nam, *control; +{ + register int error = 0; + register struct inpcb *inp = sotoinpcb(so); +#ifdef MROUTING + extern struct socket *ip_mrouter; +#endif + switch (req) { + + case PRU_ATTACH: + if (inp) + panic("rip_attach"); + if ((so->so_state & SS_PRIV) == 0) { + error = EACCES; + break; + } + if ((error = soreserve(so, rip_sendspace, rip_recvspace)) || + (error = in_pcballoc(so, &rawinpcb))) + break; + inp = (struct inpcb *)so->so_pcb; + inp->inp_ip.ip_p = (int)nam; + break; + + case PRU_DISCONNECT: + if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + break; + } + /* FALLTHROUGH */ + case PRU_ABORT: + soisdisconnected(so); + /* FALLTHROUGH */ + case PRU_DETACH: + if (inp == 0) + panic("rip_detach"); +#ifdef MROUTING + if (so == ip_mrouter) + ip_mrouter_done(); +#endif + in_pcbdetach(inp); + break; + + case PRU_BIND: + { + struct sockaddr_in *addr = mtod(nam, struct sockaddr_in *); + + if (nam->m_len != sizeof(*addr)) { + error = EINVAL; + break; + } + if ((ifnet == 0) || + ((addr->sin_family != AF_INET) && + (addr->sin_family != AF_IMPLINK)) || + (addr->sin_addr.s_addr && + ifa_ifwithaddr((struct sockaddr *)addr) == 0)) { + error = EADDRNOTAVAIL; + break; + } + inp->inp_laddr = addr->sin_addr; + break; + } + case PRU_CONNECT: + { + struct sockaddr_in *addr = mtod(nam, struct sockaddr_in *); + + if (nam->m_len != sizeof(*addr)) { + error = EINVAL; + break; + } + if (ifnet == 0) { + error = EADDRNOTAVAIL; + break; + } + if ((addr->sin_family != AF_INET) && + (addr->sin_family != AF_IMPLINK)) { + error = EAFNOSUPPORT; + break; + } + inp->inp_faddr = addr->sin_addr; + soisconnected(so); + break; + } + + case PRU_CONNECT2: + error = EOPNOTSUPP; + break; + + /* + * Mark the connection as being incapable of further input. + */ + case PRU_SHUTDOWN: + socantsendmore(so); + break; + + /* + * Ship a packet out. The appropriate raw output + * routine handles any massaging necessary. + */ + case PRU_SEND: + { + register u_long dst; + + if (so->so_state & SS_ISCONNECTED) { + if (nam) { + error = EISCONN; + break; + } + dst = inp->inp_faddr.s_addr; + } else { + if (nam == NULL) { + error = ENOTCONN; + break; + } + dst = mtod(nam, struct sockaddr_in *)->sin_addr.s_addr; + } + error = rip_output(m, so, dst); + m = NULL; + break; + } + + case PRU_SENSE: + /* + * stat: don't bother with a blocksize. + */ + return (0); + + /* + * Not supported. + */ + case PRU_RCVOOB: + case PRU_RCVD: + case PRU_LISTEN: + case PRU_ACCEPT: + case PRU_SENDOOB: + error = EOPNOTSUPP; + break; + + case PRU_SOCKADDR: + in_setsockaddr(inp, nam); + break; + + case PRU_PEERADDR: + in_setpeeraddr(inp, nam); + break; + + default: + panic("rip_usrreq"); + } + if (m != NULL) + m_freem(m); + return (error); +} diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h new file mode 100644 index 00000000000..6b77ff663a4 --- /dev/null +++ b/sys/netinet/tcp.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp.h 8.1 (Berkeley) 6/10/93 + */ + +typedef u_long tcp_seq; +/* + * TCP header. + * Per RFC 793, September, 1981. + */ +struct tcphdr { + u_short th_sport; /* source port */ + u_short th_dport; /* destination port */ + tcp_seq th_seq; /* sequence number */ + tcp_seq th_ack; /* acknowledgement number */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_char th_x2:4, /* (unused) */ + th_off:4; /* data offset */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_char th_off:4, /* data offset */ + th_x2:4; /* (unused) */ +#endif + u_char th_flags; +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 + u_short th_win; /* window */ + u_short th_sum; /* checksum */ + u_short th_urp; /* urgent pointer */ +}; + +#define TCPOPT_EOL 0 +#define TCPOPT_NOP 1 +#define TCPOPT_MAXSEG 2 +#define TCPOLEN_MAXSEG 4 +#define TCPOPT_WINDOW 3 +#define TCPOLEN_WINDOW 3 +#define TCPOPT_SACK_PERMITTED 4 /* Experimental */ +#define TCPOLEN_SACK_PERMITTED 2 +#define TCPOPT_SACK 5 /* Experimental */ +#define TCPOPT_TIMESTAMP 8 +#define TCPOLEN_TIMESTAMP 10 +#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ + +#define TCPOPT_TSTAMP_HDR \ + (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP) + +/* + * Default maximum segment size for TCP. + * With an IP MSS of 576, this is 536, + * but 512 is probably more convenient. + * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)). + */ +#define TCP_MSS 512 + +#define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ + +#define TCP_MAX_WINSHIFT 14 /* maximum window shift */ + +/* + * User-settable options (used with setsockopt). + */ +#define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */ +#define TCP_MAXSEG 0x02 /* set maximum segment size */ diff --git a/sys/netinet/tcp_debug.c b/sys/netinet/tcp_debug.c new file mode 100644 index 00000000000..ddb30927b4a --- /dev/null +++ b/sys/netinet/tcp_debug.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.c 8.1 (Berkeley) 6/10/93 + */ + +#ifdef TCPDEBUG +/* load symbolic names */ +#define PRUREQUESTS +#define TCPSTATES +#define TCPTIMERS +#define TANAMES +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef TCPDEBUG +int tcpconsdebug = 0; +#endif +/* + * Tcp debug routines + */ +void +tcp_trace(act, ostate, tp, ti, req) + short act, ostate; + struct tcpcb *tp; + struct tcpiphdr *ti; + int req; +{ + tcp_seq seq, ack; + int len, flags; + struct tcp_debug *td = &tcp_debug[tcp_debx++]; + + if (tcp_debx == TCP_NDEBUG) + tcp_debx = 0; + td->td_time = iptime(); + td->td_act = act; + td->td_ostate = ostate; + td->td_tcb = (caddr_t)tp; + if (tp) + td->td_cb = *tp; + else + bzero((caddr_t)&td->td_cb, sizeof (*tp)); + if (ti) + td->td_ti = *ti; + else + bzero((caddr_t)&td->td_ti, sizeof (*ti)); + td->td_req = req; +#ifdef TCPDEBUG + if (tcpconsdebug == 0) + return; + if (tp) + printf("%x %s:", tp, tcpstates[ostate]); + else + printf("???????? "); + printf("%s ", tanames[act]); + switch (act) { + + case TA_INPUT: + case TA_OUTPUT: + case TA_DROP: + if (ti == 0) + break; + seq = ti->ti_seq; + ack = ti->ti_ack; + len = ti->ti_len; + if (act == TA_OUTPUT) { + seq = ntohl(seq); + ack = ntohl(ack); + len = ntohs((u_short)len); + } + if (act == TA_OUTPUT) + len -= sizeof (struct tcphdr); + if (len) + printf("[%x..%x)", seq, seq+len); + else + printf("%x", seq); + printf("@%x, urp=%x", ack, ti->ti_urp); + flags = ti->ti_flags; + if (flags) { +#ifndef lint + char *cp = "<"; +#define pf(f) { if (ti->ti_flags&TH_/**/f) { printf("%s%s", cp, "f"); cp = ","; } } + pf(SYN); pf(ACK); pf(FIN); pf(RST); pf(PUSH); pf(URG); +#endif + printf(">"); + } + break; + + case TA_USER: + printf("%s", prurequests[req&0xff]); + if ((req & 0xff) == PRU_SLOWTIMO) + printf("<%s>", tcptimers[req>>8]); + break; + } + if (tp) + printf(" -> %s", tcpstates[tp->t_state]); + /* print out internal state of tp !?! */ + printf("\n"); + if (tp == 0) + return; + printf("\trcv_(nxt,wnd,up) (%x,%x,%x) snd_(una,nxt,max) (%x,%x,%x)\n", + tp->rcv_nxt, tp->rcv_wnd, tp->rcv_up, tp->snd_una, tp->snd_nxt, + tp->snd_max); + printf("\tsnd_(wl1,wl2,wnd) (%x,%x,%x)\n", + tp->snd_wl1, tp->snd_wl2, tp->snd_wnd); +#endif /* TCPDEBUG */ +} diff --git a/sys/netinet/tcp_debug.h b/sys/netinet/tcp_debug.h new file mode 100644 index 00000000000..c02c0cd521d --- /dev/null +++ b/sys/netinet/tcp_debug.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.h 8.1 (Berkeley) 6/10/93 + */ + +struct tcp_debug { + n_time td_time; + short td_act; + short td_ostate; + caddr_t td_tcb; + struct tcpiphdr td_ti; + short td_req; + struct tcpcb td_cb; +}; + +#define TA_INPUT 0 +#define TA_OUTPUT 1 +#define TA_USER 2 +#define TA_RESPOND 3 +#define TA_DROP 4 + +#ifdef TANAMES +char *tanames[] = + { "input", "output", "user", "respond", "drop" }; +#endif + +#define TCP_NDEBUG 100 +struct tcp_debug tcp_debug[TCP_NDEBUG]; +int tcp_debx; diff --git a/sys/netinet/tcp_fsm.h b/sys/netinet/tcp_fsm.h new file mode 100644 index 00000000000..c5da7fc32d9 --- /dev/null +++ b/sys/netinet/tcp_fsm.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_fsm.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * TCP FSM state definitions. + * Per RFC793, September, 1981. + */ + +#define TCP_NSTATES 11 + +#define TCPS_CLOSED 0 /* closed */ +#define TCPS_LISTEN 1 /* listening for connection */ +#define TCPS_SYN_SENT 2 /* active, have sent syn */ +#define TCPS_SYN_RECEIVED 3 /* have send and received syn */ +/* states < TCPS_ESTABLISHED are those where connections not established */ +#define TCPS_ESTABLISHED 4 /* established */ +#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */ +/* states > TCPS_CLOSE_WAIT are those where user has closed */ +#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */ +#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */ +#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */ +/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */ +#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */ +#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */ + +#define TCPS_HAVERCVDSYN(s) ((s) >= TCPS_SYN_RECEIVED) +#define TCPS_HAVERCVDFIN(s) ((s) >= TCPS_TIME_WAIT) + +#ifdef TCPOUTFLAGS +/* + * Flags used when sending segments in tcp_output. + * Basic flags (TH_RST,TH_ACK,TH_SYN,TH_FIN) are totally + * determined by state, with the proviso that TH_FIN is sent only + * if all data queued for output is included in the segment. + */ +u_char tcp_outflags[TCP_NSTATES] = { + TH_RST|TH_ACK, 0, TH_SYN, TH_SYN|TH_ACK, + TH_ACK, TH_ACK, + TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_FIN|TH_ACK, TH_ACK, TH_ACK, +}; +#endif + +#ifdef KPROF +int tcp_acounts[TCP_NSTATES][PRU_NREQ]; +#endif + +#ifdef TCPSTATES +char *tcpstates[] = { + "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD", + "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING", + "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT", +}; +#endif diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c new file mode 100644 index 00000000000..2dd1d749c40 --- /dev/null +++ b/sys/netinet/tcp_input.c @@ -0,0 +1,1647 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94 + */ + +#ifndef TUBA_INCLUDE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int tcprexmtthresh = 3; +struct tcpiphdr tcp_saveti; +struct inpcb *tcp_last_inpcb = &tcb; + +extern u_long sb_max; + +#endif /* TUBA_INCLUDE */ +#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) + +/* for modulo comparisons of timestamps */ +#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) +#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) + + +/* + * Insert segment ti into reassembly queue of tcp with + * control block tp. Return TH_FIN if reassembly now includes + * a segment with FIN. The macro form does the common case inline + * (segment is the next to be received on an established connection, + * and the queue is empty), avoiding linkage into and removal + * from the queue and repetition of various conversions. + * Set DELACK for segments received in order, but ack immediately + * when segments are out of order (so fast retransmit can work). + */ +#define TCP_REASS(tp, ti, m, so, flags) { \ + if ((ti)->ti_seq == (tp)->rcv_nxt && \ + (tp)->seg_next == (struct tcpiphdr *)(tp) && \ + (tp)->t_state == TCPS_ESTABLISHED) { \ + tp->t_flags |= TF_DELACK; \ + (tp)->rcv_nxt += (ti)->ti_len; \ + flags = (ti)->ti_flags & TH_FIN; \ + tcpstat.tcps_rcvpack++;\ + tcpstat.tcps_rcvbyte += (ti)->ti_len;\ + sbappend(&(so)->so_rcv, (m)); \ + sorwakeup(so); \ + } else { \ + (flags) = tcp_reass((tp), (ti), (m)); \ + tp->t_flags |= TF_ACKNOW; \ + } \ +} +#ifndef TUBA_INCLUDE + +int +tcp_reass(tp, ti, m) + register struct tcpcb *tp; + register struct tcpiphdr *ti; + struct mbuf *m; +{ + register struct tcpiphdr *q; + struct socket *so = tp->t_inpcb->inp_socket; + int flags; + + /* + * Call with ti==0 after become established to + * force pre-ESTABLISHED data up to user socket. + */ + if (ti == 0) + goto present; + + /* + * Find a segment which begins after this one does. + */ + for (q = tp->seg_next; q != (struct tcpiphdr *)tp; + q = (struct tcpiphdr *)q->ti_next) + if (SEQ_GT(q->ti_seq, ti->ti_seq)) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if ((struct tcpiphdr *)q->ti_prev != (struct tcpiphdr *)tp) { + register int i; + q = (struct tcpiphdr *)q->ti_prev; + /* conversion to int (in i) handles seq wraparound */ + i = q->ti_seq + q->ti_len - ti->ti_seq; + if (i > 0) { + if (i >= ti->ti_len) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + m_freem(m); + return (0); + } + m_adj(m, i); + ti->ti_len -= i; + ti->ti_seq += i; + } + q = (struct tcpiphdr *)(q->ti_next); + } + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += ti->ti_len; + REASS_MBUF(ti) = m; /* XXX */ + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q != (struct tcpiphdr *)tp) { + register int i = (ti->ti_seq + ti->ti_len) - q->ti_seq; + if (i <= 0) + break; + if (i < q->ti_len) { + q->ti_seq += i; + q->ti_len -= i; + m_adj(REASS_MBUF(q), i); + break; + } + q = (struct tcpiphdr *)q->ti_next; + m = REASS_MBUF((struct tcpiphdr *)q->ti_prev); + remque(q->ti_prev); + m_freem(m); + } + + /* + * Stick new segment in its place. + */ + insque(ti, q->ti_prev); + +present: + /* + * Present data to user, advancing rcv_nxt through + * completed sequence space. + */ + if (TCPS_HAVERCVDSYN(tp->t_state) == 0) + return (0); + ti = tp->seg_next; + if (ti == (struct tcpiphdr *)tp || ti->ti_seq != tp->rcv_nxt) + return (0); + if (tp->t_state == TCPS_SYN_RECEIVED && ti->ti_len) + return (0); + do { + tp->rcv_nxt += ti->ti_len; + flags = ti->ti_flags & TH_FIN; + remque(ti); + m = REASS_MBUF(ti); + ti = (struct tcpiphdr *)ti->ti_next; + if (so->so_state & SS_CANTRCVMORE) + m_freem(m); + else + sbappend(&so->so_rcv, m); + } while (ti != (struct tcpiphdr *)tp && ti->ti_seq == tp->rcv_nxt); + sorwakeup(so); + return (flags); +} + +/* + * TCP input routine, follows pages 65-76 of the + * protocol specification dated September, 1981 very closely. + */ +void +tcp_input(m, iphlen) + register struct mbuf *m; + int iphlen; +{ + register struct tcpiphdr *ti; + register struct inpcb *inp; + caddr_t optp = NULL; + int optlen; + int len, tlen, off; + register struct tcpcb *tp = 0; + register int tiflags; + struct socket *so; + int todrop, acked, ourfinisacked, needoutput = 0; + short ostate; + struct in_addr laddr; + int dropsocket = 0; + int iss = 0; + u_long tiwin, ts_val, ts_ecr; + int ts_present = 0; + + tcpstat.tcps_rcvtotal++; + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + ti = mtod(m, struct tcpiphdr *); + if (iphlen > sizeof (struct ip)) + ip_stripoptions(m, (struct mbuf *)0); + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ti = mtod(m, struct tcpiphdr *); + } + + /* + * Checksum extended TCP header and data. + */ + tlen = ((struct ip *)ti)->ip_len; + len = sizeof (struct ip) + tlen; + ti->ti_next = ti->ti_prev = 0; + ti->ti_x1 = 0; + ti->ti_len = (u_short)tlen; + HTONS(ti->ti_len); + if (ti->ti_sum = in_cksum(m, len)) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } +#endif /* TUBA_INCLUDE */ + + /* + * Check that TCP offset makes sense, + * pull out TCP options and adjust length. XXX + */ + off = ti->ti_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + tcpstat.tcps_rcvbadoff++; + goto drop; + } + tlen -= off; + ti->ti_len = tlen; + if (off > sizeof (struct tcphdr)) { + if (m->m_len < sizeof(struct ip) + off) { + if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ti = mtod(m, struct tcpiphdr *); + } + optlen = off - sizeof (struct tcphdr); + optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr); + /* + * Do quick retrieval of timestamp options ("options + * prediction?"). If timestamp is the only option and it's + * formatted as recommended in RFC 1323 appendix A, we + * quickly get the values now and not bother calling + * tcp_dooptions(), etc. + */ + if ((optlen == TCPOLEN_TSTAMP_APPA || + (optlen > TCPOLEN_TSTAMP_APPA && + optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && + *(u_long *)optp == htonl(TCPOPT_TSTAMP_HDR) && + (ti->ti_flags & TH_SYN) == 0) { + ts_present = 1; + ts_val = ntohl(*(u_long *)(optp + 4)); + ts_ecr = ntohl(*(u_long *)(optp + 8)); + optp = NULL; /* we've parsed the options */ + } + } + tiflags = ti->ti_flags; + + /* + * Convert TCP protocol specific fields to host format. + */ + NTOHL(ti->ti_seq); + NTOHL(ti->ti_ack); + NTOHS(ti->ti_win); + NTOHS(ti->ti_urp); + + /* + * Locate pcb for segment. + */ +findpcb: + inp = tcp_last_inpcb; + if (inp->inp_lport != ti->ti_dport || + inp->inp_fport != ti->ti_sport || + inp->inp_faddr.s_addr != ti->ti_src.s_addr || + inp->inp_laddr.s_addr != ti->ti_dst.s_addr) { + inp = in_pcblookup(&tcb, ti->ti_src, ti->ti_sport, + ti->ti_dst, ti->ti_dport, INPLOOKUP_WILDCARD); + if (inp) + tcp_last_inpcb = inp; + ++tcpstat.tcps_pcbcachemiss; + } + + /* + * If the state is CLOSED (i.e., TCB does not exist) then + * all data in the incoming segment is discarded. + * If the TCB exists but is in CLOSED state, it is embryonic, + * but should either do a listen or a connect soon. + */ + if (inp == 0) + goto dropwithreset; + tp = intotcpcb(inp); + if (tp == 0) + goto dropwithreset; + if (tp->t_state == TCPS_CLOSED) + goto drop; + + /* Unscale the window into a 32-bit value. */ + if ((tiflags & TH_SYN) == 0) + tiwin = ti->ti_win << tp->snd_scale; + else + tiwin = ti->ti_win; + + so = inp->inp_socket; + if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { + if (so->so_options & SO_DEBUG) { + ostate = tp->t_state; + tcp_saveti = *ti; + } + if (so->so_options & SO_ACCEPTCONN) { + so = sonewconn(so, 0); + if (so == 0) + goto drop; + /* + * This is ugly, but .... + * + * Mark socket as temporary until we're + * committed to keeping it. The code at + * ``drop'' and ``dropwithreset'' check the + * flag dropsocket to see if the temporary + * socket created here should be discarded. + * We mark the socket as discardable until + * we're committed to it below in TCPS_LISTEN. + */ + dropsocket++; + inp = (struct inpcb *)so->so_pcb; + inp->inp_laddr = ti->ti_dst; + inp->inp_lport = ti->ti_dport; +#if BSD>=43 + inp->inp_options = ip_srcroute(); +#endif + tp = intotcpcb(inp); + tp->t_state = TCPS_LISTEN; + + /* Compute proper scaling value from buffer space + */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + TCP_MAXWIN << tp->request_r_scale < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + } + } + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + */ + tp->t_idle = 0; + tp->t_timer[TCPT_KEEP] = tcp_keepidle; + + /* + * Process options if not in LISTEN state, + * else do it below (after getting remote address). + */ + if (optp && tp->t_state != TCPS_LISTEN) + tcp_dooptions(tp, optp, optlen, ti, + &ts_present, &ts_val, &ts_ecr); + + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && + ti->ti_seq == tp->rcv_nxt && + tiwin && tiwin == tp->snd_wnd && + tp->snd_nxt == tp->snd_max) { + + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + */ + if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) && + SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len)) { + tp->ts_recent_age = tcp_now; + tp->ts_recent = ts_val; + } + + if (ti->ti_len == 0) { + if (SEQ_GT(ti->ti_ack, tp->snd_una) && + SEQ_LEQ(ti->ti_ack, tp->snd_max) && + tp->snd_cwnd >= tp->snd_wnd) { + /* + * this is a pure ack for outstanding data. + */ + ++tcpstat.tcps_predack; + if (ts_present) + tcp_xmit_timer(tp, tcp_now-ts_ecr+1); + else if (tp->t_rtt && + SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, tp->t_rtt); + acked = ti->ti_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + sbdrop(&so->so_snd, acked); + tp->snd_una = ti->ti_ack; + m_freem(m); + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ + if (tp->snd_una == tp->snd_max) + tp->t_timer[TCPT_REXMT] = 0; + else if (tp->t_timer[TCPT_PERSIST] == 0) + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + + if (so->so_snd.sb_flags & SB_NOTIFY) + sowwakeup(so); + if (so->so_snd.sb_cc) + (void) tcp_output(tp); + return; + } + } else if (ti->ti_ack == tp->snd_una && + tp->seg_next == (struct tcpiphdr *)tp && + ti->ti_len <= sbspace(&so->so_rcv)) { + /* + * this is a pure, in-sequence data packet + * with nothing on the reassembly queue and + * we have enough buffer space to take it. + */ + ++tcpstat.tcps_preddat; + tp->rcv_nxt += ti->ti_len; + tcpstat.tcps_rcvpack++; + tcpstat.tcps_rcvbyte += ti->ti_len; + /* + * Drop TCP, IP headers and TCP options then add data + * to socket buffer. + */ + m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + sbappend(&so->so_rcv, m); + sorwakeup(so); + tp->t_flags |= TF_DELACK; + return; + } + } + + /* + * Drop TCP, IP headers and TCP options. + */ + m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + + /* + * Calculate amount of space in receive window, + * and then do TCP input processing. + * Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + { int win; + + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + } + + switch (tp->t_state) { + + /* + * If the state is LISTEN then ignore segment if it contains an RST. + * If the segment contains an ACK then it is bad and send a RST. + * If it does not contain a SYN then it is not interesting; drop it. + * Don't bother responding if the destination was a broadcast. + * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial + * tp->iss, and send a segment: + * + * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. + * Fill in remote peer address fields if not previously specified. + * Enter SYN_RECEIVED state, and process any other fields of this + * segment in this state. + */ + case TCPS_LISTEN: { + struct mbuf *am; + register struct sockaddr_in *sin; + + if (tiflags & TH_RST) + goto drop; + if (tiflags & TH_ACK) + goto dropwithreset; + if ((tiflags & TH_SYN) == 0) + goto drop; + /* + * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN + * in_broadcast() should never return true on a received + * packet with M_BCAST not set. + */ + if (m->m_flags & (M_BCAST|M_MCAST) || + IN_MULTICAST(ti->ti_dst.s_addr)) + goto drop; + am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ + if (am == NULL) + goto drop; + am->m_len = sizeof (struct sockaddr_in); + sin = mtod(am, struct sockaddr_in *); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ti->ti_src; + sin->sin_port = ti->ti_sport; + bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); + laddr = inp->inp_laddr; + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = ti->ti_dst; + if (in_pcbconnect(inp, am)) { + inp->inp_laddr = laddr; + (void) m_free(am); + goto drop; + } + (void) m_free(am); + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + tp = tcp_drop(tp, ENOBUFS); + dropsocket = 0; /* socket is already gone */ + goto drop; + } + if (optp) + tcp_dooptions(tp, optp, optlen, ti, + &ts_present, &ts_val, &ts_ecr); + if (iss) + tp->iss = iss; + else + tp->iss = tcp_iss; + tcp_iss += TCP_ISSINCR/2; + tp->irs = ti->ti_seq; + tcp_sendseqinit(tp); + tcp_rcvseqinit(tp); + tp->t_flags |= TF_ACKNOW; + tp->t_state = TCPS_SYN_RECEIVED; + tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + + /* + * If the state is SYN_SENT: + * if seg contains an ACK, but not for our SYN, drop the input. + * if seg contains a RST, then drop the connection. + * if seg does not contain SYN, then drop it. + * Otherwise this is an acceptable SYN segment + * initialize tp->rcv_nxt and tp->irs + * if seg contains ack then advance tp->snd_una + * if SYN has been acked change to ESTABLISHED else SYN_RCVD state + * arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + case TCPS_SYN_SENT: + if ((tiflags & TH_ACK) && + (SEQ_LEQ(ti->ti_ack, tp->iss) || + SEQ_GT(ti->ti_ack, tp->snd_max))) + goto dropwithreset; + if (tiflags & TH_RST) { + if (tiflags & TH_ACK) + tp = tcp_drop(tp, ECONNREFUSED); + goto drop; + } + if ((tiflags & TH_SYN) == 0) + goto drop; + if (tiflags & TH_ACK) { + tp->snd_una = ti->ti_ack; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + } + tp->t_timer[TCPT_REXMT] = 0; + tp->irs = ti->ti_seq; + tcp_rcvseqinit(tp); + tp->t_flags |= TF_ACKNOW; + if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { + tcpstat.tcps_connects++; + soisconnected(so); + tp->t_state = TCPS_ESTABLISHED; + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + (void) tcp_reass(tp, (struct tcpiphdr *)0, + (struct mbuf *)0); + /* + * if we didn't have to retransmit the SYN, + * use its rtt as our initial srtt & rtt var. + */ + if (tp->t_rtt) + tcp_xmit_timer(tp, tp->t_rtt); + } else + tp->t_state = TCPS_SYN_RECEIVED; + +trimthenstep6: + /* + * Advance ti->ti_seq to correspond to first data byte. + * If data, trim to stay within window, + * dropping FIN if necessary. + */ + ti->ti_seq++; + if (ti->ti_len > tp->rcv_wnd) { + todrop = ti->ti_len - tp->rcv_wnd; + m_adj(m, -todrop); + ti->ti_len = tp->rcv_wnd; + tiflags &= ~TH_FIN; + tcpstat.tcps_rcvpackafterwin++; + tcpstat.tcps_rcvbyteafterwin += todrop; + } + tp->snd_wl1 = ti->ti_seq - 1; + tp->rcv_up = ti->ti_seq; + goto step6; + } + + /* + * States other than LISTEN or SYN_SENT. + * First check timestamp, if present. + * Then check that at least some bytes of segment are within + * receive window. If segment begins before rcv_nxt, + * drop leading data (and SYN); if nothing left, just ack. + * + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && + TSTMP_LT(ts_val, tp->ts_recent)) { + + /* Check to see if ts_recent is over 24 days old. */ + if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates + * ts_recent, the age will be reset later and ts_recent + * will get a valid value. If it does not, setting + * ts_recent to zero will at least satisfy the + * requirement that zero be placed in the timestamp + * echo reply when ts_recent isn't valid. The + * age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be + * dropped when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + tcpstat.tcps_pawsdrop++; + goto dropafterack; + } + } + + todrop = tp->rcv_nxt - ti->ti_seq; + if (todrop > 0) { + if (tiflags & TH_SYN) { + tiflags &= ~TH_SYN; + ti->ti_seq++; + if (ti->ti_urp > 1) + ti->ti_urp--; + else + tiflags &= ~TH_URG; + todrop--; + } + if (todrop >= ti->ti_len) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + /* + * If segment is just one to the left of the window, + * check two special cases: + * 1. Don't toss RST in response to 4.2-style keepalive. + * 2. If the only thing to drop is a FIN, we can drop + * it, but check the ACK or we will get into FIN + * wars if our FINs crossed (both CLOSING). + * In either case, send ACK to resynchronize, + * but keep on processing for RST or ACK. + */ + if ((tiflags & TH_FIN && todrop == ti->ti_len + 1) +#ifdef TCP_COMPAT_42 + || (tiflags & TH_RST && ti->ti_seq == tp->rcv_nxt - 1) +#endif + ) { + todrop = ti->ti_len; + tiflags &= ~TH_FIN; + tp->t_flags |= TF_ACKNOW; + } else { + /* + * Handle the case when a bound socket connects + * to itself. Allow packets with a SYN and + * an ACK to continue with the processing. + */ + if (todrop != 0 || (tiflags & TH_ACK) == 0) + goto dropafterack; + } + } else { + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += todrop; + } + m_adj(m, todrop); + ti->ti_seq += todrop; + ti->ti_len -= todrop; + if (ti->ti_urp > todrop) + ti->ti_urp -= todrop; + else { + tiflags &= ~TH_URG; + ti->ti_urp = 0; + } + } + + /* + * If new data are received on a connection after the + * user processes are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { + tp = tcp_close(tp); + tcpstat.tcps_rcvafterclose++; + goto dropwithreset; + } + + /* + * If segment ends after window, drop trailing data + * (and PUSH and FIN); if nothing left, just ACK. + */ + todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); + if (todrop > 0) { + tcpstat.tcps_rcvpackafterwin++; + if (todrop >= ti->ti_len) { + tcpstat.tcps_rcvbyteafterwin += ti->ti_len; + /* + * If a new connection request is received + * while in TIME_WAIT, drop the old connection + * and start over if the sequence numbers + * are above the previous ones. + */ + if (tiflags & TH_SYN && + tp->t_state == TCPS_TIME_WAIT && + SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { + iss = tp->rcv_nxt + TCP_ISSINCR; + tp = tcp_close(tp); + goto findpcb; + } + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment + * and ack. + */ + if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_rcvwinprobe++; + } else + goto dropafterack; + } else + tcpstat.tcps_rcvbyteafterwin += todrop; + m_adj(m, -todrop); + ti->ti_len -= todrop; + tiflags &= ~(TH_PUSH|TH_FIN); + } + + /* + * If last ACK falls within this segment's sequence numbers, + * record its timestamp. + */ + if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) && + SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len + + ((tiflags & (TH_SYN|TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_now; + tp->ts_recent = ts_val; + } + + /* + * If the RST bit is set examine the state: + * SYN_RECEIVED STATE: + * If passive open, return to LISTEN state. + * If active open, inform user that connection was refused. + * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: + * Inform user that connection was reset, and close tcb. + * CLOSING, LAST_ACK, TIME_WAIT STATES + * Close the tcb. + */ + if (tiflags&TH_RST) switch (tp->t_state) { + + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tp->t_state = TCPS_CLOSED; + tcpstat.tcps_drops++; + tp = tcp_close(tp); + goto drop; + + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + tp = tcp_close(tp); + goto drop; + } + + /* + * If a SYN is in the window, then this is an + * error and we send an RST and drop the connection. + */ + if (tiflags & TH_SYN) { + tp = tcp_drop(tp, ECONNRESET); + goto dropwithreset; + } + + /* + * If the ACK bit is off we drop the segment and return. + */ + if ((tiflags & TH_ACK) == 0) + goto drop; + + /* + * Ack processing. + */ + switch (tp->t_state) { + + /* + * In SYN_RECEIVED state if the ack ACKs our SYN then enter + * ESTABLISHED state and continue processing, otherwise + * send an RST. + */ + case TCPS_SYN_RECEIVED: + if (SEQ_GT(tp->snd_una, ti->ti_ack) || + SEQ_GT(ti->ti_ack, tp->snd_max)) + goto dropwithreset; + tcpstat.tcps_connects++; + soisconnected(so); + tp->t_state = TCPS_ESTABLISHED; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0); + tp->snd_wl1 = ti->ti_seq - 1; + /* fall into ... */ + + /* + * In ESTABLISHED state: drop duplicate ACKs; ACK out of range + * ACKs. If the ack is in the range + * tp->snd_una < ti->ti_ack <= tp->snd_max + * then advance tp->snd_una to ti->ti_ack and drop + * data from the retransmission queue. If this ACK reflects + * more up to date window information we update our window information. + */ + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + + if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { + if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { + tcpstat.tcps_rcvdupack++; + /* + * If we have outstanding data (other than + * a window probe), this is a completely + * duplicate ack (ie, window info didn't + * change), the ack is the biggest we've + * seen and we've seen exactly our rexmt + * threshhold of them, assume a packet + * has been dropped and retransmit it. + * Kludge snd_nxt & the congestion + * window so we send only this one + * packet. + * + * We know we're losing at the current + * window size so do congestion avoidance + * (set ssthresh to half the current window + * and pull our congestion window back to + * the new ssthresh). + * + * Dup acks mean that packets have left the + * network (they're now cached at the receiver) + * so bump cwnd by the amount in the receiver + * to keep a constant cwnd packets in the + * network. + */ + if (tp->t_timer[TCPT_REXMT] == 0 || + ti->ti_ack != tp->snd_una) + tp->t_dupacks = 0; + else if (++tp->t_dupacks == tcprexmtthresh) { + tcp_seq onxt = tp->snd_nxt; + u_int win = + min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->t_maxseg; + + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_timer[TCPT_REXMT] = 0; + tp->t_rtt = 0; + tp->snd_nxt = ti->ti_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * tp->t_dupacks; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + goto drop; + } else if (tp->t_dupacks > tcprexmtthresh) { + tp->snd_cwnd += tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } + } else + tp->t_dupacks = 0; + break; + } + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (tp->t_dupacks > tcprexmtthresh && + tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + if (SEQ_GT(ti->ti_ack, tp->snd_max)) { + tcpstat.tcps_rcvacktoomuch++; + goto dropafterack; + } + acked = ti->ti_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + + /* + * If we have a timestamp reply, update smoothed + * round trip time. If no timestamp is present but + * transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + */ + if (ts_present) + tcp_xmit_timer(tp, tcp_now-ts_ecr+1); + else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp,tp->t_rtt); + + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + */ + if (ti->ti_ack == tp->snd_max) { + tp->t_timer[TCPT_REXMT] = 0; + needoutput = 1; + } else if (tp->t_timer[TCPT_PERSIST] == 0) + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + /* + * When new data is acked, open the congestion window. + * If the window gives us less than ssthresh packets + * in flight, open exponentially (maxseg per packet). + * Otherwise open linearly: maxseg per window + * (maxseg^2 / cwnd per packet), plus a constant + * fraction of a packet (maxseg/8) to help larger windows + * open quickly enough. + */ + { + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + + if (cw > tp->snd_ssthresh) + incr = incr * incr / cw + incr / 8; + tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<snd_scale); + } + if (acked > so->so_snd.sb_cc) { + tp->snd_wnd -= so->so_snd.sb_cc; + sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); + ourfinisacked = 1; + } else { + sbdrop(&so->so_snd, acked); + tp->snd_wnd -= acked; + ourfinisacked = 0; + } + if (so->so_snd.sb_flags & SB_NOTIFY) + sowwakeup(so); + tp->snd_una = ti->ti_ack; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + + switch (tp->t_state) { + + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now acknowledged + * then enter FIN_WAIT_2. + */ + case TCPS_FIN_WAIT_1: + if (ourfinisacked) { + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + */ + if (so->so_state & SS_CANTRCVMORE) { + soisdisconnected(so); + tp->t_timer[TCPT_2MSL] = tcp_maxidle; + } + tp->t_state = TCPS_FIN_WAIT_2; + } + break; + + /* + * In CLOSING STATE in addition to the processing for + * the ESTABLISHED state if the ACK acknowledges our FIN + * then enter the TIME-WAIT state, otherwise ignore + * the segment. + */ + case TCPS_CLOSING: + if (ourfinisacked) { + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + soisdisconnected(so); + } + break; + + /* + * In LAST_ACK, we may still be waiting for data to drain + * and/or to be acked, as well as for the ack of our FIN. + * If our FIN is now acknowledged, delete the TCB, + * enter the closed state and return. + */ + case TCPS_LAST_ACK: + if (ourfinisacked) { + tp = tcp_close(tp); + goto drop; + } + break; + + /* + * In TIME_WAIT state the only thing that should arrive + * is a retransmission of the remote FIN. Acknowledge + * it and restart the finack timer. + */ + case TCPS_TIME_WAIT: + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + goto dropafterack; + } + } + +step6: + /* + * Update window information. + * Don't look at window if no ACK: TAC's send garbage on first SYN. + */ + if ((tiflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, ti->ti_seq) || tp->snd_wl1 == ti->ti_seq && + (SEQ_LT(tp->snd_wl2, ti->ti_ack) || + tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))) { + /* keep track of pure window updates */ + if (ti->ti_len == 0 && + tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) + tcpstat.tcps_rcvwinupd++; + tp->snd_wnd = tiwin; + tp->snd_wl1 = ti->ti_seq; + tp->snd_wl2 = ti->ti_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + needoutput = 1; + } + + /* + * Process segments with URG. + */ + if ((tiflags & TH_URG) && ti->ti_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept + * random urgent pointers, we'll crash in + * soreceive. It's hard to imagine someone + * actually wanting to send this much urgent data. + */ + if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) { + ti->ti_urp = 0; /* XXX */ + tiflags &= ~TH_URG; /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, + * then mark the data stream. This should not happen + * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since + * a FIN has been received from the remote side. + * In these states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { + tp->rcv_up = ti->ti_seq + ti->ti_urp; + so->so_oobmark = so->so_rcv.sb_cc + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_state |= SS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (ti->ti_urp <= ti->ti_len +#ifdef SO_OOBINLINE + && (so->so_options & SO_OOBINLINE) == 0 +#endif + ) + tcp_pulloutofband(so, ti, m); + } else + /* + * If no out of band data is expected, + * pull receive urgent pointer along + * with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; +dodata: /* XXX */ + + /* + * Process the segment text, merging it into the TCP sequencing queue, + * and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data + * is presented to the user (this happens in tcp_usrreq.c, + * case PRU_RCVD). If a FIN has already been received on this + * connection then we just ignore the text. + */ + if ((ti->ti_len || (tiflags&TH_FIN)) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + TCP_REASS(tp, ti, m, so, tiflags); + /* + * Note the amount of data that peer has sent into + * our window, in order to estimate the sender's + * buffer size. + */ + len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); + } else { + m_freem(m); + tiflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know + * that the connection is closing. + */ + if (tiflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES + * enter the CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been acked so + * enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the other + * standard timers. + */ + case TCPS_FIN_WAIT_2: + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + soisdisconnected(so); + break; + + /* + * In TIME_WAIT state restart the 2 MSL time_wait timer. + */ + case TCPS_TIME_WAIT: + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + break; + } + } + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); + + /* + * Return any desired output. + */ + if (needoutput || (tp->t_flags & TF_ACKNOW)) + (void) tcp_output(tp); + return; + +dropafterack: + /* + * Generate an ACK dropping incoming segment if it occupies + * sequence space, where the ACK reflects our state. + */ + if (tiflags & TH_RST) + goto drop; + m_freem(m); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + return; + +dropwithreset: + /* + * Generate a RST, dropping incoming segment. + * Make ACK acceptable to originator of segment. + * Don't bother to respond if destination was broadcast/multicast. + */ + if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) || + IN_MULTICAST(ti->ti_dst.s_addr)) + goto drop; + if (tiflags & TH_ACK) + tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); + else { + if (tiflags & TH_SYN) + ti->ti_len++; + tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, + TH_RST|TH_ACK); + } + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; + +drop: + /* + * Drop space held by incoming segment and return. + */ + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); + m_freem(m); + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; +#ifndef TUBA_INCLUDE +} + +void +tcp_dooptions(tp, cp, cnt, ti, ts_present, ts_val, ts_ecr) + struct tcpcb *tp; + u_char *cp; + int cnt; + struct tcpiphdr *ti; + int *ts_present; + u_long *ts_val, *ts_ecr; +{ + u_short mss; + int opt, optlen; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + switch (opt) { + + default: + continue; + + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); + NTOHS(mss); + (void) tcp_mss(tp, mss); /* sets t_maxseg */ + break; + + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + tp->t_flags |= TF_RCVD_SCALE; + tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); + break; + + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) + continue; + *ts_present = 1; + bcopy((char *)cp + 2, (char *) ts_val, sizeof(*ts_val)); + NTOHL(*ts_val); + bcopy((char *)cp + 6, (char *) ts_ecr, sizeof(*ts_ecr)); + NTOHL(*ts_ecr); + + /* + * A timestamp received in a SYN makes + * it ok to send timestamp requests and replies. + */ + if (ti->ti_flags & TH_SYN) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = *ts_val; + tp->ts_recent_age = tcp_now; + } + break; + } + } +} + +/* + * Pull out of band byte out of a segment so + * it doesn't appear in the user's data queue. + * It is still reflected in the segment length for + * sequencing purposes. + */ +void +tcp_pulloutofband(so, ti, m) + struct socket *so; + struct tcpiphdr *ti; + register struct mbuf *m; +{ + int cnt = ti->ti_urp - 1; + + while (cnt >= 0) { + if (m->m_len > cnt) { + char *cp = mtod(m, caddr_t) + cnt; + struct tcpcb *tp = sototcpcb(so); + + tp->t_iobc = *cp; + tp->t_oobflags |= TCPOOB_HAVEDATA; + bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); + m->m_len--; + return; + } + cnt -= m->m_len; + m = m->m_next; + if (m == 0) + break; + } + panic("tcp_pulloutofband"); +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +void +tcp_xmit_timer(tp, rtt) + register struct tcpcb *tp; + short rtt; +{ + register short delta; + + tcpstat.tcps_rttupdated++; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 3 bits after the + * binary point (i.e., scaled by 8). The following magic + * is equivalent to the smoothing algorithm in rfc793 with + * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed + * point). Adjust rtt to origin 0. + */ + delta = rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT); + if ((tp->t_srtt += delta) <= 0) + tp->t_srtt = 1; + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit + * timer to smoothed rtt + 4 times the smoothed variance. + * rttvar is stored as fixed point with 2 bits after the + * binary point (scaled by 4). The following is + * equivalent to rfc793 smoothing with an alpha of .75 + * (rttvar = rttvar*3/4 + |delta| / 4). This replaces + * rfc793's wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); + if ((tp->t_rttvar += delta) <= 0) + tp->t_rttvar = 1; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. + * Set the variance to half the rtt (so our first + * retransmit happens at 3*rtt). + */ + tp->t_srtt = rtt << TCP_RTT_SHIFT; + tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + } + tp->t_rtt = 0; + tp->t_rxtshift = 0; + + /* + * the retransmit should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + tp->t_rttmin, TCPTV_REXMTMAX); + + /* + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + */ + tp->t_softerror = 0; +} + +/* + * Determine a reasonable value for maxseg size. + * If the route is known, check route for mtu. + * If none, use an mss that can be handled on the outgoing + * interface without forcing IP to fragment; if bigger than + * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES + * to utilize large mbufs. If no route is found, route has no mtu, + * or the destination isn't local, use a default, hopefully conservative + * size (usually 512 or the default IP max size, but no more than the mtu + * of the interface), as we can't discover anything about intervening + * gateways or networks. We also initialize the congestion/slow start + * window to be a single segment if the destination isn't local. + * While looking at the routing entry, we also initialize other path-dependent + * parameters from pre-set or cached values in the routing entry. + */ +int +tcp_mss(tp, offer) + register struct tcpcb *tp; + u_int offer; +{ + struct route *ro; + register struct rtentry *rt; + struct ifnet *ifp; + register int rtt, mss; + u_long bufsize; + struct inpcb *inp; + struct socket *so; + extern int tcp_mssdflt; + + inp = tp->t_inpcb; + ro = &inp->inp_route; + + if ((rt = ro->ro_rt) == (struct rtentry *)0) { + /* No route yet, so try to acquire one */ + if (inp->inp_faddr.s_addr != INADDR_ANY) { + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(ro->ro_dst); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + inp->inp_faddr; + rtalloc(ro); + } + if ((rt = ro->ro_rt) == (struct rtentry *)0) + return (tcp_mssdflt); + } + ifp = rt->rt_ifp; + so = inp->inp_socket; + +#ifdef RTV_MTU /* if route characteristics exist ... */ + /* + * While we're here, check if there's an initial rtt + * or rttvar. Convert from the route-table units + * to scaled multiples of the slow timeout timer. + */ + if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { + /* + * XXX the lock bit for MTU indicates that the value + * is also a minimum value; this is subject to time. + */ + if (rt->rt_rmx.rmx_locks & RTV_RTT) + tp->t_rttmin = rtt / (RTM_RTTUNIT / PR_SLOWHZ); + tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); + if (rt->rt_rmx.rmx_rttvar) + tp->t_rttvar = rt->rt_rmx.rmx_rttvar / + (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); + else + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + /* + * if there's an mtu associated with the route, use it + */ + if (rt->rt_rmx.rmx_mtu) + mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + else +#endif /* RTV_MTU */ + { + mss = ifp->if_mtu - sizeof(struct tcpiphdr); +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + if (!in_localaddr(inp->inp_faddr)) + mss = min(mss, tcp_mssdflt); + } + /* + * The current mss, t_maxseg, is initialized to the default value. + * If we compute a smaller value, reduce the current mss. + * If we compute a larger value, return it for use in sending + * a max seg size option, but don't store it for use + * unless we received an offer at least that large from peer. + * However, do not accept offers under 32 bytes. + */ + if (offer) + mss = min(mss, offer); + mss = max(mss, 32); /* sanity */ + if (mss < tp->t_maxseg || offer != 0) { + /* + * If there's a pipesize, change the socket buffer + * to that size. Make the socket buffers an integral + * number of mss units; if the mss is larger than + * the socket buffer, decrease the mss. + */ +#ifdef RTV_SPIPE + if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) +#endif + bufsize = so->so_snd.sb_hiwat; + if (bufsize < mss) + mss = bufsize; + else { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_snd, bufsize); + } + tp->t_maxseg = mss; + +#ifdef RTV_RPIPE + if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) +#endif + bufsize = so->so_rcv.sb_hiwat; + if (bufsize > mss) { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_rcv, bufsize); + } + } + tp->snd_cwnd = mss; + +#ifdef RTV_SSTHRESH + if (rt->rt_rmx.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); + } +#endif /* RTV_MTU */ + return (mss); +} +#endif /* TUBA_INCLUDE */ diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c new file mode 100644 index 00000000000..667579fc0ed --- /dev/null +++ b/sys/netinet/tcp_output.c @@ -0,0 +1,599 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_output.c 8.3 (Berkeley) 12/30/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#define TCPOUTFLAGS +#include +#include +#include +#include +#include +#include + +#ifdef notyet +extern struct mbuf *m_copypack(); +#endif + + +#define MAX_TCPOPTLEN 32 /* max # bytes that go in options */ + +/* + * Tcp output routine: figure out what should be sent and send it. + */ +int +tcp_output(tp) + register struct tcpcb *tp; +{ + register struct socket *so = tp->t_inpcb->inp_socket; + register long len, win; + int off, flags, error; + register struct mbuf *m; + register struct tcpiphdr *ti; + u_char opt[MAX_TCPOPTLEN]; + unsigned optlen, hdrlen; + int idle, sendalot; + + /* + * Determine length of data that should be transmitted, + * and flags that will be used. + * If there is some data or critical controls (SYN, RST) + * to send, then transmit; otherwise, investigate further. + */ + idle = (tp->snd_max == tp->snd_una); + if (idle && tp->t_idle >= tp->t_rxtcur) + /* + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + */ + tp->snd_cwnd = tp->t_maxseg; +again: + sendalot = 0; + off = tp->snd_nxt - tp->snd_una; + win = min(tp->snd_wnd, tp->snd_cwnd); + + flags = tcp_outflags[tp->t_state]; + /* + * If in persist timeout with window of 0, send 1 byte. + * Otherwise, if window is small but nonzero + * and timer expired, we will send what we can + * and go to transmit state. + */ + if (tp->t_force) { + if (win == 0) { + /* + * If we still have some data to send, then + * clear the FIN bit. Usually this would + * happen below when it realizes that we + * aren't sending all the data. However, + * if we have exactly 1 byte of unset data, + * then it won't clear the FIN bit below, + * and if we are in persist state, we wind + * up sending the packet without recording + * that we sent the FIN bit. + * + * We can't just blindly clear the FIN bit, + * because if we don't have any more data + * to send then the probe will be the FIN + * itself. + */ + if (off < so->so_snd.sb_cc) + flags &= ~TH_FIN; + win = 1; + } else { + tp->t_timer[TCPT_PERSIST] = 0; + tp->t_rxtshift = 0; + } + } + + len = min(so->so_snd.sb_cc, win) - off; + + if (len < 0) { + /* + * If FIN has been sent but not acked, + * but we haven't been called to retransmit, + * len will be -1. Otherwise, window shrank + * after we sent into it. If window shrank to 0, + * cancel pending retransmit and pull snd_nxt + * back to (closed) window. We will enter persist + * state below. If the window didn't close completely, + * just wait for an ACK. + */ + len = 0; + if (win == 0) { + tp->t_timer[TCPT_REXMT] = 0; + tp->snd_nxt = tp->snd_una; + } + } + if (len > tp->t_maxseg) { + len = tp->t_maxseg; + sendalot = 1; + } + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + flags &= ~TH_FIN; + + win = sbspace(&so->so_rcv); + + /* + * Sender silly window avoidance. If connection is idle + * and can send all data, a maximum segment, + * at least a maximum default-size segment do it, + * or are forced, do it; otherwise don't bother. + * If peer's buffer is tiny, then send + * when window is at least half open. + * If retransmitting (possibly after persist timer forced us + * to send into a small window), then must resend. + */ + if (len) { + if (len == tp->t_maxseg) + goto send; + if ((idle || tp->t_flags & TF_NODELAY) && + len + off >= so->so_snd.sb_cc) + goto send; + if (tp->t_force) + goto send; + if (len >= tp->max_sndwnd / 2) + goto send; + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + goto send; + } + + /* + * Compare available window to amount of window + * known to peer (as advertised window less + * next expected input). If the difference is at least two + * max size segments, or at least 50% of the maximum possible + * window, then want to send a window update to peer. + */ + if (win > 0) { + /* + * "adv" is the amount we can increase the window, + * taking into account that we are limited by + * TCP_MAXWIN << tp->rcv_scale. + */ + long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - + (tp->rcv_adv - tp->rcv_nxt); + + if (adv >= (long) (2 * tp->t_maxseg)) + goto send; + if (2 * adv >= (long) so->so_rcv.sb_hiwat) + goto send; + } + + /* + * Send if we owe peer an ACK. + */ + if (tp->t_flags & TF_ACKNOW) + goto send; + if (flags & (TH_SYN|TH_RST)) + goto send; + if (SEQ_GT(tp->snd_up, tp->snd_una)) + goto send; + /* + * If our state indicates that FIN should be sent + * and we have not yet done so, or we're retransmitting the FIN, + * then we need to send. + */ + if (flags & TH_FIN && + ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) + goto send; + + /* + * TCP window updates are not reliable, rather a polling protocol + * using ``persist'' packets is used to insure receipt of window + * updates. The three ``states'' for the output side are: + * idle not doing retransmits or persists + * persisting to move a small or zero window + * (re)transmitting and thereby not persisting + * + * tp->t_timer[TCPT_PERSIST] + * is set when we are in persist state. + * tp->t_force + * is set when we are called to send a persist packet. + * tp->t_timer[TCPT_REXMT] + * is set when we are retransmitting + * The output side is idle when both timers are zero. + * + * If send window is too small, there is data to transmit, and no + * retransmit or persist is pending, then go to persist state. + * If nothing happens soon, send when timer expires: + * if window is nonzero, transmit what we can, + * otherwise force out a byte. + */ + if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && + tp->t_timer[TCPT_PERSIST] == 0) { + tp->t_rxtshift = 0; + tcp_setpersist(tp); + } + + /* + * No reason to send a segment, just return. + */ + return (0); + +send: + /* + * Before ESTABLISHED, force sending of initial options + * unless TCP set not to do any options. + * NOTE: we assume that the IP/TCP header plus TCP options + * always fit in a single mbuf, leaving room for a maximum + * link header, i.e. + * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN + */ + optlen = 0; + hdrlen = sizeof (struct tcpiphdr); + if (flags & TH_SYN) { + tp->snd_nxt = tp->iss; + if ((tp->t_flags & TF_NOOPT) == 0) { + u_short mss; + + opt[0] = TCPOPT_MAXSEG; + opt[1] = 4; + mss = htons((u_short) tcp_mss(tp, 0)); + bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss)); + optlen = 4; + + if ((tp->t_flags & TF_REQ_SCALE) && + ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_RCVD_SCALE))) { + *((u_long *) (opt + optlen)) = htonl( + TCPOPT_NOP << 24 | + TCPOPT_WINDOW << 16 | + TCPOLEN_WINDOW << 8 | + tp->request_r_scale); + optlen += 4; + } + } + } + + /* + * Send a timestamp and echo-reply if this is a SYN and our side + * wants to use timestamps (TF_REQ_TSTMP is set) or both our side + * and our peer have sent timestamps in our SYN's. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (flags & TH_RST) == 0 && + ((flags & (TH_SYN|TH_ACK)) == TH_SYN || + (tp->t_flags & TF_RCVD_TSTMP))) { + u_long *lp = (u_long *)(opt + optlen); + + /* Form timestamp option as shown in appendix A of RFC 1323. */ + *lp++ = htonl(TCPOPT_TSTAMP_HDR); + *lp++ = htonl(tcp_now); + *lp = htonl(tp->ts_recent); + optlen += TCPOLEN_TSTAMP_APPA; + } + + hdrlen += optlen; + + /* + * Adjust data length if insertion of options will + * bump the packet length beyond the t_maxseg length. + */ + if (len > tp->t_maxseg - optlen) { + len = tp->t_maxseg - optlen; + sendalot = 1; + } + + +#ifdef DIAGNOSTIC + if (max_linkhdr + hdrlen > MHLEN) + panic("tcphdr too big"); +#endif + + /* + * Grab a header mbuf, attaching a copy of data to + * be transmitted, and initialize the header from + * the template for sends on this connection. + */ + if (len) { + if (tp->t_force && len == 1) + tcpstat.tcps_sndprobe++; + else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tcpstat.tcps_sndrexmitpack++; + tcpstat.tcps_sndrexmitbyte += len; + } else { + tcpstat.tcps_sndpack++; + tcpstat.tcps_sndbyte += len; + } +#ifdef notyet + if ((m = m_copypack(so->so_snd.sb_mb, off, + (int)len, max_linkhdr + hdrlen)) == 0) { + error = ENOBUFS; + goto out; + } + /* + * m_copypack left space for our hdr; use it. + */ + m->m_len += hdrlen; + m->m_data -= hdrlen; +#else + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } + m->m_data += max_linkhdr; + m->m_len = hdrlen; + if (len <= MHLEN - hdrlen - max_linkhdr) { + m_copydata(so->so_snd.sb_mb, off, (int) len, + mtod(m, caddr_t) + hdrlen); + m->m_len += len; + } else { + m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); + if (m->m_next == 0) + len = 0; + } +#endif + /* + * If we're sending everything we've got, set PUSH. + * (This will keep happy those implementations which only + * give data to the user when a buffer fills or + * a PUSH comes in.) + */ + if (off + len == so->so_snd.sb_cc) + flags |= TH_PUSH; + } else { + if (tp->t_flags & TF_ACKNOW) + tcpstat.tcps_sndacks++; + else if (flags & (TH_SYN|TH_FIN|TH_RST)) + tcpstat.tcps_sndctrl++; + else if (SEQ_GT(tp->snd_up, tp->snd_una)) + tcpstat.tcps_sndurg++; + else + tcpstat.tcps_sndwinup++; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } + m->m_data += max_linkhdr; + m->m_len = hdrlen; + } + m->m_pkthdr.rcvif = (struct ifnet *)0; + ti = mtod(m, struct tcpiphdr *); + if (tp->t_template == 0) + panic("tcp_output"); + bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr)); + + /* + * Fill in fields, remembering maximum advertised + * window for use in delaying messages about window sizes. + * If resending a FIN, be sure not to use a new sequence number. + */ + if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && + tp->snd_nxt == tp->snd_max) + tp->snd_nxt--; + /* + * If we are doing retransmissions, then snd_nxt will + * not reflect the first unsent octet. For ACK only + * packets, we do not want the sequence number of the + * retransmitted packet, we want the sequence number + * of the next unsent octet. So, if there is no data + * (and no SYN or FIN), use snd_max instead of snd_nxt + * when filling in ti_seq. But if we are in persist + * state, snd_max might reflect one byte beyond the + * right edge of the window, so use snd_nxt in that + * case, since we know we aren't doing a retransmission. + * (retransmit and persist are mutually exclusive...) + */ + if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST]) + ti->ti_seq = htonl(tp->snd_nxt); + else + ti->ti_seq = htonl(tp->snd_max); + ti->ti_ack = htonl(tp->rcv_nxt); + if (optlen) { + bcopy((caddr_t)opt, (caddr_t)(ti + 1), optlen); + ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; + } + ti->ti_flags = flags; + /* + * Calculate receive window. Don't shrink window, + * but avoid silly window syndrome. + */ + if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) + win = 0; + if (win > (long)TCP_MAXWIN << tp->rcv_scale) + win = (long)TCP_MAXWIN << tp->rcv_scale; + if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) + win = (long)(tp->rcv_adv - tp->rcv_nxt); + ti->ti_win = htons((u_short) (win>>tp->rcv_scale)); + if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { + ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); + ti->ti_flags |= TH_URG; + } else + /* + * If no urgent pointer to send, then we pull + * the urgent pointer to the left edge of the send window + * so that it doesn't drift into the send window on sequence + * number wraparound. + */ + tp->snd_up = tp->snd_una; /* drag it along */ + + /* + * Put TCP length in extended header, and then + * checksum extended header and data. + */ + if (len + optlen) + ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + + optlen + len)); + ti->ti_sum = in_cksum(m, (int)(hdrlen + len)); + + /* + * In transmit state, time the transmission and arrange for + * the retransmit. In persist state, just set snd_max. + */ + if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { + tcp_seq startseq = tp->snd_nxt; + + /* + * Advance snd_nxt over sequence space of this segment. + */ + if (flags & (TH_SYN|TH_FIN)) { + if (flags & TH_SYN) + tp->snd_nxt++; + if (flags & TH_FIN) { + tp->snd_nxt++; + tp->t_flags |= TF_SENTFIN; + } + } + tp->snd_nxt += len; + if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + tp->snd_max = tp->snd_nxt; + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + */ + if (tp->t_rtt == 0) { + tp->t_rtt = 1; + tp->t_rtseq = startseq; + tcpstat.tcps_segstimed++; + } + } + + /* + * Set retransmit timer if not currently set, + * and not doing an ack or a keep-alive probe. + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. + */ + if (tp->t_timer[TCPT_REXMT] == 0 && + tp->snd_nxt != tp->snd_una) { + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + if (tp->t_timer[TCPT_PERSIST]) { + tp->t_timer[TCPT_PERSIST] = 0; + tp->t_rxtshift = 0; + } + } + } else + if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) + tp->snd_max = tp->snd_nxt + len; + + /* + * Trace. + */ + if (so->so_options & SO_DEBUG) + tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0); + + /* + * Fill in IP length and desired time to live and + * send to IP level. There should be a better way + * to handle ttl and tos; we could keep them in + * the template, but need a way to checksum without them. + */ + m->m_pkthdr.len = hdrlen + len; +#ifdef TUBA + if (tp->t_tuba_pcb) + error = tuba_output(m, tp); + else +#endif + { + ((struct ip *)ti)->ip_len = m->m_pkthdr.len; + ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; /* XXX */ + ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip.ip_tos; /* XXX */ +#if BSD >= 43 + error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + so->so_options & SO_DONTROUTE, 0); +#else + error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route, + so->so_options & SO_DONTROUTE); +#endif + } + if (error) { +out: + if (error == ENOBUFS) { + tcp_quench(tp->t_inpcb, 0); + return (0); + } + if ((error == EHOSTUNREACH || error == ENETDOWN) + && TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_softerror = error; + return (0); + } + return (error); + } + tcpstat.tcps_sndtotal++; + + /* + * Data sent (as far as we can tell). + * If this advertises a larger window than any other segment, + * then remember the size of the advertised window. + * Any pending ACK has now been sent. + */ + if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + win; + tp->last_ack_sent = tp->rcv_nxt; + tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); + if (sendalot) + goto again; + return (0); +} + +void +tcp_setpersist(tp) + register struct tcpcb *tp; +{ + register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; + + if (tp->t_timer[TCPT_REXMT]) + panic("tcp_output REXMT"); + /* + * Start/restart persistance timer. + */ + TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], + t * tcp_backoff[tp->t_rxtshift], + TCPTV_PERSMIN, TCPTV_PERSMAX); + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) + tp->t_rxtshift++; +} diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c new file mode 100644 index 00000000000..2dd1d749c40 --- /dev/null +++ b/sys/netinet/tcp_reass.c @@ -0,0 +1,1647 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.5 (Berkeley) 4/10/94 + */ + +#ifndef TUBA_INCLUDE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int tcprexmtthresh = 3; +struct tcpiphdr tcp_saveti; +struct inpcb *tcp_last_inpcb = &tcb; + +extern u_long sb_max; + +#endif /* TUBA_INCLUDE */ +#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ) + +/* for modulo comparisons of timestamps */ +#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) +#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) + + +/* + * Insert segment ti into reassembly queue of tcp with + * control block tp. Return TH_FIN if reassembly now includes + * a segment with FIN. The macro form does the common case inline + * (segment is the next to be received on an established connection, + * and the queue is empty), avoiding linkage into and removal + * from the queue and repetition of various conversions. + * Set DELACK for segments received in order, but ack immediately + * when segments are out of order (so fast retransmit can work). + */ +#define TCP_REASS(tp, ti, m, so, flags) { \ + if ((ti)->ti_seq == (tp)->rcv_nxt && \ + (tp)->seg_next == (struct tcpiphdr *)(tp) && \ + (tp)->t_state == TCPS_ESTABLISHED) { \ + tp->t_flags |= TF_DELACK; \ + (tp)->rcv_nxt += (ti)->ti_len; \ + flags = (ti)->ti_flags & TH_FIN; \ + tcpstat.tcps_rcvpack++;\ + tcpstat.tcps_rcvbyte += (ti)->ti_len;\ + sbappend(&(so)->so_rcv, (m)); \ + sorwakeup(so); \ + } else { \ + (flags) = tcp_reass((tp), (ti), (m)); \ + tp->t_flags |= TF_ACKNOW; \ + } \ +} +#ifndef TUBA_INCLUDE + +int +tcp_reass(tp, ti, m) + register struct tcpcb *tp; + register struct tcpiphdr *ti; + struct mbuf *m; +{ + register struct tcpiphdr *q; + struct socket *so = tp->t_inpcb->inp_socket; + int flags; + + /* + * Call with ti==0 after become established to + * force pre-ESTABLISHED data up to user socket. + */ + if (ti == 0) + goto present; + + /* + * Find a segment which begins after this one does. + */ + for (q = tp->seg_next; q != (struct tcpiphdr *)tp; + q = (struct tcpiphdr *)q->ti_next) + if (SEQ_GT(q->ti_seq, ti->ti_seq)) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if ((struct tcpiphdr *)q->ti_prev != (struct tcpiphdr *)tp) { + register int i; + q = (struct tcpiphdr *)q->ti_prev; + /* conversion to int (in i) handles seq wraparound */ + i = q->ti_seq + q->ti_len - ti->ti_seq; + if (i > 0) { + if (i >= ti->ti_len) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + m_freem(m); + return (0); + } + m_adj(m, i); + ti->ti_len -= i; + ti->ti_seq += i; + } + q = (struct tcpiphdr *)(q->ti_next); + } + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += ti->ti_len; + REASS_MBUF(ti) = m; /* XXX */ + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q != (struct tcpiphdr *)tp) { + register int i = (ti->ti_seq + ti->ti_len) - q->ti_seq; + if (i <= 0) + break; + if (i < q->ti_len) { + q->ti_seq += i; + q->ti_len -= i; + m_adj(REASS_MBUF(q), i); + break; + } + q = (struct tcpiphdr *)q->ti_next; + m = REASS_MBUF((struct tcpiphdr *)q->ti_prev); + remque(q->ti_prev); + m_freem(m); + } + + /* + * Stick new segment in its place. + */ + insque(ti, q->ti_prev); + +present: + /* + * Present data to user, advancing rcv_nxt through + * completed sequence space. + */ + if (TCPS_HAVERCVDSYN(tp->t_state) == 0) + return (0); + ti = tp->seg_next; + if (ti == (struct tcpiphdr *)tp || ti->ti_seq != tp->rcv_nxt) + return (0); + if (tp->t_state == TCPS_SYN_RECEIVED && ti->ti_len) + return (0); + do { + tp->rcv_nxt += ti->ti_len; + flags = ti->ti_flags & TH_FIN; + remque(ti); + m = REASS_MBUF(ti); + ti = (struct tcpiphdr *)ti->ti_next; + if (so->so_state & SS_CANTRCVMORE) + m_freem(m); + else + sbappend(&so->so_rcv, m); + } while (ti != (struct tcpiphdr *)tp && ti->ti_seq == tp->rcv_nxt); + sorwakeup(so); + return (flags); +} + +/* + * TCP input routine, follows pages 65-76 of the + * protocol specification dated September, 1981 very closely. + */ +void +tcp_input(m, iphlen) + register struct mbuf *m; + int iphlen; +{ + register struct tcpiphdr *ti; + register struct inpcb *inp; + caddr_t optp = NULL; + int optlen; + int len, tlen, off; + register struct tcpcb *tp = 0; + register int tiflags; + struct socket *so; + int todrop, acked, ourfinisacked, needoutput = 0; + short ostate; + struct in_addr laddr; + int dropsocket = 0; + int iss = 0; + u_long tiwin, ts_val, ts_ecr; + int ts_present = 0; + + tcpstat.tcps_rcvtotal++; + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + ti = mtod(m, struct tcpiphdr *); + if (iphlen > sizeof (struct ip)) + ip_stripoptions(m, (struct mbuf *)0); + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ti = mtod(m, struct tcpiphdr *); + } + + /* + * Checksum extended TCP header and data. + */ + tlen = ((struct ip *)ti)->ip_len; + len = sizeof (struct ip) + tlen; + ti->ti_next = ti->ti_prev = 0; + ti->ti_x1 = 0; + ti->ti_len = (u_short)tlen; + HTONS(ti->ti_len); + if (ti->ti_sum = in_cksum(m, len)) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } +#endif /* TUBA_INCLUDE */ + + /* + * Check that TCP offset makes sense, + * pull out TCP options and adjust length. XXX + */ + off = ti->ti_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + tcpstat.tcps_rcvbadoff++; + goto drop; + } + tlen -= off; + ti->ti_len = tlen; + if (off > sizeof (struct tcphdr)) { + if (m->m_len < sizeof(struct ip) + off) { + if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ti = mtod(m, struct tcpiphdr *); + } + optlen = off - sizeof (struct tcphdr); + optp = mtod(m, caddr_t) + sizeof (struct tcpiphdr); + /* + * Do quick retrieval of timestamp options ("options + * prediction?"). If timestamp is the only option and it's + * formatted as recommended in RFC 1323 appendix A, we + * quickly get the values now and not bother calling + * tcp_dooptions(), etc. + */ + if ((optlen == TCPOLEN_TSTAMP_APPA || + (optlen > TCPOLEN_TSTAMP_APPA && + optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) && + *(u_long *)optp == htonl(TCPOPT_TSTAMP_HDR) && + (ti->ti_flags & TH_SYN) == 0) { + ts_present = 1; + ts_val = ntohl(*(u_long *)(optp + 4)); + ts_ecr = ntohl(*(u_long *)(optp + 8)); + optp = NULL; /* we've parsed the options */ + } + } + tiflags = ti->ti_flags; + + /* + * Convert TCP protocol specific fields to host format. + */ + NTOHL(ti->ti_seq); + NTOHL(ti->ti_ack); + NTOHS(ti->ti_win); + NTOHS(ti->ti_urp); + + /* + * Locate pcb for segment. + */ +findpcb: + inp = tcp_last_inpcb; + if (inp->inp_lport != ti->ti_dport || + inp->inp_fport != ti->ti_sport || + inp->inp_faddr.s_addr != ti->ti_src.s_addr || + inp->inp_laddr.s_addr != ti->ti_dst.s_addr) { + inp = in_pcblookup(&tcb, ti->ti_src, ti->ti_sport, + ti->ti_dst, ti->ti_dport, INPLOOKUP_WILDCARD); + if (inp) + tcp_last_inpcb = inp; + ++tcpstat.tcps_pcbcachemiss; + } + + /* + * If the state is CLOSED (i.e., TCB does not exist) then + * all data in the incoming segment is discarded. + * If the TCB exists but is in CLOSED state, it is embryonic, + * but should either do a listen or a connect soon. + */ + if (inp == 0) + goto dropwithreset; + tp = intotcpcb(inp); + if (tp == 0) + goto dropwithreset; + if (tp->t_state == TCPS_CLOSED) + goto drop; + + /* Unscale the window into a 32-bit value. */ + if ((tiflags & TH_SYN) == 0) + tiwin = ti->ti_win << tp->snd_scale; + else + tiwin = ti->ti_win; + + so = inp->inp_socket; + if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { + if (so->so_options & SO_DEBUG) { + ostate = tp->t_state; + tcp_saveti = *ti; + } + if (so->so_options & SO_ACCEPTCONN) { + so = sonewconn(so, 0); + if (so == 0) + goto drop; + /* + * This is ugly, but .... + * + * Mark socket as temporary until we're + * committed to keeping it. The code at + * ``drop'' and ``dropwithreset'' check the + * flag dropsocket to see if the temporary + * socket created here should be discarded. + * We mark the socket as discardable until + * we're committed to it below in TCPS_LISTEN. + */ + dropsocket++; + inp = (struct inpcb *)so->so_pcb; + inp->inp_laddr = ti->ti_dst; + inp->inp_lport = ti->ti_dport; +#if BSD>=43 + inp->inp_options = ip_srcroute(); +#endif + tp = intotcpcb(inp); + tp->t_state = TCPS_LISTEN; + + /* Compute proper scaling value from buffer space + */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + TCP_MAXWIN << tp->request_r_scale < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + } + } + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + */ + tp->t_idle = 0; + tp->t_timer[TCPT_KEEP] = tcp_keepidle; + + /* + * Process options if not in LISTEN state, + * else do it below (after getting remote address). + */ + if (optp && tp->t_state != TCPS_LISTEN) + tcp_dooptions(tp, optp, optlen, ti, + &ts_present, &ts_val, &ts_ecr); + + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && + ti->ti_seq == tp->rcv_nxt && + tiwin && tiwin == tp->snd_wnd && + tp->snd_nxt == tp->snd_max) { + + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + */ + if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) && + SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len)) { + tp->ts_recent_age = tcp_now; + tp->ts_recent = ts_val; + } + + if (ti->ti_len == 0) { + if (SEQ_GT(ti->ti_ack, tp->snd_una) && + SEQ_LEQ(ti->ti_ack, tp->snd_max) && + tp->snd_cwnd >= tp->snd_wnd) { + /* + * this is a pure ack for outstanding data. + */ + ++tcpstat.tcps_predack; + if (ts_present) + tcp_xmit_timer(tp, tcp_now-ts_ecr+1); + else if (tp->t_rtt && + SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, tp->t_rtt); + acked = ti->ti_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + sbdrop(&so->so_snd, acked); + tp->snd_una = ti->ti_ack; + m_freem(m); + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ + if (tp->snd_una == tp->snd_max) + tp->t_timer[TCPT_REXMT] = 0; + else if (tp->t_timer[TCPT_PERSIST] == 0) + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + + if (so->so_snd.sb_flags & SB_NOTIFY) + sowwakeup(so); + if (so->so_snd.sb_cc) + (void) tcp_output(tp); + return; + } + } else if (ti->ti_ack == tp->snd_una && + tp->seg_next == (struct tcpiphdr *)tp && + ti->ti_len <= sbspace(&so->so_rcv)) { + /* + * this is a pure, in-sequence data packet + * with nothing on the reassembly queue and + * we have enough buffer space to take it. + */ + ++tcpstat.tcps_preddat; + tp->rcv_nxt += ti->ti_len; + tcpstat.tcps_rcvpack++; + tcpstat.tcps_rcvbyte += ti->ti_len; + /* + * Drop TCP, IP headers and TCP options then add data + * to socket buffer. + */ + m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + sbappend(&so->so_rcv, m); + sorwakeup(so); + tp->t_flags |= TF_DELACK; + return; + } + } + + /* + * Drop TCP, IP headers and TCP options. + */ + m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); + + /* + * Calculate amount of space in receive window, + * and then do TCP input processing. + * Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + { int win; + + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + } + + switch (tp->t_state) { + + /* + * If the state is LISTEN then ignore segment if it contains an RST. + * If the segment contains an ACK then it is bad and send a RST. + * If it does not contain a SYN then it is not interesting; drop it. + * Don't bother responding if the destination was a broadcast. + * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial + * tp->iss, and send a segment: + * + * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. + * Fill in remote peer address fields if not previously specified. + * Enter SYN_RECEIVED state, and process any other fields of this + * segment in this state. + */ + case TCPS_LISTEN: { + struct mbuf *am; + register struct sockaddr_in *sin; + + if (tiflags & TH_RST) + goto drop; + if (tiflags & TH_ACK) + goto dropwithreset; + if ((tiflags & TH_SYN) == 0) + goto drop; + /* + * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN + * in_broadcast() should never return true on a received + * packet with M_BCAST not set. + */ + if (m->m_flags & (M_BCAST|M_MCAST) || + IN_MULTICAST(ti->ti_dst.s_addr)) + goto drop; + am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */ + if (am == NULL) + goto drop; + am->m_len = sizeof (struct sockaddr_in); + sin = mtod(am, struct sockaddr_in *); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ti->ti_src; + sin->sin_port = ti->ti_sport; + bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); + laddr = inp->inp_laddr; + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = ti->ti_dst; + if (in_pcbconnect(inp, am)) { + inp->inp_laddr = laddr; + (void) m_free(am); + goto drop; + } + (void) m_free(am); + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + tp = tcp_drop(tp, ENOBUFS); + dropsocket = 0; /* socket is already gone */ + goto drop; + } + if (optp) + tcp_dooptions(tp, optp, optlen, ti, + &ts_present, &ts_val, &ts_ecr); + if (iss) + tp->iss = iss; + else + tp->iss = tcp_iss; + tcp_iss += TCP_ISSINCR/2; + tp->irs = ti->ti_seq; + tcp_sendseqinit(tp); + tcp_rcvseqinit(tp); + tp->t_flags |= TF_ACKNOW; + tp->t_state = TCPS_SYN_RECEIVED; + tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + + /* + * If the state is SYN_SENT: + * if seg contains an ACK, but not for our SYN, drop the input. + * if seg contains a RST, then drop the connection. + * if seg does not contain SYN, then drop it. + * Otherwise this is an acceptable SYN segment + * initialize tp->rcv_nxt and tp->irs + * if seg contains ack then advance tp->snd_una + * if SYN has been acked change to ESTABLISHED else SYN_RCVD state + * arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + case TCPS_SYN_SENT: + if ((tiflags & TH_ACK) && + (SEQ_LEQ(ti->ti_ack, tp->iss) || + SEQ_GT(ti->ti_ack, tp->snd_max))) + goto dropwithreset; + if (tiflags & TH_RST) { + if (tiflags & TH_ACK) + tp = tcp_drop(tp, ECONNREFUSED); + goto drop; + } + if ((tiflags & TH_SYN) == 0) + goto drop; + if (tiflags & TH_ACK) { + tp->snd_una = ti->ti_ack; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + } + tp->t_timer[TCPT_REXMT] = 0; + tp->irs = ti->ti_seq; + tcp_rcvseqinit(tp); + tp->t_flags |= TF_ACKNOW; + if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { + tcpstat.tcps_connects++; + soisconnected(so); + tp->t_state = TCPS_ESTABLISHED; + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + (void) tcp_reass(tp, (struct tcpiphdr *)0, + (struct mbuf *)0); + /* + * if we didn't have to retransmit the SYN, + * use its rtt as our initial srtt & rtt var. + */ + if (tp->t_rtt) + tcp_xmit_timer(tp, tp->t_rtt); + } else + tp->t_state = TCPS_SYN_RECEIVED; + +trimthenstep6: + /* + * Advance ti->ti_seq to correspond to first data byte. + * If data, trim to stay within window, + * dropping FIN if necessary. + */ + ti->ti_seq++; + if (ti->ti_len > tp->rcv_wnd) { + todrop = ti->ti_len - tp->rcv_wnd; + m_adj(m, -todrop); + ti->ti_len = tp->rcv_wnd; + tiflags &= ~TH_FIN; + tcpstat.tcps_rcvpackafterwin++; + tcpstat.tcps_rcvbyteafterwin += todrop; + } + tp->snd_wl1 = ti->ti_seq - 1; + tp->rcv_up = ti->ti_seq; + goto step6; + } + + /* + * States other than LISTEN or SYN_SENT. + * First check timestamp, if present. + * Then check that at least some bytes of segment are within + * receive window. If segment begins before rcv_nxt, + * drop leading data (and SYN); if nothing left, just ack. + * + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if (ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent && + TSTMP_LT(ts_val, tp->ts_recent)) { + + /* Check to see if ts_recent is over 24 days old. */ + if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates + * ts_recent, the age will be reset later and ts_recent + * will get a valid value. If it does not, setting + * ts_recent to zero will at least satisfy the + * requirement that zero be placed in the timestamp + * echo reply when ts_recent isn't valid. The + * age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be + * dropped when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + tcpstat.tcps_pawsdrop++; + goto dropafterack; + } + } + + todrop = tp->rcv_nxt - ti->ti_seq; + if (todrop > 0) { + if (tiflags & TH_SYN) { + tiflags &= ~TH_SYN; + ti->ti_seq++; + if (ti->ti_urp > 1) + ti->ti_urp--; + else + tiflags &= ~TH_URG; + todrop--; + } + if (todrop >= ti->ti_len) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += ti->ti_len; + /* + * If segment is just one to the left of the window, + * check two special cases: + * 1. Don't toss RST in response to 4.2-style keepalive. + * 2. If the only thing to drop is a FIN, we can drop + * it, but check the ACK or we will get into FIN + * wars if our FINs crossed (both CLOSING). + * In either case, send ACK to resynchronize, + * but keep on processing for RST or ACK. + */ + if ((tiflags & TH_FIN && todrop == ti->ti_len + 1) +#ifdef TCP_COMPAT_42 + || (tiflags & TH_RST && ti->ti_seq == tp->rcv_nxt - 1) +#endif + ) { + todrop = ti->ti_len; + tiflags &= ~TH_FIN; + tp->t_flags |= TF_ACKNOW; + } else { + /* + * Handle the case when a bound socket connects + * to itself. Allow packets with a SYN and + * an ACK to continue with the processing. + */ + if (todrop != 0 || (tiflags & TH_ACK) == 0) + goto dropafterack; + } + } else { + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += todrop; + } + m_adj(m, todrop); + ti->ti_seq += todrop; + ti->ti_len -= todrop; + if (ti->ti_urp > todrop) + ti->ti_urp -= todrop; + else { + tiflags &= ~TH_URG; + ti->ti_urp = 0; + } + } + + /* + * If new data are received on a connection after the + * user processes are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tp->t_state > TCPS_CLOSE_WAIT && ti->ti_len) { + tp = tcp_close(tp); + tcpstat.tcps_rcvafterclose++; + goto dropwithreset; + } + + /* + * If segment ends after window, drop trailing data + * (and PUSH and FIN); if nothing left, just ACK. + */ + todrop = (ti->ti_seq+ti->ti_len) - (tp->rcv_nxt+tp->rcv_wnd); + if (todrop > 0) { + tcpstat.tcps_rcvpackafterwin++; + if (todrop >= ti->ti_len) { + tcpstat.tcps_rcvbyteafterwin += ti->ti_len; + /* + * If a new connection request is received + * while in TIME_WAIT, drop the old connection + * and start over if the sequence numbers + * are above the previous ones. + */ + if (tiflags & TH_SYN && + tp->t_state == TCPS_TIME_WAIT && + SEQ_GT(ti->ti_seq, tp->rcv_nxt)) { + iss = tp->rcv_nxt + TCP_ISSINCR; + tp = tcp_close(tp); + goto findpcb; + } + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment + * and ack. + */ + if (tp->rcv_wnd == 0 && ti->ti_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_rcvwinprobe++; + } else + goto dropafterack; + } else + tcpstat.tcps_rcvbyteafterwin += todrop; + m_adj(m, -todrop); + ti->ti_len -= todrop; + tiflags &= ~(TH_PUSH|TH_FIN); + } + + /* + * If last ACK falls within this segment's sequence numbers, + * record its timestamp. + */ + if (ts_present && SEQ_LEQ(ti->ti_seq, tp->last_ack_sent) && + SEQ_LT(tp->last_ack_sent, ti->ti_seq + ti->ti_len + + ((tiflags & (TH_SYN|TH_FIN)) != 0))) { + tp->ts_recent_age = tcp_now; + tp->ts_recent = ts_val; + } + + /* + * If the RST bit is set examine the state: + * SYN_RECEIVED STATE: + * If passive open, return to LISTEN state. + * If active open, inform user that connection was refused. + * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: + * Inform user that connection was reset, and close tcb. + * CLOSING, LAST_ACK, TIME_WAIT STATES + * Close the tcb. + */ + if (tiflags&TH_RST) switch (tp->t_state) { + + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tp->t_state = TCPS_CLOSED; + tcpstat.tcps_drops++; + tp = tcp_close(tp); + goto drop; + + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + tp = tcp_close(tp); + goto drop; + } + + /* + * If a SYN is in the window, then this is an + * error and we send an RST and drop the connection. + */ + if (tiflags & TH_SYN) { + tp = tcp_drop(tp, ECONNRESET); + goto dropwithreset; + } + + /* + * If the ACK bit is off we drop the segment and return. + */ + if ((tiflags & TH_ACK) == 0) + goto drop; + + /* + * Ack processing. + */ + switch (tp->t_state) { + + /* + * In SYN_RECEIVED state if the ack ACKs our SYN then enter + * ESTABLISHED state and continue processing, otherwise + * send an RST. + */ + case TCPS_SYN_RECEIVED: + if (SEQ_GT(tp->snd_una, ti->ti_ack) || + SEQ_GT(ti->ti_ack, tp->snd_max)) + goto dropwithreset; + tcpstat.tcps_connects++; + soisconnected(so); + tp->t_state = TCPS_ESTABLISHED; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + (void) tcp_reass(tp, (struct tcpiphdr *)0, (struct mbuf *)0); + tp->snd_wl1 = ti->ti_seq - 1; + /* fall into ... */ + + /* + * In ESTABLISHED state: drop duplicate ACKs; ACK out of range + * ACKs. If the ack is in the range + * tp->snd_una < ti->ti_ack <= tp->snd_max + * then advance tp->snd_una to ti->ti_ack and drop + * data from the retransmission queue. If this ACK reflects + * more up to date window information we update our window information. + */ + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + + if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { + if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { + tcpstat.tcps_rcvdupack++; + /* + * If we have outstanding data (other than + * a window probe), this is a completely + * duplicate ack (ie, window info didn't + * change), the ack is the biggest we've + * seen and we've seen exactly our rexmt + * threshhold of them, assume a packet + * has been dropped and retransmit it. + * Kludge snd_nxt & the congestion + * window so we send only this one + * packet. + * + * We know we're losing at the current + * window size so do congestion avoidance + * (set ssthresh to half the current window + * and pull our congestion window back to + * the new ssthresh). + * + * Dup acks mean that packets have left the + * network (they're now cached at the receiver) + * so bump cwnd by the amount in the receiver + * to keep a constant cwnd packets in the + * network. + */ + if (tp->t_timer[TCPT_REXMT] == 0 || + ti->ti_ack != tp->snd_una) + tp->t_dupacks = 0; + else if (++tp->t_dupacks == tcprexmtthresh) { + tcp_seq onxt = tp->snd_nxt; + u_int win = + min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->t_maxseg; + + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_timer[TCPT_REXMT] = 0; + tp->t_rtt = 0; + tp->snd_nxt = ti->ti_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * tp->t_dupacks; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + goto drop; + } else if (tp->t_dupacks > tcprexmtthresh) { + tp->snd_cwnd += tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } + } else + tp->t_dupacks = 0; + break; + } + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (tp->t_dupacks > tcprexmtthresh && + tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + if (SEQ_GT(ti->ti_ack, tp->snd_max)) { + tcpstat.tcps_rcvacktoomuch++; + goto dropafterack; + } + acked = ti->ti_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + + /* + * If we have a timestamp reply, update smoothed + * round trip time. If no timestamp is present but + * transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + */ + if (ts_present) + tcp_xmit_timer(tp, tcp_now-ts_ecr+1); + else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp,tp->t_rtt); + + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + */ + if (ti->ti_ack == tp->snd_max) { + tp->t_timer[TCPT_REXMT] = 0; + needoutput = 1; + } else if (tp->t_timer[TCPT_PERSIST] == 0) + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + /* + * When new data is acked, open the congestion window. + * If the window gives us less than ssthresh packets + * in flight, open exponentially (maxseg per packet). + * Otherwise open linearly: maxseg per window + * (maxseg^2 / cwnd per packet), plus a constant + * fraction of a packet (maxseg/8) to help larger windows + * open quickly enough. + */ + { + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + + if (cw > tp->snd_ssthresh) + incr = incr * incr / cw + incr / 8; + tp->snd_cwnd = min(cw + incr, TCP_MAXWIN<snd_scale); + } + if (acked > so->so_snd.sb_cc) { + tp->snd_wnd -= so->so_snd.sb_cc; + sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); + ourfinisacked = 1; + } else { + sbdrop(&so->so_snd, acked); + tp->snd_wnd -= acked; + ourfinisacked = 0; + } + if (so->so_snd.sb_flags & SB_NOTIFY) + sowwakeup(so); + tp->snd_una = ti->ti_ack; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + + switch (tp->t_state) { + + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now acknowledged + * then enter FIN_WAIT_2. + */ + case TCPS_FIN_WAIT_1: + if (ourfinisacked) { + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + */ + if (so->so_state & SS_CANTRCVMORE) { + soisdisconnected(so); + tp->t_timer[TCPT_2MSL] = tcp_maxidle; + } + tp->t_state = TCPS_FIN_WAIT_2; + } + break; + + /* + * In CLOSING STATE in addition to the processing for + * the ESTABLISHED state if the ACK acknowledges our FIN + * then enter the TIME-WAIT state, otherwise ignore + * the segment. + */ + case TCPS_CLOSING: + if (ourfinisacked) { + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + soisdisconnected(so); + } + break; + + /* + * In LAST_ACK, we may still be waiting for data to drain + * and/or to be acked, as well as for the ack of our FIN. + * If our FIN is now acknowledged, delete the TCB, + * enter the closed state and return. + */ + case TCPS_LAST_ACK: + if (ourfinisacked) { + tp = tcp_close(tp); + goto drop; + } + break; + + /* + * In TIME_WAIT state the only thing that should arrive + * is a retransmission of the remote FIN. Acknowledge + * it and restart the finack timer. + */ + case TCPS_TIME_WAIT: + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + goto dropafterack; + } + } + +step6: + /* + * Update window information. + * Don't look at window if no ACK: TAC's send garbage on first SYN. + */ + if ((tiflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, ti->ti_seq) || tp->snd_wl1 == ti->ti_seq && + (SEQ_LT(tp->snd_wl2, ti->ti_ack) || + tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd))) { + /* keep track of pure window updates */ + if (ti->ti_len == 0 && + tp->snd_wl2 == ti->ti_ack && tiwin > tp->snd_wnd) + tcpstat.tcps_rcvwinupd++; + tp->snd_wnd = tiwin; + tp->snd_wl1 = ti->ti_seq; + tp->snd_wl2 = ti->ti_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + needoutput = 1; + } + + /* + * Process segments with URG. + */ + if ((tiflags & TH_URG) && ti->ti_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept + * random urgent pointers, we'll crash in + * soreceive. It's hard to imagine someone + * actually wanting to send this much urgent data. + */ + if (ti->ti_urp + so->so_rcv.sb_cc > sb_max) { + ti->ti_urp = 0; /* XXX */ + tiflags &= ~TH_URG; /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, + * then mark the data stream. This should not happen + * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since + * a FIN has been received from the remote side. + * In these states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(ti->ti_seq+ti->ti_urp, tp->rcv_up)) { + tp->rcv_up = ti->ti_seq + ti->ti_urp; + so->so_oobmark = so->so_rcv.sb_cc + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_state |= SS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (ti->ti_urp <= ti->ti_len +#ifdef SO_OOBINLINE + && (so->so_options & SO_OOBINLINE) == 0 +#endif + ) + tcp_pulloutofband(so, ti, m); + } else + /* + * If no out of band data is expected, + * pull receive urgent pointer along + * with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; +dodata: /* XXX */ + + /* + * Process the segment text, merging it into the TCP sequencing queue, + * and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data + * is presented to the user (this happens in tcp_usrreq.c, + * case PRU_RCVD). If a FIN has already been received on this + * connection then we just ignore the text. + */ + if ((ti->ti_len || (tiflags&TH_FIN)) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + TCP_REASS(tp, ti, m, so, tiflags); + /* + * Note the amount of data that peer has sent into + * our window, in order to estimate the sender's + * buffer size. + */ + len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); + } else { + m_freem(m); + tiflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know + * that the connection is closing. + */ + if (tiflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES + * enter the CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been acked so + * enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the other + * standard timers. + */ + case TCPS_FIN_WAIT_2: + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + soisdisconnected(so); + break; + + /* + * In TIME_WAIT state restart the 2 MSL time_wait timer. + */ + case TCPS_TIME_WAIT: + tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + break; + } + } + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); + + /* + * Return any desired output. + */ + if (needoutput || (tp->t_flags & TF_ACKNOW)) + (void) tcp_output(tp); + return; + +dropafterack: + /* + * Generate an ACK dropping incoming segment if it occupies + * sequence space, where the ACK reflects our state. + */ + if (tiflags & TH_RST) + goto drop; + m_freem(m); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + return; + +dropwithreset: + /* + * Generate a RST, dropping incoming segment. + * Make ACK acceptable to originator of segment. + * Don't bother to respond if destination was broadcast/multicast. + */ + if ((tiflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST) || + IN_MULTICAST(ti->ti_dst.s_addr)) + goto drop; + if (tiflags & TH_ACK) + tcp_respond(tp, ti, m, (tcp_seq)0, ti->ti_ack, TH_RST); + else { + if (tiflags & TH_SYN) + ti->ti_len++; + tcp_respond(tp, ti, m, ti->ti_seq+ti->ti_len, (tcp_seq)0, + TH_RST|TH_ACK); + } + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; + +drop: + /* + * Drop space held by incoming segment and return. + */ + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, &tcp_saveti, 0); + m_freem(m); + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; +#ifndef TUBA_INCLUDE +} + +void +tcp_dooptions(tp, cp, cnt, ti, ts_present, ts_val, ts_ecr) + struct tcpcb *tp; + u_char *cp; + int cnt; + struct tcpiphdr *ti; + int *ts_present; + u_long *ts_val, *ts_ecr; +{ + u_short mss; + int opt, optlen; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + switch (opt) { + + default: + continue; + + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); + NTOHS(mss); + (void) tcp_mss(tp, mss); /* sets t_maxseg */ + break; + + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) + continue; + if (!(ti->ti_flags & TH_SYN)) + continue; + tp->t_flags |= TF_RCVD_SCALE; + tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); + break; + + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) + continue; + *ts_present = 1; + bcopy((char *)cp + 2, (char *) ts_val, sizeof(*ts_val)); + NTOHL(*ts_val); + bcopy((char *)cp + 6, (char *) ts_ecr, sizeof(*ts_ecr)); + NTOHL(*ts_ecr); + + /* + * A timestamp received in a SYN makes + * it ok to send timestamp requests and replies. + */ + if (ti->ti_flags & TH_SYN) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = *ts_val; + tp->ts_recent_age = tcp_now; + } + break; + } + } +} + +/* + * Pull out of band byte out of a segment so + * it doesn't appear in the user's data queue. + * It is still reflected in the segment length for + * sequencing purposes. + */ +void +tcp_pulloutofband(so, ti, m) + struct socket *so; + struct tcpiphdr *ti; + register struct mbuf *m; +{ + int cnt = ti->ti_urp - 1; + + while (cnt >= 0) { + if (m->m_len > cnt) { + char *cp = mtod(m, caddr_t) + cnt; + struct tcpcb *tp = sototcpcb(so); + + tp->t_iobc = *cp; + tp->t_oobflags |= TCPOOB_HAVEDATA; + bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); + m->m_len--; + return; + } + cnt -= m->m_len; + m = m->m_next; + if (m == 0) + break; + } + panic("tcp_pulloutofband"); +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +void +tcp_xmit_timer(tp, rtt) + register struct tcpcb *tp; + short rtt; +{ + register short delta; + + tcpstat.tcps_rttupdated++; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 3 bits after the + * binary point (i.e., scaled by 8). The following magic + * is equivalent to the smoothing algorithm in rfc793 with + * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed + * point). Adjust rtt to origin 0. + */ + delta = rtt - 1 - (tp->t_srtt >> TCP_RTT_SHIFT); + if ((tp->t_srtt += delta) <= 0) + tp->t_srtt = 1; + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit + * timer to smoothed rtt + 4 times the smoothed variance. + * rttvar is stored as fixed point with 2 bits after the + * binary point (scaled by 4). The following is + * equivalent to rfc793 smoothing with an alpha of .75 + * (rttvar = rttvar*3/4 + |delta| / 4). This replaces + * rfc793's wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT); + if ((tp->t_rttvar += delta) <= 0) + tp->t_rttvar = 1; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. + * Set the variance to half the rtt (so our first + * retransmit happens at 3*rtt). + */ + tp->t_srtt = rtt << TCP_RTT_SHIFT; + tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + } + tp->t_rtt = 0; + tp->t_rxtshift = 0; + + /* + * the retransmit should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + tp->t_rttmin, TCPTV_REXMTMAX); + + /* + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + */ + tp->t_softerror = 0; +} + +/* + * Determine a reasonable value for maxseg size. + * If the route is known, check route for mtu. + * If none, use an mss that can be handled on the outgoing + * interface without forcing IP to fragment; if bigger than + * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES + * to utilize large mbufs. If no route is found, route has no mtu, + * or the destination isn't local, use a default, hopefully conservative + * size (usually 512 or the default IP max size, but no more than the mtu + * of the interface), as we can't discover anything about intervening + * gateways or networks. We also initialize the congestion/slow start + * window to be a single segment if the destination isn't local. + * While looking at the routing entry, we also initialize other path-dependent + * parameters from pre-set or cached values in the routing entry. + */ +int +tcp_mss(tp, offer) + register struct tcpcb *tp; + u_int offer; +{ + struct route *ro; + register struct rtentry *rt; + struct ifnet *ifp; + register int rtt, mss; + u_long bufsize; + struct inpcb *inp; + struct socket *so; + extern int tcp_mssdflt; + + inp = tp->t_inpcb; + ro = &inp->inp_route; + + if ((rt = ro->ro_rt) == (struct rtentry *)0) { + /* No route yet, so try to acquire one */ + if (inp->inp_faddr.s_addr != INADDR_ANY) { + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(ro->ro_dst); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + inp->inp_faddr; + rtalloc(ro); + } + if ((rt = ro->ro_rt) == (struct rtentry *)0) + return (tcp_mssdflt); + } + ifp = rt->rt_ifp; + so = inp->inp_socket; + +#ifdef RTV_MTU /* if route characteristics exist ... */ + /* + * While we're here, check if there's an initial rtt + * or rttvar. Convert from the route-table units + * to scaled multiples of the slow timeout timer. + */ + if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { + /* + * XXX the lock bit for MTU indicates that the value + * is also a minimum value; this is subject to time. + */ + if (rt->rt_rmx.rmx_locks & RTV_RTT) + tp->t_rttmin = rtt / (RTM_RTTUNIT / PR_SLOWHZ); + tp->t_srtt = rtt / (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); + if (rt->rt_rmx.rmx_rttvar) + tp->t_rttvar = rt->rt_rmx.rmx_rttvar / + (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); + else + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + /* + * if there's an mtu associated with the route, use it + */ + if (rt->rt_rmx.rmx_mtu) + mss = rt->rt_rmx.rmx_mtu - sizeof(struct tcpiphdr); + else +#endif /* RTV_MTU */ + { + mss = ifp->if_mtu - sizeof(struct tcpiphdr); +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + if (!in_localaddr(inp->inp_faddr)) + mss = min(mss, tcp_mssdflt); + } + /* + * The current mss, t_maxseg, is initialized to the default value. + * If we compute a smaller value, reduce the current mss. + * If we compute a larger value, return it for use in sending + * a max seg size option, but don't store it for use + * unless we received an offer at least that large from peer. + * However, do not accept offers under 32 bytes. + */ + if (offer) + mss = min(mss, offer); + mss = max(mss, 32); /* sanity */ + if (mss < tp->t_maxseg || offer != 0) { + /* + * If there's a pipesize, change the socket buffer + * to that size. Make the socket buffers an integral + * number of mss units; if the mss is larger than + * the socket buffer, decrease the mss. + */ +#ifdef RTV_SPIPE + if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) +#endif + bufsize = so->so_snd.sb_hiwat; + if (bufsize < mss) + mss = bufsize; + else { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_snd, bufsize); + } + tp->t_maxseg = mss; + +#ifdef RTV_RPIPE + if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) +#endif + bufsize = so->so_rcv.sb_hiwat; + if (bufsize > mss) { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_rcv, bufsize); + } + } + tp->snd_cwnd = mss; + +#ifdef RTV_SSTHRESH + if (rt->rt_rmx.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); + } +#endif /* RTV_MTU */ + return (mss); +} +#endif /* TUBA_INCLUDE */ diff --git a/sys/netinet/tcp_seq.h b/sys/netinet/tcp_seq.h new file mode 100644 index 00000000000..8912299ff79 --- /dev/null +++ b/sys/netinet/tcp_seq.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_seq.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * TCP sequence numbers are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. + */ +#define SEQ_LT(a,b) ((int)((a)-(b)) < 0) +#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define SEQ_GT(a,b) ((int)((a)-(b)) > 0) +#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* + * Macros to initialize tcp sequence numbers for + * send and receive from initial send and receive + * sequence numbers. + */ +#define tcp_rcvseqinit(tp) \ + (tp)->rcv_adv = (tp)->rcv_nxt = (tp)->irs + 1 + +#define tcp_sendseqinit(tp) \ + (tp)->snd_una = (tp)->snd_nxt = (tp)->snd_max = (tp)->snd_up = \ + (tp)->iss + +#define TCP_ISSINCR (125*1024) /* increment for tcp_iss each second */ + +#ifdef KERNEL +tcp_seq tcp_iss; /* tcp initial send seq # */ +#endif diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c new file mode 100644 index 00000000000..8edb853bede --- /dev/null +++ b/sys/netinet/tcp_subr.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* patchable/settable parameters for tcp */ +int tcp_mssdflt = TCP_MSS; +int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; +int tcp_do_rfc1323 = 1; + +extern struct inpcb *tcp_last_inpcb; + +/* + * Tcp initialization + */ +void +tcp_init() +{ + + tcp_iss = 1; /* wrong */ + tcb.inp_next = tcb.inp_prev = &tcb; + if (max_protohdr < sizeof(struct tcpiphdr)) + max_protohdr = sizeof(struct tcpiphdr); + if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN) + panic("tcp_init"); +} + +/* + * Create template to be used to send tcp packets on a connection. + * Call after host entry created, allocates an mbuf and fills + * in a skeletal tcp/ip header, minimizing the amount of work + * necessary when the connection is used. + */ +struct tcpiphdr * +tcp_template(tp) + struct tcpcb *tp; +{ + register struct inpcb *inp = tp->t_inpcb; + register struct mbuf *m; + register struct tcpiphdr *n; + + if ((n = tp->t_template) == 0) { + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return (0); + m->m_len = sizeof (struct tcpiphdr); + n = mtod(m, struct tcpiphdr *); + } + n->ti_next = n->ti_prev = 0; + n->ti_x1 = 0; + n->ti_pr = IPPROTO_TCP; + n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); + n->ti_src = inp->inp_laddr; + n->ti_dst = inp->inp_faddr; + n->ti_sport = inp->inp_lport; + n->ti_dport = inp->inp_fport; + n->ti_seq = 0; + n->ti_ack = 0; + n->ti_x2 = 0; + n->ti_off = 5; + n->ti_flags = 0; + n->ti_win = 0; + n->ti_sum = 0; + n->ti_urp = 0; + return (n); +} + +/* + * Send a single message to the TCP at address specified by + * the given TCP/IP header. If m == 0, then we make a copy + * of the tcpiphdr at ti and send directly to the addressed host. + * This is used to force keep alive messages out using the TCP + * template for a connection tp->t_template. If flags are given + * then we send a message back to the TCP which originated the + * segment ti, and discard the mbuf containing it and any other + * attached mbufs. + * + * In any case the ack and sequence number of the transmitted + * segment are as specified by the parameters. + */ +void +tcp_respond(tp, ti, m, ack, seq, flags) + struct tcpcb *tp; + register struct tcpiphdr *ti; + register struct mbuf *m; + tcp_seq ack, seq; + int flags; +{ + register int tlen; + int win = 0; + struct route *ro = 0; + + if (tp) { + win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); + ro = &tp->t_inpcb->inp_route; + } + if (m == 0) { + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; +#ifdef TCP_COMPAT_42 + tlen = 1; +#else + tlen = 0; +#endif + m->m_data += max_linkhdr; + *mtod(m, struct tcpiphdr *) = *ti; + ti = mtod(m, struct tcpiphdr *); + flags = TH_ACK; + } else { + m_freem(m->m_next); + m->m_next = 0; + m->m_data = (caddr_t)ti; + m->m_len = sizeof (struct tcpiphdr); + tlen = 0; +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } + xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, u_long); + xchg(ti->ti_dport, ti->ti_sport, u_short); +#undef xchg + } + ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); + tlen += sizeof (struct tcpiphdr); + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = (struct ifnet *) 0; + ti->ti_next = ti->ti_prev = 0; + ti->ti_x1 = 0; + ti->ti_seq = htonl(seq); + ti->ti_ack = htonl(ack); + ti->ti_x2 = 0; + ti->ti_off = sizeof (struct tcphdr) >> 2; + ti->ti_flags = flags; + if (tp) + ti->ti_win = htons((u_short) (win >> tp->rcv_scale)); + else + ti->ti_win = htons((u_short)win); + ti->ti_urp = 0; + ti->ti_sum = 0; + ti->ti_sum = in_cksum(m, tlen); + ((struct ip *)ti)->ip_len = tlen; + ((struct ip *)ti)->ip_ttl = ip_defttl; + (void) ip_output(m, NULL, ro, 0, NULL); +} + +/* + * Create a new TCP control block, making an + * empty reassembly queue and hooking it to the argument + * protocol control block. + */ +struct tcpcb * +tcp_newtcpcb(inp) + struct inpcb *inp; +{ + register struct tcpcb *tp; + + tp = malloc(sizeof(*tp), M_PCB, M_NOWAIT); + if (tp == NULL) + return ((struct tcpcb *)0); + bzero((char *) tp, sizeof(struct tcpcb)); + tp->seg_next = tp->seg_prev = (struct tcpiphdr *)tp; + tp->t_maxseg = tcp_mssdflt; + + tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; + tp->t_inpcb = inp; + /* + * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no + * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives + * reasonable initial retransmit time. + */ + tp->t_srtt = TCPTV_SRTTBASE; + tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << 2; + tp->t_rttmin = TCPTV_MIN; + TCPT_RANGESET(tp->t_rxtcur, + ((TCPTV_SRTTBASE >> 2) + (TCPTV_SRTTDFLT << 2)) >> 1, + TCPTV_MIN, TCPTV_REXMTMAX); + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + inp->inp_ip.ip_ttl = ip_defttl; + inp->inp_ppcb = (caddr_t)tp; + return (tp); +} + +/* + * Drop a TCP connection, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +struct tcpcb * +tcp_drop(tp, errno) + register struct tcpcb *tp; + int errno; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_output(tp); + tcpstat.tcps_drops++; + } else + tcpstat.tcps_conndrops++; + if (errno == ETIMEDOUT && tp->t_softerror) + errno = tp->t_softerror; + so->so_error = errno; + return (tcp_close(tp)); +} + +/* + * Close a TCP control block: + * discard all space held by the tcp + * discard internet protocol block + * wake up any sleepers + */ +struct tcpcb * +tcp_close(tp) + register struct tcpcb *tp; +{ + register struct tcpiphdr *t; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + register struct mbuf *m; +#ifdef RTV_RTT + register struct rtentry *rt; + + /* + * If we sent enough data to get some meaningful characteristics, + * save them in the routing entry. 'Enough' is arbitrarily + * defined as the sendpipesize (default 4K) * 16. This would + * give us 16 rtt samples assuming we only get one sample per + * window (the usual case on a long haul net). 16 samples is + * enough for the srtt filter to converge to within 5% of the correct + * value; fewer samples and we could save a very bogus rtt. + * + * Don't update the default route's characteristics and don't + * update anything that the user "locked". + */ + if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) && + (rt = inp->inp_route.ro_rt) && + ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) { + register u_long i; + + if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { + i = tp->t_srtt * + (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); + if (rt->rt_rmx.rmx_rtt && i) + /* + * filter this update to half the old & half + * the new values, converting scale. + * See route.h and tcp_var.h for a + * description of the scaling constants. + */ + rt->rt_rmx.rmx_rtt = + (rt->rt_rmx.rmx_rtt + i) / 2; + else + rt->rt_rmx.rmx_rtt = i; + } + if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { + i = tp->t_rttvar * + (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); + if (rt->rt_rmx.rmx_rttvar && i) + rt->rt_rmx.rmx_rttvar = + (rt->rt_rmx.rmx_rttvar + i) / 2; + else + rt->rt_rmx.rmx_rttvar = i; + } + /* + * update the pipelimit (ssthresh) if it has been updated + * already or if a pipesize was specified & the threshhold + * got below half the pipesize. I.e., wait for bad news + * before we start updating, then update on both good + * and bad news. + */ + if ((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && + (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh || + i < (rt->rt_rmx.rmx_sendpipe / 2)) { + /* + * convert the limit from user data bytes to + * packets then to packet data bytes. + */ + i = (i + tp->t_maxseg / 2) / tp->t_maxseg; + if (i < 2) + i = 2; + i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr)); + if (rt->rt_rmx.rmx_ssthresh) + rt->rt_rmx.rmx_ssthresh = + (rt->rt_rmx.rmx_ssthresh + i) / 2; + else + rt->rt_rmx.rmx_ssthresh = i; + } + } +#endif /* RTV_RTT */ + /* free the reassembly queue, if any */ + t = tp->seg_next; + while (t != (struct tcpiphdr *)tp) { + t = (struct tcpiphdr *)t->ti_next; + m = REASS_MBUF((struct tcpiphdr *)t->ti_prev); + remque(t->ti_prev); + m_freem(m); + } + if (tp->t_template) + (void) m_free(dtom(tp->t_template)); + free(tp, M_PCB); + inp->inp_ppcb = 0; + soisdisconnected(so); + /* clobber input pcb cache if we're closing the cached connection */ + if (inp == tcp_last_inpcb) + tcp_last_inpcb = &tcb; + in_pcbdetach(inp); + tcpstat.tcps_closed++; + return ((struct tcpcb *)0); +} + +void +tcp_drain() +{ + +} + +/* + * Notify a tcp user of an asynchronous error; + * store error as soft error, but wake up user + * (for now, won't do anything until can select for soft error). + */ +void +tcp_notify(inp, error) + struct inpcb *inp; + int error; +{ + register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; + register struct socket *so = inp->inp_socket; + + /* + * Ignore some errors if we are hooked up. + * If connection hasn't completed, has retransmitted several times, + * and receives a second error, give up now. This is better + * than waiting a long time to establish a connection that + * can never complete. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { + return; + } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && + tp->t_softerror) + so->so_error = error; + else + tp->t_softerror = error; + wakeup((caddr_t) &so->so_timeo); + sorwakeup(so); + sowwakeup(so); +} + +void +tcp_ctlinput(cmd, sa, ip) + int cmd; + struct sockaddr *sa; + register struct ip *ip; +{ + register struct tcphdr *th; + extern struct in_addr zeroin_addr; + extern u_char inetctlerrmap[]; + void (*notify) __P((struct inpcb *, int)) = tcp_notify; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)) + return; + if (ip) { + th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport, + cmd, notify); + } else + in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify); +} + +/* + * When a source quench is received, close congestion window + * to one segment. We will gradually open it again as we proceed. + */ +void +tcp_quench(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp) + tp->snd_cwnd = tp->t_maxseg; +} diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c new file mode 100644 index 00000000000..0c0f0f8c2f1 --- /dev/null +++ b/sys/netinet/tcp_timer.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.c 8.1 (Berkeley) 6/10/93 + */ + +#ifndef TUBA_INCLUDE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int tcp_keepidle = TCPTV_KEEP_IDLE; +int tcp_keepintvl = TCPTV_KEEPINTVL; +int tcp_maxidle; +#endif /* TUBA_INCLUDE */ +/* + * Fast timeout routine for processing delayed acks + */ +void +tcp_fasttimo() +{ + register struct inpcb *inp; + register struct tcpcb *tp; + int s = splnet(); + + inp = tcb.inp_next; + if (inp) + for (; inp != &tcb; inp = inp->inp_next) + if ((tp = (struct tcpcb *)inp->inp_ppcb) && + (tp->t_flags & TF_DELACK)) { + tp->t_flags &= ~TF_DELACK; + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_delack++; + (void) tcp_output(tp); + } + splx(s); +} + +/* + * Tcp protocol timeout routine called every 500 ms. + * Updates the timers in all active tcb's and + * causes finite state machine actions if timers expire. + */ +void +tcp_slowtimo() +{ + register struct inpcb *ip, *ipnxt; + register struct tcpcb *tp; + int s = splnet(); + register int i; + + tcp_maxidle = TCPTV_KEEPCNT * tcp_keepintvl; + /* + * Search through tcb's and update active timers. + */ + ip = tcb.inp_next; + if (ip == 0) { + splx(s); + return; + } + for (; ip != &tcb; ip = ipnxt) { + ipnxt = ip->inp_next; + tp = intotcpcb(ip); + if (tp == 0) + continue; + for (i = 0; i < TCPT_NTIMERS; i++) { + if (tp->t_timer[i] && --tp->t_timer[i] == 0) { + (void) tcp_usrreq(tp->t_inpcb->inp_socket, + PRU_SLOWTIMO, (struct mbuf *)0, + (struct mbuf *)i, (struct mbuf *)0); + if (ipnxt->inp_prev != ip) + goto tpgone; + } + } + tp->t_idle++; + if (tp->t_rtt) + tp->t_rtt++; +tpgone: + ; + } + tcp_iss += TCP_ISSINCR/PR_SLOWHZ; /* increment iss */ +#ifdef TCP_COMPAT_42 + if ((int)tcp_iss < 0) + tcp_iss = 0; /* XXX */ +#endif + tcp_now++; /* for timestamps */ + splx(s); +} +#ifndef TUBA_INCLUDE + +/* + * Cancel all timers for TCP tp. + */ +void +tcp_canceltimers(tp) + struct tcpcb *tp; +{ + register int i; + + for (i = 0; i < TCPT_NTIMERS; i++) + tp->t_timer[i] = 0; +} + +int tcp_backoff[TCP_MAXRXTSHIFT + 1] = + { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; + +/* + * TCP timer processing. + */ +struct tcpcb * +tcp_timers(tp, timer) + register struct tcpcb *tp; + int timer; +{ + register int rexmt; + + switch (timer) { + + /* + * 2 MSL timeout in shutdown went off. If we're closed but + * still waiting for peer to close and connection has been idle + * too long, or if 2MSL time is up from TIME_WAIT, delete connection + * control block. Otherwise, check again in a bit. + */ + case TCPT_2MSL: + if (tp->t_state != TCPS_TIME_WAIT && + tp->t_idle <= tcp_maxidle) + tp->t_timer[TCPT_2MSL] = tcp_keepintvl; + else + tp = tcp_close(tp); + break; + + /* + * Retransmission timer went off. Message has not + * been acked within retransmit interval. Back off + * to a longer retransmit interval and retransmit one segment. + */ + case TCPT_REXMT: + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + tp->t_rxtshift = TCP_MAXRXTSHIFT; + tcpstat.tcps_timeoutdrop++; + tp = tcp_drop(tp, tp->t_softerror ? + tp->t_softerror : ETIMEDOUT); + break; + } + tcpstat.tcps_rexmttimeo++; + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + TCPT_RANGESET(tp->t_rxtcur, rexmt, + tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + /* + * If losing, let the lower level know and try for + * a better route. Also, if we backed off this far, + * our srtt estimate is probably bogus. Clobber it + * so we'll take the next rtt measurement as our srtt; + * move the current srtt into rttvar to keep the current + * retransmit times until then. + */ + if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { + in_losing(tp->t_inpcb); + tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); + tp->t_srtt = 0; + } + tp->snd_nxt = tp->snd_una; + /* + * If timing a segment in this window, stop the timer. + */ + tp->t_rtt = 0; + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + { + u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_dupacks = 0; + } + (void) tcp_output(tp); + break; + + /* + * Persistance timer into zero window. + * Force a byte to be output, if possible. + */ + case TCPT_PERSIST: + tcpstat.tcps_persisttimeo++; + tcp_setpersist(tp); + tp->t_force = 1; + (void) tcp_output(tp); + tp->t_force = 0; + break; + + /* + * Keep-alive timer went off; send something + * or drop connection if idle for too long. + */ + case TCPT_KEEP: + tcpstat.tcps_keeptimeo++; + if (tp->t_state < TCPS_ESTABLISHED) + goto dropit; + if (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE && + tp->t_state <= TCPS_CLOSE_WAIT) { + if (tp->t_idle >= tcp_keepidle + tcp_maxidle) + goto dropit; + /* + * Send a packet designed to force a response + * if the peer is up and reachable: + * either an ACK if the connection is still alive, + * or an RST if the peer has closed the connection + * due to timeout or reboot. + * Using sequence number tp->snd_una-1 + * causes the transmitted zero-length segment + * to lie outside the receive window; + * by the protocol spec, this requires the + * correspondent TCP to respond. + */ + tcpstat.tcps_keepprobe++; +#ifdef TCP_COMPAT_42 + /* + * The keepalive packet must have nonzero length + * to get a 4.2 host to respond. + */ + tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, + tp->rcv_nxt - 1, tp->snd_una - 1, 0); +#else + tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); +#endif + tp->t_timer[TCPT_KEEP] = tcp_keepintvl; + } else + tp->t_timer[TCPT_KEEP] = tcp_keepidle; + break; + dropit: + tcpstat.tcps_keepdrops++; + tp = tcp_drop(tp, ETIMEDOUT); + break; + } + return (tp); +} +#endif /* TUBA_INCLUDE */ diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h new file mode 100644 index 00000000000..301a10f4034 --- /dev/null +++ b/sys/netinet/tcp_timer.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions of the TCP timers. These timers are counted + * down PR_SLOWHZ times a second. + */ +#define TCPT_NTIMERS 4 + +#define TCPT_REXMT 0 /* retransmit */ +#define TCPT_PERSIST 1 /* retransmit persistance */ +#define TCPT_KEEP 2 /* keep alive */ +#define TCPT_2MSL 3 /* 2*msl quiet time timer */ + +/* + * The TCPT_REXMT timer is used to force retransmissions. + * The TCP has the TCPT_REXMT timer set whenever segments + * have been sent for which ACKs are expected but not yet + * received. If an ACK is received which advances tp->snd_una, + * then the retransmit timer is cleared (if there are no more + * outstanding segments) or reset to the base value (if there + * are more ACKs expected). Whenever the retransmit timer goes off, + * we retransmit one unacknowledged segment, and do a backoff + * on the retransmit timer. + * + * The TCPT_PERSIST timer is used to keep window size information + * flowing even if the window goes shut. If all previous transmissions + * have been acknowledged (so that there are no retransmissions in progress), + * and the window is too small to bother sending anything, then we start + * the TCPT_PERSIST timer. When it expires, if the window is nonzero, + * we go to transmit state. Otherwise, at intervals send a single byte + * into the peer's window to force him to update our window information. + * We do this at most as often as TCPT_PERSMIN time intervals, + * but no more frequently than the current estimate of round-trip + * packet time. The TCPT_PERSIST timer is cleared whenever we receive + * a window update from the peer. + * + * The TCPT_KEEP timer is used to keep connections alive. If an + * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time, + * but not yet established, then we drop the connection. Once the connection + * is established, if the connection is idle for TCPTV_KEEP_IDLE time + * (and keepalives have been enabled on the socket), we begin to probe + * the connection. We force the peer to send us a segment by sending: + * + * This segment is (deliberately) outside the window, and should elicit + * an ack segment in response from the peer. If, despite the TCPT_KEEP + * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE + * amount of time probing, then we drop the connection. + */ + +/* + * Time constants. + */ +#define TCPTV_MSL ( 30*PR_SLOWHZ) /* max seg lifetime (hah!) */ +#define TCPTV_SRTTBASE 0 /* base roundtrip time; + if 0, no idea yet */ +#define TCPTV_SRTTDFLT ( 3*PR_SLOWHZ) /* assumed RTT if no info */ + +#define TCPTV_PERSMIN ( 5*PR_SLOWHZ) /* retransmit persistance */ +#define TCPTV_PERSMAX ( 60*PR_SLOWHZ) /* maximum persist interval */ + +#define TCPTV_KEEP_INIT ( 75*PR_SLOWHZ) /* initial connect keep alive */ +#define TCPTV_KEEP_IDLE (120*60*PR_SLOWHZ) /* dflt time before probing */ +#define TCPTV_KEEPINTVL ( 75*PR_SLOWHZ) /* default probe interval */ +#define TCPTV_KEEPCNT 8 /* max probes before drop */ + +#define TCPTV_MIN ( 1*PR_SLOWHZ) /* minimum allowable value */ +#define TCPTV_REXMTMAX ( 64*PR_SLOWHZ) /* max allowable REXMT value */ + +#define TCP_LINGERTIME 120 /* linger at most 2 minutes */ + +#define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ + +#ifdef TCPTIMERS +char *tcptimers[] = + { "REXMT", "PERSIST", "KEEP", "2MSL" }; +#endif + +/* + * Force a time value to be in a certain range. + */ +#define TCPT_RANGESET(tv, value, tvmin, tvmax) { \ + (tv) = (value); \ + if ((tv) < (tvmin)) \ + (tv) = (tvmin); \ + else if ((tv) > (tvmax)) \ + (tv) = (tvmax); \ +} + +#ifdef KERNEL +extern int tcp_keepidle; /* time before keepalive probes begin */ +extern int tcp_keepintvl; /* time between keepalive probes */ +extern int tcp_maxidle; /* time to drop after starting probes */ +extern int tcp_ttl; /* time to live for TCP segs */ +extern int tcp_backoff[]; +#endif diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c new file mode 100644 index 00000000000..8edb853bede --- /dev/null +++ b/sys/netinet/tcp_timewait.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* patchable/settable parameters for tcp */ +int tcp_mssdflt = TCP_MSS; +int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; +int tcp_do_rfc1323 = 1; + +extern struct inpcb *tcp_last_inpcb; + +/* + * Tcp initialization + */ +void +tcp_init() +{ + + tcp_iss = 1; /* wrong */ + tcb.inp_next = tcb.inp_prev = &tcb; + if (max_protohdr < sizeof(struct tcpiphdr)) + max_protohdr = sizeof(struct tcpiphdr); + if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN) + panic("tcp_init"); +} + +/* + * Create template to be used to send tcp packets on a connection. + * Call after host entry created, allocates an mbuf and fills + * in a skeletal tcp/ip header, minimizing the amount of work + * necessary when the connection is used. + */ +struct tcpiphdr * +tcp_template(tp) + struct tcpcb *tp; +{ + register struct inpcb *inp = tp->t_inpcb; + register struct mbuf *m; + register struct tcpiphdr *n; + + if ((n = tp->t_template) == 0) { + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return (0); + m->m_len = sizeof (struct tcpiphdr); + n = mtod(m, struct tcpiphdr *); + } + n->ti_next = n->ti_prev = 0; + n->ti_x1 = 0; + n->ti_pr = IPPROTO_TCP; + n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip)); + n->ti_src = inp->inp_laddr; + n->ti_dst = inp->inp_faddr; + n->ti_sport = inp->inp_lport; + n->ti_dport = inp->inp_fport; + n->ti_seq = 0; + n->ti_ack = 0; + n->ti_x2 = 0; + n->ti_off = 5; + n->ti_flags = 0; + n->ti_win = 0; + n->ti_sum = 0; + n->ti_urp = 0; + return (n); +} + +/* + * Send a single message to the TCP at address specified by + * the given TCP/IP header. If m == 0, then we make a copy + * of the tcpiphdr at ti and send directly to the addressed host. + * This is used to force keep alive messages out using the TCP + * template for a connection tp->t_template. If flags are given + * then we send a message back to the TCP which originated the + * segment ti, and discard the mbuf containing it and any other + * attached mbufs. + * + * In any case the ack and sequence number of the transmitted + * segment are as specified by the parameters. + */ +void +tcp_respond(tp, ti, m, ack, seq, flags) + struct tcpcb *tp; + register struct tcpiphdr *ti; + register struct mbuf *m; + tcp_seq ack, seq; + int flags; +{ + register int tlen; + int win = 0; + struct route *ro = 0; + + if (tp) { + win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); + ro = &tp->t_inpcb->inp_route; + } + if (m == 0) { + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; +#ifdef TCP_COMPAT_42 + tlen = 1; +#else + tlen = 0; +#endif + m->m_data += max_linkhdr; + *mtod(m, struct tcpiphdr *) = *ti; + ti = mtod(m, struct tcpiphdr *); + flags = TH_ACK; + } else { + m_freem(m->m_next); + m->m_next = 0; + m->m_data = (caddr_t)ti; + m->m_len = sizeof (struct tcpiphdr); + tlen = 0; +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } + xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, u_long); + xchg(ti->ti_dport, ti->ti_sport, u_short); +#undef xchg + } + ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + tlen)); + tlen += sizeof (struct tcpiphdr); + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = (struct ifnet *) 0; + ti->ti_next = ti->ti_prev = 0; + ti->ti_x1 = 0; + ti->ti_seq = htonl(seq); + ti->ti_ack = htonl(ack); + ti->ti_x2 = 0; + ti->ti_off = sizeof (struct tcphdr) >> 2; + ti->ti_flags = flags; + if (tp) + ti->ti_win = htons((u_short) (win >> tp->rcv_scale)); + else + ti->ti_win = htons((u_short)win); + ti->ti_urp = 0; + ti->ti_sum = 0; + ti->ti_sum = in_cksum(m, tlen); + ((struct ip *)ti)->ip_len = tlen; + ((struct ip *)ti)->ip_ttl = ip_defttl; + (void) ip_output(m, NULL, ro, 0, NULL); +} + +/* + * Create a new TCP control block, making an + * empty reassembly queue and hooking it to the argument + * protocol control block. + */ +struct tcpcb * +tcp_newtcpcb(inp) + struct inpcb *inp; +{ + register struct tcpcb *tp; + + tp = malloc(sizeof(*tp), M_PCB, M_NOWAIT); + if (tp == NULL) + return ((struct tcpcb *)0); + bzero((char *) tp, sizeof(struct tcpcb)); + tp->seg_next = tp->seg_prev = (struct tcpiphdr *)tp; + tp->t_maxseg = tcp_mssdflt; + + tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; + tp->t_inpcb = inp; + /* + * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no + * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives + * reasonable initial retransmit time. + */ + tp->t_srtt = TCPTV_SRTTBASE; + tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << 2; + tp->t_rttmin = TCPTV_MIN; + TCPT_RANGESET(tp->t_rxtcur, + ((TCPTV_SRTTBASE >> 2) + (TCPTV_SRTTDFLT << 2)) >> 1, + TCPTV_MIN, TCPTV_REXMTMAX); + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + inp->inp_ip.ip_ttl = ip_defttl; + inp->inp_ppcb = (caddr_t)tp; + return (tp); +} + +/* + * Drop a TCP connection, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +struct tcpcb * +tcp_drop(tp, errno) + register struct tcpcb *tp; + int errno; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_output(tp); + tcpstat.tcps_drops++; + } else + tcpstat.tcps_conndrops++; + if (errno == ETIMEDOUT && tp->t_softerror) + errno = tp->t_softerror; + so->so_error = errno; + return (tcp_close(tp)); +} + +/* + * Close a TCP control block: + * discard all space held by the tcp + * discard internet protocol block + * wake up any sleepers + */ +struct tcpcb * +tcp_close(tp) + register struct tcpcb *tp; +{ + register struct tcpiphdr *t; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + register struct mbuf *m; +#ifdef RTV_RTT + register struct rtentry *rt; + + /* + * If we sent enough data to get some meaningful characteristics, + * save them in the routing entry. 'Enough' is arbitrarily + * defined as the sendpipesize (default 4K) * 16. This would + * give us 16 rtt samples assuming we only get one sample per + * window (the usual case on a long haul net). 16 samples is + * enough for the srtt filter to converge to within 5% of the correct + * value; fewer samples and we could save a very bogus rtt. + * + * Don't update the default route's characteristics and don't + * update anything that the user "locked". + */ + if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) && + (rt = inp->inp_route.ro_rt) && + ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr != INADDR_ANY) { + register u_long i; + + if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { + i = tp->t_srtt * + (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTT_SCALE)); + if (rt->rt_rmx.rmx_rtt && i) + /* + * filter this update to half the old & half + * the new values, converting scale. + * See route.h and tcp_var.h for a + * description of the scaling constants. + */ + rt->rt_rmx.rmx_rtt = + (rt->rt_rmx.rmx_rtt + i) / 2; + else + rt->rt_rmx.rmx_rtt = i; + } + if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { + i = tp->t_rttvar * + (RTM_RTTUNIT / (PR_SLOWHZ * TCP_RTTVAR_SCALE)); + if (rt->rt_rmx.rmx_rttvar && i) + rt->rt_rmx.rmx_rttvar = + (rt->rt_rmx.rmx_rttvar + i) / 2; + else + rt->rt_rmx.rmx_rttvar = i; + } + /* + * update the pipelimit (ssthresh) if it has been updated + * already or if a pipesize was specified & the threshhold + * got below half the pipesize. I.e., wait for bad news + * before we start updating, then update on both good + * and bad news. + */ + if ((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && + (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh || + i < (rt->rt_rmx.rmx_sendpipe / 2)) { + /* + * convert the limit from user data bytes to + * packets then to packet data bytes. + */ + i = (i + tp->t_maxseg / 2) / tp->t_maxseg; + if (i < 2) + i = 2; + i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr)); + if (rt->rt_rmx.rmx_ssthresh) + rt->rt_rmx.rmx_ssthresh = + (rt->rt_rmx.rmx_ssthresh + i) / 2; + else + rt->rt_rmx.rmx_ssthresh = i; + } + } +#endif /* RTV_RTT */ + /* free the reassembly queue, if any */ + t = tp->seg_next; + while (t != (struct tcpiphdr *)tp) { + t = (struct tcpiphdr *)t->ti_next; + m = REASS_MBUF((struct tcpiphdr *)t->ti_prev); + remque(t->ti_prev); + m_freem(m); + } + if (tp->t_template) + (void) m_free(dtom(tp->t_template)); + free(tp, M_PCB); + inp->inp_ppcb = 0; + soisdisconnected(so); + /* clobber input pcb cache if we're closing the cached connection */ + if (inp == tcp_last_inpcb) + tcp_last_inpcb = &tcb; + in_pcbdetach(inp); + tcpstat.tcps_closed++; + return ((struct tcpcb *)0); +} + +void +tcp_drain() +{ + +} + +/* + * Notify a tcp user of an asynchronous error; + * store error as soft error, but wake up user + * (for now, won't do anything until can select for soft error). + */ +void +tcp_notify(inp, error) + struct inpcb *inp; + int error; +{ + register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; + register struct socket *so = inp->inp_socket; + + /* + * Ignore some errors if we are hooked up. + * If connection hasn't completed, has retransmitted several times, + * and receives a second error, give up now. This is better + * than waiting a long time to establish a connection that + * can never complete. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { + return; + } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && + tp->t_softerror) + so->so_error = error; + else + tp->t_softerror = error; + wakeup((caddr_t) &so->so_timeo); + sorwakeup(so); + sowwakeup(so); +} + +void +tcp_ctlinput(cmd, sa, ip) + int cmd; + struct sockaddr *sa; + register struct ip *ip; +{ + register struct tcphdr *th; + extern struct in_addr zeroin_addr; + extern u_char inetctlerrmap[]; + void (*notify) __P((struct inpcb *, int)) = tcp_notify; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)) + return; + if (ip) { + th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + in_pcbnotify(&tcb, sa, th->th_dport, ip->ip_src, th->th_sport, + cmd, notify); + } else + in_pcbnotify(&tcb, sa, 0, zeroin_addr, 0, cmd, notify); +} + +/* + * When a source quench is received, close congestion window + * to one segment. We will gradually open it again as we proceed. + */ +void +tcp_quench(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp) + tp->snd_cwnd = tp->t_maxseg; +} diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c new file mode 100644 index 00000000000..38a08d6d0c2 --- /dev/null +++ b/sys/netinet/tcp_usrreq.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * TCP protocol interface to socket abstraction. + */ +extern char *tcpstates[]; + +/* + * Process a TCP user request for TCP tb. If this is a send request + * then m is the mbuf chain of send data. If this is a timer expiration + * (called from the software clock routine), then timertype tells which timer. + */ +/*ARGSUSED*/ +int +tcp_usrreq(so, req, m, nam, control) + struct socket *so; + int req; + struct mbuf *m, *nam, *control; +{ + register struct inpcb *inp; + register struct tcpcb *tp; + int s; + int error = 0; + int ostate; + + if (req == PRU_CONTROL) + return (in_control(so, (int)m, (caddr_t)nam, + (struct ifnet *)control)); + if (control && control->m_len) { + m_freem(control); + if (m) + m_freem(m); + return (EINVAL); + } + + s = splnet(); + inp = sotoinpcb(so); + /* + * When a TCP is attached to a socket, then there will be + * a (struct inpcb) pointed at by the socket, and this + * structure will point at a subsidary (struct tcpcb). + */ + if (inp == 0 && req != PRU_ATTACH) { + splx(s); + return (EINVAL); /* XXX */ + } + if (inp) { + tp = intotcpcb(inp); + /* WHAT IF TP IS 0? */ +#ifdef KPROF + tcp_acounts[tp->t_state][req]++; +#endif + ostate = tp->t_state; + } else + ostate = 0; + switch (req) { + + /* + * TCP attaches to socket via PRU_ATTACH, reserving space, + * and an internet control block. + */ + case PRU_ATTACH: + if (inp) { + error = EISCONN; + break; + } + error = tcp_attach(so); + if (error) + break; + if ((so->so_options & SO_LINGER) && so->so_linger == 0) + so->so_linger = TCP_LINGERTIME; + tp = sototcpcb(so); + break; + + /* + * PRU_DETACH detaches the TCP protocol from the socket. + * If the protocol state is non-embryonic, then can't + * do this directly: have to initiate a PRU_DISCONNECT, + * which may finish later; embryonic TCB's can just + * be discarded here. + */ + case PRU_DETACH: + if (tp->t_state > TCPS_LISTEN) + tp = tcp_disconnect(tp); + else + tp = tcp_close(tp); + break; + + /* + * Give the socket an address. + */ + case PRU_BIND: + error = in_pcbbind(inp, nam); + if (error) + break; + break; + + /* + * Prepare to accept connections. + */ + case PRU_LISTEN: + if (inp->inp_lport == 0) + error = in_pcbbind(inp, (struct mbuf *)0); + if (error == 0) + tp->t_state = TCPS_LISTEN; + break; + + /* + * Initiate connection to peer. + * Create a template for use in transmissions on this connection. + * Enter SYN_SENT state, and mark socket as connecting. + * Start keep-alive timer, and seed output sequence space. + * Send initial segment on connection. + */ + case PRU_CONNECT: + if (inp->inp_lport == 0) { + error = in_pcbbind(inp, (struct mbuf *)0); + if (error) + break; + } + error = in_pcbconnect(inp, nam); + if (error) + break; + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + in_pcbdisconnect(inp); + error = ENOBUFS; + break; + } + /* Compute window scaling to request. */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + soisconnecting(so); + tcpstat.tcps_connattempt++; + tp->t_state = TCPS_SYN_SENT; + tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; + tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; + tcp_sendseqinit(tp); + error = tcp_output(tp); + break; + + /* + * Create a TCP connection between two sockets. + */ + case PRU_CONNECT2: + error = EOPNOTSUPP; + break; + + /* + * Initiate disconnect from peer. + * If connection never passed embryonic stage, just drop; + * else if don't need to let data drain, then can just drop anyways, + * else have to begin TCP shutdown process: mark socket disconnecting, + * drain unread data, state switch to reflect user close, and + * send segment (e.g. FIN) to peer. Socket will be really disconnected + * when peer sends FIN and acks ours. + * + * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. + */ + case PRU_DISCONNECT: + tp = tcp_disconnect(tp); + break; + + /* + * Accept a connection. Essentially all the work is + * done at higher levels; just return the address + * of the peer, storing through addr. + */ + case PRU_ACCEPT: + in_setpeeraddr(inp, nam); + break; + + /* + * Mark the connection as being incapable of further output. + */ + case PRU_SHUTDOWN: + socantsendmore(so); + tp = tcp_usrclosed(tp); + if (tp) + error = tcp_output(tp); + break; + + /* + * After a receive, possibly send window update to peer. + */ + case PRU_RCVD: + (void) tcp_output(tp); + break; + + /* + * Do a send by putting data in output queue and updating urgent + * marker if URG set. Possibly send more data. + */ + case PRU_SEND: + sbappend(&so->so_snd, m); + error = tcp_output(tp); + break; + + /* + * Abort the TCP. + */ + case PRU_ABORT: + tp = tcp_drop(tp, ECONNABORTED); + break; + + case PRU_SENSE: + ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; + (void) splx(s); + return (0); + + case PRU_RCVOOB: + if ((so->so_oobmark == 0 && + (so->so_state & SS_RCVATMARK) == 0) || + so->so_options & SO_OOBINLINE || + tp->t_oobflags & TCPOOB_HADDATA) { + error = EINVAL; + break; + } + if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { + error = EWOULDBLOCK; + break; + } + m->m_len = 1; + *mtod(m, caddr_t) = tp->t_iobc; + if (((int)nam & MSG_PEEK) == 0) + tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); + break; + + case PRU_SENDOOB: + if (sbspace(&so->so_snd) < -512) { + m_freem(m); + error = ENOBUFS; + break; + } + /* + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section. + * Otherwise, snd_up should be one lower. + */ + sbappend(&so->so_snd, m); + tp->snd_up = tp->snd_una + so->so_snd.sb_cc; + tp->t_force = 1; + error = tcp_output(tp); + tp->t_force = 0; + break; + + case PRU_SOCKADDR: + in_setsockaddr(inp, nam); + break; + + case PRU_PEERADDR: + in_setpeeraddr(inp, nam); + break; + + /* + * TCP slow timer went off; going through this + * routine for tracing's sake. + */ + case PRU_SLOWTIMO: + tp = tcp_timers(tp, (int)nam); + req |= (int)nam << 8; /* for debug's sake */ + break; + + default: + panic("tcp_usrreq"); + } + if (tp && (so->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req); + splx(s); + return (error); +} + +int +tcp_ctloutput(op, so, level, optname, mp) + int op; + struct socket *so; + int level, optname; + struct mbuf **mp; +{ + int error = 0, s; + struct inpcb *inp; + register struct tcpcb *tp; + register struct mbuf *m; + register int i; + + s = splnet(); + inp = sotoinpcb(so); + if (inp == NULL) { + splx(s); + if (op == PRCO_SETOPT && *mp) + (void) m_free(*mp); + return (ECONNRESET); + } + if (level != IPPROTO_TCP) { + error = ip_ctloutput(op, so, level, optname, mp); + splx(s); + return (error); + } + tp = intotcpcb(inp); + + switch (op) { + + case PRCO_SETOPT: + m = *mp; + switch (optname) { + + case TCP_NODELAY: + if (m == NULL || m->m_len < sizeof (int)) + error = EINVAL; + else if (*mtod(m, int *)) + tp->t_flags |= TF_NODELAY; + else + tp->t_flags &= ~TF_NODELAY; + break; + + case TCP_MAXSEG: + if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg) + tp->t_maxseg = i; + else + error = EINVAL; + break; + + default: + error = ENOPROTOOPT; + break; + } + if (m) + (void) m_free(m); + break; + + case PRCO_GETOPT: + *mp = m = m_get(M_WAIT, MT_SOOPTS); + m->m_len = sizeof(int); + + switch (optname) { + case TCP_NODELAY: + *mtod(m, int *) = tp->t_flags & TF_NODELAY; + break; + case TCP_MAXSEG: + *mtod(m, int *) = tp->t_maxseg; + break; + default: + error = ENOPROTOOPT; + break; + } + break; + } + splx(s); + return (error); +} + +u_long tcp_sendspace = 1024*8; +u_long tcp_recvspace = 1024*8; + +/* + * Attach TCP protocol to socket, allocating + * internet protocol control block, tcp control block, + * bufer space, and entering LISTEN state if to accept connections. + */ +int +tcp_attach(so) + struct socket *so; +{ + register struct tcpcb *tp; + struct inpcb *inp; + int error; + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + error = soreserve(so, tcp_sendspace, tcp_recvspace); + if (error) + return (error); + } + error = in_pcballoc(so, &tcb); + if (error) + return (error); + inp = sotoinpcb(so); + tp = tcp_newtcpcb(inp); + if (tp == 0) { + int nofd = so->so_state & SS_NOFDREF; /* XXX */ + + so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ + in_pcbdetach(inp); + so->so_state |= nofd; + return (ENOBUFS); + } + tp->t_state = TCPS_CLOSED; + return (0); +} + +/* + * Initiate (or continue) disconnect. + * If embryonic state, just send reset (once). + * If in ``let data drain'' option and linger null, just drop. + * Otherwise (hard), mark socket disconnecting and drop + * current input data; switch states based on user close, and + * send segment to peer (with FIN). + */ +struct tcpcb * +tcp_disconnect(tp) + register struct tcpcb *tp; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (tp->t_state < TCPS_ESTABLISHED) + tp = tcp_close(tp); + else if ((so->so_options & SO_LINGER) && so->so_linger == 0) + tp = tcp_drop(tp, 0); + else { + soisdisconnecting(so); + sbflush(&so->so_rcv); + tp = tcp_usrclosed(tp); + if (tp) + (void) tcp_output(tp); + } + return (tp); +} + +/* + * User issued close, and wish to trail through shutdown states: + * if never received SYN, just forget it. If got a SYN from peer, + * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. + * If already got a FIN from peer, then almost done; go to LAST_ACK + * state. In all other cases, have already sent FIN to peer (e.g. + * after PRU_SHUTDOWN), and just have to play tedious game waiting + * for peer to send FIN or not respond to keep-alives, etc. + * We can let the user exit from the close as soon as the FIN is acked. + */ +struct tcpcb * +tcp_usrclosed(tp) + register struct tcpcb *tp; +{ + + switch (tp->t_state) { + + case TCPS_CLOSED: + case TCPS_LISTEN: + case TCPS_SYN_SENT: + tp->t_state = TCPS_CLOSED; + tp = tcp_close(tp); + break; + + case TCPS_SYN_RECEIVED: + case TCPS_ESTABLISHED: + tp->t_state = TCPS_FIN_WAIT_1; + break; + + case TCPS_CLOSE_WAIT: + tp->t_state = TCPS_LAST_ACK; + break; + } + if (tp && tp->t_state >= TCPS_FIN_WAIT_2) + soisdisconnected(tp->t_inpcb->inp_socket); + return (tp); +} diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h new file mode 100644 index 00000000000..8a8e7512114 --- /dev/null +++ b/sys/netinet/tcp_var.h @@ -0,0 +1,278 @@ +/* + * Copyright (c) 1982, 1986, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_var.h 8.3 (Berkeley) 4/10/94 + */ + +/* + * Kernel variables for tcp. + */ + +/* + * Tcp control block, one per tcp; fields: + */ +struct tcpcb { + struct tcpiphdr *seg_next; /* sequencing queue */ + struct tcpiphdr *seg_prev; + short t_state; /* state of this connection */ + short t_timer[TCPT_NTIMERS]; /* tcp timers */ + short t_rxtshift; /* log(2) of rexmt exp. backoff */ + short t_rxtcur; /* current retransmit value */ + short t_dupacks; /* consecutive dup acks recd */ + u_short t_maxseg; /* maximum segment size */ + char t_force; /* 1 if forcing out a byte */ + u_short t_flags; +#define TF_ACKNOW 0x0001 /* ack peer immediately */ +#define TF_DELACK 0x0002 /* ack, but try to delay it */ +#define TF_NODELAY 0x0004 /* don't delay packets to coalesce */ +#define TF_NOOPT 0x0008 /* don't use tcp options */ +#define TF_SENTFIN 0x0010 /* have sent FIN */ +#define TF_REQ_SCALE 0x0020 /* have/will request window scaling */ +#define TF_RCVD_SCALE 0x0040 /* other side has requested scaling */ +#define TF_REQ_TSTMP 0x0080 /* have/will request timestamps */ +#define TF_RCVD_TSTMP 0x0100 /* a timestamp was received in SYN */ +#define TF_SACK_PERMIT 0x0200 /* other side said I could SACK */ + + struct tcpiphdr *t_template; /* skeletal packet for transmit */ + struct inpcb *t_inpcb; /* back pointer to internet pcb */ +/* + * The following fields are used as in the protocol specification. + * See RFC783, Dec. 1981, page 21. + */ +/* send sequence variables */ + tcp_seq snd_una; /* send unacknowledged */ + tcp_seq snd_nxt; /* send next */ + tcp_seq snd_up; /* send urgent pointer */ + tcp_seq snd_wl1; /* window update seg seq number */ + tcp_seq snd_wl2; /* window update seg ack number */ + tcp_seq iss; /* initial send sequence number */ + u_long snd_wnd; /* send window */ +/* receive sequence variables */ + u_long rcv_wnd; /* receive window */ + tcp_seq rcv_nxt; /* receive next */ + tcp_seq rcv_up; /* receive urgent pointer */ + tcp_seq irs; /* initial receive sequence number */ +/* + * Additional variables for this implementation. + */ +/* receive variables */ + tcp_seq rcv_adv; /* advertised window */ +/* retransmit variables */ + tcp_seq snd_max; /* highest sequence number sent; + * used to recognize retransmits + */ +/* congestion control (for slow start, source quench, retransmit after loss) */ + u_long snd_cwnd; /* congestion-controlled window */ + u_long snd_ssthresh; /* snd_cwnd size threshhold for + * for slow start exponential to + * linear switch + */ +/* + * transmit timing stuff. See below for scale of srtt and rttvar. + * "Variance" is actually smoothed difference. + */ + short t_idle; /* inactivity time */ + short t_rtt; /* round trip time */ + tcp_seq t_rtseq; /* sequence number being timed */ + short t_srtt; /* smoothed round-trip time */ + short t_rttvar; /* variance in round-trip time */ + u_short t_rttmin; /* minimum rtt allowed */ + u_long max_sndwnd; /* largest window peer has offered */ + +/* out-of-band data */ + char t_oobflags; /* have some */ + char t_iobc; /* input character */ +#define TCPOOB_HAVEDATA 0x01 +#define TCPOOB_HADDATA 0x02 + short t_softerror; /* possible error not yet reported */ + +/* RFC 1323 variables */ + u_char snd_scale; /* window scaling for send window */ + u_char rcv_scale; /* window scaling for recv window */ + u_char request_r_scale; /* pending window scaling */ + u_char requested_s_scale; + u_long ts_recent; /* timestamp echo data */ + u_long ts_recent_age; /* when last updated */ + tcp_seq last_ack_sent; + +/* TUBA stuff */ + caddr_t t_tuba_pcb; /* next level down pcb for TCP over z */ +}; + +#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) +#define sototcpcb(so) (intotcpcb(sotoinpcb(so))) + +/* + * The smoothed round-trip time and estimated variance + * are stored as fixed point numbers scaled by the values below. + * For convenience, these scales are also used in smoothing the average + * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). + * With these scales, srtt has 3 bits to the right of the binary point, + * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the + * binary point, and is smoothed with an ALPHA of 0.75. + */ +#define TCP_RTT_SCALE 8 /* multiplier for srtt; 3 bits frac. */ +#define TCP_RTT_SHIFT 3 /* shift for srtt; 3 bits frac. */ +#define TCP_RTTVAR_SCALE 4 /* multiplier for rttvar; 2 bits */ +#define TCP_RTTVAR_SHIFT 2 /* multiplier for rttvar; 2 bits */ + +/* + * The initial retransmission should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + * This macro assumes that the value of TCP_RTTVAR_SCALE + * is the same as the multiplier for rttvar. + */ +#define TCP_REXMTVAL(tp) \ + (((tp)->t_srtt >> TCP_RTT_SHIFT) + (tp)->t_rttvar) + +/* XXX + * We want to avoid doing m_pullup on incoming packets but that + * means avoiding dtom on the tcp reassembly code. That in turn means + * keeping an mbuf pointer in the reassembly queue (since we might + * have a cluster). As a quick hack, the source & destination + * port numbers (which are no longer needed once we've located the + * tcpcb) are overlayed with an mbuf pointer. + */ +#define REASS_MBUF(ti) (*(struct mbuf **)&((ti)->ti_t)) + +/* + * TCP statistics. + * Many of these should be kept per connection, + * but that's inconvenient at the moment. + */ +struct tcpstat { + u_long tcps_connattempt; /* connections initiated */ + u_long tcps_accepts; /* connections accepted */ + u_long tcps_connects; /* connections established */ + u_long tcps_drops; /* connections dropped */ + u_long tcps_conndrops; /* embryonic connections dropped */ + u_long tcps_closed; /* conn. closed (includes drops) */ + u_long tcps_segstimed; /* segs where we tried to get rtt */ + u_long tcps_rttupdated; /* times we succeeded */ + u_long tcps_delack; /* delayed acks sent */ + u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ + u_long tcps_rexmttimeo; /* retransmit timeouts */ + u_long tcps_persisttimeo; /* persist timeouts */ + u_long tcps_keeptimeo; /* keepalive timeouts */ + u_long tcps_keepprobe; /* keepalive probes sent */ + u_long tcps_keepdrops; /* connections dropped in keepalive */ + + u_long tcps_sndtotal; /* total packets sent */ + u_long tcps_sndpack; /* data packets sent */ + u_long tcps_sndbyte; /* data bytes sent */ + u_long tcps_sndrexmitpack; /* data packets retransmitted */ + u_long tcps_sndrexmitbyte; /* data bytes retransmitted */ + u_long tcps_sndacks; /* ack-only packets sent */ + u_long tcps_sndprobe; /* window probes sent */ + u_long tcps_sndurg; /* packets sent with URG only */ + u_long tcps_sndwinup; /* window update-only packets sent */ + u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ + + u_long tcps_rcvtotal; /* total packets received */ + u_long tcps_rcvpack; /* packets received in sequence */ + u_long tcps_rcvbyte; /* bytes received in sequence */ + u_long tcps_rcvbadsum; /* packets received with ccksum errs */ + u_long tcps_rcvbadoff; /* packets received with bad offset */ + u_long tcps_rcvshort; /* packets received too short */ + u_long tcps_rcvduppack; /* duplicate-only packets received */ + u_long tcps_rcvdupbyte; /* duplicate-only bytes received */ + u_long tcps_rcvpartduppack; /* packets with some duplicate data */ + u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ + u_long tcps_rcvoopack; /* out-of-order packets received */ + u_long tcps_rcvoobyte; /* out-of-order bytes received */ + u_long tcps_rcvpackafterwin; /* packets with data after window */ + u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */ + u_long tcps_rcvafterclose; /* packets rcvd after "close" */ + u_long tcps_rcvwinprobe; /* rcvd window probe packets */ + u_long tcps_rcvdupack; /* rcvd duplicate acks */ + u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */ + u_long tcps_rcvackpack; /* rcvd ack packets */ + u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */ + u_long tcps_rcvwinupd; /* rcvd window update packets */ + u_long tcps_pawsdrop; /* segments dropped due to PAWS */ + u_long tcps_predack; /* times hdr predict ok for acks */ + u_long tcps_preddat; /* times hdr predict ok for data pkts */ + u_long tcps_pcbcachemiss; +}; + +#ifdef KERNEL +struct inpcb tcb; /* head of queue of active tcpcb's */ +struct tcpstat tcpstat; /* tcp statistics */ +u_long tcp_now; /* for RFC 1323 timestamps */ + +int tcp_attach __P((struct socket *)); +void tcp_canceltimers __P((struct tcpcb *)); +struct tcpcb * + tcp_close __P((struct tcpcb *)); +void tcp_ctlinput __P((int, struct sockaddr *, struct ip *)); +int tcp_ctloutput __P((int, struct socket *, int, int, struct mbuf **)); +struct tcpcb * + tcp_disconnect __P((struct tcpcb *)); +struct tcpcb * + tcp_drop __P((struct tcpcb *, int)); +void tcp_dooptions __P((struct tcpcb *, + u_char *, int, struct tcpiphdr *, int *, u_long *, u_long *)); +void tcp_drain __P((void)); +void tcp_fasttimo __P((void)); +void tcp_init __P((void)); +void tcp_input __P((struct mbuf *, int)); +int tcp_mss __P((struct tcpcb *, u_int)); +struct tcpcb * + tcp_newtcpcb __P((struct inpcb *)); +void tcp_notify __P((struct inpcb *, int)); +int tcp_output __P((struct tcpcb *)); +void tcp_pulloutofband __P((struct socket *, + struct tcpiphdr *, struct mbuf *)); +void tcp_quench __P((struct inpcb *, int)); +int tcp_reass __P((struct tcpcb *, struct tcpiphdr *, struct mbuf *)); +void tcp_respond __P((struct tcpcb *, + struct tcpiphdr *, struct mbuf *, u_long, u_long, int)); +void tcp_setpersist __P((struct tcpcb *)); +void tcp_slowtimo __P((void)); +struct tcpiphdr * + tcp_template __P((struct tcpcb *)); +struct tcpcb * + tcp_timers __P((struct tcpcb *, int)); +void tcp_trace __P((int, int, struct tcpcb *, struct tcpiphdr *, int)); +struct tcpcb * + tcp_usrclosed __P((struct tcpcb *)); +int tcp_usrreq __P((struct socket *, + int, struct mbuf *, struct mbuf *, struct mbuf *)); +void tcp_xmit_timer __P((struct tcpcb *, int)); +#endif diff --git a/sys/netinet/tcpip.h b/sys/netinet/tcpip.h new file mode 100644 index 00000000000..5000ae303ce --- /dev/null +++ b/sys/netinet/tcpip.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcpip.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Tcp+ip header, after ip options removed. + */ +struct tcpiphdr { + struct ipovly ti_i; /* overlaid ip structure */ + struct tcphdr ti_t; /* tcp header */ +}; +#define ti_next ti_i.ih_next +#define ti_prev ti_i.ih_prev +#define ti_x1 ti_i.ih_x1 +#define ti_pr ti_i.ih_pr +#define ti_len ti_i.ih_len +#define ti_src ti_i.ih_src +#define ti_dst ti_i.ih_dst +#define ti_sport ti_t.th_sport +#define ti_dport ti_t.th_dport +#define ti_seq ti_t.th_seq +#define ti_ack ti_t.th_ack +#define ti_x2 ti_t.th_x2 +#define ti_off ti_t.th_off +#define ti_flags ti_t.th_flags +#define ti_win ti_t.th_win +#define ti_sum ti_t.th_sum +#define ti_urp ti_t.th_urp diff --git a/sys/netinet/udp.h b/sys/netinet/udp.h new file mode 100644 index 00000000000..354a213cbc2 --- /dev/null +++ b/sys/netinet/udp.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Udp protocol header. + * Per RFC 768, September, 1981. + */ +struct udphdr { + u_short uh_sport; /* source port */ + u_short uh_dport; /* destination port */ + short uh_ulen; /* udp length */ + u_short uh_sum; /* udp checksum */ +}; diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c new file mode 100644 index 00000000000..95b1895ac0a --- /dev/null +++ b/sys/netinet/udp_usrreq.c @@ -0,0 +1,640 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_usrreq.c 8.4 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * UDP protocol implementation. + * Per RFC 768, August, 1980. + */ +#ifndef COMPAT_42 +int udpcksum = 1; +#else +int udpcksum = 0; /* XXX */ +#endif + +struct sockaddr_in udp_in = { sizeof(udp_in), AF_INET }; +struct inpcb *udp_last_inpcb = &udb; + +static void udp_detach __P((struct inpcb *)); +static void udp_notify __P((struct inpcb *, int)); +static struct mbuf *udp_saveopt __P((caddr_t, int, int)); + +void +udp_init() +{ + udb.inp_next = udb.inp_prev = &udb; +} + +void +udp_input(m, iphlen) + register struct mbuf *m; + int iphlen; +{ + register struct ip *ip; + register struct udphdr *uh; + register struct inpcb *inp; + struct mbuf *opts = 0; + int len; + struct ip save_ip; + + udpstat.udps_ipackets++; + + /* + * Strip IP options, if any; should skip this, + * make available to user, and use on returned packets, + * but we don't yet have a way to check the checksum + * with options still present. + */ + if (iphlen > sizeof (struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + iphlen = sizeof(struct ip); + } + + /* + * Get IP and UDP header together in first mbuf. + */ + ip = mtod(m, struct ip *); + if (m->m_len < iphlen + sizeof(struct udphdr)) { + if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { + udpstat.udps_hdrops++; + return; + } + ip = mtod(m, struct ip *); + } + uh = (struct udphdr *)((caddr_t)ip + iphlen); + + /* + * Make mbuf data length reflect UDP length. + * If not enough data to reflect UDP length, drop. + */ + len = ntohs((u_short)uh->uh_ulen); + if (ip->ip_len != len) { + if (len > ip->ip_len) { + udpstat.udps_badlen++; + goto bad; + } + m_adj(m, len - ip->ip_len); + /* ip->ip_len = len; */ + } + /* + * Save a copy of the IP header in case we want restore it + * for sending an ICMP error message in response. + */ + save_ip = *ip; + + /* + * Checksum extended UDP header and data. + */ + if (udpcksum && uh->uh_sum) { + ((struct ipovly *)ip)->ih_next = 0; + ((struct ipovly *)ip)->ih_prev = 0; + ((struct ipovly *)ip)->ih_x1 = 0; + ((struct ipovly *)ip)->ih_len = uh->uh_ulen; + if (uh->uh_sum = in_cksum(m, len + sizeof (struct ip))) { + udpstat.udps_badsum++; + m_freem(m); + return; + } + } + + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + struct socket *last; + /* + * Deliver a multicast or broadcast datagram to *all* sockets + * for which the local and remote addresses and ports match + * those of the incoming datagram. This allows more than + * one process to receive multi/broadcasts on the same port. + * (This really ought to be done for unicast datagrams as + * well, but that would cause problems with existing + * applications that open both address-specific sockets and + * a wildcard socket listening to the same port -- they would + * end up receiving duplicates of every unicast datagram. + * Those applications open the multiple sockets to overcome an + * inadequacy of the UDP socket interface, but for backwards + * compatibility we avoid the problem here rather than + * fixing the interface. Maybe 4.5BSD will remedy this?) + */ + + /* + * Construct sockaddr format source address. + */ + udp_in.sin_port = uh->uh_sport; + udp_in.sin_addr = ip->ip_src; + m->m_len -= sizeof (struct udpiphdr); + m->m_data += sizeof (struct udpiphdr); + /* + * Locate pcb(s) for datagram. + * (Algorithm copied from raw_intr().) + */ + last = NULL; + for (inp = udb.inp_next; inp != &udb; inp = inp->inp_next) { + if (inp->inp_lport != uh->uh_dport) + continue; + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (inp->inp_laddr.s_addr != + ip->ip_dst.s_addr) + continue; + } + if (inp->inp_faddr.s_addr != INADDR_ANY) { + if (inp->inp_faddr.s_addr != + ip->ip_src.s_addr || + inp->inp_fport != uh->uh_sport) + continue; + } + + if (last != NULL) { + struct mbuf *n; + + if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { + if (sbappendaddr(&last->so_rcv, + (struct sockaddr *)&udp_in, + n, (struct mbuf *)0) == 0) { + m_freem(n); + udpstat.udps_fullsock++; + } else + sorwakeup(last); + } + } + last = inp->inp_socket; + /* + * Don't look for additional matches if this one does + * not have either the SO_REUSEPORT or SO_REUSEADDR + * socket options set. This heuristic avoids searching + * through all pcbs in the common case of a non-shared + * port. It * assumes that an application will never + * clear these options after setting them. + */ + if ((last->so_options&(SO_REUSEPORT|SO_REUSEADDR) == 0)) + break; + } + + if (last == NULL) { + /* + * No matching pcb found; discard datagram. + * (No need to send an ICMP Port Unreachable + * for a broadcast or multicast datgram.) + */ + udpstat.udps_noportbcast++; + goto bad; + } + if (sbappendaddr(&last->so_rcv, (struct sockaddr *)&udp_in, + m, (struct mbuf *)0) == 0) { + udpstat.udps_fullsock++; + goto bad; + } + sorwakeup(last); + return; + } + /* + * Locate pcb for datagram. + */ + inp = udp_last_inpcb; + if (inp->inp_lport != uh->uh_dport || + inp->inp_fport != uh->uh_sport || + inp->inp_faddr.s_addr != ip->ip_src.s_addr || + inp->inp_laddr.s_addr != ip->ip_dst.s_addr) { + inp = in_pcblookup(&udb, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD); + if (inp) + udp_last_inpcb = inp; + udpstat.udpps_pcbcachemiss++; + } + if (inp == 0) { + udpstat.udps_noport++; + if (m->m_flags & (M_BCAST | M_MCAST)) { + udpstat.udps_noportbcast++; + goto bad; + } + *ip = save_ip; + ip->ip_len += iphlen; + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); + return; + } + + /* + * Construct sockaddr format source address. + * Stuff source address and datagram in user buffer. + */ + udp_in.sin_port = uh->uh_sport; + udp_in.sin_addr = ip->ip_src; + if (inp->inp_flags & INP_CONTROLOPTS) { + struct mbuf **mp = &opts; + + if (inp->inp_flags & INP_RECVDSTADDR) { + *mp = udp_saveopt((caddr_t) &ip->ip_dst, + sizeof(struct in_addr), IP_RECVDSTADDR); + if (*mp) + mp = &(*mp)->m_next; + } +#ifdef notyet + /* options were tossed above */ + if (inp->inp_flags & INP_RECVOPTS) { + *mp = udp_saveopt((caddr_t) opts_deleted_above, + sizeof(struct in_addr), IP_RECVOPTS); + if (*mp) + mp = &(*mp)->m_next; + } + /* ip_srcroute doesn't do what we want here, need to fix */ + if (inp->inp_flags & INP_RECVRETOPTS) { + *mp = udp_saveopt((caddr_t) ip_srcroute(), + sizeof(struct in_addr), IP_RECVRETOPTS); + if (*mp) + mp = &(*mp)->m_next; + } +#endif + } + iphlen += sizeof(struct udphdr); + m->m_len -= iphlen; + m->m_pkthdr.len -= iphlen; + m->m_data += iphlen; + if (sbappendaddr(&inp->inp_socket->so_rcv, (struct sockaddr *)&udp_in, + m, opts) == 0) { + udpstat.udps_fullsock++; + goto bad; + } + sorwakeup(inp->inp_socket); + return; +bad: + m_freem(m); + if (opts) + m_freem(opts); +} + +/* + * Create a "control" mbuf containing the specified data + * with the specified type for presentation with a datagram. + */ +struct mbuf * +udp_saveopt(p, size, type) + caddr_t p; + register int size; + int type; +{ + register struct cmsghdr *cp; + struct mbuf *m; + + if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) + return ((struct mbuf *) NULL); + cp = (struct cmsghdr *) mtod(m, struct cmsghdr *); + bcopy(p, CMSG_DATA(cp), size); + size += sizeof(*cp); + m->m_len = size; + cp->cmsg_len = size; + cp->cmsg_level = IPPROTO_IP; + cp->cmsg_type = type; + return (m); +} + +/* + * Notify a udp user of an asynchronous error; + * just wake up so that he can collect error status. + */ +static void +udp_notify(inp, errno) + register struct inpcb *inp; + int errno; +{ + inp->inp_socket->so_error = errno; + sorwakeup(inp->inp_socket); + sowwakeup(inp->inp_socket); +} + +void +udp_ctlinput(cmd, sa, ip) + int cmd; + struct sockaddr *sa; + register struct ip *ip; +{ + register struct udphdr *uh; + extern struct in_addr zeroin_addr; + extern u_char inetctlerrmap[]; + + if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)) + return; + if (ip) { + uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + in_pcbnotify(&udb, sa, uh->uh_dport, ip->ip_src, uh->uh_sport, + cmd, udp_notify); + } else + in_pcbnotify(&udb, sa, 0, zeroin_addr, 0, cmd, udp_notify); +} + +int +udp_output(inp, m, addr, control) + register struct inpcb *inp; + register struct mbuf *m; + struct mbuf *addr, *control; +{ + register struct udpiphdr *ui; + register int len = m->m_pkthdr.len; + struct in_addr laddr; + int s, error = 0; + + if (control) + m_freem(control); /* XXX */ + + if (addr) { + laddr = inp->inp_laddr; + if (inp->inp_faddr.s_addr != INADDR_ANY) { + error = EISCONN; + goto release; + } + /* + * Must block input while temporarily connected. + */ + s = splnet(); + error = in_pcbconnect(inp, addr); + if (error) { + splx(s); + goto release; + } + } else { + if (inp->inp_faddr.s_addr == INADDR_ANY) { + error = ENOTCONN; + goto release; + } + } + /* + * Calculate data length and get a mbuf + * for UDP and IP headers. + */ + M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT); + if (m == 0) { + error = ENOBUFS; + goto release; + } + + /* + * Fill in mbuf with extended UDP header + * and addresses and length put into network format. + */ + ui = mtod(m, struct udpiphdr *); + ui->ui_next = ui->ui_prev = 0; + ui->ui_x1 = 0; + ui->ui_pr = IPPROTO_UDP; + ui->ui_len = htons((u_short)len + sizeof (struct udphdr)); + ui->ui_src = inp->inp_laddr; + ui->ui_dst = inp->inp_faddr; + ui->ui_sport = inp->inp_lport; + ui->ui_dport = inp->inp_fport; + ui->ui_ulen = ui->ui_len; + + /* + * Stuff checksum and output datagram. + */ + ui->ui_sum = 0; + if (udpcksum) { + if ((ui->ui_sum = in_cksum(m, sizeof (struct udpiphdr) + len)) == 0) + ui->ui_sum = 0xffff; + } + ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; + ((struct ip *)ui)->ip_ttl = inp->inp_ip.ip_ttl; /* XXX */ + ((struct ip *)ui)->ip_tos = inp->inp_ip.ip_tos; /* XXX */ + udpstat.udps_opackets++; + error = ip_output(m, inp->inp_options, &inp->inp_route, + inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST), + inp->inp_moptions); + + if (addr) { + in_pcbdisconnect(inp); + inp->inp_laddr = laddr; + splx(s); + } + return (error); + +release: + m_freem(m); + return (error); +} + +u_long udp_sendspace = 9216; /* really max datagram size */ +u_long udp_recvspace = 40 * (1024 + sizeof(struct sockaddr_in)); + /* 40 1K datagrams */ + +/*ARGSUSED*/ +int +udp_usrreq(so, req, m, addr, control) + struct socket *so; + int req; + struct mbuf *m, *addr, *control; +{ + struct inpcb *inp = sotoinpcb(so); + int error = 0; + int s; + + if (req == PRU_CONTROL) + return (in_control(so, (int)m, (caddr_t)addr, + (struct ifnet *)control)); + if (inp == NULL && req != PRU_ATTACH) { + error = EINVAL; + goto release; + } + /* + * Note: need to block udp_input while changing + * the udp pcb queue and/or pcb addresses. + */ + switch (req) { + + case PRU_ATTACH: + if (inp != NULL) { + error = EINVAL; + break; + } + s = splnet(); + error = in_pcballoc(so, &udb); + splx(s); + if (error) + break; + error = soreserve(so, udp_sendspace, udp_recvspace); + if (error) + break; + ((struct inpcb *) so->so_pcb)->inp_ip.ip_ttl = ip_defttl; + break; + + case PRU_DETACH: + udp_detach(inp); + break; + + case PRU_BIND: + s = splnet(); + error = in_pcbbind(inp, addr); + splx(s); + break; + + case PRU_LISTEN: + error = EOPNOTSUPP; + break; + + case PRU_CONNECT: + if (inp->inp_faddr.s_addr != INADDR_ANY) { + error = EISCONN; + break; + } + s = splnet(); + error = in_pcbconnect(inp, addr); + splx(s); + if (error == 0) + soisconnected(so); + break; + + case PRU_CONNECT2: + error = EOPNOTSUPP; + break; + + case PRU_ACCEPT: + error = EOPNOTSUPP; + break; + + case PRU_DISCONNECT: + if (inp->inp_faddr.s_addr == INADDR_ANY) { + error = ENOTCONN; + break; + } + s = splnet(); + in_pcbdisconnect(inp); + inp->inp_laddr.s_addr = INADDR_ANY; + splx(s); + so->so_state &= ~SS_ISCONNECTED; /* XXX */ + break; + + case PRU_SHUTDOWN: + socantsendmore(so); + break; + + case PRU_SEND: + return (udp_output(inp, m, addr, control)); + + case PRU_ABORT: + soisdisconnected(so); + udp_detach(inp); + break; + + case PRU_SOCKADDR: + in_setsockaddr(inp, addr); + break; + + case PRU_PEERADDR: + in_setpeeraddr(inp, addr); + break; + + case PRU_SENSE: + /* + * stat: don't bother with a blocksize. + */ + return (0); + + case PRU_SENDOOB: + case PRU_FASTTIMO: + case PRU_SLOWTIMO: + case PRU_PROTORCV: + case PRU_PROTOSEND: + error = EOPNOTSUPP; + break; + + case PRU_RCVD: + case PRU_RCVOOB: + return (EOPNOTSUPP); /* do not free mbuf's */ + + default: + panic("udp_usrreq"); + } + +release: + if (control) { + printf("udp control data unexpectedly retained\n"); + m_freem(control); + } + if (m) + m_freem(m); + return (error); +} + +static void +udp_detach(inp) + struct inpcb *inp; +{ + int s = splnet(); + + if (inp == udp_last_inpcb) + udp_last_inpcb = &udb; + in_pcbdetach(inp); + splx(s); +} + +/* + * Sysctl for udp variables. + */ +udp_sysctl(name, namelen, oldp, oldlenp, newp, newlen) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; +{ + /* All sysctl names at this level are terminal. */ + if (namelen != 1) + return (ENOTDIR); + + switch (name[0]) { + case UDPCTL_CHECKSUM: + return (sysctl_int(oldp, oldlenp, newp, newlen, &udpcksum)); + default: + return (ENOPROTOOPT); + } + /* NOTREACHED */ +} diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h new file mode 100644 index 00000000000..e8a21d261c5 --- /dev/null +++ b/sys/netinet/udp_var.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_var.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * UDP kernel structures and variables. + */ +struct udpiphdr { + struct ipovly ui_i; /* overlaid ip structure */ + struct udphdr ui_u; /* udp header */ +}; +#define ui_next ui_i.ih_next +#define ui_prev ui_i.ih_prev +#define ui_x1 ui_i.ih_x1 +#define ui_pr ui_i.ih_pr +#define ui_len ui_i.ih_len +#define ui_src ui_i.ih_src +#define ui_dst ui_i.ih_dst +#define ui_sport ui_u.uh_sport +#define ui_dport ui_u.uh_dport +#define ui_ulen ui_u.uh_ulen +#define ui_sum ui_u.uh_sum + +struct udpstat { + /* input statistics: */ + u_long udps_ipackets; /* total input packets */ + u_long udps_hdrops; /* packet shorter than header */ + u_long udps_badsum; /* checksum error */ + u_long udps_badlen; /* data length larger than packet */ + u_long udps_noport; /* no socket on port */ + u_long udps_noportbcast; /* of above, arrived as broadcast */ + u_long udps_fullsock; /* not delivered, input socket full */ + u_long udpps_pcbcachemiss; /* input packets missing pcb cache */ + /* output statistics: */ + u_long udps_opackets; /* total output packets */ +}; + +/* + * Names for UDP sysctl objects + */ +#define UDPCTL_CHECKSUM 1 /* checksum UDP packets */ +#define UDPCTL_MAXID 2 + +#define UDPCTL_NAMES { \ + { 0, 0 }, \ + { "checksum", CTLTYPE_INT }, \ +} + +#ifdef KERNEL +struct inpcb udb; +struct udpstat udpstat; + +void udp_ctlinput __P((int, struct sockaddr *, struct ip *)); +void udp_init __P((void)); +void udp_input __P((struct mbuf *, int)); +int udp_output __P((struct inpcb *, + struct mbuf *, struct mbuf *, struct mbuf *)); +int udp_sysctl __P((int *, u_int, void *, size_t *, void *, size_t)); +int udp_usrreq __P((struct socket *, + int, struct mbuf *, struct mbuf *, struct mbuf *)); +#endif diff --git a/sys/netiso/argo_debug.h b/sys/netiso/argo_debug.h new file mode 100644 index 00000000000..653982f005a --- /dev/null +++ b/sys/netiso/argo_debug.h @@ -0,0 +1,296 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)argo_debug.h 8.1 (Berkeley) 6/10/93 + */ + +/***************************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * $Header: argo_debug.h,v 4.6 88/07/19 15:53:40 hagens Exp $ + * $Source: /usr/argo/sys/netiso/RCS/argo_debug.h,v $ + */ + +#ifndef __ARGO_DEBUG__ +#define __ARGO_DEBUG__ + +#define dump_buf(a, b) Dump_buf((caddr_t)(a), (int)(b)) + +/*********************************************** + * Lint stuff + **********************************************/ +#if defined(lint) +/* + * lint can't handle the flaky vacuous definitions + * of IFDEBUG, ENDDEBUG, etc. + */ +#endif /* defined(lint) */ + +/*********************************************** + * DEBUG ON: + **********************************************/ +#ifndef ARGO_DEBUG +#define ARGO_DEBUG +#endif /* ARGO_DEBUG */ + + +#ifdef ARGO_DEBUG +/* + #ifndef TPPT + #define TPPT + #endif TPPT + + #ifndef TP_PERF_MEAS + #define TP_PERF_MEAS + #endif TP_PERF_MEAS +*/ + +unsigned char argo_debug[128]; + +#define IFDEBUG(ascii) \ + if(argo_debug[ascii]) { +#define ENDDEBUG ; } + +#else /* ARGO_DEBUG */ + +/*********************************************** + * DEBUG OFF: + **********************************************/ + +#ifndef STAR +#define STAR * +#endif /* STAR */ +#define IFDEBUG(ascii) //*beginning of comment*/STAR +#define ENDDEBUG STAR/*end of comment*// + +#endif /* ARGO_DEBUG */ + +/*********************************************** + * ASSERT + **********************************************/ +#ifdef ARGO_DEBUG + +#ifndef lint +#define ASSERT(phrase) \ +if( !(phrase) ) printf("ASSERTION NOT VALID at line %d file %s\n",__LINE__,__FILE__) +#else /* lint */ +#define ASSERT(phrase) /* phrase */ +#endif /* lint */ + +#else /* ARGO_DEBUG */ + +#define ASSERT(phrase) /* phrase */ + +#endif /* ARGO_DEBUG */ + + +/*********************************************** + * CLNP DEBUG OPTIONS + **********************************************/ +#define D_INPUT '\1' +/* clnp input */ +#define D_OUTPUT '\2' +/* clnp output */ +#define D_ROUTE '\3' +/* clnp routing */ +#define D_CTLINPUT '\4' +/* clnp control input */ +#define D_CTLOUTPUT '\5' +/* clnp control output */ +#define D_OPTIONS '\6' +/* clnp options */ +#define D_IOCTL '\7' +/* iso ioctls */ +#define D_ETHER '\10' +/* clnp over ethernet */ +#define D_TOKEN '\11' +/* clnp over token ring */ +#define D_ADCOM '\12' +/* clnp over the adcom */ +#define D_ISO '\13' +/* iso address family */ +#define D_FORWARD '\14' +/* clnp forwarding */ +#define D_DUMPOUT '\15' +/* dump clnp outgoing packets */ +#define D_DUMPIN '\16' +/* dump clnp input packets */ +#define D_DISCARD '\17' +/* debug clnp packet discard/er function */ +#define D_FRAG '\20' +/* clnp fragmentation */ +#define D_REASS '\21' +/* clnp reassembly */ + +char *clnp_iso_addrp(); + +/*********************************************** + * ESIS DEBUG OPTIONS + **********************************************/ +#define D_ESISOUTPUT '\30' +#define D_ESISINPUT '\31' +#define D_SNPA '\32' + +/*********************************************** + * ISIS DEBUG OPTIONS + **********************************************/ +#define D_ISISOUTPUT '\40' +#define D_ISISINPUT '\41' + +/*********************************************** + * EON DEBUG OPTION + **********************************************/ +#define D_EON '\57' + +/*********************************************** + * CONS DEBUG OPTIONS + **********************************************/ + +#define D_ECNWORK '\60' +#define D_ECNOUT '\61' +#define D_ECNFIN '\62' +#define D_ECNDWN '\63' +#define D_ECNUTIL '\64' + +#define D_INCOMING '\70' +#define D_CDATA '\71' +#define D_CFIND '\72' +#define D_CDUMP_REQ '\73' +#define D_CADDR '\74' +#define D_CCONS '\75' +#define D_CCONN '\76' + + +/*********************************************** + * TP DEBUG OPTIONS + **********************************************/ + +#define D_SETPARAMS '\137' +#define D_RTT '\140' + +#define D_ACKRECV '\141' +#define D_ACKSEND '\142' +#define D_CONN '\143' +#define D_CREDIT '\144' +#define D_DATA '\145' +#define D_DRIVER '\146' + +#define D_EMIT '\147' +#define D_ERROR_EMIT '\150' +#define D_TPINPUT '\151' +#define D_INDICATION '\152' +#define D_CHKSUM '\153' + +#define D_RENEG '\154' +#define D_PERF_MEAS '\155' +#define D_MBUF_MEAS '\156' +#define D_RTC '\157' +#define D_SB '\160' + +#define D_DISASTER_CHECK '\161' +#define D_REQUEST '\162' +#define D_STASH '\163' +#define D_NEWSOCK '\164' +#define D_TIMER '\165' + +#define D_TPIOCTL '\166' +#define D_SIZE_CHECK '\167' +#define D_2ER '\170' +#define D_DISASTER_CHECK_W '\171' + +#define D_XPD '\172' +#define D_SYSCALL '\173' +#define D_DROP '\174' +#define D_ZDREF '\175' +#define D_TPISO '\176' +#define D_QUENCH '\177' + +void dump_mbuf(); + +/*********************************************** + * New mbuf types for debugging w/ netstat -m + * This messes up 4.4 malloc for now. need bigger + * mbtypes array for now. + **********************************************/ +#ifdef notdef + +#define TPMT_DATA 0x21 +#define TPMT_RCVRTC 0x42 +#define TPMT_SNDRTC 0x41 +#define TPMT_TPHDR 0x22 +#define TPMT_IPHDR 0x32 +#define TPMT_SONAME 0x28 +#define TPMT_EOT 0x40 +#define TPMT_XPD 0x44 +#define TPMT_PCB 0x23 +#define TPMT_PERF 0x45 + +#else /* ARGO_DEBUG */ + +#define TPMT_DATA MT_DATA +#define TPMT_RCVRTC MT_DATA +#define TPMT_SNDRTC MT_DATA +#define TPMT_IPHDR MT_HEADER +#define TPMT_TPHDR MT_HEADER +#define TPMT_SONAME MT_SONAME +/* MT_EOT and MT_XPD are defined in tp_param.h */ +#define TPMT_XPD MT_OOBDATA +#define TPMT_PCB MT_PCB +#define TPMT_PERF MT_PCB + +#endif /* ARGO_DEBUG */ + +#endif /* __ARGO_DEBUG__ */ diff --git a/sys/netiso/clnl.h b/sys/netiso/clnl.h new file mode 100644 index 00000000000..87227dc273b --- /dev/null +++ b/sys/netiso/clnl.h @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnl.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +struct clnl_protosw { + int (*clnl_input)(); /* input routine */ +}; diff --git a/sys/netiso/clnp.h b/sys/netiso/clnp.h new file mode 100644 index 00000000000..4c81ba37d9c --- /dev/null +++ b/sys/netiso/clnp.h @@ -0,0 +1,463 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnp.h 8.2 (Berkeley) 4/16/94 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: /big/BSD4.4/isis-usr/src/sys/netiso/RCS/clnp.h,v 1.1 1992/02/07 18:14:59 hagens Exp hagens $ */ +/* $Source: /big/BSD4.4/isis-usr/src/sys/netiso/RCS/clnp.h,v $ */ + +/* should be config option but cpp breaks with too many #defines */ +#define DECBIT + +/* + * Return true if the mbuf is a cluster mbuf + */ +#define IS_CLUSTER(m) ((m)->m_flags & M_EXT) + +/* + * Move the halfword into the two characters + */ +#define HTOC(msb, lsb, hword)\ + (msb) = (u_char)((hword) >> 8);\ + (lsb) = (u_char)((hword) & 0xff) +/* + * Move the two charcters into the halfword + */ +#define CTOH(msb, lsb, hword)\ + (hword) = ((msb) << 8) | (lsb) + +/* + * Return true if the checksum has been set - ie. the checksum is + * not zero + */ +#define CKSUM_REQUIRED(clnp)\ + (((clnp)->cnf_cksum_msb != 0) || ((clnp)->cnf_cksum_lsb != 0)) + +/* + * Fixed part of clnp header + */ +struct clnp_fixed { + u_char cnf_proto_id; /* network layer protocol identifier */ + u_char cnf_hdr_len; /* length indicator (octets) */ + u_char cnf_vers; /* version/protocol identifier extension */ + u_char cnf_ttl; /* lifetime (500 milliseconds) */ + u_char cnf_type; /* type code */ + /* Includes err_ok, more_segs, and seg_ok */ + u_char cnf_seglen_msb; /* pdu segment length (octets) high byte */ + u_char cnf_seglen_lsb; /* pdu segment length (octets) low byte */ + u_char cnf_cksum_msb; /* checksum high byte */ + u_char cnf_cksum_lsb; /* checksum low byte */ +}; +#define CNF_TYPE 0x1f +#define CNF_ERR_OK 0x20 +#define CNF_MORE_SEGS 0x40 +#define CNF_SEG_OK 0x80 + +#define CLNP_CKSUM_OFF 0x07 /* offset of checksum */ + +#define clnl_fixed clnp_fixed + +/* + * Segmentation part of clnp header + */ +struct clnp_segment { + u_short cng_id; /* data unit identifier */ + u_short cng_off; /* segment offset */ + u_short cng_tot_len; /* total length */ +}; + +/* + * Clnp fragment reassembly structures: + * + * All packets undergoing reassembly are linked together in + * clnp_fragl structures. Each clnp_fragl structure contains a + * pointer to the original clnp packet header, as well as a + * list of packet fragments. Each packet fragment + * is headed by a clnp_frag structure. This structure contains the + * offset of the first and last byte of the fragment, as well as + * a pointer to the data (an mbuf chain) of the fragment. + */ + +/* + * NOTE: + * The clnp_frag structure is stored in an mbuf immedately preceeding + * the fragment data. Since there are words in this struct, + * it must be word aligned. + * + * NOTE: + * All the fragment code assumes that the entire clnp header is + * contained in the first mbuf. + */ +struct clnp_frag { + u_int cfr_first; /* offset of first byte of this frag */ + u_int cfr_last; /* offset of last byte of this frag */ + u_int cfr_bytes; /* bytes to shave to get to data */ + struct mbuf *cfr_data; /* ptr to data for this frag */ + struct clnp_frag *cfr_next; /* next fragment in list */ +}; + +struct clnp_fragl { + struct iso_addr cfl_src; /* source of the pkt */ + struct iso_addr cfl_dst; /* destination of the pkt */ + u_short cfl_id; /* id of the pkt */ + u_char cfl_ttl; /* current ttl of pkt */ + u_short cfl_last; /* offset of last byte of packet */ + struct mbuf *cfl_orighdr; /* ptr to original header */ + struct clnp_frag *cfl_frags; /* linked list of fragments for pkt */ + struct clnp_fragl *cfl_next; /* next pkt being reassembled */ +}; + +/* + * The following structure is used to index into an options section + * of a clnp datagram. These values can be used without worry that + * offset or length fields are invalid or too big, etc. That is, + * the consistancy of the options will be guaranteed before this + * structure is filled in. Any pointer (field ending in p) is + * actually the offset from the beginning of the mbuf the option + * is contained in. A value of NULL for any pointer + * means that the option is not present. The length any option + * does not include the option code or option length fields. + */ +struct clnp_optidx { + u_short cni_securep; /* ptr to beginning of security option */ + char cni_secure_len; /* length of entire security option */ + + u_short cni_srcrt_s; /* offset of start of src rt option */ + u_short cni_srcrt_len; /* length of entire src rt option */ + + u_short cni_recrtp; /* ptr to beginning of recrt option */ + char cni_recrt_len; /* length of entire recrt option */ + + char cni_priorp; /* ptr to priority option */ + + u_short cni_qos_formatp; /* ptr to format of qos option */ + char cni_qos_len; /* length of entire qos option */ + + u_char cni_er_reason; /* reason from ER pdu option */ + + /* ESIS options */ + + u_short cni_esct; /* value from ISH ESCT option */ + + u_short cni_netmaskp; /* ptr to beginning of netmask option */ + char cni_netmask_len; /* length of entire netmask option */ + + u_short cni_snpamaskp; /* ptr to beginning of snpamask option */ + char cni_snpamask_len; /* length of entire snpamask option */ + +}; + +#define ER_INVALREAS 0xff /* code for invalid ER pdu discard reason */ + +/* given an mbuf and addr of option, return offset from data of mbuf */ +#define CLNP_OPTTOOFF(m, opt)\ + ((u_short) (opt - mtod(m, caddr_t))) + +/* given an mbuf and offset of option, return address of option */ +#define CLNP_OFFTOOPT(m, off)\ + ((caddr_t) (mtod(m, caddr_t) + off)) + +/* return true iff src route is valid */ +#define CLNPSRCRT_VALID(oidx)\ + ((oidx) && (oidx->cni_srcrt_s)) + +/* return the offset field of the src rt */ +#define CLNPSRCRT_OFF(oidx, options)\ + (*((u_char *)(CLNP_OFFTOOPT(options, oidx->cni_srcrt_s) + 1))) + +/* return the type field of the src rt */ +#define CLNPSRCRT_TYPE(oidx, options)\ + ((u_char)(*(CLNP_OFFTOOPT(options, oidx->cni_srcrt_s)))) + +/* return the length of the current address */ +#define CLNPSRCRT_CLEN(oidx, options)\ + ((u_char)(*(CLNP_OFFTOOPT(options, oidx->cni_srcrt_s) + CLNPSRCRT_OFF(oidx, options) - 1))) + +/* return the address of the current address */ +#define CLNPSRCRT_CADDR(oidx, options)\ + ((caddr_t)(CLNP_OFFTOOPT(options, oidx->cni_srcrt_s) + CLNPSRCRT_OFF(oidx, options))) + +/* + * return true if the src route has run out of routes + * this is true if the offset of next route is greater than the end of the rt + */ +#define CLNPSRCRT_TERM(oidx, options)\ + (CLNPSRCRT_OFF(oidx, options) > oidx->cni_srcrt_len) + +/* + * Options a user can set/get + */ +#define CLNPOPT_FLAGS 0x01 /* flags: seg permitted, no er xmit, etc */ +#define CLNPOPT_OPTS 0x02 /* datagram options */ + +/* + * Values for particular datagram options + */ +#define CLNPOVAL_PAD 0xcc /* padding */ +#define CLNPOVAL_SECURE 0xc5 /* security */ +#define CLNPOVAL_SRCRT 0xc8 /* source routing */ +#define CLNPOVAL_RECRT 0xcb /* record route */ +#define CLNPOVAL_QOS 0xc3 /* quality of service */ +#define CLNPOVAL_PRIOR 0xcd /* priority */ +#define CLNPOVAL_ERREAS 0xc1 /* ER PDU ONLY: reason for discard */ + +#define CLNPOVAL_SRCSPEC 0x40 /* source address specific */ +#define CLNPOVAL_DSTSPEC 0x80 /* destination address specific */ +#define CLNPOVAL_GLOBAL 0xc0 /* globally unique */ + +/* Globally Unique QOS */ +#define CLNPOVAL_SEQUENCING 0x10 /* sequencing preferred */ +#define CLNPOVAL_CONGESTED 0x08 /* congestion experienced */ +#define CLNPOVAL_LOWDELAY 0x04 /* low transit delay */ + +#define CLNPOVAL_PARTRT 0x00 /* partial source routing */ +#define CLNPOVAL_COMPRT 0x01 /* complete source routing */ + +/* + * Clnp flags used in a control block flags field. + * NOTE: these must be out of the range of bits defined in ../net/raw_cb.h + */ +#define CLNP_NO_SEG 0x010 /* segmentation not permitted */ +#define CLNP_NO_ER 0x020 /* do not generate ERs */ +#define CLNP_SEND_RAW 0x080 /* send pkt as RAW DT rather than TP DT */ +#define CLNP_NO_CKSUM 0x100 /* don't use clnp checksum */ +#define CLNP_ECHO 0x200 /* send echo request */ +#define CLNP_NOCACHE 0x400 /* don't store cache information */ +#define CLNP_ECHOR 0x800 /* send echo reply */ + +/* valid clnp flags */ +#define CLNP_VFLAGS (CLNP_SEND_RAW|CLNP_NO_SEG|CLNP_NO_ER|CLNP_NO_CKSUM\ + |CLNP_ECHO|CLNP_NOCACHE|CLNP_ECHOR) + +/* + * Constants used by clnp + */ +#define CLNP_HDR_MIN (sizeof (struct clnp_fixed)) +#define CLNP_HDR_MAX (254) +#define CLNP_TTL_UNITS 2 /* 500 milliseconds */ +#define CLNP_TTL 15*CLNP_TTL_UNITS /* time to live (seconds) */ +#define ISO8473_V1 0x01 + +/* + * Clnp packet types + * In order to test raw clnp and tp/clnp simultaneously, a third type of + * packet has been defined: CLNP_RAW. This is done so that the input + * routine can switch to the correct input routine (rclnp_input or + * tpclnp_input) based on the type field. If clnp had a higher level protocol + * field, this would not be necessary. + */ +#define CLNP_DT 0x1C /* normal data */ +#define CLNP_ER 0x01 /* error report */ +#define CLNP_RAW 0x1D /* debug only */ +#define CLNP_EC 0x1E /* echo packet */ +#define CLNP_ECR 0x1F /* echo reply */ + +/* + * ER pdu error codes + */ +#define GEN_NOREAS 0x00 /* reason not specified */ +#define GEN_PROTOERR 0x01 /* protocol procedure error */ +#define GEN_BADCSUM 0x02 /* incorrect checksum */ +#define GEN_CONGEST 0x03 /* pdu discarded due to congestion */ +#define GEN_HDRSYNTAX 0x04 /* header syntax error */ +#define GEN_SEGNEEDED 0x05 /* segmentation needed, but not permitted */ +#define GEN_INCOMPLETE 0x06 /* incomplete pdu received */ +#define GEN_DUPOPT 0x07 /* duplicate option */ + +/* address errors */ +#define ADDR_DESTUNREACH 0x80 /* destination address unreachable */ +#define ADDR_DESTUNKNOWN 0x81 /* destination address unknown */ + +/* source routing */ +#define SRCRT_UNSPECERR 0x90 /* unspecified src rt error */ +#define SRCRT_SYNTAX 0x91 /* syntax error in src rt field */ +#define SRCRT_UNKNOWNADDR 0x92 /* unknown addr in src rt field */ +#define SRCRT_BADPATH 0x93 /* path not acceptable */ + +/* lifetime */ +#define TTL_EXPTRANSIT 0xa0 /* lifetime expired during transit */ +#define TTL_EXPREASS 0xa1 /* lifetime expired during reassembly */ + +/* pdu discarded */ +#define DISC_UNSUPPOPT 0xb0 /* unsupported option not specified? */ +#define DISC_UNSUPPVERS 0xb1 /* unsupported protocol version */ +#define DISC_UNSUPPSECURE 0xb2 /* unsupported security option */ +#define DISC_UNSUPPSRCRT 0xb3 /* unsupported src rt option */ +#define DISC_UNSUPPRECRT 0xb4 /* unsupported rec rt option */ + +/* reassembly */ +#define REASS_INTERFERE 0xc0 /* reassembly interference */ +#define CLNP_ERRORS 22 + + +#ifdef KERNEL +int clnp_er_index(); +#endif + +#ifdef CLNP_ER_CODES +u_char clnp_er_codes[CLNP_ERRORS] = { +GEN_NOREAS, GEN_PROTOERR, GEN_BADCSUM, GEN_CONGEST, +GEN_HDRSYNTAX, GEN_SEGNEEDED, GEN_INCOMPLETE, GEN_DUPOPT, +ADDR_DESTUNREACH, ADDR_DESTUNKNOWN, +SRCRT_UNSPECERR, SRCRT_SYNTAX, SRCRT_UNKNOWNADDR, SRCRT_BADPATH, +TTL_EXPTRANSIT, TTL_EXPREASS, +DISC_UNSUPPOPT, DISC_UNSUPPVERS, DISC_UNSUPPSECURE, +DISC_UNSUPPSRCRT, DISC_UNSUPPRECRT, REASS_INTERFERE }; +#endif + +#ifdef TROLL + +#define TR_DUPEND 0x01 /* duplicate end of fragment */ +#define TR_DUPPKT 0x02 /* duplicate entire packet */ +#define TR_DROPPKT 0x04 /* drop packet on output */ +#define TR_TRIM 0x08 /* trim bytes from packet */ +#define TR_CHANGE 0x10 /* change bytes in packet */ +#define TR_MTU 0x20 /* delta to change device mtu */ +#define TR_CHUCK 0x40 /* drop packet in rclnp_input */ +#define TR_BLAST 0x80 /* force rclnp_output to blast many packet */ +#define TR_RAWLOOP 0x100 /* make if_loop call clnpintr directly */ +struct troll { + int tr_ops; /* operations to perform */ + float tr_dup_size; /* % to duplicate */ + float tr_dup_freq; /* frequency to duplicate packets */ + float tr_drop_freq; /* frequence to drop packets */ + int tr_mtu_adj; /* delta to adjust if mtu */ + int tr_blast_cnt; /* # of pkts to blast out */ +}; + +#define SN_OUTPUT(clcp, m)\ + troll_output(clcp->clc_ifp, m, clcp->clc_firsthop, clcp->clc_rt) + +#define SN_MTU(ifp, rt) (((rt && rt->rt_rmx.rmx_mtu) ?\ + rt->rt_rmx.rmx_mtu : clnp_badmtu(ifp, rt, __LINE__, __FILE__))\ + - trollctl.tr_mtu_adj) + +#ifdef KERNEL +extern float troll_random; +#endif + +#else /* NO TROLL */ + +#define SN_OUTPUT(clcp, m)\ + (*clcp->clc_ifp->if_output)(clcp->clc_ifp, m, clcp->clc_firsthop, clcp->clc_rt) + +#define SN_MTU(ifp, rt) (((rt && rt->rt_rmx.rmx_mtu) ?\ + rt->rt_rmx.rmx_mtu : clnp_badmtu(ifp, rt, __LINE__, __FILE__))) + +#endif /* TROLL */ + +/* + * Macro to remove an address from a clnp header + */ +#define CLNP_EXTRACT_ADDR(isoa, hoff, hend)\ + {\ + isoa.isoa_len = (u_char)*hoff;\ + if ((((++hoff) + isoa.isoa_len) > hend) ||\ + (isoa.isoa_len > 20) || (isoa.isoa_len == 0)) {\ + hoff = (caddr_t)0;\ + } else {\ + (void) bcopy(hoff, (caddr_t)isoa.isoa_genaddr, isoa.isoa_len);\ + hoff += isoa.isoa_len;\ + }\ + } + +/* + * Macro to insert an address into a clnp header + */ +#define CLNP_INSERT_ADDR(hoff, isoa)\ + *hoff++ = (isoa).isoa_len;\ + (void) bcopy((caddr_t)((isoa).isoa_genaddr), hoff, (isoa).isoa_len);\ + hoff += (isoa).isoa_len; + +/* + * Clnp hdr cache. Whenever a clnp packet is sent, a copy of the + * header is made and kept in this cache. In addition to a copy of + * the cached clnp hdr, the cache contains + * information necessary to determine whether the new packet + * to send requires a new header to be built. + */ +struct clnp_cache { + /* these fields are used to check the validity of the cache */ + struct iso_addr clc_dst; /* destination of packet */ + struct mbuf *clc_options; /* ptr to options mbuf */ + int clc_flags; /* flags passed to clnp_output */ + + /* these fields are state that clnp_output requires to finish the pkt */ + int clc_segoff; /* offset of seg part of header */ + struct rtentry *clc_rt; /* ptr to rtentry (points into + the route structure) */ + struct sockaddr *clc_firsthop; /* first hop of packet */ + struct ifnet *clc_ifp; /* ptr to interface structure */ + struct iso_ifaddr *clc_ifa; /* ptr to interface address */ + struct mbuf *clc_hdr; /* cached pkt hdr (finally)! */ +}; + +#ifndef satosiso +#define satosiso(sa)\ + ((struct sockaddr_iso *)(sa)) +#endif + +#ifdef KERNEL +caddr_t clnp_insert_addr(); +struct iso_addr *clnp_srcaddr(); +struct mbuf *clnp_reass(); +#ifdef TROLL +struct troll trollctl; +#endif /* TROLL */ +#endif /* KERNEL */ diff --git a/sys/netiso/clnp_debug.c b/sys/netiso/clnp_debug.c new file mode 100644 index 00000000000..964638e244b --- /dev/null +++ b/sys/netiso/clnp_debug.c @@ -0,0 +1,260 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnp_debug.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: clnp_debug.c,v 4.2 88/06/29 14:58:34 hagens Exp $ */ +/* $Source: /usr/argo/sys/netargo/RCS/clnp_debug.c,v $ */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#ifdef ARGO_DEBUG + +#ifdef TESTDEBUG +#ifdef notdef +struct addr_37 u_37 = { + {0x00, 0x02, 0x00, 0x10, 0x20, 0x30, 0x35}, + {0x01, 0x02, 0x03, 0x04, 0x50, 0x60, 0x70, 0x80, 0x90} +}; +struct addr_osinet u_osinet = { + {0x00, 0x04}, + {0x00, 0x02, 0x00, 0x01, 0x23, 0x42, 0x78, 0x20, 0x01, 0x05, 0x00} +}; +#endif /* notdef */ +struct addr_rfc986 u_rfc986 = { + {0x00, 0x06}, + {0x01, 0xc0, 0x0c, 0x0c, 0xab, 0x11} +}; +struct addr_rfc986 u_bad = { + {0x00, 0x01}, + {0x01, 0xc0, 0x0c, 0x0c, 0xab, 0x11} +}; +#include +main() +{ + struct iso_addr a; + + a.isoa_afi = AFI_37; + a.isoa_u.addr_37 = u_37; + a.isoa_len = 17; + printf("type 37: %s\n", clnp_iso_addrp(&a)); + + a.isoa_afi = AFI_OSINET; + a.isoa_u.addr_osinet = u_osinet; + a.isoa_len = 14; + printf("type osinet: %s\n", clnp_iso_addrp(&a)); + + a.isoa_afi = AFI_RFC986; + a.isoa_u.addr_rfc986 = u_rfc986; + a.isoa_len = 9; + printf("type rfc986: %s\n", clnp_iso_addrp(&a)); + + a.isoa_afi = 12; + a.isoa_u.addr_rfc986 = u_rfc986; + a.isoa_len = 9; + printf("type bad afi: %s\n", clnp_iso_addrp(&a)); + + a.isoa_afi = AFI_RFC986; + a.isoa_u.addr_rfc986 = u_bad; + a.isoa_len = 9; + printf("type bad idi: %s\n", clnp_iso_addrp(&a)); +} +#endif /* TESTDEBUG */ + +unsigned int clnp_debug; +static char letters[] = "0123456789abcdef"; + +/* + * Print buffer in hex, return addr of where we left off. + * Do not null terminate. + */ +char * +clnp_hexp(src, len, where) +char *src; /* src of data to print */ +int len; /* lengthof src */ +char *where; /* where to put data */ +{ + int i; + + for (i=0; i> 4]; + *where++ = letters[j & 0x0f]; + } + return where; +} + +/* + * Return a ptr to a human readable form of an iso addr + */ +static char iso_addr_b[50]; +#define DELIM '.'; + +char * +clnp_iso_addrp(isoa) +struct iso_addr *isoa; +{ + char *cp; + + /* print length */ + sprintf(iso_addr_b, "[%d] ", isoa->isoa_len); + + /* set cp to end of what we have */ + cp = iso_addr_b; + while (*cp) + cp++; + + /* print afi */ + cp = clnp_hexp(isoa->isoa_genaddr, (int)isoa->isoa_len, cp); +#ifdef notdef + *cp++ = DELIM; + + /* print type specific part */ + switch(isoa->isoa_afi) { + case AFI_37: + cp = clnp_hexp(isoa->t37_idi, ADDR37_IDI_LEN, cp); + *cp++ = DELIM; + cp = clnp_hexp(isoa->t37_dsp, ADDR37_DSP_LEN, cp); + break; + +/* case AFI_OSINET:*/ + case AFI_RFC986: { + u_short idi; + + /* osinet and rfc986 have idi in the same place */ + /* print idi */ + cp = clnp_hexp(isoa->rfc986_idi, ADDROSINET_IDI_LEN, cp); + *cp++ = DELIM; + CTOH(isoa->rfc986_idi[0], isoa->rfc986_idi[1], idi); + + if (idi == IDI_OSINET) { + struct ovl_osinet *oosi = (struct ovl_osinet *)isoa; + cp = clnp_hexp(oosi->oosi_orgid, OVLOSINET_ORGID_LEN, cp); + *cp++ = DELIM; + cp = clnp_hexp(oosi->oosi_snetid, OVLOSINET_SNETID_LEN, cp); + *cp++ = DELIM; + cp = clnp_hexp(oosi->oosi_snpa, OVLOSINET_SNPA_LEN, cp); + *cp++ = DELIM; + cp = clnp_hexp(oosi->oosi_nsap, OVLOSINET_NSAP_LEN, cp); + } else if (idi == IDI_RFC986) { + struct ovl_rfc986 *o986 = (struct ovl_rfc986 *)isoa; + cp = clnp_hexp(&o986->o986_vers, 1, cp); + *cp++ = DELIM; +#ifdef vax + sprintf(cp, "%d.%d.%d.%d.%d", + o986->o986_inetaddr[0] & 0xff, + o986->o986_inetaddr[1] & 0xff, + o986->o986_inetaddr[2] & 0xff, + o986->o986_inetaddr[3] & 0xff, + o986->o986_upid & 0xff); + return(iso_addr_b); +#else + cp = clnp_hexp(&o986->o986_inetaddr[0], 1, cp); + *cp++ = DELIM; + cp = clnp_hexp(&o986->o986_inetaddr[1], 1, cp); + *cp++ = DELIM; + cp = clnp_hexp(&o986->o986_inetaddr[2], 1, cp); + *cp++ = DELIM; + cp = clnp_hexp(&o986->o986_inetaddr[3], 1, cp); + *cp++ = DELIM; + cp = clnp_hexp(&o986->o986_upid, 1, cp); +#endif /* vax */ + } + + } break; + + default: + *cp++ = '?'; + break; + } +#endif /* notdef */ + *cp = (char)0; + + return(iso_addr_b); +} + +char * +clnp_saddr_isop(s) +register struct sockaddr_iso *s; +{ + register char *cp = clnp_iso_addrp(&s->siso_addr); + + while (*cp) cp++; + *cp++ = '('; + cp = clnp_hexp(TSEL(s), (int)s->siso_tlen, cp); + *cp++ = ')'; + *cp++ = 0; + return (iso_addr_b); +} + +#endif /* ARGO_DEBUG */ diff --git a/sys/netiso/clnp_er.c b/sys/netiso/clnp_er.c new file mode 100644 index 00000000000..8b7f45b77a4 --- /dev/null +++ b/sys/netiso/clnp_er.c @@ -0,0 +1,375 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnp_er.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: /var/src/sys/netiso/RCS/clnp_er.c,v 5.1 89/02/09 16:20:18 hagens Exp $ */ +/* $Source: /var/src/sys/netiso/RCS/clnp_er.c,v $ */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#define CLNP_ER_CODES +#include +#include +#include + +static struct clnp_fixed er_template = { + ISO8473_CLNP, /* network identifier */ + 0, /* length */ + ISO8473_V1, /* version */ + CLNP_TTL, /* ttl */ + CLNP_ER, /* type */ + 0, /* segment length */ + 0 /* checksum */ +}; + +/* + * FUNCTION: clnp_er_input + * + * PURPOSE: Process an ER pdu. + * + * RETURNS: + * + * SIDE EFFECTS: + * + * NOTES: + */ +clnp_er_input(m, src, reason) +struct mbuf *m; /* ptr to packet itself */ +struct iso_addr *src; /* ptr to src of er */ +u_char reason; /* reason code of er */ +{ + int cmd = -1; + extern u_char clnp_protox[]; + + IFDEBUG(D_CTLINPUT) + printf("clnp_er_input: m x%x, src %s, reason x%x\n", m, + clnp_iso_addrp(src), reason); + ENDDEBUG + + INCSTAT(cns_er_inhist[clnp_er_index(reason)]); + switch (reason) { + case GEN_NOREAS: + case GEN_PROTOERR: + break; + case GEN_BADCSUM: + cmd = PRC_PARAMPROB; + break; + case GEN_CONGEST: + cmd = PRC_QUENCH; + break; + case GEN_HDRSYNTAX: + cmd = PRC_PARAMPROB; + break; + case GEN_SEGNEEDED: + cmd = PRC_MSGSIZE; + break; + case GEN_INCOMPLETE: + cmd = PRC_PARAMPROB; + break; + case GEN_DUPOPT: + cmd = PRC_PARAMPROB; + break; + case ADDR_DESTUNREACH: + cmd = PRC_UNREACH_HOST; + break; + case ADDR_DESTUNKNOWN: + cmd = PRC_UNREACH_PROTOCOL; + break; + case SRCRT_UNSPECERR: + case SRCRT_SYNTAX: + case SRCRT_UNKNOWNADDR: + case SRCRT_BADPATH: + cmd = PRC_UNREACH_SRCFAIL; + break; + case TTL_EXPTRANSIT: + cmd = PRC_TIMXCEED_INTRANS; + break; + case TTL_EXPREASS: + cmd = PRC_TIMXCEED_REASS; + break; + case DISC_UNSUPPOPT: + case DISC_UNSUPPVERS: + case DISC_UNSUPPSECURE: + case DISC_UNSUPPSRCRT: + case DISC_UNSUPPRECRT: + cmd = PRC_PARAMPROB; + break; + case REASS_INTERFERE: + cmd = PRC_TIMXCEED_REASS; + break; + } + + /* + * tpclnp_ctlinput1 is called directly so that we don't + * have to build an iso_sockaddr out of src. + */ + if (cmd >= 0) + tpclnp_ctlinput1(cmd, src); + + m_freem(m); +} + +/* + * FUNCTION: clnp_discard + * + * PURPOSE: Discard a clnp datagram + * + * RETURNS: nothing + * + * SIDE EFFECTS: Will emit an ER pdu if possible + * + * NOTES: This code assumes that we have previously tried to pull + * up the header of the datagram into one mbuf. + */ +clnp_discard(m, reason) +struct mbuf *m; /* header of packet to discard */ +char reason; /* reason for discard */ +{ + IFDEBUG(D_DISCARD) + printf("clnp_discard: m x%x, reason x%x\n", m, reason); + ENDDEBUG + + if (m != NULL) { + if (m->m_len >= sizeof(struct clnp_fixed)) { + register struct clnp_fixed *clnp = mtod(m, struct clnp_fixed *); + + if (((clnp->cnf_type & CNF_TYPE) != CLNP_ER) && + (clnp->cnf_type & CNF_ERR_OK)) { + clnp_emit_er(m, reason); + return; + } + } + m_freem(m); + } +} + +/* + * FUNCTION: clnp_emit_er + * + * PURPOSE: Send an ER pdu. + * The src of the of the ER pdu is the host that is sending + * the ER (ie. us), *not* the original destination of the + * packet. + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: Takes responsibility for freeing mbuf passed + * This function may be called with a packet that + * was created by us; in this case, do not send + * an ER. + */ +clnp_emit_er(m, reason) +struct mbuf *m; /* header of packet to discard */ +char reason; /* reason for discard */ +{ + register struct clnp_fixed *clnp = mtod(m, struct clnp_fixed *); + register struct clnp_fixed *er; + struct route_iso route; + struct ifnet *ifp; + struct sockaddr *first_hop; + struct iso_addr src, dst, *our_addr; + caddr_t hoff, hend; + int total_len; /* total len of dg */ + struct mbuf *m0; /* contains er pdu hdr */ + struct iso_ifaddr *ia = 0; + + IFDEBUG(D_DISCARD) + printf("clnp_emit_er: m x%x, hdr len %d\n", m, clnp->cnf_hdr_len); + ENDDEBUG + + bzero((caddr_t)&route, sizeof(route)); + + /* + * If header length is incorrect, or entire header is not contained + * in this mbuf, we punt + */ + if ((clnp->cnf_hdr_len < CLNP_HDR_MIN) || + (clnp->cnf_hdr_len > CLNP_HDR_MAX) || + (clnp->cnf_hdr_len > m->m_len)) + goto bad; + + /* extract src, dest address */ + hend = (caddr_t)clnp + clnp->cnf_hdr_len; + hoff = (caddr_t)clnp + sizeof(struct clnp_fixed); + CLNP_EXTRACT_ADDR(dst, hoff, hend); + if (hoff == (caddr_t)0) { + goto bad; + } + CLNP_EXTRACT_ADDR(src, hoff, hend); + if (hoff == (caddr_t)0) { + goto bad; + } + + /* + * Do not send ER if we generated the packet. + */ + if (clnp_ours(&src)) + goto bad; + + /* + * Trim mbuf to hold only the header. + * This mbuf will be the 'data' of the er pdu + */ + if (m->m_next != NULL) { + m_freem(m->m_next); + m->m_next = NULL; + } + + if (m->m_len > clnp->cnf_hdr_len) + m_adj(m, (int)-(m->m_len - (int)clnp->cnf_hdr_len)); + + /* route er pdu: note we send pkt to src of original packet */ + if (clnp_route(&src, &route, /* flags */0, &first_hop, &ia) != 0) + goto bad; + + /* compute our address based upon firsthop/ifp */ + if (ia) + our_addr = &ia->ia_addr.siso_addr; + else + goto bad; + ifp = ia->ia_ifp; + + IFDEBUG(D_DISCARD) + printf("clnp_emit_er: to %s", clnp_iso_addrp(&src)); + printf(" from %s\n", clnp_iso_addrp(our_addr)); + ENDDEBUG + + IFDEBUG(D_DISCARD) + printf("clnp_emit_er: packet routed to %s\n", + clnp_iso_addrp(&((struct sockaddr_iso *)first_hop)->siso_addr)); + ENDDEBUG + + /* allocate mbuf for er pdu header: punt on no space */ + MGET(m0, M_DONTWAIT, MT_HEADER); + if (m0 == 0) + goto bad; + + m0->m_next = m; + er = mtod(m0, struct clnp_fixed *); + *er = er_template; + + /* setup src/dst on er pdu */ + /* NOTE REVERSAL OF SRC/DST */ + hoff = (caddr_t)er + sizeof(struct clnp_fixed); + CLNP_INSERT_ADDR(hoff, src); + CLNP_INSERT_ADDR(hoff, *our_addr); + + /* + * TODO: if complete src rt was specified, then reverse path, and + * copy into er as option. + */ + + /* add er option */ + *hoff++ = CLNPOVAL_ERREAS; /* code */ + *hoff++ = 2; /* length */ + *hoff++ = reason; /* discard reason */ + *hoff++ = 0; /* error localization = not specified */ + + /* set length */ + er->cnf_hdr_len = m0->m_len = (u_char)(hoff - (caddr_t)er); + total_len = m0->m_len + m->m_len; + HTOC(er->cnf_seglen_msb, er->cnf_seglen_lsb, total_len); + + /* compute checksum (on header only) */ + iso_gen_csum(m0, CLNP_CKSUM_OFF, (int)er->cnf_hdr_len); + + /* trim packet if too large for interface */ + if (total_len > ifp->if_mtu) + m_adj(m0, -(total_len - ifp->if_mtu)); + + /* send packet */ + INCSTAT(cns_er_outhist[clnp_er_index(reason)]); + (void) (*ifp->if_output)(ifp, m0, first_hop, route.ro_rt); + goto done; + +bad: + m_freem(m); + +done: + /* free route if it is a temp */ + if (route.ro_rt != NULL) + RTFREE(route.ro_rt); +} + +clnp_er_index(p) +u_char p; +{ + register u_char *cp = clnp_er_codes + CLNP_ERRORS; + while (cp > clnp_er_codes) { + cp--; + if (*cp == p) + return (cp - clnp_er_codes); + } + return (CLNP_ERRORS + 1); +} diff --git a/sys/netiso/clnp_frag.c b/sys/netiso/clnp_frag.c new file mode 100644 index 00000000000..546a592ccf7 --- /dev/null +++ b/sys/netiso/clnp_frag.c @@ -0,0 +1,859 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnp_frag.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: /var/src/sys/netiso/RCS/clnp_frag.c,v 5.1 89/02/09 16:20:26 hagens Exp $ */ +/* $Source: /var/src/sys/netiso/RCS/clnp_frag.c,v $ */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +/* all fragments are hung off this list */ +struct clnp_fragl *clnp_frags = NULL; + +struct mbuf *clnp_comp_pdu(); + + +/* + * FUNCTION: clnp_fragment + * + * PURPOSE: Fragment a datagram, and send the itty bitty pieces + * out over an interface. + * + * RETURNS: success - 0 + * failure - unix error code + * + * SIDE EFFECTS: + * + * NOTES: If there is an error sending the packet, clnp_discard + * is called to discard the packet and send an ER. If + * clnp_fragment was called from clnp_output, then + * we generated the packet, and should not send an + * ER -- clnp_emit_er will check for this. Otherwise, + * the packet was fragmented during forwarding. In this + * case, we ought to send an ER back. + */ +clnp_fragment(ifp, m, first_hop, total_len, segoff, flags, rt) +struct ifnet *ifp; /* ptr to outgoing interface */ +struct mbuf *m; /* ptr to packet */ +struct sockaddr *first_hop; /* ptr to first hop */ +int total_len; /* length of datagram */ +int segoff; /* offset of segpart in hdr */ +int flags; /* flags passed to clnp_output */ +struct rtentry *rt; /* route if direct ether */ +{ + struct clnp_fixed *clnp = mtod(m, struct clnp_fixed *); + int hdr_len = (int)clnp->cnf_hdr_len; + int frag_size = (SN_MTU(ifp, rt) - hdr_len) & ~7; + + total_len -= hdr_len; + if ((clnp->cnf_type & CNF_SEG_OK) && + (total_len >= 8) && + (frag_size > 8 || (frag_size == 8 && !(total_len & 7)))) { + + struct mbuf *hdr = NULL; /* save copy of clnp hdr */ + struct mbuf *frag_hdr = NULL; + struct mbuf *frag_data = NULL; + struct clnp_segment seg_part; /* segmentation header */ + int frag_base; + int error = 0; + + + INCSTAT(cns_fragmented); + (void) bcopy(segoff + mtod(m, caddr_t), (caddr_t)&seg_part, + sizeof(seg_part)); + frag_base = ntohs(seg_part.cng_off); + /* + * Duplicate header, and remove from packet + */ + if ((hdr = m_copy(m, 0, hdr_len)) == NULL) { + clnp_discard(m, GEN_CONGEST); + return(ENOBUFS); + } + m_adj(m, hdr_len); + + while (total_len > 0) { + int remaining, last_frag; + + IFDEBUG(D_FRAG) + struct mbuf *mdump = frag_hdr; + int tot_mlen = 0; + printf("clnp_fragment: total_len %d:\n", total_len); + while (mdump != NULL) { + printf("\tmbuf x%x, m_len %d\n", + mdump, mdump->m_len); + tot_mlen += mdump->m_len; + mdump = mdump->m_next; + } + printf("clnp_fragment: sum of mbuf chain %d:\n", tot_mlen); + ENDDEBUG + + frag_size = min(total_len, frag_size); + if ((remaining = total_len - frag_size) == 0) + last_frag = 1; + else { + /* + * If this fragment will cause the last one to + * be less than 8 bytes, shorten this fragment a bit. + * The obscure test on frag_size above ensures that + * frag_size will be positive. + */ + last_frag = 0; + if (remaining < 8) + frag_size -= 8; + } + + + IFDEBUG(D_FRAG) + printf("clnp_fragment: seg off %d, size %d, remaining %d\n", + ntohs(seg_part.cng_off), frag_size, total_len-frag_size); + if (last_frag) + printf("clnp_fragment: last fragment\n"); + ENDDEBUG + + if (last_frag) { + /* + * this is the last fragment; we don't need to get any other + * mbufs. + */ + frag_hdr = hdr; + frag_data = m; + } else { + /* duplicate header and data mbufs */ + if ((frag_hdr = m_copy(hdr, 0, (int)M_COPYALL)) == NULL) { + clnp_discard(hdr, GEN_CONGEST); + m_freem(m); + return(ENOBUFS); + } + if ((frag_data = m_copy(m, 0, frag_size)) == NULL) { + clnp_discard(hdr, GEN_CONGEST); + m_freem(m); + m_freem(frag_hdr); + return(ENOBUFS); + } + INCSTAT(cns_fragments); + } + clnp = mtod(frag_hdr, struct clnp_fixed *); + + if (!last_frag) + clnp->cnf_type |= CNF_MORE_SEGS; + + /* link together */ + m_cat(frag_hdr, frag_data); + + /* insert segmentation part; updated below */ + bcopy((caddr_t)&seg_part, mtod(frag_hdr, caddr_t) + segoff, + sizeof(struct clnp_segment)); + + { + int derived_len = hdr_len + frag_size; + HTOC(clnp->cnf_seglen_msb, clnp->cnf_seglen_lsb, derived_len); + if ((frag_hdr->m_flags & M_PKTHDR) == 0) + panic("clnp_frag:lost header"); + frag_hdr->m_pkthdr.len = derived_len; + } + /* compute clnp checksum (on header only) */ + if (flags & CLNP_NO_CKSUM) { + HTOC(clnp->cnf_cksum_msb, clnp->cnf_cksum_lsb, 0); + } else { + iso_gen_csum(frag_hdr, CLNP_CKSUM_OFF, hdr_len); + } + + IFDEBUG(D_DUMPOUT) + struct mbuf *mdump = frag_hdr; + printf("clnp_fragment: sending dg:\n"); + while (mdump != NULL) { + printf("\tmbuf x%x, m_len %d\n", mdump, mdump->m_len); + mdump = mdump->m_next; + } + ENDDEBUG + +#ifdef TROLL + error = troll_output(ifp, frag_hdr, first_hop, rt); +#else + error = (*ifp->if_output)(ifp, frag_hdr, first_hop, rt); +#endif /* TROLL */ + + /* + * Tough situation: if the error occured on the last + * fragment, we can not send an ER, as the if_output + * routine consumed the packet. If the error occured + * on any intermediate packets, we can send an ER + * because we still have the original header in (m). + */ + if (error) { + if (frag_hdr != hdr) { + /* + * The error was not on the last fragment. We must + * free hdr and m before returning + */ + clnp_discard(hdr, GEN_NOREAS); + m_freem(m); + } + return(error); + } + + /* bump segment offset, trim data mbuf, and decrement count left */ +#ifdef TROLL + /* + * Decrement frag_size by some fraction. This will cause the + * next fragment to start 'early', thus duplicating the end + * of the current fragment. troll.tr_dup_size controls + * the fraction. If positive, it specifies the fraction. If + * negative, a random fraction is used. + */ + if ((trollctl.tr_ops & TR_DUPEND) && (!last_frag)) { + int num_bytes = frag_size; + + if (trollctl.tr_dup_size > 0) + num_bytes *= trollctl.tr_dup_size; + else + num_bytes *= troll_random(); + frag_size -= num_bytes; + } +#endif /* TROLL */ + total_len -= frag_size; + if (!last_frag) { + frag_base += frag_size; + seg_part.cng_off = htons(frag_base); + m_adj(m, frag_size); + } + } + return(0); + } else { + cantfrag: + INCSTAT(cns_cantfrag); + clnp_discard(m, GEN_SEGNEEDED); + return(EMSGSIZE); + } +} + +/* + * FUNCTION: clnp_reass + * + * PURPOSE: Attempt to reassemble a clnp packet given the current + * fragment. If reassembly succeeds (all the fragments + * are present), then return a pointer to an mbuf chain + * containing the reassembled packet. This packet will + * appear in the mbufs as if it had just arrived in + * one piece. + * + * If reassembly fails, then save this fragment and + * return 0. + * + * RETURNS: Ptr to assembled packet, or 0 + * + * SIDE EFFECTS: + * + * NOTES: + * clnp_slowtimo can not affect this code because clnpintr, and thus + * this code, is called at a higher priority than clnp_slowtimo. + */ +struct mbuf * +clnp_reass(m, src, dst, seg) +struct mbuf *m; /* new fragment */ +struct iso_addr *src; /* src of new fragment */ +struct iso_addr *dst; /* dst of new fragment */ +struct clnp_segment *seg; /* segment part of fragment header */ +{ + register struct clnp_fragl *cfh; + + /* look for other fragments of this datagram */ + for (cfh = clnp_frags; cfh != NULL; cfh = cfh->cfl_next) { + if (seg->cng_id == cfh->cfl_id && + iso_addrmatch1(src, &cfh->cfl_src) && + iso_addrmatch1(dst, &cfh->cfl_dst)) { + IFDEBUG(D_REASS) + printf("clnp_reass: found packet\n"); + ENDDEBUG + /* + * There are other fragments here already. Lets see if + * this fragment is of any help + */ + clnp_insert_frag(cfh, m, seg); + if (m = clnp_comp_pdu(cfh)) { + register struct clnp_fixed *clnp = mtod(m, struct clnp_fixed *); + HTOC(clnp->cnf_seglen_msb, clnp->cnf_seglen_lsb, + seg->cng_tot_len); + } + return (m); + } + } + + IFDEBUG(D_REASS) + printf("clnp_reass: new packet!\n"); + ENDDEBUG + + /* + * This is the first fragment. If src is not consuming too many + * resources, then create a new fragment list and add + * this fragment to the list. + */ + /* TODO: don't let one src hog all the reassembly buffers */ + if (!clnp_newpkt(m, src, dst, seg) /* || this src is a hog */) { + INCSTAT(cns_fragdropped); + clnp_discard(m, GEN_CONGEST); + } + + return(NULL); +} + +/* + * FUNCTION: clnp_newpkt + * + * PURPOSE: Create the necessary structures to handle a new + * fragmented clnp packet. + * + * RETURNS: non-zero if it succeeds, zero if fails. + * + * SIDE EFFECTS: + * + * NOTES: Failure is only due to insufficient resources. + */ +clnp_newpkt(m, src, dst, seg) +struct mbuf *m; /* new fragment */ +struct iso_addr *src; /* src of new fragment */ +struct iso_addr *dst; /* dst of new fragment */ +struct clnp_segment *seg; /* segment part of fragment header */ +{ + register struct clnp_fragl *cfh; + register struct clnp_fixed *clnp; + struct mbuf *m0; + + clnp = mtod(m, struct clnp_fixed *); + + /* + * Allocate new clnp fragl structure to act as header of all fragments + * for this datagram. + */ + MGET(m0, M_DONTWAIT, MT_FTABLE); + if (m0 == NULL) { + return (0); + } + cfh = mtod(m0, struct clnp_fragl *); + + /* + * Duplicate the header of this fragment, and save in cfh. + * Free m0 and return if m_copy does not succeed. + */ + if ((cfh->cfl_orighdr = m_copy(m, 0, (int)clnp->cnf_hdr_len)) == NULL) { + m_freem(m0); + return (0); + } + + /* Fill in rest of fragl structure */ + bcopy((caddr_t)src, (caddr_t)&cfh->cfl_src, sizeof(struct iso_addr)); + bcopy((caddr_t)dst, (caddr_t)&cfh->cfl_dst, sizeof(struct iso_addr)); + cfh->cfl_id = seg->cng_id; + cfh->cfl_ttl = clnp->cnf_ttl; + cfh->cfl_last = (seg->cng_tot_len - clnp->cnf_hdr_len) - 1; + cfh->cfl_frags = NULL; + cfh->cfl_next = NULL; + + /* Insert into list of packets */ + cfh->cfl_next = clnp_frags; + clnp_frags = cfh; + + /* Insert this fragment into list headed by cfh */ + clnp_insert_frag(cfh, m, seg); + return(1); +} + +/* + * FUNCTION: clnp_insert_frag + * + * PURPOSE: Insert fragment into list headed by 'cf'. + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: This is the 'guts' of the reassembly algorithm. + * Each fragment in this list contains a clnp_frag + * structure followed by the data of the fragment. + * The clnp_frag structure actually lies on top of + * part of the old clnp header. + */ +clnp_insert_frag(cfh, m, seg) +struct clnp_fragl *cfh; /* header of list of packet fragments */ +struct mbuf *m; /* new fragment */ +struct clnp_segment *seg; /* segment part of fragment header */ +{ + register struct clnp_fixed *clnp; /* clnp hdr of fragment */ + register struct clnp_frag *cf; /* generic fragment ptr */ + register struct clnp_frag *cf_sub = NULL; /* frag subsequent to new one */ + register struct clnp_frag *cf_prev = NULL; /* frag previous to new one */ + u_short first; /* offset of first byte of initial pdu*/ + u_short last; /* offset of last byte of initial pdu */ + u_short fraglen;/* length of fragment */ + + clnp = mtod(m, struct clnp_fixed *); + first = seg->cng_off; + CTOH(clnp->cnf_seglen_msb, clnp->cnf_seglen_lsb, fraglen); + fraglen -= clnp->cnf_hdr_len; + last = (first + fraglen) - 1; + + IFDEBUG(D_REASS) + printf("clnp_insert_frag: New fragment: [%d ... %d], len %d\n", + first, last, fraglen); + printf("clnp_insert_frag: current fragments:\n"); + for (cf = cfh->cfl_frags; cf != NULL; cf = cf->cfr_next) { + printf("\tcf x%x: [%d ... %d]\n", cf, cf->cfr_first, cf->cfr_last); + } + ENDDEBUG + + if (cfh->cfl_frags != NULL) { + /* + * Find fragment which begins after the new one + */ + for (cf = cfh->cfl_frags; cf != NULL; cf_prev = cf, cf = cf->cfr_next) { + if (cf->cfr_first > first) { + cf_sub = cf; + break; + } + } + + IFDEBUG(D_REASS) + printf("clnp_insert_frag: Previous frag is "); + if (cf_prev == NULL) + printf("NULL\n"); + else + printf("[%d ... %d]\n", cf_prev->cfr_first, cf_prev->cfr_last); + printf("clnp_insert_frag: Subsequent frag is "); + if (cf_sub == NULL) + printf("NULL\n"); + else + printf("[%d ... %d]\n", cf_sub->cfr_first, cf_sub->cfr_last); + ENDDEBUG + + /* + * If there is a fragment before the new one, check if it + * overlaps the new one. If so, then trim the end of the + * previous one. + */ + if (cf_prev != NULL) { + if (cf_prev->cfr_last > first) { + u_short overlap = cf_prev->cfr_last - first; + + IFDEBUG(D_REASS) + printf("clnp_insert_frag: previous overlaps by %d\n", + overlap); + ENDDEBUG + + if (overlap > fraglen) { + /* + * The new fragment is entirely contained in the + * preceeding one. We can punt on the new frag + * completely. + */ + m_freem(m); + return; + } else { + /* Trim data off of end of previous fragment */ + /* inc overlap to prevent duplication of last byte */ + overlap++; + m_adj(cf_prev->cfr_data, -(int)overlap); + cf_prev->cfr_last -= overlap; + } + } + } + + /* + * For all fragments past the new one, check if any data on + * the new one overlaps data on existing fragments. If so, + * then trim the extra data off the end of the new one. + */ + for (cf = cf_sub; cf != NULL; cf = cf->cfr_next) { + if (cf->cfr_first < last) { + u_short overlap = last - cf->cfr_first; + + IFDEBUG(D_REASS) + printf("clnp_insert_frag: subsequent overlaps by %d\n", + overlap); + ENDDEBUG + + if (overlap > fraglen) { + /* + * The new fragment is entirely contained in the + * succeeding one. This should not happen, because + * early on in this code we scanned for the fragment + * which started after the new one! + */ + m_freem(m); + printf("clnp_insert_frag: internal error!\n"); + return; + } else { + /* Trim data off of end of new fragment */ + /* inc overlap to prevent duplication of last byte */ + overlap++; + m_adj(m, -(int)overlap); + last -= overlap; + } + } + } + } + + /* + * Insert the new fragment beween cf_prev and cf_sub + * + * Note: the clnp hdr is still in the mbuf. + * If the data of the mbuf is not word aligned, shave off enough + * so that it is. Then, cast the clnp_frag structure on top + * of the clnp header. + * The clnp_hdr will not be used again (as we already have + * saved a copy of it). + * + * Save in cfr_bytes the number of bytes to shave off to get to + * the data of the packet. This is used when we coalesce fragments; + * the clnp_frag structure must be removed before joining mbufs. + */ + { + int pad; + u_int bytes; + + /* determine if header is not word aligned */ + pad = (int)clnp % 4; + if (pad < 0) + pad = -pad; + + /* bytes is number of bytes left in front of data */ + bytes = clnp->cnf_hdr_len - pad; + + IFDEBUG(D_REASS) + printf("clnp_insert_frag: clnp x%x requires %d alignment\n", + clnp, pad); + ENDDEBUG + + /* make it word aligned if necessary */ + if (pad) + m_adj(m, pad); + + cf = mtod(m, struct clnp_frag *); + cf->cfr_bytes = bytes; + + IFDEBUG(D_REASS) + printf("clnp_insert_frag: cf now x%x, cfr_bytes %d\n", cf, + cf->cfr_bytes); + ENDDEBUG + } + cf->cfr_first = first; + cf->cfr_last = last; + + + /* + * The data is the mbuf itself, although we must remember that the + * first few bytes are actually a clnp_frag structure + */ + cf->cfr_data = m; + + /* link into place */ + cf->cfr_next = cf_sub; + if (cf_prev == NULL) + cfh->cfl_frags = cf; + else + cf_prev->cfr_next = cf; +} + +/* + * FUNCTION: clnp_comp_pdu + * + * PURPOSE: Scan the list of fragments headed by cfh. Merge + * any contigious fragments into one. If, after + * traversing all the fragments, it is determined that + * the packet is complete, then return a pointer to + * the packet (with header prepended). Otherwise, + * return NULL. + * + * RETURNS: NULL, or a pointer to the assembled pdu in an mbuf chain. + * + * SIDE EFFECTS: Will colapse contigious fragments into one. + * + * NOTES: This code assumes that there are no overlaps of + * fragment pdus. + */ +struct mbuf * +clnp_comp_pdu(cfh) +struct clnp_fragl *cfh; /* fragment header */ +{ + register struct clnp_frag *cf = cfh->cfl_frags; + + while (cf->cfr_next != NULL) { + register struct clnp_frag *cf_next = cf->cfr_next; + + IFDEBUG(D_REASS) + printf("clnp_comp_pdu: comparing: [%d ... %d] to [%d ... %d]\n", + cf->cfr_first, cf->cfr_last, cf_next->cfr_first, + cf_next->cfr_last); + ENDDEBUG + + if (cf->cfr_last == (cf_next->cfr_first - 1)) { + /* + * Merge fragment cf and cf_next + * + * - update cf header + * - trim clnp_frag structure off of cf_next + * - append cf_next to cf + */ + struct clnp_frag cf_next_hdr; + struct clnp_frag *next_frag; + + cf_next_hdr = *cf_next; + next_frag = cf_next->cfr_next; + + IFDEBUG(D_REASS) + struct mbuf *mdump; + int l; + printf("clnp_comp_pdu: merging fragments\n"); + printf("clnp_comp_pdu: 1st: [%d ... %d] (bytes %d)\n", + cf->cfr_first, cf->cfr_last, cf->cfr_bytes); + mdump = cf->cfr_data; + l = 0; + while (mdump != NULL) { + printf("\tmbuf x%x, m_len %d\n", mdump, mdump->m_len); + l += mdump->m_len; + mdump = mdump->m_next; + } + printf("\ttotal len: %d\n", l); + printf("clnp_comp_pdu: 2nd: [%d ... %d] (bytes %d)\n", + cf_next->cfr_first, cf_next->cfr_last, cf_next->cfr_bytes); + mdump = cf_next->cfr_data; + l = 0; + while (mdump != NULL) { + printf("\tmbuf x%x, m_len %d\n", mdump, mdump->m_len); + l += mdump->m_len; + mdump = mdump->m_next; + } + printf("\ttotal len: %d\n", l); + ENDDEBUG + + cf->cfr_last = cf_next->cfr_last; + /* + * After this m_adj, the cf_next ptr is useless because we + * have adjusted the clnp_frag structure away... + */ + IFDEBUG(D_REASS) + printf("clnp_comp_pdu: shaving off %d bytes\n", + cf_next_hdr.cfr_bytes); + ENDDEBUG + m_adj(cf_next_hdr.cfr_data, (int)cf_next_hdr.cfr_bytes); + m_cat(cf->cfr_data, cf_next_hdr.cfr_data); + cf->cfr_next = next_frag; + } else { + cf = cf->cfr_next; + } + } + + cf = cfh->cfl_frags; + + IFDEBUG(D_REASS) + struct mbuf *mdump = cf->cfr_data; + printf("clnp_comp_pdu: first frag now: [%d ... %d]\n", cf->cfr_first, + cf->cfr_last); + printf("clnp_comp_pdu: data for frag:\n"); + while (mdump != NULL) { + printf("mbuf x%x, m_len %d\n", mdump, mdump->m_len); +/* dump_buf(mtod(mdump, caddr_t), mdump->m_len);*/ + mdump = mdump->m_next; + } + ENDDEBUG + + /* Check if datagram is complete */ + if ((cf->cfr_first == 0) && (cf->cfr_last == cfh->cfl_last)) { + /* + * We have a complete pdu! + * - Remove the frag header from (only) remaining fragment + * (which is not really a fragment anymore, as the datagram is + * complete). + * - Prepend a clnp header + */ + struct mbuf *data = cf->cfr_data; + struct mbuf *hdr = cfh->cfl_orighdr; + struct clnp_fragl *scan; + + IFDEBUG(D_REASS) + printf("clnp_comp_pdu: complete pdu!\n"); + ENDDEBUG + + m_adj(data, (int)cf->cfr_bytes); + m_cat(hdr, data); + + IFDEBUG(D_DUMPIN) + struct mbuf *mdump = hdr; + printf("clnp_comp_pdu: pdu is:\n"); + while (mdump != NULL) { + printf("mbuf x%x, m_len %d\n", mdump, mdump->m_len); +/* dump_buf(mtod(mdump, caddr_t), mdump->m_len);*/ + mdump = mdump->m_next; + } + ENDDEBUG + + /* + * Remove cfh from the list of fragmented pdus + */ + if (clnp_frags == cfh) { + clnp_frags = cfh->cfl_next; + } else { + for (scan = clnp_frags; scan != NULL; scan = scan->cfl_next) { + if (scan->cfl_next == cfh) { + scan->cfl_next = cfh->cfl_next; + break; + } + } + } + + /* free cfh */ + m_freem(dtom(cfh)); + + return(hdr); + } + + return(NULL); +} +#ifdef TROLL +static int troll_cnt; +#include +/* + * FUNCTION: troll_random + * + * PURPOSE: generate a pseudo-random number between 0 and 1 + * + * RETURNS: the random number + * + * SIDE EFFECTS: + * + * NOTES: This is based on the clock. + */ +float troll_random() +{ + extern struct timeval time; + long t = time.tv_usec % 100; + + return((float)t / (float) 100); +} + +/* + * FUNCTION: troll_output + * + * PURPOSE: Do something sneaky with the datagram passed. Possible + * operations are: + * Duplicate the packet + * Drop the packet + * Trim some number of bytes from the packet + * Munge some byte in the packet + * + * RETURNS: 0, or unix error code + * + * SIDE EFFECTS: + * + * NOTES: The operation of this procedure is regulated by the + * troll control structure (Troll). + */ +troll_output(ifp, m, dst, rt) +struct ifnet *ifp; +struct mbuf *m; +struct sockaddr *dst; +struct rtentry *rt; +{ + int err = 0; + troll_cnt++; + + if (trollctl.tr_ops & TR_DUPPKT) { + /* + * Duplicate every Nth packet + * TODO: random? + */ + float f_freq = troll_cnt * trollctl.tr_dup_freq; + int i_freq = troll_cnt * trollctl.tr_dup_freq; + if (i_freq == f_freq) { + struct mbuf *dup = m_copy(m, 0, (int)M_COPYALL); + if (dup != NULL) + err = (*ifp->if_output)(ifp, dup, dst, rt); + } + if (!err) + err = (*ifp->if_output)(ifp, m, dst, rt); + return(err); + } else if (trollctl.tr_ops & TR_DROPPKT) { + } else if (trollctl.tr_ops & TR_CHANGE) { + struct clnp_fixed *clnp = mtod(m, struct clnp_fixed *); + clnp->cnf_cksum_msb = 0; + err = (*ifp->if_output)(ifp, m, dst, rt); + return(err); + } else { + err = (*ifp->if_output)(ifp, m, dst, rt); + return(err); + } +} + +#endif /* TROLL */ diff --git a/sys/netiso/clnp_input.c b/sys/netiso/clnp_input.c new file mode 100644 index 00000000000..c49de95e5fa --- /dev/null +++ b/sys/netiso/clnp_input.c @@ -0,0 +1,551 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnp_input.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: /var/src/sys/netiso/RCS/clnp_input.c,v 5.1 89/02/09 16:20:32 hagens Exp $ */ +/* $Source: /var/src/sys/netiso/RCS/clnp_input.c,v $ */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef ISO +u_char clnp_protox[ISOPROTO_MAX]; +struct clnl_protosw clnl_protox[256]; +int clnpqmaxlen = IFQ_MAXLEN; /* RAH? why is this a variable */ +struct mbuf *clnp_data_ck(); + +int clnp_input(); + +int esis_input(); + +#ifdef ISO_X25ESIS +int x25esis_input(); +#endif /* ISO_X25ESIS */ + +/* + * FUNCTION: clnp_init + * + * PURPOSE: clnp initialization. Fill in clnp switch tables. + * + * RETURNS: none + * + * SIDE EFFECTS: fills in clnp_protox table with correct offsets into + * the isosw table. + * + * NOTES: + */ +clnp_init() +{ + register struct protosw *pr; + + /* + * CLNP protox initialization + */ + if ((pr = pffindproto(PF_ISO, ISOPROTO_RAW, SOCK_RAW)) == 0) + printf("clnl_init: no raw CLNP\n"); + else + clnp_protox[ISOPROTO_RAW] = pr - isosw; + + if ((pr = pffindproto(PF_ISO, ISOPROTO_TP, SOCK_SEQPACKET)) == 0) + printf("clnl_init: no tp/clnp\n"); + else + clnp_protox[ISOPROTO_TP] = pr - isosw; + + /* + * CLNL protox initialization + */ + clnl_protox[ISO8473_CLNP].clnl_input = clnp_input; + + clnlintrq.ifq_maxlen = clnpqmaxlen; +} + +/* + * FUNCTION: clnlintr + * + * PURPOSE: Process a packet on the clnl input queue + * + * RETURNS: nothing. + * + * SIDE EFFECTS: + * + * NOTES: + */ +clnlintr() +{ + register struct mbuf *m; /* ptr to first mbuf of pkt */ + register struct clnl_fixed *clnl; /* ptr to fixed part of clnl hdr */ + int s; /* save and restore priority */ + struct clnl_protosw *clnlsw;/* ptr to protocol switch */ + struct snpa_hdr sh; /* subnetwork hdr */ + + /* + * Get next datagram off clnl input queue + */ +next: + s = splimp(); + /* IF_DEQUEUESNPAHDR(&clnlintrq, m, sh);*/ + IF_DEQUEUE(&clnlintrq, m); + splx(s); + + + if (m == 0) /* nothing to do */ + return; + if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.rcvif == 0) { + m_freem(m); + goto next; + } else { + register struct ifaddr *ifa; + for (ifa = m->m_pkthdr.rcvif->if_addrlist; ifa; ifa = ifa->ifa_next) + if (ifa->ifa_addr->sa_family == AF_ISO) + break; + if (ifa == 0) { + m_freem(m); + goto next; + } + } + bzero((caddr_t)&sh, sizeof(sh)); + sh.snh_flags = m->m_flags & (M_MCAST|M_BCAST); + switch((sh.snh_ifp = m->m_pkthdr.rcvif)->if_type) { + extern int ether_output(); + case IFT_EON: + bcopy(mtod(m, caddr_t), (caddr_t)sh.snh_dhost, sizeof(u_long)); + bcopy(sizeof(u_long) + mtod(m, caddr_t), + (caddr_t)sh.snh_shost, sizeof(u_long)); + sh.snh_dhost[4] = mtod(m, u_char *)[sizeof(struct ip) + + _offsetof(struct eon_hdr, eonh_class)]; + m->m_data += EONIPLEN; + m->m_len -= EONIPLEN; + m->m_pkthdr.len -= EONIPLEN; + break; + + default: + if (sh.snh_ifp->if_output == ether_output) { + bcopy((caddr_t)(mtod(m, struct ether_header *)->ether_dhost), + (caddr_t)sh.snh_dhost, 2*sizeof(sh.snh_dhost)); + m->m_data += sizeof (struct ether_header); + m->m_len -= sizeof (struct ether_header); + m->m_pkthdr.len -= sizeof (struct ether_header); + } + } + IFDEBUG(D_INPUT) + int i; + printf("clnlintr: src:"); + for (i=0; i<6; i++) + printf("%x%c", sh.snh_shost[i] & 0xff, (i<5) ? ':' : ' '); + printf(" dst:"); + for (i=0; i<6; i++) + printf("%x%c", sh.snh_dhost[i] & 0xff, (i<5) ? ':' : ' '); + printf("\n"); + ENDDEBUG + + /* + * Get the fixed part of the clnl header into the first mbuf. + * Drop the packet if this fails. + * Do not call m_pullup if we have a cluster mbuf or the + * data is not there. + */ + if ((IS_CLUSTER(m) || (m->m_len < sizeof(struct clnl_fixed))) && + ((m = m_pullup(m, sizeof(struct clnl_fixed))) == 0)) { + INCSTAT(cns_toosmall); /* TODO: use clnl stats */ + goto next; /* m_pullup discards mbuf */ + } + + clnl = mtod(m, struct clnl_fixed *); + + /* + * Drop packet if the length of the header is not reasonable. + */ + if ((clnl->cnf_hdr_len < CLNP_HDR_MIN) || + (clnl->cnf_hdr_len > CLNP_HDR_MAX)) { + INCSTAT(cns_badhlen); /* TODO: use clnl stats */ + m_freem(m); + goto next; + } + + /* + * If the header is not contained in this mbuf, make it so. + * Drop packet if this fails. + * Note: m_pullup will allocate a cluster mbuf if necessary + */ + if (clnl->cnf_hdr_len > m->m_len) { + if ((m = m_pullup(m, (int)clnl->cnf_hdr_len)) == 0) { + INCSTAT(cns_badhlen); /* TODO: use clnl stats */ + goto next; /* m_pullup discards mbuf */ + } + clnl = mtod(m, struct clnl_fixed *); + } + + clnlsw = &clnl_protox[clnl->cnf_proto_id]; + + + if (clnlsw->clnl_input) + (*clnlsw->clnl_input) (m, &sh); + else + m_freem(m); + + goto next; +} + +/* + * FUNCTION: clnp_input + * + * PURPOSE: process an incoming clnp packet + * + * RETURNS: nothing + * + * SIDE EFFECTS: increments fields of clnp_stat structure. + * + * NOTES: + * TODO: I would like to make seg_part a pointer into the mbuf, but + * will it be correctly aligned? + */ +clnp_input(m, shp) +struct mbuf *m; /* ptr to first mbuf of pkt */ +struct snpa_hdr *shp; /* subnetwork header */ +{ + register struct clnp_fixed *clnp; /* ptr to fixed part of header */ + struct sockaddr_iso source; /* source address of pkt */ + struct sockaddr_iso target; /* destination address of pkt */ +#define src source.siso_addr +#define dst target.siso_addr + caddr_t hoff; /* current offset in packet */ + caddr_t hend; /* address of end of header info */ + struct clnp_segment seg_part; /* segment part of hdr */ + int seg_off=0; /* offset of segment part of hdr */ + int seg_len;/* length of packet data&hdr in bytes */ + struct clnp_optidx oidx, *oidxp = NULL; /* option index */ + extern int iso_systype; /* used by ESIS config resp */ + extern struct sockaddr_iso blank_siso; /* used for initializing */ + int need_afrin = 0; + /* true if congestion experienced */ + /* which means you need afrin nose */ + /* spray. How clever! */ + + IFDEBUG(D_INPUT) + printf( + "clnp_input: proccessing dg; First mbuf m_len %d, m_type x%x, %s\n", + m->m_len, m->m_type, IS_CLUSTER(m) ? "cluster" : "normal"); + ENDDEBUG + need_afrin = 0; + + /* + * If no iso addresses have been set, there is nothing + * to do with the packet. + */ + if (iso_ifaddr == NULL) { + clnp_discard(m, ADDR_DESTUNREACH); + return; + } + + INCSTAT(cns_total); + clnp = mtod(m, struct clnp_fixed *); + + IFDEBUG(D_DUMPIN) + struct mbuf *mhead; + int total_len = 0; + printf("clnp_input: clnp header:\n"); + dump_buf(mtod(m, caddr_t), clnp->cnf_hdr_len); + printf("clnp_input: mbuf chain:\n"); + for (mhead = m; mhead != NULL; mhead=mhead->m_next) { + printf("m x%x, len %d\n", mhead, mhead->m_len); + total_len += mhead->m_len; + } + printf("clnp_input: total length of mbuf chain %d:\n", total_len); + ENDDEBUG + + /* + * Compute checksum (if necessary) and drop packet if + * checksum does not match + */ + if (CKSUM_REQUIRED(clnp) && iso_check_csum(m, (int)clnp->cnf_hdr_len)) { + INCSTAT(cns_badcsum); + clnp_discard(m, GEN_BADCSUM); + return; + } + + if (clnp->cnf_vers != ISO8473_V1) { + INCSTAT(cns_badvers); + clnp_discard(m, DISC_UNSUPPVERS); + return; + } + + + /* check mbuf data length: clnp_data_ck will free mbuf upon error */ + CTOH(clnp->cnf_seglen_msb, clnp->cnf_seglen_lsb, seg_len); + if ((m = clnp_data_ck(m, seg_len)) == 0) + return; + + clnp = mtod(m, struct clnp_fixed *); + hend = (caddr_t)clnp + clnp->cnf_hdr_len; + + /* + * extract the source and destination address + * drop packet on failure + */ + source = target = blank_siso; + + hoff = (caddr_t)clnp + sizeof(struct clnp_fixed); + CLNP_EXTRACT_ADDR(dst, hoff, hend); + if (hoff == (caddr_t)0) { + INCSTAT(cns_badaddr); + clnp_discard(m, GEN_INCOMPLETE); + return; + } + CLNP_EXTRACT_ADDR(src, hoff, hend); + if (hoff == (caddr_t)0) { + INCSTAT(cns_badaddr); + clnp_discard(m, GEN_INCOMPLETE); + return; + } + + IFDEBUG(D_INPUT) + printf("clnp_input: from %s", clnp_iso_addrp(&src)); + printf(" to %s\n", clnp_iso_addrp(&dst)); + ENDDEBUG + + /* + * extract the segmentation information, if it is present. + * drop packet on failure + */ + if (((clnp->cnf_type & CNF_TYPE) != CLNP_ER) && + (clnp->cnf_type & CNF_SEG_OK)) { + if (hoff + sizeof(struct clnp_segment) > hend) { + INCSTAT(cns_noseg); + clnp_discard(m, GEN_INCOMPLETE); + return; + } else { + (void) bcopy(hoff, (caddr_t)&seg_part, sizeof(struct clnp_segment)); + /* make sure segmentation fields are in host order */ + seg_part.cng_id = ntohs(seg_part.cng_id); + seg_part.cng_off = ntohs(seg_part.cng_off); + seg_part.cng_tot_len = ntohs(seg_part.cng_tot_len); + seg_off = hoff - (caddr_t)clnp; + hoff += sizeof(struct clnp_segment); + } + } + + /* + * process options if present. If clnp_opt_sanity returns + * false (indicating an error was found in the options) or + * an unsupported option was found + * then drop packet and emit an ER. + */ + if (hoff < hend) { + int errcode; + + oidxp = &oidx; + errcode = clnp_opt_sanity(m, hoff, hend-hoff, oidxp); + + /* we do not support security */ + if ((errcode == 0) && (oidxp->cni_securep)) + errcode = DISC_UNSUPPSECURE; + + /* the er option is valid with ER pdus only */ + if ((errcode == 0) && (oidxp->cni_er_reason != ER_INVALREAS) && + ((clnp->cnf_type & CNF_TYPE) != CLNP_ER)) + errcode = DISC_UNSUPPOPT; + +#ifdef DECBIT + /* check if the congestion experienced bit is set */ + if (oidxp->cni_qos_formatp) { + caddr_t qosp = CLNP_OFFTOOPT(m, oidxp->cni_qos_formatp); + u_char qos = *qosp; + + need_afrin = ((qos & (CLNPOVAL_GLOBAL|CLNPOVAL_CONGESTED)) == + (CLNPOVAL_GLOBAL|CLNPOVAL_CONGESTED)); + if (need_afrin) + INCSTAT(cns_congest_rcvd); + } +#endif /* DECBIT */ + + if (errcode != 0) { + clnp_discard(m, (char)errcode); + IFDEBUG(D_INPUT) + printf("clnp_input: dropped (err x%x) due to bad options\n", + errcode); + ENDDEBUG + return; + } + } + + /* + * check if this packet is for us. if not, then forward + */ + if (clnp_ours(&dst) == 0) { + IFDEBUG(D_INPUT) + printf("clnp_input: forwarding packet not for us\n"); + ENDDEBUG + clnp_forward(m, seg_len, &dst, oidxp, seg_off, shp); + return; + } + + /* + * ESIS Configuration Response Function + * + * If the packet received was sent to the multicast address + * all end systems, then send an esh to the source + */ + if ((shp->snh_flags & M_MCAST) && (iso_systype == SNPA_ES)) { + extern short esis_holding_time; + + esis_shoutput(shp->snh_ifp, ESIS_ESH, esis_holding_time, + shp->snh_shost, 6, &dst); + } + + /* + * If this is a fragment, then try to reassemble it. If clnp_reass + * returns non NULL, the packet has been reassembled, and should + * be give to TP. Otherwise the fragment has been delt with + * by the reassembly code (either stored or deleted). In either case + * we should have nothing more to do with it. + */ + if (((clnp->cnf_type & CNF_TYPE) != CLNP_ER) && + (clnp->cnf_type & CNF_SEG_OK) && + (seg_len != seg_part.cng_tot_len)) { + struct mbuf *m0; + + if ((m0 = clnp_reass(m, &src, &dst, &seg_part)) != NULL) { + m = m0; + clnp = mtod(m, struct clnp_fixed *); + INCSTAT(cns_reassembled); + } else { + return; + } + } + + /* + * give the packet to the higher layer + * + * Note: the total length of packet + * is the total length field of the segmentation part, + * or, if absent, the segment length field of the + * header. + */ + INCSTAT(cns_delivered); + switch (clnp->cnf_type & CNF_TYPE) { + case CLNP_ER: + /* + * This ER must have the er option. + * If the option is not present, discard datagram. + */ + if (oidxp == NULL || oidxp->cni_er_reason == ER_INVALREAS) { + clnp_discard(m, GEN_HDRSYNTAX); + } else { + clnp_er_input(m, &src, oidxp->cni_er_reason); + } + break; + + case CLNP_DT: + (*isosw[clnp_protox[ISOPROTO_TP]].pr_input)(m, &source, &target, + clnp->cnf_hdr_len, need_afrin); + break; + + case CLNP_RAW: + case CLNP_ECR: + IFDEBUG(D_INPUT) + printf("clnp_input: raw input of %d bytes\n", + clnp->cnf_type & CNF_SEG_OK ? seg_part.cng_tot_len : seg_len); + ENDDEBUG + (*isosw[clnp_protox[ISOPROTO_RAW]].pr_input)(m, &source, &target, + clnp->cnf_hdr_len); + break; + + case CLNP_EC: + IFDEBUG(D_INPUT) + printf("clnp_input: echoing packet\n"); + ENDDEBUG + (void)clnp_echoreply(m, + (clnp->cnf_type & CNF_SEG_OK ? (int)seg_part.cng_tot_len : seg_len), + &source, &target, oidxp); + break; + + default: + printf("clnp_input: unknown clnp pkt type %d\n", + clnp->cnf_type & CNF_TYPE); + clnp_stat.cns_delivered--; + clnp_stat.cns_noproto++; + clnp_discard(m, GEN_HDRSYNTAX); + break; + } +} +#endif /* ISO */ diff --git a/sys/netiso/clnp_options.c b/sys/netiso/clnp_options.c new file mode 100644 index 00000000000..250b438664f --- /dev/null +++ b/sys/netiso/clnp_options.c @@ -0,0 +1,532 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnp_options.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: /var/src/sys/netiso/RCS/clnp_options.c,v 5.1 89/02/09 16:20:37 hagens Exp $ */ +/* $Source: /var/src/sys/netiso/RCS/clnp_options.c,v $ */ + +#ifdef ISO + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +/* + * FUNCTION: clnp_update_srcrt + * + * PURPOSE: Process src rt option accompanying a clnp datagram. + * - bump src route ptr if src routing and + * we appear current in src route list. + * + * RETURNS: none + * + * SIDE EFFECTS: + * + * NOTES: If source routing has been terminated, do nothing. + */ +clnp_update_srcrt(options, oidx) +struct mbuf *options; /* ptr to options mbuf */ +struct clnp_optidx *oidx; /* ptr to option index */ +{ + u_char len; /* length of current address */ + struct iso_addr isoa; /* copy current address into here */ + + if (CLNPSRCRT_TERM(oidx, options)) { + IFDEBUG(D_OPTIONS) + printf("clnp_update_srcrt: src rt terminated\n"); + ENDDEBUG + return; + } + + len = CLNPSRCRT_CLEN(oidx, options); + bcopy(CLNPSRCRT_CADDR(oidx, options), (caddr_t)&isoa, len); + isoa.isoa_len = len; + + IFDEBUG(D_OPTIONS) + printf("clnp_update_srcrt: current src rt: %s\n", + clnp_iso_addrp(&isoa)); + ENDDEBUG + + if (clnp_ours(&isoa)) { + IFDEBUG(D_OPTIONS) + printf("clnp_update_srcrt: updating src rt\n"); + ENDDEBUG + + /* update pointer to next src route */ + len++; /* count length byte too! */ + CLNPSRCRT_OFF(oidx, options) += len; + } +} + +/* + * FUNCTION: clnp_dooptions + * + * PURPOSE: Process options accompanying a clnp datagram. + * Processing includes + * - log our address if recording route + * + * RETURNS: none + * + * SIDE EFFECTS: + * + * NOTES: + */ +clnp_dooptions(options, oidx, ifp, isoa) +struct mbuf *options; /* ptr to options mbuf */ +struct clnp_optidx *oidx; /* ptr to option index */ +struct ifnet *ifp; /* ptr to interface pkt is leaving on */ +struct iso_addr *isoa; /* ptr to our address for this ifp */ +{ + /* + * If record route is specified, move all + * existing records over, and insert the address of + * interface passed + */ + if (oidx->cni_recrtp) { + char *opt; /* ptr to beginning of recrt option */ + u_char off; /* offset from opt of first free byte */ + char *rec_start; /* beginning of new rt recorded */ + + opt = CLNP_OFFTOOPT(options, oidx->cni_recrtp); + off = *(opt + 1); + rec_start = opt + off - 1; + + IFDEBUG(D_OPTIONS) + printf("clnp_dooptions: record route: option x%x for %d bytes\n", + opt, oidx->cni_recrt_len); + printf("\tfree slot offset x%x\n", off); + printf("clnp_dooptions: recording %s\n", clnp_iso_addrp(isoa)); + printf("clnp_dooptions: option dump:\n"); + dump_buf(opt, oidx->cni_recrt_len); + ENDDEBUG + + /* proceed only if recording has not been terminated */ + if (off != 0xff) { + int new_addrlen = isoa->isoa_len + 1; + /* + * if there is insufficient room to store the next address, + * then terminate recording. Plus 1 on isoa_len is for the + * length byte itself + */ + if (oidx->cni_recrt_len - (off - 1) < new_addrlen) { + *(opt + 1) = 0xff; /* terminate recording */ + } else { + IFDEBUG(D_OPTIONS) + printf("clnp_dooptions: new addr at x%x for %d\n", + rec_start, new_addrlen); + ENDDEBUG + + bcopy((caddr_t)isoa, rec_start, new_addrlen); + + /* update offset field */ + *(opt + 1) += new_addrlen; + + IFDEBUG(D_OPTIONS) + printf("clnp_dooptions: new option dump:\n"); + dump_buf(opt, oidx->cni_recrt_len); + ENDDEBUG + } + } + } +} + +/* + * FUNCTION: clnp_set_opts + * + * PURPOSE: Check the data mbuf passed for option sanity. If it is + * ok, then set the options ptr to address the data mbuf. + * If an options mbuf exists, free it. This implies that + * any old options will be lost. If data is NULL, simply + * free any old options. + * + * RETURNS: unix error code + * + * SIDE EFFECTS: + * + * NOTES: + */ +clnp_set_opts(options, data) +struct mbuf **options; /* target for option information */ +struct mbuf **data; /* source of option information */ +{ + int error = 0; /* error return value */ + struct clnp_optidx dummy; /* dummy index - not used */ + + /* + * remove any existing options + */ + if (*options != NULL) { + m_freem(*options); + *options = NULL; + } + + if (*data != NULL) { + /* + * Insure that the options are reasonable. + * + * Also, we do not support security, priority, + * nor do we allow one to send an ER option + * + * The QOS parameter is checked for the DECBIT. + */ + if ((clnp_opt_sanity(*data, mtod(*data, caddr_t), (*data)->m_len, + &dummy) != 0) || + (dummy.cni_securep) || + (dummy.cni_priorp) || + (dummy.cni_er_reason != ER_INVALREAS)) { + error = EINVAL; + } else { + *options = *data; + *data = NULL; /* so caller won't free mbuf @ *data */ + } + } + return error; +} + +/* + * FUNCTION: clnp_opt_sanity + * + * PURPOSE: Check the options (beginning at opts for len bytes) for + * sanity. In addition, fill in the option index structure + * in with information about each option discovered. + * + * RETURNS: success (options check out) - 0 + * failure - an ER pdu error code describing failure + * + * SIDE EFFECTS: + * + * NOTES: Each pointer field of the option index is filled in with + * the offset from the beginning of the mbuf data, not the + * actual address. + */ +clnp_opt_sanity(m, opts, len, oidx) +struct mbuf *m; /* mbuf options reside in */ +caddr_t opts; /* ptr to buffer containing options */ +int len; /* length of buffer */ +struct clnp_optidx *oidx; /* RETURN: filled in with option idx info */ +{ + u_char opcode; /* code of particular option */ + u_char oplen; /* length of a particular option */ + caddr_t opts_end; /* ptr to end of options */ + u_char pad = 0, secure = 0, srcrt = 0, recrt = 0, qos = 0, prior = 0; + /* flags for catching duplicate options */ + + IFDEBUG(D_OPTIONS) + printf("clnp_opt_sanity: checking %d bytes of data:\n", len); + dump_buf(opts, len); + ENDDEBUG + + /* clear option index field if passed */ + bzero((caddr_t)oidx, sizeof(struct clnp_optidx)); + + /* + * We need to indicate whether the ER option is present. This is done + * by overloading the er_reason field to also indicate presense of + * the option along with the option value. I would like ER_INVALREAS + * to have value 0, but alas, 0 is a valid er reason... + */ + oidx->cni_er_reason = ER_INVALREAS; + + opts_end = opts + len; + while (opts < opts_end) { + /* must have at least 2 bytes per option (opcode and len) */ + if (opts + 2 > opts_end) + return(GEN_INCOMPLETE); + + opcode = *opts++; + oplen = *opts++; + IFDEBUG(D_OPTIONS) + printf("clnp_opt_sanity: opcode is %x and oplen %d\n", + opcode, oplen); + printf("clnp_opt_sanity: clnpoval_SRCRT is %x\n", CLNPOVAL_SRCRT); + + switch (opcode) { + case CLNPOVAL_PAD: { + printf("CLNPOVAL_PAD\n"); + } break; + case CLNPOVAL_SECURE: { + printf("CLNPOVAL_SECURE\n"); + } break; + case CLNPOVAL_SRCRT: { + printf("CLNPOVAL_SRCRT\n"); + } break; + case CLNPOVAL_RECRT: { + printf("CLNPOVAL_RECRT\n"); + } break; + case CLNPOVAL_QOS: { + printf("CLNPOVAL_QOS\n"); + } break; + case CLNPOVAL_PRIOR: { + printf("CLNPOVAL_PRIOR\n"); + } break; + case CLNPOVAL_ERREAS: { + printf("CLNPOVAL_ERREAS\n"); + } break; + default: + printf("UKNOWN option %x\n", opcode); + } + ENDDEBUG + + /* don't allow crazy length values */ + if (opts + oplen > opts_end) + return(GEN_INCOMPLETE); + + switch (opcode) { + case CLNPOVAL_PAD: + /* + * Padding: increment pointer by length of padding + */ + if (pad++) /* duplicate ? */ + return(GEN_DUPOPT); + opts += oplen; + break; + + case CLNPOVAL_SECURE: { + u_char format = *opts; + + if (secure++) /* duplicate ? */ + return(GEN_DUPOPT); + /* + * Security: high 2 bits of first octet indicate format + * (00 in high bits is reserved). + * Remaining bits must be 0. Remaining octets indicate + * actual security + */ + if (((format & 0x3f) > 0) || /* low 6 bits set ? */ + ((format & 0xc0) == 0)) /* high 2 bits zero ? */ + return(GEN_HDRSYNTAX); + + oidx->cni_securep = CLNP_OPTTOOFF(m, opts); + oidx->cni_secure_len = oplen; + opts += oplen; + } break; + + case CLNPOVAL_SRCRT: { + u_char type, offset; /* type of rt, offset of start */ + caddr_t route_end; /* address of end of route option */ + + IFDEBUG(D_OPTIONS) + printf("clnp_opt_sanity: SRC RT\n"); + ENDDEBUG + + if (srcrt++) /* duplicate ? */ + return(GEN_DUPOPT); + /* + * source route: There must be 2 bytes following the length + * field: type and offset. The type must be either + * partial route or complete route. The offset field must + * be within the option. A single exception is made, however. + * The offset may be 1 greater than the length. This case + * occurs when the last source route record is consumed. + * In this case, we ignore the source route option. + * RAH? You should be able to set offset to 'ff' like in record + * route! + * Following this is a series of address fields. + * Each address field is composed of a (length, address) pair. + * Insure that the offset and each address length is reasonable + */ + route_end = opts + oplen; + + if (opts + 2 > route_end) + return(SRCRT_SYNTAX); + + type = *opts; + offset = *(opts+1); + + + /* type must be partial or complete */ + if (!((type == CLNPOVAL_PARTRT) || (type == CLNPOVAL_COMPRT))) + return(SRCRT_SYNTAX); + + oidx->cni_srcrt_s = CLNP_OPTTOOFF(m, opts); + oidx->cni_srcrt_len = oplen; + + opts += offset-1; /*set opts to first addr in rt */ + + /* + * Offset must be reasonable: + * less than end of options, or equal to end of options + */ + if (opts >= route_end) { + if (opts == route_end) { + IFDEBUG(D_OPTIONS) + printf("clnp_opt_sanity: end of src route info\n"); + ENDDEBUG + break; + } else + return(SRCRT_SYNTAX); + } + + while (opts < route_end) { + u_char addrlen = *opts++; + if (opts + addrlen > route_end) + return(SRCRT_SYNTAX); + opts += addrlen; + } + } break; + case CLNPOVAL_RECRT: { + u_char type, offset; /* type of rt, offset of start */ + caddr_t record_end; /* address of end of record option */ + + if (recrt++) /* duplicate ? */ + return(GEN_DUPOPT); + /* + * record route: after the length field, expect a + * type and offset. Type must be partial or complete. + * Offset indicates where to start recording. Insure it + * is within the option. All ones for offset means + * recording is terminated. + */ + record_end = opts + oplen; + + oidx->cni_recrtp = CLNP_OPTTOOFF(m, opts); + oidx->cni_recrt_len = oplen; + + if (opts + 2 > record_end) + return(GEN_INCOMPLETE); + + type = *opts; + offset = *(opts+1); + + /* type must be partial or complete */ + if (!((type == CLNPOVAL_PARTRT) || (type == CLNPOVAL_COMPRT))) + return(GEN_HDRSYNTAX); + + /* offset must be reasonable */ + if ((offset < 0xff) && (opts + offset > record_end)) + return(GEN_HDRSYNTAX); + opts += oplen; + } break; + case CLNPOVAL_QOS: { + u_char format = *opts; + + if (qos++) /* duplicate ? */ + return(GEN_DUPOPT); + /* + * qos: high 2 bits of first octet indicate format + * (00 in high bits is reserved). + * Remaining bits must be 0 (unless format indicates + * globally unique qos, in which case remaining bits indicate + * qos (except bit 6 which is reserved)). Otherwise, + * remaining octets indicate actual qos. + */ + if (((format & 0xc0) == 0) || /* high 2 bits zero ? */ + (((format & 0xc0) != CLNPOVAL_GLOBAL) && + ((format & 0x3f) > 0))) /* not global,low bits used ? */ + return(GEN_HDRSYNTAX); + + oidx->cni_qos_formatp = CLNP_OPTTOOFF(m, opts); + oidx->cni_qos_len = oplen; + + opts += oplen; + } break; + + case CLNPOVAL_PRIOR: { + if (prior++) /* duplicate ? */ + return(GEN_DUPOPT); + /* + * priority: value must be one byte long + */ + if (oplen != 1) + return(GEN_HDRSYNTAX); + + oidx->cni_priorp = CLNP_OPTTOOFF(m, opts); + + opts += oplen; + } break; + + case CLNPOVAL_ERREAS: { + /* + * er reason: value must be two bytes long + */ + if (oplen != 2) + return(GEN_HDRSYNTAX); + + oidx->cni_er_reason = *opts; + + opts += oplen; + } break; + + default: { + IFDEBUG(D_OPTIONS) + printf("clnp_opt_sanity: UNKNOWN OPTION 0x%x\n", opcode); + ENDDEBUG + return(DISC_UNSUPPOPT); + } + } + } + IFDEBUG(D_OPTIONS) + printf("clnp_opt_sanity: return(0)\n", opcode); + ENDDEBUG + return(0); +} +#endif /* ISO */ diff --git a/sys/netiso/clnp_output.c b/sys/netiso/clnp_output.c new file mode 100644 index 00000000000..aba9f6e00bd --- /dev/null +++ b/sys/netiso/clnp_output.c @@ -0,0 +1,561 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnp_output.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: /var/src/sys/netiso/RCS/clnp_output.c,v 5.0 89/02/08 12:00:15 hagens Exp $ */ +/* $Source: /var/src/sys/netiso/RCS/clnp_output.c,v $ */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +static struct clnp_fixed dt_template = { + ISO8473_CLNP, /* network identifier */ + 0, /* length */ + ISO8473_V1, /* version */ + CLNP_TTL, /* ttl */ + CLNP_DT|CNF_SEG_OK|CNF_ERR_OK, /* type */ + 0, /* segment length */ + 0 /* checksum */ +}; + +static struct clnp_fixed raw_template = { + ISO8473_CLNP, /* network identifier */ + 0, /* length */ + ISO8473_V1, /* version */ + CLNP_TTL, /* ttl */ + CLNP_RAW|CNF_SEG_OK|CNF_ERR_OK, /* type */ + 0, /* segment length */ + 0 /* checksum */ +}; + +static struct clnp_fixed echo_template = { + ISO8473_CLNP, /* network identifier */ + 0, /* length */ + ISO8473_V1, /* version */ + CLNP_TTL, /* ttl */ + CLNP_EC|CNF_SEG_OK|CNF_ERR_OK, /* type */ + 0, /* segment length */ + 0 /* checksum */ +}; + +static struct clnp_fixed echor_template = { + ISO8473_CLNP, /* network identifier */ + 0, /* length */ + ISO8473_V1, /* version */ + CLNP_TTL, /* ttl */ + CLNP_ECR|CNF_SEG_OK|CNF_ERR_OK, /* type */ + 0, /* segment length */ + 0 /* checksum */ +}; + +#ifdef DECBIT +u_char qos_option[] = {CLNPOVAL_QOS, 1, + CLNPOVAL_GLOBAL|CLNPOVAL_SEQUENCING|CLNPOVAL_LOWDELAY}; +#endif /* DECBIT */ + +int clnp_id = 0; /* id for segmented dgrams */ + +/* + * FUNCTION: clnp_output + * + * PURPOSE: output the data in the mbuf as a clnp datagram + * + * The data specified by m0 is sent as a clnp datagram. + * The mbuf chain m0 will be freed when this routine has + * returned. + * + * If options is non-null, it points to an mbuf which contains + * options to be sent with the datagram. The options must + * be formatted in the mbuf according to clnp rules. Options + * will not be freed. + * + * Datalen specifies the length of the data in m0. + * + * Src and dst are the addresses for the packet. + * + * If route is non-null, it is used as the route for + * the packet. + * + * By default, a DT is sent. However, if flags & CNLP_SEND_ER + * then an ER will be sent. If flags & CLNP_SEND_RAW, then + * the packet will be send as raw clnp. + * + * RETURNS: 0 success + * appropriate error code + * + * SIDE EFFECTS: none + * + * NOTES: + * Flags are interpretated as follows: + * CLNP_NO_SEG - do not allow this pkt to be segmented. + * CLNP_NO_ER - have pkt request ER suppression. + * CLNP_SEND_RAW - send pkt as RAW DT rather than TP DT + * CLNP_NO_CKSUM - don't compute clnp checksum + * CLNP_ECHO - send as ECHO packet + * + * When checking for a cached packet, clnp checks + * that the route taken is still up. It does not + * check that the route is still to the same destination. + * This means that any entity that alters an existing + * route for an isopcb (such as when a redirect arrives) + * must invalidate the clnp cache. It might be perferable + * to have clnp check that the route has the same dest, but + * by avoiding this check, we save a call to iso_addrmatch1. + */ +clnp_output(m0, isop, datalen, flags) +struct mbuf *m0; /* data for the packet */ +struct isopcb *isop; /* iso pcb */ +int datalen; /* number of bytes of data in m0 */ +int flags; /* flags */ +{ + int error = 0; /* return value of function */ + register struct mbuf *m = m0; /* mbuf for clnp header chain */ + register struct clnp_fixed *clnp; /* ptr to fixed part of hdr */ + register caddr_t hoff; /* offset into header */ + int total_len; /* total length of packet */ + struct iso_addr *src; /* ptr to source address */ + struct iso_addr *dst; /* ptr to destination address */ + struct clnp_cache clc; /* storage for cache information */ + struct clnp_cache *clcp = NULL; /* ptr to clc */ + int hdrlen = 0; + + dst = &isop->isop_faddr->siso_addr; + if (isop->isop_laddr == 0) { + struct iso_ifaddr *ia = 0; + clnp_route(dst, &isop->isop_route, flags, 0, &ia); + if (ia == 0 || ia->ia_ifa.ifa_addr->sa_family != AF_ISO) + return (ENETUNREACH); + src = &ia->ia_addr.siso_addr; + } else + src = &isop->isop_laddr->siso_addr; + + IFDEBUG(D_OUTPUT) + printf("clnp_output: to %s", clnp_iso_addrp(dst)); + printf(" from %s of %d bytes\n", clnp_iso_addrp(src), datalen); + printf("\toptions x%x, flags x%x, isop_clnpcache x%x\n", + isop->isop_options, flags, isop->isop_clnpcache); + ENDDEBUG + + if (isop->isop_clnpcache != NULL) { + clcp = mtod(isop->isop_clnpcache, struct clnp_cache *); + } + + /* + * Check if cache is valid ... + */ + IFDEBUG(D_OUTPUT) + printf("clnp_output: ck cache: clcp %x\n", clcp); + if (clcp != NULL) { + printf("\tclc_dst %s\n", clnp_iso_addrp(&clcp->clc_dst)); + printf("\tisop_opts x%x, clc_opts x%x\n", isop->isop_options, + clcp->clc_options); + if (isop->isop_route.ro_rt) + printf("\tro_rt x%x, rt_flags x%x\n", + isop->isop_route.ro_rt, isop->isop_route.ro_rt->rt_flags); + printf("\tflags x%x, clc_flags x%x\n", flags, clcp->clc_flags); + printf("\tclc_hdr x%x\n", clcp->clc_hdr); + } + ENDDEBUG + if ((clcp != NULL) && /* cache exists */ + (isop->isop_options == clcp->clc_options) && /* same options */ + (iso_addrmatch1(dst, &clcp->clc_dst)) && /* dst still same */ + (isop->isop_route.ro_rt != NULL) && /* route exists */ + (isop->isop_route.ro_rt == clcp->clc_rt) && /* and is cached */ + (isop->isop_route.ro_rt->rt_flags & RTF_UP) && /* route still up */ + (flags == clcp->clc_flags) && /* same flags */ + (clcp->clc_hdr != NULL)) { /* hdr mbuf exists */ + /* + * The cache is valid + */ + + IFDEBUG(D_OUTPUT) + printf("clnp_output: using cache\n"); + ENDDEBUG + + m = m_copy(clcp->clc_hdr, 0, (int)M_COPYALL); + if (m == NULL) { + /* + * No buffers left to copy cached packet header. Use + * the cached packet header this time, and + * mark the hdr as vacant + */ + m = clcp->clc_hdr; + clcp->clc_hdr = NULL; + } + m->m_next = m0; /* ASSUMES pkt hdr is 1 mbuf long */ + clnp = mtod(m, struct clnp_fixed *); + } else { + struct clnp_optidx *oidx = NULL; /* index to clnp options */ + + /* + * The cache is not valid. Allocate an mbuf (if necessary) + * to hold cached info. If one is not available, then + * don't bother with the cache + */ + INCSTAT(cns_cachemiss); + if (flags & CLNP_NOCACHE) { + clcp = &clc; + } else { + if (isop->isop_clnpcache == NULL) { + /* + * There is no clnpcache. Allocate an mbuf to hold one + */ + if ((isop->isop_clnpcache = m_get(M_DONTWAIT, MT_HEADER)) + == NULL) { + /* + * No mbufs available. Pretend that we don't want + * caching this time. + */ + IFDEBUG(D_OUTPUT) + printf("clnp_output: no mbufs to allocate to cache\n"); + ENDDEBUG + flags |= CLNP_NOCACHE; + clcp = &clc; + } else { + clcp = mtod(isop->isop_clnpcache, struct clnp_cache *); + } + } else { + /* + * A clnpcache mbuf exists. If the clc_hdr is not null, + * we must free it, as a new one is about to be created. + */ + clcp = mtod(isop->isop_clnpcache, struct clnp_cache *); + if (clcp->clc_hdr != NULL) { + /* + * The clc_hdr is not null but a clnpcache mbuf exists. + * This means that there was a cache, but the existing + * copy of the hdr is no longer valid. Free it now + * before we lose the pointer to it. + */ + IFDEBUG(D_OUTPUT) + printf("clnp_output: freeing old clc_hdr 0x%x\n", + clcp->clc_hdr); + ENDDEBUG + m_free(clcp->clc_hdr); + IFDEBUG(D_OUTPUT) + printf("clnp_output: freed old clc_hdr (done)\n"); + ENDDEBUG + } + } + } + IFDEBUG(D_OUTPUT) + printf("clnp_output: NEW clcp x%x\n",clcp); + ENDDEBUG + bzero((caddr_t)clcp, sizeof(struct clnp_cache)); + + if (isop->isop_optindex) + oidx = mtod(isop->isop_optindex, struct clnp_optidx *); + + /* + * Don't allow packets with security, quality of service, + * priority, or error report options to be sent. + */ + if ((isop->isop_options) && (oidx)) { + if ((oidx->cni_securep) || + (oidx->cni_priorp) || + (oidx->cni_qos_formatp) || + (oidx->cni_er_reason != ER_INVALREAS)) { + IFDEBUG(D_OUTPUT) + printf("clnp_output: pkt dropped - option unsupported\n"); + ENDDEBUG + m_freem(m0); + return(EINVAL); + } + } + + /* + * Don't allow any invalid flags to be set + */ + if ((flags & (CLNP_VFLAGS)) != flags) { + IFDEBUG(D_OUTPUT) + printf("clnp_output: packet dropped - flags unsupported\n"); + ENDDEBUG + INCSTAT(cns_odropped); + m_freem(m0); + return(EINVAL); + } + + /* + * Don't allow funny lengths on dst; src may be zero in which + * case we insert the source address based upon the interface + */ + if ((src->isoa_len > sizeof(struct iso_addr)) || + (dst->isoa_len == 0) || + (dst->isoa_len > sizeof(struct iso_addr))) { + m_freem(m0); + INCSTAT(cns_odropped); + return(ENAMETOOLONG); + } + + /* + * Grab mbuf to contain header + */ + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == 0) { + m_freem(m0); + INCSTAT(cns_odropped); + return(ENOBUFS); + } + INCSTAT(cns_sent); + m->m_next = m0; + clnp = mtod(m, struct clnp_fixed *); + clcp->clc_segoff = 0; + + /* + * Fill in all of fixed hdr except lengths and checksum + */ + if (flags & CLNP_SEND_RAW) { + *clnp = raw_template; + } else if (flags & CLNP_ECHO) { + *clnp = echo_template; + } else if (flags & CLNP_ECHOR) { + *clnp = echor_template; + } else { + *clnp = dt_template; + } + if (flags & CLNP_NO_SEG) + clnp->cnf_type &= ~CNF_SEG_OK; + if (flags & CLNP_NO_ER) + clnp->cnf_type &= ~CNF_ERR_OK; + + /* + * Route packet; special case for source rt + */ + if ((isop->isop_options) && CLNPSRCRT_VALID(oidx)) { + IFDEBUG(D_OUTPUT) + printf("clnp_output: calling clnp_srcroute\n"); + ENDDEBUG + error = clnp_srcroute(isop->isop_options, oidx, &isop->isop_route, + &clcp->clc_firsthop, &clcp->clc_ifa, dst); + } else { + IFDEBUG(D_OUTPUT) + ENDDEBUG + error = clnp_route(dst, &isop->isop_route, flags, + &clcp->clc_firsthop, &clcp->clc_ifa); + } + if (error || (clcp->clc_ifa == 0)) { + IFDEBUG(D_OUTPUT) + printf("clnp_output: route failed, errno %d\n", error); + printf("@clcp:\n"); + dump_buf(clcp, sizeof (struct clnp_cache)); + ENDDEBUG + goto bad; + } + clcp->clc_rt = isop->isop_route.ro_rt; /* XXX */ + clcp->clc_ifp = clcp->clc_ifa->ia_ifp; /* XXX */ + + IFDEBUG(D_OUTPUT) + printf("clnp_output: packet routed to %s\n", + clnp_iso_addrp( + &((struct sockaddr_iso *)clcp->clc_firsthop)->siso_addr)); + ENDDEBUG + + /* + * If src address is not yet specified, use address of + * interface. NOTE: this will now update the laddr field in + * the isopcb. Is this desirable? RAH? + */ + if (src->isoa_len == 0) { + src = &(clcp->clc_ifa->ia_addr.siso_addr); + IFDEBUG(D_OUTPUT) + printf("clnp_output: new src %s\n", clnp_iso_addrp(src)); + ENDDEBUG + } + + /* + * Insert the source and destination address, + */ + hoff = (caddr_t)clnp + sizeof(struct clnp_fixed); + CLNP_INSERT_ADDR(hoff, *dst); + CLNP_INSERT_ADDR(hoff, *src); + + /* + * Leave room for the segment part, if segmenting is selected + */ + if (clnp->cnf_type & CNF_SEG_OK) { + clcp->clc_segoff = hoff - (caddr_t)clnp; + hoff += sizeof(struct clnp_segment); + } + + clnp->cnf_hdr_len = m->m_len = (u_char)(hoff - (caddr_t)clnp); + hdrlen = clnp->cnf_hdr_len; + +#ifdef DECBIT + /* + * Add the globally unique QOS (with room for congestion experienced + * bit). I can safely assume that this option is not in the options + * mbuf below because I checked that the option was not specified + * previously + */ + if ((m->m_len + sizeof(qos_option)) < MLEN) { + bcopy((caddr_t)qos_option, hoff, sizeof(qos_option)); + clnp->cnf_hdr_len += sizeof(qos_option); + hdrlen += sizeof(qos_option); + m->m_len += sizeof(qos_option); + } +#endif /* DECBIT */ + + /* + * If an options mbuf is present, concatenate a copy to the hdr mbuf. + */ + if (isop->isop_options) { + struct mbuf *opt_copy = m_copy(isop->isop_options, 0, (int)M_COPYALL); + if (opt_copy == NULL) { + error = ENOBUFS; + goto bad; + } + /* Link in place */ + opt_copy->m_next = m->m_next; + m->m_next = opt_copy; + + /* update size of header */ + clnp->cnf_hdr_len += opt_copy->m_len; + hdrlen += opt_copy->m_len; + } + + if (hdrlen > CLNP_HDR_MAX) { + error = EMSGSIZE; + goto bad; + } + + /* + * Now set up the cache entry in the pcb + */ + if ((flags & CLNP_NOCACHE) == 0) { + if (clcp->clc_hdr = m_copy(m, 0, (int)clnp->cnf_hdr_len)) { + clcp->clc_dst = *dst; + clcp->clc_flags = flags; + clcp->clc_options = isop->isop_options; + } + } + } + /* + * If small enough for interface, send directly + * Fill in segmentation part of hdr if using the full protocol + */ + total_len = clnp->cnf_hdr_len + datalen; + if (clnp->cnf_type & CNF_SEG_OK) { + struct clnp_segment seg_part; /* segment part of hdr */ + seg_part.cng_id = htons(clnp_id++); + seg_part.cng_off = htons(0); + seg_part.cng_tot_len = htons(total_len); + (void) bcopy((caddr_t)&seg_part, (caddr_t) clnp + clcp->clc_segoff, + sizeof(seg_part)); + } + if (total_len <= SN_MTU(clcp->clc_ifp, clcp->clc_rt)) { + HTOC(clnp->cnf_seglen_msb, clnp->cnf_seglen_lsb, total_len); + m->m_pkthdr.len = total_len; + /* + * Compute clnp checksum (on header only) + */ + if (flags & CLNP_NO_CKSUM) { + HTOC(clnp->cnf_cksum_msb, clnp->cnf_cksum_lsb, 0); + } else { + iso_gen_csum(m, CLNP_CKSUM_OFF, (int)clnp->cnf_hdr_len); + } + + IFDEBUG(D_DUMPOUT) + struct mbuf *mdump = m; + printf("clnp_output: sending dg:\n"); + while (mdump != NULL) { + dump_buf(mtod(mdump, caddr_t), mdump->m_len); + mdump = mdump->m_next; + } + ENDDEBUG + + error = SN_OUTPUT(clcp, m); + goto done; + } else { + /* + * Too large for interface; fragment if possible. + */ + error = clnp_fragment(clcp->clc_ifp, m, clcp->clc_firsthop, + total_len, clcp->clc_segoff, flags, clcp->clc_rt); + goto done; + } +bad: + m_freem(m); +done: + if (error) { + clnp_stat.cns_sent--; + clnp_stat.cns_odropped++; + } + return (error); +} + +int clnp_ctloutput() +{ +} diff --git a/sys/netiso/clnp_raw.c b/sys/netiso/clnp_raw.c new file mode 100644 index 00000000000..0bc3dbac4b1 --- /dev/null +++ b/sys/netiso/clnp_raw.c @@ -0,0 +1,352 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnp_raw.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: clnp_raw.c,v 4.2 88/06/29 14:58:56 hagens Exp $ */ +/* $Source: /usr/argo/sys/netiso/RCS/clnp_raw.c,v $ */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include /* XXX -- defines SOL_NETWORK */ + +struct sockproto rclnp_proto = { PF_ISO, 0 }; +/* + * FUNCTION: rclnp_input + * + * PURPOSE: Setup generic address an protocol structures for + * raw input routine, then pass them along with the + * mbuf chain. + * + * RETURNS: none + * + * SIDE EFFECTS: + * + * NOTES: The protocol field of rclnp_proto is set to zero indicating + * no protocol. + */ +rclnp_input(m, src, dst, hdrlen) +struct mbuf *m; /* ptr to packet */ +struct sockaddr_iso *src; /* ptr to src address */ +struct sockaddr_iso *dst; /* ptr to dest address */ +int hdrlen; /* length (in bytes) of clnp header */ +{ +#ifdef TROLL + if (trollctl.tr_ops & TR_CHUCK) { + m_freem(m); + return; + } +#endif /* TROLL */ + + raw_input(m, &rclnp_proto, (struct sockaddr *)src, (struct sockaddr *)dst); +} + +/* + * FUNCTION: rclnp_output + * + * PURPOSE: Prepare to send a raw clnp packet. Setup src and dest + * addresses, count the number of bytes to send, and + * call clnp_output. + * + * RETURNS: success - 0 + * failure - an appropriate error code + * + * SIDE EFFECTS: + * + * NOTES: + */ +rclnp_output(m0, so) +struct mbuf *m0; /* packet to send */ +struct socket *so; /* socket to send from */ +{ + register struct mbuf *m; /* used to scan a chain */ + int len = 0; /* store length of chain here */ + struct rawisopcb *rp = sotorawisopcb(so); /* ptr to raw cb */ + int error; /* return value of function */ + int flags; /* flags for clnp_output */ + + if (0 == (m0->m_flags & M_PKTHDR)) + return (EINVAL); + /* + * Set up src address. If user has bound socket to an address, use it. + * Otherwise, do not specify src (clnp_output will fill it in). + */ + if (rp->risop_rcb.rcb_laddr) { + if (rp->risop_isop.isop_sladdr.siso_family != AF_ISO) { +bad: + m_freem(m0); + return(EAFNOSUPPORT); + } + } + /* set up dest address */ + if (rp->risop_rcb.rcb_faddr == 0) + goto bad; + rp->risop_isop.isop_sfaddr = + *(struct sockaddr_iso *)rp->risop_rcb.rcb_faddr; + rp->risop_isop.isop_faddr = &rp->risop_isop.isop_sfaddr; + + /* get flags and ship it off */ + flags = rp->risop_flags & CLNP_VFLAGS; + + error = clnp_output(m0, &rp->risop_isop, m0->m_pkthdr.len, + flags|CLNP_NOCACHE); + + return (error); +} + +/* + * FUNCTION: rclnp_ctloutput + * + * PURPOSE: Raw clnp socket option processing + * All options are stored inside an mbuf. + * + * RETURNS: success - 0 + * failure - unix error code + * + * SIDE EFFECTS: If the options mbuf does not exist, it the mbuf passed + * is used. + * + * NOTES: + */ +rclnp_ctloutput(op, so, level, optname, m) +int op; /* type of operation */ +struct socket *so; /* ptr to socket */ +int level; /* level of option */ +int optname; /* name of option */ +struct mbuf **m; /* ptr to ptr to option data */ +{ + int error = 0; + register struct rawisopcb *rp = sotorawisopcb(so);/* raw cb ptr */ + + IFDEBUG(D_CTLOUTPUT) + printf("rclnp_ctloutput: op = x%x, level = x%x, name = x%x\n", + op, level, optname); + if (*m != NULL) { + printf("rclnp_ctloutput: %d bytes of mbuf data\n", (*m)->m_len); + dump_buf(mtod((*m), caddr_t), (*m)->m_len); + } + ENDDEBUG + +#ifdef SOL_NETWORK + if (level != SOL_NETWORK) + error = EINVAL; + else switch (op) { +#else + switch (op) { +#endif /* SOL_NETWORK */ + case PRCO_SETOPT: + switch (optname) { + case CLNPOPT_FLAGS: { + u_short usr_flags; + /* + * Insure that the data passed has exactly one short in it + */ + if ((*m == NULL) || ((*m)->m_len != sizeof(short))) { + error = EINVAL; + break; + } + + /* + * Don't allow invalid flags to be set + */ + usr_flags = (*mtod((*m), short *)); + + if ((usr_flags & (CLNP_VFLAGS)) != usr_flags) { + error = EINVAL; + } else + rp->risop_flags |= usr_flags; + + } break; + + case CLNPOPT_OPTS: + if (error = clnp_set_opts(&rp->risop_isop.isop_options, m)) + break; + rp->risop_isop.isop_optindex = m_get(M_WAIT, MT_SOOPTS); + (void) clnp_opt_sanity(rp->risop_isop.isop_options, + mtod(rp->risop_isop.isop_options, caddr_t), + rp->risop_isop.isop_options->m_len, + mtod(rp->risop_isop.isop_optindex, + struct clnp_optidx *)); + break; + } + break; + + case PRCO_GETOPT: +#ifdef notdef + /* commented out to keep hi C quiet */ + switch (optname) { + default: + error = EINVAL; + break; + } +#endif /* notdef */ + break; + default: + error = EINVAL; + break; + } + if (op == PRCO_SETOPT) { + /* note: m_freem does not barf is *m is NULL */ + m_freem(*m); + *m = NULL; + } + + return error; +} + +/*ARGSUSED*/ +clnp_usrreq(so, req, m, nam, control) + register struct socket *so; + int req; + struct mbuf *m, *nam, *control; +{ + register int error = 0; + register struct rawisopcb *rp = sotorawisopcb(so); + + rp = sotorawisopcb(so); + switch (req) { + + case PRU_ATTACH: + if (rp) + panic("rip_attach"); + MALLOC(rp, struct rawisopcb *, sizeof *rp, M_PCB, M_WAITOK); + if (rp == 0) + return (ENOBUFS); + bzero((caddr_t)rp, sizeof *rp); + so->so_pcb = (caddr_t)rp; + break; + + case PRU_DETACH: + if (rp == 0) + panic("rip_detach"); + if (rp->risop_isop.isop_options) + m_freem(rp->risop_isop.isop_options); + if (rp->risop_isop.isop_route.ro_rt) + RTFREE(rp->risop_isop.isop_route.ro_rt); + if (rp->risop_rcb.rcb_laddr) + rp->risop_rcb.rcb_laddr = 0; + /* free clnp cached hdr if necessary */ + if (rp->risop_isop.isop_clnpcache != NULL) { + struct clnp_cache *clcp = + mtod(rp->risop_isop.isop_clnpcache, struct clnp_cache *); + if (clcp->clc_hdr != NULL) { + m_free(clcp->clc_hdr); + } + m_free(rp->risop_isop.isop_clnpcache); + } + if (rp->risop_isop.isop_optindex != NULL) + m_free(rp->risop_isop.isop_optindex); + + break; + + case PRU_BIND: + { + struct sockaddr_iso *addr = mtod(nam, struct sockaddr_iso *); + + if (nam->m_len != sizeof(*addr)) + return (EINVAL); + if ((ifnet == 0) || + (addr->siso_family != AF_ISO) || + (addr->siso_addr.isoa_len && + ifa_ifwithaddr((struct sockaddr *)addr) == 0)) + return (EADDRNOTAVAIL); + rp->risop_isop.isop_sladdr = *addr; + rp->risop_rcb.rcb_laddr = (struct sockaddr *) + (rp->risop_isop.isop_laddr = &rp->risop_isop.isop_sladdr); + return (0); + } + case PRU_CONNECT: + { + struct sockaddr_iso *addr = mtod(nam, struct sockaddr_iso *); + + if ((nam->m_len > sizeof(*addr)) || (addr->siso_len > sizeof(*addr))) + return (EINVAL); + if (ifnet == 0) + return (EADDRNOTAVAIL); + if (addr->siso_family != AF_ISO) + rp->risop_isop.isop_sfaddr = *addr; + rp->risop_rcb.rcb_faddr = (struct sockaddr *) + (rp->risop_isop.isop_faddr = &rp->risop_isop.isop_sfaddr); + soisconnected(so); + return (0); + } + } + error = raw_usrreq(so, req, m, nam, control); + + if (error && req == PRU_ATTACH && so->so_pcb) + free((caddr_t)rp, M_PCB); + return (error); +} diff --git a/sys/netiso/clnp_stat.h b/sys/netiso/clnp_stat.h new file mode 100644 index 00000000000..07cd72c63e4 --- /dev/null +++ b/sys/netiso/clnp_stat.h @@ -0,0 +1,103 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnp_stat.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: /var/src/sys/netiso/RCS/clnp_stat.h,v 5.1 89/02/09 16:20:42 hagens Exp $ */ +/* $Source: /var/src/sys/netiso/RCS/clnp_stat.h,v $ */ + + +#ifndef __CLNP_STAT__ +#define __CLNP_STAT__ + +struct clnp_stat { + int cns_total; /* total pkts received */ + int cns_toosmall; /* fixed part of header too small */ + int cns_badhlen; /* header length is not reasonable */ + int cns_badcsum; /* checksum on packet failed */ + int cns_badaddr; /* address fields were not reasonable */ + int cns_badvers; /* incorrect version */ + int cns_noseg; /* segment information forgotten */ + int cns_noproto; /* incorrect protocol id */ + int cns_delivered; /* packets consumed by protocol */ + int cns_ttlexpired; /* ttl has expired */ + int cns_forward; /* forwarded packets */ + int cns_sent; /* total packets sent */ + int cns_odropped; /* o.k. packets discarded, e.g. ENOBUFS */ + int cns_cantforward; /* non-forwarded packets */ + int cns_fragmented; /* packets fragmented */ + int cns_fragments; /* fragments received */ + int cns_fragdropped; /* fragments discarded */ + int cns_fragtimeout; /* fragments timed out */ + int cns_ofragments; /* fragments generated */ + int cns_cantfrag; /* fragmentation prohibited */ + int cns_reassembled; /* packets reconstructed */ + int cns_cachemiss; /* cache misses */ + int cns_congest_set; /* congestion experienced bit set */ + int cns_congest_rcvd; /* congestion experienced bit received */ + int cns_er_inhist[CLNP_ERRORS + 1]; + int cns_er_outhist[CLNP_ERRORS + 1]; +} clnp_stat ; + +#ifdef INCSTAT +#undef INCSTAT +#endif /* INCSTAT */ +#define INCSTAT(x) clnp_stat./**/x/**/++ + +#endif /* __CLNP_STAT__ */ diff --git a/sys/netiso/clnp_subr.c b/sys/netiso/clnp_subr.c new file mode 100644 index 00000000000..c877811be16 --- /dev/null +++ b/sys/netiso/clnp_subr.c @@ -0,0 +1,658 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnp_subr.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: /var/src/sys/netiso/RCS/clnp_subr.c,v 5.1 89/02/09 16:20:46 hagens Exp $ */ +/* $Source: /var/src/sys/netiso/RCS/clnp_subr.c,v $ */ + +#ifdef ISO + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* + * FUNCTION: clnp_data_ck + * + * PURPOSE: Check that the amount of data in the mbuf chain is + * at least as much as the clnp header would have us + * expect. Trim mbufs if longer than expected, drop + * packet if shorter than expected. + * + * RETURNS: success - ptr to mbuf chain + * failure - 0 + * + * SIDE EFFECTS: + * + * NOTES: + */ +struct mbuf * +clnp_data_ck(m, length) +register struct mbuf *m; /* ptr to mbuf chain containing hdr & data */ +int length; /* length (in bytes) of packet */ + { + register int len; /* length of data */ + register struct mbuf *mhead; /* ptr to head of chain */ + + len = -length; + mhead = m; + for (;;) { + len += m->m_len; + if (m->m_next == 0) + break; + m = m->m_next; + } + if (len != 0) { + if (len < 0) { + INCSTAT(cns_toosmall); + clnp_discard(mhead, GEN_INCOMPLETE); + return 0; + } + if (len <= m->m_len) + m->m_len -= len; + else + m_adj(mhead, -len); + } + return mhead; +} + +#ifdef notdef +/* + * FUNCTION: clnp_extract_addr + * + * PURPOSE: Extract the source and destination address from the + * supplied buffer. Place them in the supplied address buffers. + * If insufficient data is supplied, then fail. + * + * RETURNS: success - Address of first byte in the packet past + * the address part. + * failure - 0 + * + * SIDE EFFECTS: + * + * NOTES: + */ +caddr_t +clnp_extract_addr(bufp, buflen, srcp, destp) +caddr_t bufp; /* ptr to buffer containing addresses */ +int buflen; /* length of buffer */ +register struct iso_addr *srcp; /* ptr to source address buffer */ +register struct iso_addr *destp; /* ptr to destination address buffer */ + { + int len; /* argument to bcopy */ + + /* + * check that we have enough data. Plus1 is for length octet + */ + if ((u_char)*bufp + 1 > buflen) { + return((caddr_t)0); + } + len = destp->isoa_len = (u_char)*bufp++; + (void) bcopy(bufp, (caddr_t)destp, len); + buflen -= len; + bufp += len; + + /* + * check that we have enough data. Plus1 is for length octet + */ + if ((u_char)*bufp + 1 > buflen) { + return((caddr_t)0); + } + len = srcp->isoa_len = (u_char)* bufp++; + (void) bcopy(bufp, (caddr_t)srcp, len); + bufp += len; + + /* + * Insure that the addresses make sense + */ + if (iso_ck_addr(srcp) && iso_ck_addr(destp)) + return bufp; + else + return (caddr_t) 0; +} +#endif /* notdef */ + +/* + * FUNCTION: clnp_ours + * + * PURPOSE: Decide whether the supplied packet is destined for + * us, or that it should be forwarded on. + * + * RETURNS: packet is for us - 1 + * packet is not for us - 0 + * + * SIDE EFFECTS: + * + * NOTES: + */ +clnp_ours(dst) +register struct iso_addr *dst; /* ptr to destination address */ +{ + register struct iso_ifaddr *ia; /* scan through interface addresses */ + + for (ia = iso_ifaddr; ia; ia = ia->ia_next) { + IFDEBUG(D_ROUTE) + printf("clnp_ours: ia_sis x%x, dst x%x\n", &ia->ia_addr, + dst); + ENDDEBUG + /* + * XXX Warning: + * We are overloading siso_tlen in the if's address, as an nsel length. + */ + if (dst->isoa_len == ia->ia_addr.siso_nlen && + bcmp((caddr_t)ia->ia_addr.siso_addr.isoa_genaddr, + (caddr_t)dst->isoa_genaddr, + ia->ia_addr.siso_nlen - ia->ia_addr.siso_tlen) == 0) + return 1; + } + return 0; +} + +/* Dec bit set if ifp qlen is greater than congest_threshold */ +int congest_threshold = 0; + +/* + * FUNCTION: clnp_forward + * + * PURPOSE: Forward the datagram passed + * clnpintr guarantees that the header will be + * contigious (a cluster mbuf will be used if necessary). + * + * If oidx is NULL, no options are present. + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: + */ +clnp_forward(m, len, dst, oidx, seg_off, inbound_shp) +struct mbuf *m; /* pkt to forward */ +int len; /* length of pkt */ +struct iso_addr *dst; /* destination address */ +struct clnp_optidx *oidx; /* option index */ +int seg_off;/* offset of segmentation part */ +struct snpa_hdr *inbound_shp; /* subnetwork header of inbound packet */ +{ + struct clnp_fixed *clnp; /* ptr to fixed part of header */ + int error; /* return value of route function */ + struct sockaddr *next_hop; /* next hop for dgram */ + struct ifnet *ifp; /* ptr to outgoing interface */ + struct iso_ifaddr *ia = 0;/* ptr to iso name for ifp */ + struct route_iso route; /* filled in by clnp_route */ + extern int iso_systype; + + clnp = mtod(m, struct clnp_fixed *); + bzero((caddr_t)&route, sizeof(route)); /* MUST be done before "bad:" */ + + /* + * Don't forward multicast or broadcast packets + */ + if ((inbound_shp) && (IS_MULTICAST(inbound_shp->snh_dhost))) { + IFDEBUG(D_FORWARD) + printf("clnp_forward: dropping multicast packet\n"); + ENDDEBUG + clnp->cnf_type &= ~CNF_ERR_OK; /* so we don't generate an ER */ + clnp_discard(m, 0); + INCSTAT(cns_cantforward); + goto done; + } + + IFDEBUG(D_FORWARD) + printf("clnp_forward: %d bytes, to %s, options x%x\n", len, + clnp_iso_addrp(dst), oidx); + ENDDEBUG + + /* + * Decrement ttl, and if zero drop datagram + * Can't compare ttl as less than zero 'cause its a unsigned + */ + if ((clnp->cnf_ttl == 0) || (--clnp->cnf_ttl == 0)) { + IFDEBUG(D_FORWARD) + printf("clnp_forward: discarding datagram because ttl is zero\n"); + ENDDEBUG + INCSTAT(cns_ttlexpired); + clnp_discard(m, TTL_EXPTRANSIT); + goto done; + } + /* + * Route packet; special case for source rt + */ + if CLNPSRCRT_VALID(oidx) { + /* + * Update src route first + */ + clnp_update_srcrt(m, oidx); + error = clnp_srcroute(m, oidx, &route, &next_hop, &ia, dst); + } else { + error = clnp_route(dst, &route, 0, &next_hop, &ia); + } + if (error || ia == 0) { + IFDEBUG(D_FORWARD) + printf("clnp_forward: can't route packet (errno %d)\n", error); + ENDDEBUG + clnp_discard(m, ADDR_DESTUNREACH); + INCSTAT(cns_cantforward); + goto done; + } + ifp = ia->ia_ifp; + + IFDEBUG(D_FORWARD) + printf("clnp_forward: packet routed to %s\n", + clnp_iso_addrp(&((struct sockaddr_iso *)next_hop)->siso_addr)); + ENDDEBUG + + INCSTAT(cns_forward); + + /* + * If we are an intermediate system and + * we are routing outbound on the same ifp that the packet + * arrived upon, and we know the next hop snpa, + * then generate a redirect request + */ + if ((iso_systype & SNPA_IS) && (inbound_shp) && + (ifp == inbound_shp->snh_ifp)) + esis_rdoutput(inbound_shp, m, oidx, dst, route.ro_rt); + /* + * If options are present, update them + */ + if (oidx) { + struct iso_addr *mysrc = &ia->ia_addr.siso_addr; + if (mysrc == NULL) { + clnp_discard(m, ADDR_DESTUNREACH); + INCSTAT(cns_cantforward); + clnp_stat.cns_forward--; + goto done; + } else { + (void) clnp_dooptions(m, oidx, ifp, mysrc); + } + } + +#ifdef DECBIT + if (ifp->if_snd.ifq_len > congest_threshold) { + /* + * Congestion! Set the Dec Bit and thank Dave Oran + */ + IFDEBUG(D_FORWARD) + printf("clnp_forward: congestion experienced\n"); + ENDDEBUG + if ((oidx) && (oidx->cni_qos_formatp)) { + caddr_t qosp = CLNP_OFFTOOPT(m, oidx->cni_qos_formatp); + u_char qos = *qosp; + IFDEBUG(D_FORWARD) + printf("clnp_forward: setting congestion bit (qos x%x)\n", qos); + ENDDEBUG + if ((qos & CLNPOVAL_GLOBAL) == CLNPOVAL_GLOBAL) { + qos |= CLNPOVAL_CONGESTED; + INCSTAT(cns_congest_set); + *qosp = qos; + } + } + } +#endif /* DECBIT */ + + /* + * Dispatch the datagram if it is small enough, otherwise fragment + */ + if (len <= SN_MTU(ifp, route.ro_rt)) { + iso_gen_csum(m, CLNP_CKSUM_OFF, (int)clnp->cnf_hdr_len); + (void) (*ifp->if_output)(ifp, m, next_hop, route.ro_rt); + } else { + (void) clnp_fragment(ifp, m, next_hop, len, seg_off, /* flags */0, route.ro_rt); + } + +done: + /* + * Free route + */ + if (route.ro_rt != NULL) { + RTFREE(route.ro_rt); + } +} + +#ifdef notdef +/* + * FUNCTION: clnp_insert_addr + * + * PURPOSE: Insert the address part into a clnp datagram. + * + * RETURNS: Address of first byte after address part in datagram. + * + * SIDE EFFECTS: + * + * NOTES: Assume that there is enough space for the address part. + */ +caddr_t +clnp_insert_addr(bufp, srcp, dstp) +caddr_t bufp; /* address of where addr part goes */ +register struct iso_addr *srcp; /* ptr to src addr */ +register struct iso_addr *dstp; /* ptr to dst addr */ +{ + *bufp++ = dstp->isoa_len; + (void) bcopy((caddr_t)dstp, bufp, dstp->isoa_len); + bufp += dstp->isoa_len; + + *bufp++ = srcp->isoa_len; + (void) bcopy((caddr_t)srcp, bufp, srcp->isoa_len); + bufp += srcp->isoa_len; + + return bufp; +} + +#endif /* notdef */ + +/* + * FUNCTION: clnp_route + * + * PURPOSE: Route a clnp datagram to the first hop toward its + * destination. In many cases, the first hop will be + * the destination. The address of a route + * is specified. If a routing entry is present in + * that route, and it is still up to the same destination, + * then no further action is necessary. Otherwise, a + * new routing entry will be allocated. + * + * RETURNS: route found - 0 + * unix error code + * + * SIDE EFFECTS: + * + * NOTES: It is up to the caller to free the routing entry + * allocated in route. + */ +clnp_route(dst, ro, flags, first_hop, ifa) + struct iso_addr *dst; /* ptr to datagram destination */ + register struct route_iso *ro; /* existing route structure */ + int flags; /* flags for routing */ + struct sockaddr **first_hop; /* result: fill in with ptr to firsthop */ + struct iso_ifaddr **ifa; /* result: fill in with ptr to interface */ +{ + if (flags & SO_DONTROUTE) { + struct iso_ifaddr *ia; + + if (ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = 0; + } + bzero((caddr_t)&ro->ro_dst, sizeof(ro->ro_dst)); + bcopy((caddr_t)dst, (caddr_t)&ro->ro_dst.siso_addr, + 1 + (unsigned)dst->isoa_len); + ro->ro_dst.siso_family = AF_ISO; + ro->ro_dst.siso_len = sizeof(ro->ro_dst); + ia = iso_localifa(&ro->ro_dst); + if (ia == 0) + return EADDRNOTAVAIL; + if (ifa) + *ifa = ia; + if (first_hop) + *first_hop = (struct sockaddr *)&ro->ro_dst; + return 0; + } + /* + * If there is a cached route, check that it is still up and to + * the same destination. If not, free it and try again. + */ + if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + (Bcmp(ro->ro_dst.siso_data, dst->isoa_genaddr, dst->isoa_len)))) { + IFDEBUG(D_ROUTE) + printf("clnp_route: freeing old route: ro->ro_rt 0x%x\n", + ro->ro_rt); + printf("clnp_route: old route refcnt: 0x%x\n", + ro->ro_rt->rt_refcnt); + ENDDEBUG + + /* free old route entry */ + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } else { + IFDEBUG(D_ROUTE) + printf("clnp_route: OK route exists\n"); + ENDDEBUG + } + + if (ro->ro_rt == 0) { + /* set up new route structure */ + bzero((caddr_t)&ro->ro_dst, sizeof(ro->ro_dst)); + ro->ro_dst.siso_len = sizeof(ro->ro_dst); + ro->ro_dst.siso_family = AF_ISO; + Bcopy(dst, &ro->ro_dst.siso_addr, 1 + dst->isoa_len); + /* allocate new route */ + IFDEBUG(D_ROUTE) + printf("clnp_route: allocating new route to %s\n", + clnp_iso_addrp(dst)); + ENDDEBUG + rtalloc((struct route *)ro); + } + if (ro->ro_rt == 0) + return(ENETUNREACH); /* rtalloc failed */ + ro->ro_rt->rt_use++; + if (ifa) + if ((*ifa = (struct iso_ifaddr *)ro->ro_rt->rt_ifa) == 0) + panic("clnp_route"); + if (first_hop) { + if (ro->ro_rt->rt_flags & RTF_GATEWAY) + *first_hop = ro->ro_rt->rt_gateway; + else + *first_hop = (struct sockaddr *)&ro->ro_dst; + } + return(0); +} + +/* + * FUNCTION: clnp_srcroute + * + * PURPOSE: Source route the datagram. If complete source + * routing is specified but not possible, then + * return an error. If src routing is terminated, then + * try routing on destination. + * Usage of first_hop, + * ifp, and error return is identical to clnp_route. + * + * RETURNS: 0 or unix error code + * + * SIDE EFFECTS: + * + * NOTES: Remember that option index pointers are really + * offsets from the beginning of the mbuf. + */ +clnp_srcroute(options, oidx, ro, first_hop, ifa, final_dst) +struct mbuf *options; /* ptr to options */ +struct clnp_optidx *oidx; /* index to options */ +struct route_iso *ro; /* route structure */ +struct sockaddr **first_hop; /* RETURN: fill in with ptr to firsthop */ +struct iso_ifaddr **ifa; /* RETURN: fill in with ptr to interface */ +struct iso_addr *final_dst; /* final destination */ +{ + struct iso_addr dst; /* first hop specified by src rt */ + int error = 0; /* return code */ + + /* + * Check if we have run out of routes + * If so, then try to route on destination. + */ + if CLNPSRCRT_TERM(oidx, options) { + dst.isoa_len = final_dst->isoa_len; + bcopy(final_dst->isoa_genaddr, dst.isoa_genaddr, dst.isoa_len); + } else { + /* + * setup dst based on src rt specified + */ + dst.isoa_len = CLNPSRCRT_CLEN(oidx, options); + bcopy(CLNPSRCRT_CADDR(oidx, options), dst.isoa_genaddr, dst.isoa_len); + } + + /* + * try to route it + */ + error = clnp_route(&dst, ro, 0, first_hop, ifa); + if (error != 0) + return error; + + /* + * If complete src rt, first hop must be equal to dst + */ + if ((CLNPSRCRT_TYPE(oidx, options) == CLNPOVAL_COMPRT) && + (!iso_addrmatch1(&(*(struct sockaddr_iso **)first_hop)->siso_addr,&dst))){ + IFDEBUG(D_OPTIONS) + printf("clnp_srcroute: complete src route failed\n"); + ENDDEBUG + return EHOSTUNREACH; /* RAH? would like ESRCRTFAILED */ + } + + return error; +} + +/* + * FUNCTION: clnp_echoreply + * + * PURPOSE: generate an echo reply packet and transmit + * + * RETURNS: result of clnp_output + * + * SIDE EFFECTS: + */ +clnp_echoreply(ec_m, ec_len, ec_src, ec_dst, ec_oidxp) +struct mbuf *ec_m; /* echo request */ +int ec_len; /* length of ec */ +struct sockaddr_iso *ec_src; /* src of ec */ +struct sockaddr_iso *ec_dst; /* destination of ec (i.e., us) */ +struct clnp_optidx *ec_oidxp; /* options index to ec packet */ +{ + struct isopcb isopcb; + int flags = CLNP_NOCACHE|CLNP_ECHOR; + int ret; + + /* fill in fake isopcb to pass to output function */ + bzero(&isopcb, sizeof(isopcb)); + isopcb.isop_laddr = ec_dst; + isopcb.isop_faddr = ec_src; + + /* forget copying the options for now. If implemented, need only + * copy record route option, but it must be reset to zero length */ + + ret = clnp_output(ec_m, &isopcb, ec_len, flags); + + IFDEBUG(D_OUTPUT) + printf("clnp_echoreply: output returns %d\n", ret); + ENDDEBUG + return ret; +} + +/* + * FUNCTION: clnp_badmtu + * + * PURPOSE: print notice of route with mtu not initialized. + * + * RETURNS: mtu of ifp. + * + * SIDE EFFECTS: prints notice, slows down system. + */ +clnp_badmtu(ifp, rt, line, file) +struct ifnet *ifp; /* outgoing interface */ +struct rtentry *rt; /* dst route */ +int line; /* where the dirty deed occured */ +char *file; /* where the dirty deed occured */ +{ + printf("sending on route 0x%x with no mtu, line %d of file %s\n", + rt, line, file); +#ifdef ARGO_DEBUG + printf("route dst is "); + dump_isoaddr(rt_key(rt)); +#endif + return ifp->if_mtu; +} + +/* + * FUNCTION: clnp_ypocb - backwards bcopy + * + * PURPOSE: bcopy starting at end of src rather than beginning. + * + * RETURNS: none + * + * SIDE EFFECTS: + * + * NOTES: No attempt has been made to make this efficient + */ +clnp_ypocb(from, to, len) +caddr_t from; /* src buffer */ +caddr_t to; /* dst buffer */ +u_int len; /* number of bytes */ +{ + while (len--) + *(to + len) = *(from + len); +} +#endif /* ISO */ diff --git a/sys/netiso/clnp_timer.c b/sys/netiso/clnp_timer.c new file mode 100644 index 00000000000..718d5302f77 --- /dev/null +++ b/sys/netiso/clnp_timer.c @@ -0,0 +1,180 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clnp_timer.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: clnp_timer.c,v 4.2 88/06/29 14:59:05 hagens Exp $ */ +/* $Source: /usr/argo/sys/netiso/RCS/clnp_timer.c,v $ */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +extern struct clnp_fragl *clnp_frags; + +/* + * FUNCTION: clnp_freefrags + * + * PURPOSE: Free the resources associated with a fragment + * + * RETURNS: pointer to next fragment in list of fragments + * + * SIDE EFFECTS: + * + * NOTES: + * TODO: send ER back to source + */ +struct clnp_fragl * +clnp_freefrags(cfh) +register struct clnp_fragl *cfh; /* fragment header to delete */ +{ + struct clnp_fragl *next = cfh->cfl_next; + struct clnp_frag *cf; + + /* free any frags hanging around */ + cf = cfh->cfl_frags; + while (cf != NULL) { + struct clnp_frag *cf_next = cf->cfr_next; + INCSTAT(cns_fragdropped); + m_freem(cf->cfr_data); + cf = cf_next; + } + + /* free the copy of the header */ + INCSTAT(cns_fragdropped); + m_freem(cfh->cfl_orighdr); + + if (clnp_frags == cfh) { + clnp_frags = cfh->cfl_next; + } else { + struct clnp_fragl *scan; + + for (scan = clnp_frags; scan != NULL; scan = scan->cfl_next) { + if (scan->cfl_next == cfh) { + scan->cfl_next = cfh->cfl_next; + break; + } + } + } + + /* free the fragment header */ + m_freem(dtom(cfh)); + + return(next); +} + +/* + * FUNCTION: clnp_slowtimo + * + * PURPOSE: clnp timer processing; if the ttl expires on a + * packet on the reassembly queue, discard it. + * + * RETURNS: none + * + * SIDE EFFECTS: + * + * NOTES: + */ +clnp_slowtimo() +{ + register struct clnp_fragl *cfh = clnp_frags; + int s = splnet(); + + while (cfh != NULL) { + if (--cfh->cfl_ttl == 0) { + cfh = clnp_freefrags(cfh); + INCSTAT(cns_fragtimeout); + } else { + cfh = cfh->cfl_next; + } + } + splx(s); +} + +/* + * FUNCTION: clnp_drain + * + * PURPOSE: drain off all datagram fragments + * + * RETURNS: none + * + * SIDE EFFECTS: + * + * NOTES: + * TODO: should send back ER + */ +clnp_drain() +{ + register struct clnp_fragl *cfh = clnp_frags; + + while (cfh != NULL) + cfh = clnp_freefrags(cfh); +} diff --git a/sys/netiso/cltp_usrreq.c b/sys/netiso/cltp_usrreq.c new file mode 100644 index 00000000000..93f8d1c398e --- /dev/null +++ b/sys/netiso/cltp_usrreq.c @@ -0,0 +1,405 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cltp_usrreq.c 8.1 (Berkeley) 6/10/93 + */ + +#ifndef CLTPOVAL_SRC /* XXX -- till files gets changed */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#endif + +/* + * CLTP protocol implementation. + * Per ISO 8602, December, 1987. + */ +cltp_init() +{ + + cltb.isop_next = cltb.isop_prev = &cltb; +} + +int cltp_cksum = 1; + + +/* ARGUSED */ +cltp_input(m0, srcsa, dstsa, cons_channel, output) + struct mbuf *m0; + struct sockaddr *srcsa, *dstsa; + u_int cons_channel; + int (*output)(); +{ + register struct isopcb *isop; + register struct mbuf *m = m0; + register u_char *up = mtod(m, u_char *); + register struct sockaddr_iso *src = (struct sockaddr_iso *)srcsa; + int len, hdrlen = *up + 1, dlen = 0; + u_char *uplim = up + hdrlen; + caddr_t dtsap; + + for (len = 0; m; m = m->m_next) + len += m->m_len; + up += 2; /* skip header */ + while (up < uplim) switch (*up) { /* process options */ + case CLTPOVAL_SRC: + src->siso_tlen = up[1]; + src->siso_len = up[1] + TSEL(src) - (caddr_t)src; + if (src->siso_len < sizeof(*src)) + src->siso_len = sizeof(*src); + else if (src->siso_len > sizeof(*src)) { + MGET(m, M_DONTWAIT, MT_SONAME); + if (m == 0) + goto bad; + m->m_len = src->siso_len; + src = mtod(m, struct sockaddr_iso *); + bcopy((caddr_t)srcsa, (caddr_t)src, srcsa->sa_len); + } + bcopy((caddr_t)up + 2, TSEL(src), up[1]); + up += 2 + src->siso_tlen; + continue; + + case CLTPOVAL_DST: + dtsap = 2 + (caddr_t)up; + dlen = up[1]; + up += 2 + dlen; + continue; + + case CLTPOVAL_CSM: + if (iso_check_csum(m0, len)) { + cltpstat.cltps_badsum++; + goto bad; + } + up += 4; + continue; + + default: + printf("clts: unknown option (%x)\n", up[0]); + cltpstat.cltps_hdrops++; + goto bad; + } + if (dlen == 0 || src->siso_tlen == 0) + goto bad; + for (isop = cltb.isop_next;; isop = isop->isop_next) { + if (isop == &cltb) { + cltpstat.cltps_noport++; + goto bad; + } + if (isop->isop_laddr && + bcmp(TSEL(isop->isop_laddr), dtsap, dlen) == 0) + break; + } + m = m0; + m->m_len -= hdrlen; + m->m_data += hdrlen; + if (sbappendaddr(&isop->isop_socket->so_rcv, (struct sockaddr *)src, + m, (struct mbuf *)0) == 0) + goto bad; + cltpstat.cltps_ipackets++; + sorwakeup(isop->isop_socket); + m0 = 0; +bad: + if (src != (struct sockaddr_iso *)srcsa) + m_freem(dtom(src)); + if (m0) + m_freem(m0); + return 0; +} + +/* + * Notify a cltp user of an asynchronous error; + * just wake up so that he can collect error status. + */ +cltp_notify(isop) + register struct isopcb *isop; +{ + + sorwakeup(isop->isop_socket); + sowwakeup(isop->isop_socket); +} + +cltp_ctlinput(cmd, sa) + int cmd; + struct sockaddr *sa; +{ + extern u_char inetctlerrmap[]; + struct sockaddr_iso *siso; + int iso_rtchange(); + + if ((unsigned)cmd > PRC_NCMDS) + return; + if (sa->sa_family != AF_ISO && sa->sa_family != AF_CCITT) + return; + siso = (struct sockaddr_iso *)sa; + if (siso == 0 || siso->siso_nlen == 0) + return; + + switch (cmd) { + case PRC_ROUTEDEAD: + case PRC_REDIRECT_NET: + case PRC_REDIRECT_HOST: + case PRC_REDIRECT_TOSNET: + case PRC_REDIRECT_TOSHOST: + iso_pcbnotify(&cltb, siso, + (int)inetctlerrmap[cmd], iso_rtchange); + break; + + default: + if (inetctlerrmap[cmd] == 0) + return; /* XXX */ + iso_pcbnotify(&cltb, siso, (int)inetctlerrmap[cmd], + cltp_notify); + } +} + +cltp_output(isop, m) + register struct isopcb *isop; + register struct mbuf *m; +{ + register int len; + register struct sockaddr_iso *siso; + int hdrlen, error = 0, docsum; + register u_char *up; + + if (isop->isop_laddr == 0 || isop->isop_faddr == 0) { + error = ENOTCONN; + goto bad; + } + /* + * Calculate data length and get a mbuf for CLTP header. + */ + hdrlen = 2 + 2 + isop->isop_laddr->siso_tlen + + 2 + isop->isop_faddr->siso_tlen; + if (docsum = /*isop->isop_flags & CLNP_NO_CKSUM*/ cltp_cksum) + hdrlen += 4; + M_PREPEND(m, hdrlen, M_WAIT); + len = m->m_pkthdr.len; + /* + * Fill in mbuf with extended CLTP header + */ + up = mtod(m, u_char *); + up[0] = hdrlen - 1; + up[1] = UD_TPDU_type; + up[2] = CLTPOVAL_SRC; + up[3] = (siso = isop->isop_laddr)->siso_tlen; + up += 4; + bcopy(TSEL(siso), (caddr_t)up, siso->siso_tlen); + up += siso->siso_tlen; + up[0] = CLTPOVAL_DST; + up[1] = (siso = isop->isop_faddr)->siso_tlen; + up += 2; + bcopy(TSEL(siso), (caddr_t)up, siso->siso_tlen); + /* + * Stuff checksum and output datagram. + */ + if (docsum) { + up += siso->siso_tlen; + up[0] = CLTPOVAL_CSM; + up[1] = 2; + iso_gen_csum(m, 2 + up - mtod(m, u_char *), len); + } + cltpstat.cltps_opackets++; + return (tpclnp_output(isop, m, len, !docsum)); +bad: + m_freem(m); + return (error); +} + +u_long cltp_sendspace = 9216; /* really max datagram size */ +u_long cltp_recvspace = 40 * (1024 + sizeof(struct sockaddr_iso)); + /* 40 1K datagrams */ + + +/*ARGSUSED*/ +cltp_usrreq(so, req, m, nam, control) + struct socket *so; + int req; + struct mbuf *m, *nam, *control; +{ + register struct isopcb *isop = sotoisopcb(so); + int s, error = 0; + + if (req == PRU_CONTROL) + return (iso_control(so, (int)m, (caddr_t)nam, + (struct ifnet *)control)); + if ((isop == NULL && req != PRU_ATTACH) || + (control && control->m_len)) { + error = EINVAL; + goto release; + } + switch (req) { + + case PRU_ATTACH: + if (isop != NULL) { + error = EINVAL; + break; + } + error = iso_pcballoc(so, &cltb); + if (error) + break; + error = soreserve(so, cltp_sendspace, cltp_recvspace); + if (error) + break; + break; + + case PRU_DETACH: + iso_pcbdetach(isop); + break; + + case PRU_BIND: + error = iso_pcbbind(isop, nam); + break; + + case PRU_LISTEN: + error = EOPNOTSUPP; + break; + + case PRU_CONNECT: + if (isop->isop_faddr) { + error = EISCONN; + break; + } + error = iso_pcbconnect(isop, nam); + if (error == 0) + soisconnected(so); + break; + + case PRU_CONNECT2: + error = EOPNOTSUPP; + break; + + case PRU_ACCEPT: + error = EOPNOTSUPP; + break; + + case PRU_DISCONNECT: + if (isop->isop_faddr == 0) { + error = ENOTCONN; + break; + } + iso_pcbdisconnect(isop); + so->so_state &= ~SS_ISCONNECTED; /* XXX */ + break; + + case PRU_SHUTDOWN: + socantsendmore(so); + break; + + case PRU_SEND: + if (nam) { + if (isop->isop_faddr) { + error = EISCONN; + break; + } + /* + * Must block input while temporarily connected. + */ + s = splnet(); + error = iso_pcbconnect(isop, nam); + if (error) { + splx(s); + break; + } + } else { + if (isop->isop_faddr == 0) { + error = ENOTCONN; + break; + } + } + error = cltp_output(isop, m); + m = 0; + if (nam) { + iso_pcbdisconnect(isop); + splx(s); + } + break; + + case PRU_ABORT: + soisdisconnected(so); + iso_pcbdetach(isop); + break; + + case PRU_SOCKADDR: + if (isop->isop_laddr) + bcopy((caddr_t)isop->isop_laddr, mtod(m, caddr_t), + nam->m_len = isop->isop_laddr->siso_len); + break; + + case PRU_PEERADDR: + if (isop->isop_faddr) + bcopy((caddr_t)isop->isop_faddr, mtod(m, caddr_t), + nam->m_len = isop->isop_faddr->siso_len); + break; + + case PRU_SENSE: + /* + * stat: don't bother with a blocksize. + */ + return (0); + + case PRU_SENDOOB: + case PRU_FASTTIMO: + case PRU_SLOWTIMO: + case PRU_PROTORCV: + case PRU_PROTOSEND: + error = EOPNOTSUPP; + break; + + case PRU_RCVD: + case PRU_RCVOOB: + return (EOPNOTSUPP); /* do not free mbuf's */ + + default: + panic("cltp_usrreq"); + } +release: + if (control != NULL) + m_freem(control); + if (m != NULL) + m_freem(m); + return (error); +} diff --git a/sys/netiso/cltp_var.h b/sys/netiso/cltp_var.h new file mode 100644 index 00000000000..b4e08f2c99b --- /dev/null +++ b/sys/netiso/cltp_var.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cltp_var.h 8.1 (Berkeley) 6/10/93 + */ + +#define UD_TPDU_type 0x40 /* packet type */ + +#define CLTPOVAL_SRC 0xc1 /* Source TSAP -- required */ +#define CLTPOVAL_DST 0xc2 /* Destination TSAP -- required */ +#define CLTPOVAL_CSM 0xc3 /* Checksum parameter -- optional */ + +struct cltpstat { + int cltps_hdrops; + int cltps_badsum; + int cltps_badlen; + int cltps_noport; + int cltps_ipackets; + int cltps_opackets; +}; + +#ifdef KERNEL +struct isopcb cltb; +struct cltpstat cltpstat; +#endif diff --git a/sys/netiso/cons.h b/sys/netiso/cons.h new file mode 100644 index 00000000000..b0739de1eac --- /dev/null +++ b/sys/netiso/cons.h @@ -0,0 +1,92 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cons.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * $Header: cons.h,v 4.4 88/09/09 19:01:28 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/cons.h,v $ + * + * interface between TP and CONS + */ + +#define CONSOPT_X25CRUD 0x01 /* set x.25 call request user data */ + +struct dte_addr { + u_char dtea_addr[7]; + u_char dtea_niblen; +}; + +#ifdef KERNEL + +#define CONN_OPEN 0x33 +#define CONN_CONFIRM 0x30 +#define CONN_REFUSE 0x31 +#define CONN_CLOSE 0x32 + +#define CONS_IS_DGM 0x1 +#define CONS_NOT_DGM 0x0 + +#ifndef PRC_NCMDS +#include +#endif /* PRC_NCMDS */ + +#define PRC_CONS_SEND_DONE 2 /* something unused in protosw.h */ + +#endif /* KERNEL */ diff --git a/sys/netiso/cons_pcb.h b/sys/netiso/cons_pcb.h new file mode 100644 index 00000000000..b8adc373947 --- /dev/null +++ b/sys/netiso/cons_pcb.h @@ -0,0 +1,193 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cons_pcb.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: cons_pcb.h,v 4.2 88/06/29 14:59:08 hagens Exp $ */ +/* $Source: /usr/argo/sys/netiso/RCS/cons_pcb.h,v $ */ + +/* + * protocol control block for the connection oriented network service + */ + +/* + * legit port #s for cons "transport" are 0..23 for su users only, and + * 1024..1099 for public users + */ +#define X25_SBSIZE 512 +#define X25_PORT_RESERVED 24 +#define X25_PORT_USERMAX 1099 +#define X25_FACIL_LEN_MAX 109 +#define X25_PARTIAL_PKT_LEN_MAX (MLEN - sizeof(struct cons_pcb)) + +#ifndef ARGO_DEBUG +#define X25_TTL 600 /* 5 min */ +#else /* ARGO_DEBUG */ +#define X25_TTL 120 /* 1 min */ +#endif /* ARGO_DEBUG */ + +struct cons_pcb { + struct isopcb _co_isopcb; +#define co_next _co_isopcb.isop_next +/* prev used for netstat only */ +#define co_prev _co_isopcb.isop_prev +#define co_head _co_isopcb.isop_head +#define co_laddr _co_isopcb.isop_laddr +#define co_faddr _co_isopcb.isop_faddr +#define co_lport _co_isopcb.isop_laddr.siso_tsuffix +#define co_fport _co_isopcb.isop_faddr.siso_tsuffix +#define co_route _co_isopcb.isop_route +#define co_socket _co_isopcb.isop_socket +#define co_chanmask _co_isopcb.isop_chanmask +#define co_negchanmask _co_isopcb.isop_negchanmask +#define co_x25crud _co_isopcb.isop_x25crud +#define co_x25crud_len _co_isopcb.isop_x25crud_len + u_short co_state; + u_char co_flags; + u_short co_ttl; /* time to live timer */ + u_short co_init_ttl; /* initial value of ttl */ + int co_channel; /* logical channel */ + struct ifnet * co_ifp; /* interface */ + struct protosw *co_proto; + + struct ifqueue co_pending; /* queue data to send when connection + completes*/ +#define MAX_DTE_LEN 0x7 /* 17 bcd digits */ + struct dte_addr co_peer_dte; + struct cons_pcb *co_myself; /* DEBUGGING AID */ +}; + +/* + * X.25 Packet types + */ +#define XPKT_DATA 1 +#define XPKT_INTERRUPT 2 +#define XPKT_FLOWCONTROL 3 /* not delivered? */ + +/* + * pcb xtates + */ + +#define CLOSED 0x0 +#define LISTENING 0x1 +#define CLOSING 0x2 +/* USABLE STATES MUST BE LAST */ +#define CONNECTING 0x3 +#define ACKWAIT 0x4 +#define OPEN 0x5 +#define MIN_USABLE_STATE CONNECTING + +#define cons_NSTATES 0x6 + + +/* type */ +#define CONSF_OCRE 0x40 /* created on OUTPUT */ +#define CONSF_ICRE 0x20 /* created on INPUT */ +#define CONSF_unused 0x10 /* not used */ +#define CONSF_unused2 0x08 /* not used */ +#define CONSF_DGM 0x04 /* for dgm use only */ +#define CONSF_XTS 0x02 /* for cons-as-transport-service */ +#define CONSF_LOOPBACK 0x01 /* loopback was on when connection commenced */ + +#define X_NOCHANNEL 0x80 + + +struct cons_stat { + u_int co_intr; /* input from eicon board */ + u_int co_restart; /* ecn_restart() request issued to board */ + u_int co_slowtimo; /* times slowtimo called */ + u_int co_timedout; /* connections closed by slowtimo */ + u_int co_ack; /* ECN_ACK indication came from eicon board */ + u_int co_receive; /* ECN_RECEIVE indication came from eicon board */ + u_int co_send; /* ECN_SEND request issued to board */ + u_int co_reset_in; /* ECN_RESET indication came from eicon board */ + u_int co_reset_out; /* ECN_RESET issued to the eicon board */ + u_int co_clear_in; /* ECN_CLEAR indication came from eicon board */ + u_int co_clear_out; /* ECN_CLEAR request issued to board */ + u_int co_refuse; /* ECN_REFUSE indication came from eicon board */ + u_int co_accept; /* ECN_ACCEPT indication came from eicon board */ + u_int co_connect; /* ECN_CONNECT indication came from eicon board */ + u_int co_call; /* ECN_CALL request issued to board */ + u_int co_Rdrops; /* bad pkt came from ll */ + u_int co_Xdrops; /* can't keep up */ + + u_int co_intrpt_pkts_in; /* interrupt packets in */ + u_int co_avg_qlen; + u_int co_avg_qdrop; + u_int co_active; + + u_int co_noresources; + u_int co_parse_facil_err; + u_int co_addr_proto_consist_err; + u_int co_no_copcb; +} cons_stat; + +u_char x25_error_stats[CONL_ERROR_MAX + 1]; + +struct ifqueue consintrq; + +/* reasons for clear are in a data mbuf chained to a clear ecn_request */ +struct e_clear_data { + u_char ecd_cause; + u_char ecd_diagnostic; +}; + +#ifdef KERNEL +#define IncStat(XYZ) cons_stat.XYZ++ +#endif /* KERNEL */ diff --git a/sys/netiso/eonvar.h b/sys/netiso/eonvar.h new file mode 100644 index 00000000000..93f99172f74 --- /dev/null +++ b/sys/netiso/eonvar.h @@ -0,0 +1,170 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)eonvar.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ + +#define EON_986_VERSION 0x3 +#define EON_VERSION 0x1 + +#define EON_CACHESIZE 30 + +#define E_FREE 1 +#define E_LINK 2 +#define E_ES 3 +#define E_IS 4 + + +/* + * this overlays a sockaddr_iso + */ + +struct sockaddr_eon { + u_char seon_len; /* Length */ + u_char seon_family; /* AF_ISO */ + u_char seon_status; /* overlays session suffixlen */ +#define EON_ESLINK_UP 0x1 +#define EON_ESLINK_DOWN 0x2 +#define EON_ISLINK_UP 0x10 +#define EON_ISLINK_DOWN 0x20 +/* no change is neither up or down */ + u_char seon_pad1; /* 0, overlays tsfxlen */ + u_char seon_adrlen; + u_char seon_afi; /* 47 */ + u_char seon_idi[2]; /* 0006 */ + u_char seon_vers; /* 03 */ + u_char seon_glbnum[2]; /* see RFC 1069 */ + u_char seon_RDN[2]; /* see RFC 1070 */ + u_char seon_pad2[3]; /* see RFC 1070 */ + u_char seon_LAREA[2]; /* see RFC 1070 */ + u_char seon_pad3[2]; /* see RFC 1070 */ + /* right now ip addr is aligned -- be careful -- + * future revisions may have it u_char[4] + */ + u_int seon_ipaddr; /* a.b.c.d */ + u_char seon_protoid; /* NSEL */ +}; + +#ifdef EON_TEMPLATE +struct sockaddr_eon eon_template = { + sizeof (eon_template), AF_ISO, 0, 0, 0x14, + 0x47, 0x0, 0x6, 0x3, 0 +}; +#endif + +#define DOWNBITS ( EON_ESLINK_DOWN | EON_ISLINK_DOWN ) +#define UPBITS ( EON_ESLINK_UP | EON_ISLINK_UP ) + +#define SIOCSEONCORE _IOWR('i',10, struct iso_ifreq) /* EON core member */ +#define SIOCGEONCORE _IOWR('i',11, struct iso_ifreq) /* EON core member */ + +struct eon_hdr { + u_char eonh_vers; /* value 1 */ + u_char eonh_class; /* address multicast class, below */ +#define EON_NORMAL_ADDR 0x0 +#define EON_MULTICAST_ES 0x1 +#define EON_MULTICAST_IS 0x2 +#define EON_BROADCAST 0x3 + u_short eonh_csum; /* osi checksum (choke)*/ +}; +struct eon_iphdr { + struct ip ei_ip; + struct eon_hdr ei_eh; +}; +#define EONIPLEN (sizeof(struct eon_hdr) + sizeof(struct ip)) + +/* stole these 2 fields of the flags for I-am-ES and I-am-IS */ +#define IFF_ES 0x400 +#define IFF_IS 0x800 + +struct eon_stat { + int es_in_multi_es; + int es_in_multi_is; + int es_in_broad; + int es_in_normal; + int es_out_multi_es; + int es_out_multi_is; + int es_out_broad; + int es_out_normal; + int es_ipout; + + int es_icmp[PRC_NCMDS]; + /* errors */ + int es_badcsum; + int es_badhdr; +} eonstat; + +#undef IncStat +#define IncStat(xxx) eonstat.xxx++ + +typedef struct qhdr { + struct qhdr *link, *rlink; +} *queue_t; + +struct eon_llinfo { + struct qhdr el_qhdr; /* keep all in a list */ + int el_flags; /* cache valid ? */ + int el_snpaoffset; /* IP address contained in dst nsap */ + struct rtentry *el_rt; /* back pointer to parent route */ + struct eon_iphdr el_ei; /* precomputed portion of hdr */ + struct route el_iproute; /* if direct route cache IP info */ + /* if gateway, cache secondary route */ +}; +#define el_iphdr el_ei.ei_ip +#define el_eonhdr el_ei.ei_eh diff --git a/sys/netiso/esis.c b/sys/netiso/esis.c new file mode 100644 index 00000000000..f4ade0f4fc9 --- /dev/null +++ b/sys/netiso/esis.c @@ -0,0 +1,1063 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)esis.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ + +#ifdef ISO + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Global variables to esis implementation + * + * esis_holding_time - the holding time (sec) parameter for outgoing pdus + * esis_config_time - the frequency (sec) that hellos are generated + * esis_esconfig_time - suggested es configuration time placed in the + * ish. + * + */ +struct rawcb esis_pcb; +void esis_config(), snpac_age(); +int esis_sendspace = 2048; +int esis_recvspace = 2048; +short esis_holding_time = ESIS_HT; +short esis_config_time = ESIS_CONFIG; +short esis_esconfig_time = ESIS_CONFIG; +extern int iso_systype; +struct sockaddr_dl esis_dl = { sizeof(esis_dl), AF_LINK }; +extern char all_es_snpa[], all_is_snpa[]; + +#define EXTEND_PACKET(m, mhdr, cp)\ + if (((m)->m_next = m_getclr(M_DONTWAIT, MT_HEADER)) == NULL) {\ + esis_stat.es_nomem++;\ + m_freem(mhdr);\ + return;\ + } else {\ + (m) = (m)->m_next;\ + (cp) = mtod((m), caddr_t);\ + } +/* + * FUNCTION: esis_init + * + * PURPOSE: Initialize the kernel portion of esis protocol + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: + */ +esis_init() +{ + extern struct clnl_protosw clnl_protox[256]; + int esis_input(), isis_input(); +#ifdef ISO_X25ESIS + int x25esis_input(); +#endif /* ISO_X25ESIS */ + + esis_pcb.rcb_next = esis_pcb.rcb_prev = &esis_pcb; + llinfo_llc.lc_next = llinfo_llc.lc_prev = &llinfo_llc; + + timeout(snpac_age, (caddr_t)0, hz); + timeout(esis_config, (caddr_t)0, hz); + + clnl_protox[ISO9542_ESIS].clnl_input = esis_input; + clnl_protox[ISO10589_ISIS].clnl_input = isis_input; +#ifdef ISO_X25ESIS + clnl_protox[ISO9542X25_ESIS].clnl_input = x25esis_input; +#endif /* ISO_X25ESIS */ +} + +/* + * FUNCTION: esis_usrreq + * + * PURPOSE: Handle user level esis requests + * + * RETURNS: 0 or appropriate errno + * + * SIDE EFFECTS: + * + */ +/*ARGSUSED*/ +esis_usrreq(so, req, m, nam, control) +struct socket *so; /* socket: used only to get to this code */ +int req; /* request */ +struct mbuf *m; /* data for request */ +struct mbuf *nam; /* optional name */ +struct mbuf *control; /* optional control */ +{ + struct rawcb *rp = sotorawcb(so); + int error = 0; + + if ((so->so_state & SS_PRIV) == 0) { + error = EACCES; + goto release; + } + if (rp == NULL && req != PRU_ATTACH) { + error = EINVAL; + goto release; + } + + switch (req) { + case PRU_ATTACH: + if (rp != NULL) { + error = EINVAL; + break; + } + MALLOC(rp, struct rawcb *, sizeof(*rp), M_PCB, M_WAITOK); + if (so->so_pcb = (caddr_t)rp) { + bzero(so->so_pcb, sizeof(*rp)); + insque(rp, &esis_pcb); + rp->rcb_socket = so; + error = soreserve(so, esis_sendspace, esis_recvspace); + } else + error = ENOBUFS; + break; + + case PRU_SEND: + if (nam == NULL) { + error = EINVAL; + break; + } + /* error checking here */ + error = isis_output(mtod(nam,struct sockaddr_dl *), m); + m = NULL; + break; + + case PRU_DETACH: + raw_detach(rp); + break; + + case PRU_SHUTDOWN: + socantsendmore(so); + break; + + case PRU_ABORT: + soisdisconnected(so); + raw_detach(rp); + break; + + case PRU_SENSE: + return (0); + + default: + return (EOPNOTSUPP); + } +release: + if (m != NULL) + m_freem(m); + + return (error); +} + +/* + * FUNCTION: esis_input + * + * PURPOSE: Process an incoming esis packet + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: + */ +esis_input(m0, shp) +struct mbuf *m0; /* ptr to first mbuf of pkt */ +struct snpa_hdr *shp; /* subnetwork header */ +{ + register struct esis_fixed *pdu = mtod(m0, struct esis_fixed *); + register int type; + + /* + * check checksum if necessary + */ + if (ESIS_CKSUM_REQUIRED(pdu) && iso_check_csum(m0, (int)pdu->esis_hdr_len)) { + esis_stat.es_badcsum++; + goto bad; + } + + /* check version */ + if (pdu->esis_vers != ESIS_VERSION) { + esis_stat.es_badvers++; + goto bad; + } + type = pdu->esis_type & 0x1f; + switch (type) { + case ESIS_ESH: + esis_eshinput(m0, shp); + break; + + case ESIS_ISH: + esis_ishinput(m0, shp); + break; + + case ESIS_RD: + esis_rdinput(m0, shp); + break; + + default: + esis_stat.es_badtype++; + } + +bad: + if (esis_pcb.rcb_next != &esis_pcb) + isis_input(m0, shp); + else + m_freem(m0); +} + +/* + * FUNCTION: esis_rdoutput + * + * PURPOSE: Transmit a redirect pdu + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: Assumes there is enough space for fixed part of header, + * DA, BSNPA and NET in first mbuf. + */ +esis_rdoutput(inbound_shp, inbound_m, inbound_oidx, rd_dstnsap, rt) +struct snpa_hdr *inbound_shp; /* snpa hdr from incoming packet */ +struct mbuf *inbound_m; /* incoming pkt itself */ +struct clnp_optidx *inbound_oidx; /* clnp options assoc with incoming pkt */ +struct iso_addr *rd_dstnsap; /* ultimate destination of pkt */ +struct rtentry *rt; /* snpa cache info regarding next hop of + pkt */ +{ + struct mbuf *m, *m0; + caddr_t cp; + struct esis_fixed *pdu; + int len, total_len = 0; + struct sockaddr_iso siso; + struct ifnet *ifp = inbound_shp->snh_ifp; + struct sockaddr_dl *sdl; + struct iso_addr *rd_gwnsap; + + if (rt->rt_flags & RTF_GATEWAY) { + rd_gwnsap = &((struct sockaddr_iso *)rt->rt_gateway)->siso_addr; + rt = rtalloc1(rt->rt_gateway, 0); + } else + rd_gwnsap = &((struct sockaddr_iso *)rt_key(rt))->siso_addr; + if (rt == 0 || (sdl = (struct sockaddr_dl *)rt->rt_gateway) == 0 || + sdl->sdl_family != AF_LINK) { + /* maybe we should have a function that you + could put in the iso_ifaddr structure + which could translate iso_addrs into snpa's + where there is a known mapping for that address type */ + esis_stat.es_badtype++; + return; + } + esis_stat.es_rdsent++; + IFDEBUG(D_ESISOUTPUT) + printf("esis_rdoutput: ifp x%x (%s%d), ht %d, m x%x, oidx x%x\n", + ifp, ifp->if_name, ifp->if_unit, esis_holding_time, inbound_m, + inbound_oidx); + printf("\tdestination: %s\n", clnp_iso_addrp(rd_dstnsap)); + printf("\tredirected toward:%s\n", clnp_iso_addrp(rd_gwnsap)); + ENDDEBUG + + if ((m0 = m = m_gethdr(M_DONTWAIT, MT_HEADER)) == NULL) { + esis_stat.es_nomem++; + return; + } + bzero(mtod(m, caddr_t), MHLEN); + + pdu = mtod(m, struct esis_fixed *); + cp = (caddr_t)(pdu + 1); /*pointer arith.; 1st byte after header */ + len = sizeof(struct esis_fixed); + + /* + * Build fixed part of header + */ + pdu->esis_proto_id = ISO9542_ESIS; + pdu->esis_vers = ESIS_VERSION; + pdu->esis_type = ESIS_RD; + HTOC(pdu->esis_ht_msb, pdu->esis_ht_lsb, esis_holding_time); + + /* Insert destination address */ + (void) esis_insert_addr(&cp, &len, rd_dstnsap, m, 0); + + /* Insert the snpa of better next hop */ + *cp++ = sdl->sdl_alen; + bcopy(LLADDR(sdl), cp, sdl->sdl_alen); + cp += sdl->sdl_alen; + len += (sdl->sdl_alen + 1); + + /* + * If the next hop is not the destination, then it ought to be + * an IS and it should be inserted next. Else, set the + * NETL to 0 + */ + /* PHASE2 use mask from ifp of outgoing interface */ + if (!iso_addrmatch1(rd_dstnsap, rd_gwnsap)) { + /* this should not happen: + if ((nhop_sc->sc_flags & SNPA_IS) == 0) { + printf("esis_rdoutput: next hop is not dst and not an IS\n"); + m_freem(m0); + return; + } */ + (void) esis_insert_addr(&cp, &len, rd_gwnsap, m, 0); + } else { + *cp++ = 0; /* NETL */ + len++; + } + m->m_len = len; + + /* + * PHASE2 + * If redirect is to an IS, add an address mask. The mask to be + * used should be the mask present in the routing entry used to + * forward the original data packet. + */ + + /* + * Copy Qos, priority, or security options present in original npdu + */ + if (inbound_oidx) { + /* THIS CODE IS CURRENTLY (mostly) UNTESTED */ + int optlen = 0; + if (inbound_oidx->cni_qos_formatp) + optlen += (inbound_oidx->cni_qos_len + 2); + if (inbound_oidx->cni_priorp) /* priority option is 1 byte long */ + optlen += 3; + if (inbound_oidx->cni_securep) + optlen += (inbound_oidx->cni_secure_len + 2); + if (M_TRAILINGSPACE(m) < optlen) { + EXTEND_PACKET(m, m0, cp); + m->m_len = 0; + /* assumes MLEN > optlen */ + } + /* assume MLEN-len > optlen */ + /* + * When copying options, copy from ptr - 2 in order to grab + * the option code and length + */ + if (inbound_oidx->cni_qos_formatp) { + bcopy(mtod(inbound_m, caddr_t) + inbound_oidx->cni_qos_formatp - 2, + cp, (unsigned)(inbound_oidx->cni_qos_len + 2)); + cp += inbound_oidx->cni_qos_len + 2; + } + if (inbound_oidx->cni_priorp) { + bcopy(mtod(inbound_m, caddr_t) + inbound_oidx->cni_priorp - 2, + cp, 3); + cp += 3; + } + if (inbound_oidx->cni_securep) { + bcopy(mtod(inbound_m, caddr_t) + inbound_oidx->cni_securep - 2, cp, + (unsigned)(inbound_oidx->cni_secure_len + 2)); + cp += inbound_oidx->cni_secure_len + 2; + } + m->m_len += optlen; + len += optlen; + } + + pdu->esis_hdr_len = m0->m_pkthdr.len = len; + iso_gen_csum(m0, ESIS_CKSUM_OFF, (int)pdu->esis_hdr_len); + + bzero((caddr_t)&siso, sizeof(siso)); + siso.siso_family = AF_ISO; + siso.siso_data[0] = AFI_SNA; + siso.siso_nlen = 6 + 1; /* should be taken from snpa_hdr */ + /* +1 is for AFI */ + bcopy(inbound_shp->snh_shost, siso.siso_data + 1, 6); + (ifp->if_output)(ifp, m0, (struct sockaddr *)&siso, 0); +} + +/* + * FUNCTION: esis_insert_addr + * + * PURPOSE: Insert an iso_addr into a buffer + * + * RETURNS: true if buffer was big enough, else false + * + * SIDE EFFECTS: Increment buf & len according to size of iso_addr + * + * NOTES: Plus 1 here is for length byte + */ +esis_insert_addr(buf, len, isoa, m, nsellen) +register caddr_t *buf; /* ptr to buffer to put address into */ +int *len; /* ptr to length of buffer so far */ +register struct iso_addr *isoa; /* ptr to address */ +register struct mbuf *m; /* determine if there remains space */ +int nsellen; +{ + register int newlen, result = 0; + + isoa->isoa_len -= nsellen; + newlen = isoa->isoa_len + 1; + if (newlen <= M_TRAILINGSPACE(m)) { + bcopy((caddr_t)isoa, *buf, newlen); + *len += newlen; + *buf += newlen; + m->m_len += newlen; + result = 1; + } + isoa->isoa_len += nsellen; + return (result); +} + +#define ESIS_EXTRACT_ADDR(d, b) { d = (struct iso_addr *)(b); b += (1 + *b); \ + if (b > buflim) {esis_stat.es_toosmall++; goto bad;}} +#define ESIS_NEXT_OPTION(b) { b += (2 + b[1]); \ + if (b > buflim) {esis_stat.es_toosmall++; goto bad;}} +int ESHonly = 0; +/* + +/* + * FUNCTION: esis_eshinput + * + * PURPOSE: Process an incoming ESH pdu + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: + */ +esis_eshinput(m, shp) +struct mbuf *m; /* esh pdu */ +struct snpa_hdr *shp; /* subnetwork header */ +{ + struct esis_fixed *pdu = mtod(m, struct esis_fixed *); + u_short ht; /* holding time */ + struct iso_addr *nsap; + int naddr; + u_char *buf = (u_char *)(pdu + 1); + u_char *buflim = pdu->esis_hdr_len + (u_char *)pdu; + int new_entry = 0; + + esis_stat.es_eshrcvd++; + + CTOH(pdu->esis_ht_msb, pdu->esis_ht_lsb, ht); + + naddr = *buf++; + if (buf >= buflim) + goto bad; + if (naddr == 1) { + ESIS_EXTRACT_ADDR(nsap, buf); + new_entry = snpac_add(shp->snh_ifp, + nsap, shp->snh_shost, SNPA_ES, ht, 0); + } else { + int nsellength = 0, nlen = 0; + { + /* See if we want to compress out multiple nsaps differing + only by nsel */ + register struct ifaddr *ifa = shp->snh_ifp->if_addrlist; + for (; ifa; ifa = ifa->ifa_next) + if (ifa->ifa_addr->sa_family == AF_ISO) { + nsellength = ((struct iso_ifaddr *)ifa)->ia_addr.siso_tlen; + break; + } + } + IFDEBUG(D_ESISINPUT) + printf("esis_eshinput: esh: ht %d, naddr %d nsellength %d\n", + ht, naddr, nsellength); + ENDDEBUG + while (naddr-- > 0) { + struct iso_addr *nsap2; u_char *buf2; + ESIS_EXTRACT_ADDR(nsap, buf); + /* see if there is at least one more nsap in ESH differing + only by nsel */ + if (nsellength != 0) for (buf2 = buf; buf2 < buflim;) { + ESIS_EXTRACT_ADDR(nsap2, buf2); + IFDEBUG(D_ESISINPUT) + printf("esis_eshinput: comparing %s ", + clnp_iso_addrp(nsap)); + printf("and %s\n", clnp_iso_addrp(nsap2)); + ENDDEBUG + if (Bcmp(nsap->isoa_genaddr, nsap2->isoa_genaddr, + nsap->isoa_len - nsellength) == 0) { + nlen = nsellength; + break; + } + } + new_entry |= snpac_add(shp->snh_ifp, + nsap, shp->snh_shost, SNPA_ES, ht, nlen); + nlen = 0; + } + } + IFDEBUG(D_ESISINPUT) + printf("esis_eshinput: nsap %s is %s\n", + clnp_iso_addrp(nsap), new_entry ? "new" : "old"); + ENDDEBUG + if (new_entry && (iso_systype & SNPA_IS)) + esis_shoutput(shp->snh_ifp, ESIS_ISH, esis_holding_time, + shp->snh_shost, 6, (struct iso_addr *)0); +bad: + return; +} + +/* + * FUNCTION: esis_ishinput + * + * PURPOSE: process an incoming ISH pdu + * + * RETURNS: + * + * SIDE EFFECTS: + * + * NOTES: + */ +esis_ishinput(m, shp) +struct mbuf *m; /* esh pdu */ +struct snpa_hdr *shp; /* subnetwork header */ +{ + struct esis_fixed *pdu = mtod(m, struct esis_fixed *); + u_short ht, newct; /* holding time */ + struct iso_addr *nsap; /* Network Entity Title */ + register u_char *buf = (u_char *) (pdu + 1); + register u_char *buflim = pdu->esis_hdr_len + (u_char *)pdu; + int new_entry; + + esis_stat.es_ishrcvd++; + CTOH(pdu->esis_ht_msb, pdu->esis_ht_lsb, ht); + + IFDEBUG(D_ESISINPUT) + printf("esis_ishinput: ish: ht %d\n", ht); + ENDDEBUG + if (ESHonly) + goto bad; + + ESIS_EXTRACT_ADDR(nsap, buf); + + while (buf < buflim) { + switch (*buf) { + case ESISOVAL_ESCT: + if (iso_systype & SNPA_IS) + break; + if (buf[1] != 2) + goto bad; + CTOH(buf[2], buf[3], newct); + if (esis_config_time != newct) { + untimeout(esis_config,0); + esis_config_time = newct; + esis_config(); + } + break; + + default: + printf("Unknown ISH option: %x\n", *buf); + } + ESIS_NEXT_OPTION(buf); + } + new_entry = snpac_add(shp->snh_ifp, nsap, shp->snh_shost, SNPA_IS, ht, 0); + IFDEBUG(D_ESISINPUT) + printf("esis_ishinput: nsap %s is %s\n", + clnp_iso_addrp(nsap), new_entry ? "new" : "old"); + ENDDEBUG + + if (new_entry) + esis_shoutput(shp->snh_ifp, + iso_systype & SNPA_ES ? ESIS_ESH : ESIS_ISH, + esis_holding_time, shp->snh_shost, 6, (struct iso_addr *)0); +bad: + return; +} + +/* + * FUNCTION: esis_rdinput + * + * PURPOSE: Process an incoming RD pdu + * + * RETURNS: + * + * SIDE EFFECTS: + * + * NOTES: + */ +esis_rdinput(m0, shp) +struct mbuf *m0; /* esh pdu */ +struct snpa_hdr *shp; /* subnetwork header */ +{ + struct esis_fixed *pdu = mtod(m0, struct esis_fixed *); + u_short ht; /* holding time */ + struct iso_addr *da, *net = 0, *netmask = 0, *snpamask = 0; + register struct iso_addr *bsnpa; + register u_char *buf = (u_char *)(pdu + 1); + register u_char *buflim = pdu->esis_hdr_len + (u_char *)pdu; + + esis_stat.es_rdrcvd++; + + /* intermediate systems ignore redirects */ + if (iso_systype & SNPA_IS) + return; + if (ESHonly) + return; + + CTOH(pdu->esis_ht_msb, pdu->esis_ht_lsb, ht); + if (buf >= buflim) + return; + + /* Extract DA */ + ESIS_EXTRACT_ADDR(da, buf); + + /* Extract better snpa */ + ESIS_EXTRACT_ADDR(bsnpa, buf); + + /* Extract NET if present */ + if (buf < buflim) { + if (*buf == 0) + buf++; /* no NET present, skip NETL anyway */ + else + ESIS_EXTRACT_ADDR(net, buf); + } + + /* process options */ + while (buf < buflim) { + switch (*buf) { + case ESISOVAL_SNPAMASK: + if (snpamask) /* duplicate */ + return; + snpamask = (struct iso_addr *)(buf + 1); + break; + + case ESISOVAL_NETMASK: + if (netmask) /* duplicate */ + return; + netmask = (struct iso_addr *)(buf + 1); + break; + + default: + printf("Unknown option in ESIS RD (0x%x)\n", buf[-1]); + } + ESIS_NEXT_OPTION(buf); + } + + IFDEBUG(D_ESISINPUT) + printf("esis_rdinput: rd: ht %d, da %s\n", ht, clnp_iso_addrp(da)); + if (net) + printf("\t: net %s\n", clnp_iso_addrp(net)); + ENDDEBUG + /* + * If netl is zero, then redirect is to an ES. We need to add an entry + * to the snpa cache for (destination, better snpa). + * If netl is not zero, then the redirect is to an IS. In this + * case, add an snpa cache entry for (net, better snpa). + * + * If the redirect is to an IS, add a route entry towards that + * IS. + */ + if (net == 0 || net->isoa_len == 0 || snpamask) { + /* redirect to an ES */ + snpac_add(shp->snh_ifp, da, + bsnpa->isoa_genaddr, SNPA_ES, ht, 0); + } else { + snpac_add(shp->snh_ifp, net, + bsnpa->isoa_genaddr, SNPA_IS, ht, 0); + snpac_addrt(shp->snh_ifp, da, net, netmask); + } +bad: ; /* Needed by ESIS_NEXT_OPTION */ +} + +/* + * FUNCTION: esis_config + * + * PURPOSE: Report configuration + * + * RETURNS: + * + * SIDE EFFECTS: + * + * NOTES: Called every esis_config_time seconds + */ +void +esis_config() +{ + register struct ifnet *ifp; + + timeout(esis_config, (caddr_t)0, hz * esis_config_time); + + /* + * Report configuration for each interface that + * - is UP + * - has BROADCAST capability + * - has an ISO address + */ + /* Todo: a better way would be to construct the esh or ish + * once and copy it out for all devices, possibly calling + * a method in the iso_ifaddr structure to encapsulate and + * transmit it. This could work to advantage for non-broadcast media + */ + + for (ifp = ifnet; ifp; ifp = ifp->if_next) { + if ((ifp->if_flags & IFF_UP) && + (ifp->if_flags & IFF_BROADCAST)) { + /* search for an ISO address family */ + struct ifaddr *ia; + + for (ia = ifp->if_addrlist; ia; ia = ia->ifa_next) { + if (ia->ifa_addr->sa_family == AF_ISO) { + esis_shoutput(ifp, + iso_systype & SNPA_ES ? ESIS_ESH : ESIS_ISH, + esis_holding_time, + (caddr_t)(iso_systype & SNPA_ES ? all_is_snpa : + all_es_snpa), 6, (struct iso_addr *)0); + break; + } + } + } + } +} + +/* + * FUNCTION: esis_shoutput + * + * PURPOSE: Transmit an esh or ish pdu + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: + */ +esis_shoutput(ifp, type, ht, sn_addr, sn_len, isoa) +struct ifnet *ifp; +int type; +short ht; +caddr_t sn_addr; +int sn_len; +struct iso_addr *isoa; +{ + struct mbuf *m, *m0; + caddr_t cp, naddrp; + int naddr = 0; + struct esis_fixed *pdu; + struct iso_ifaddr *ia; + int len; + struct sockaddr_iso siso; + + if (type == ESIS_ESH) + esis_stat.es_eshsent++; + else if (type == ESIS_ISH) + esis_stat.es_ishsent++; + else { + printf("esis_shoutput: bad pdu type\n"); + return; + } + + IFDEBUG(D_ESISOUTPUT) + int i; + printf("esis_shoutput: ifp x%x (%s%d), %s, ht %d, to: [%d] ", + ifp, ifp->if_name, ifp->if_unit, type == ESIS_ESH ? "esh" : "ish", + ht, sn_len); + for (i=0; iesis_proto_id = ISO9542_ESIS; + pdu->esis_vers = ESIS_VERSION; + pdu->esis_type = type; + HTOC(pdu->esis_ht_msb, pdu->esis_ht_lsb, ht); + + if (type == ESIS_ESH) { + cp++; + len++; + } + + m->m_len = len; + if (isoa) { + /* + * Here we are responding to a clnp packet sent to an NSAP + * that is ours which was sent to the MAC addr all_es's. + * It is possible that we did not specifically advertise this + * NSAP, even though it is ours, so we will respond + * directly to the sender that we are here. If we do have + * multiple NSEL's we'll tack them on so he can compress them out. + */ + (void) esis_insert_addr(&cp, &len, isoa, m, 0); + naddr = 1; + } + for (ia = iso_ifaddr; ia; ia = ia->ia_next) { + int nsellen = (type == ESIS_ISH ? ia->ia_addr.siso_tlen : 0); + int n = ia->ia_addr.siso_nlen; + register struct iso_ifaddr *ia2; + + if (type == ESIS_ISH && naddr > 0) + break; + for (ia2 = iso_ifaddr; ia2 != ia; ia2 = ia2->ia_next) + if (Bcmp(ia->ia_addr.siso_data, ia2->ia_addr.siso_data, n) == 0) + break; + if (ia2 != ia) + continue; /* Means we have previously copied this nsap */ + if (isoa && Bcmp(ia->ia_addr.siso_data, isoa->isoa_genaddr, n) == 0) { + isoa = 0; + continue; /* Ditto */ + } + IFDEBUG(D_ESISOUTPUT) + printf("esis_shoutput: adding NSAP %s\n", + clnp_iso_addrp(&ia->ia_addr.siso_addr)); + ENDDEBUG + if (!esis_insert_addr(&cp, &len, + &ia->ia_addr.siso_addr, m, nsellen)) { + EXTEND_PACKET(m, m0, cp); + (void) esis_insert_addr(&cp, &len, &ia->ia_addr.siso_addr, m, + nsellen); + } + naddr++; + } + + if (type == ESIS_ESH) + *naddrp = naddr; + else { + /* add suggested es config timer option to ISH */ + if (M_TRAILINGSPACE(m) < 4) { + printf("esis_shoutput: extending packet\n"); + EXTEND_PACKET(m, m0, cp); + } + *cp++ = ESISOVAL_ESCT; + *cp++ = 2; + HTOC(*cp, *(cp+1), esis_esconfig_time); + len += 4; + m->m_len += 4; + IFDEBUG(D_ESISOUTPUT) + printf("m0 0x%x, m 0x%x, data 0x%x, len %d, cp 0x%x\n", + m0, m, m->m_data, m->m_len, cp); + ENDDEBUG + } + + m0->m_pkthdr.len = len; + pdu->esis_hdr_len = len; + iso_gen_csum(m0, ESIS_CKSUM_OFF, (int)pdu->esis_hdr_len); + + bzero((caddr_t)&siso, sizeof(siso)); + siso.siso_family = AF_ISO; + siso.siso_data[0] = AFI_SNA; + siso.siso_nlen = sn_len + 1; + bcopy(sn_addr, siso.siso_data + 1, (unsigned)sn_len); + (ifp->if_output)(ifp, m0, (struct sockaddr *)&siso, 0); +} + +/* + * FUNCTION: isis_input + * + * PURPOSE: Process an incoming isis packet + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: + */ +isis_input(m0, shp) +struct mbuf *m0; /* ptr to first mbuf of pkt */ +struct snpa_hdr *shp; /* subnetwork header */ +{ + register int type; + register struct rawcb *rp, *first_rp = 0; + struct ifnet *ifp = shp->snh_ifp; + char workbuf[16]; + struct mbuf *mm; + + IFDEBUG(D_ISISINPUT) + int i; + + printf("isis_input: pkt on ifp x%x (%s%d): from:", ifp, + ifp->if_name, ifp->if_unit); + for (i=0; i<6; i++) + printf("%x%c", shp->snh_shost[i]&0xff, (i<5) ? ':' : ' '); + printf(" to:"); + for (i=0; i<6; i++) + printf("%x%c", shp->snh_dhost[i]&0xff, (i<5) ? ':' : ' '); + printf("\n"); + ENDDEBUG + esis_dl.sdl_alen = ifp->if_addrlen; + esis_dl.sdl_index = ifp->if_index; + bcopy(shp->snh_shost, (caddr_t)esis_dl.sdl_data, esis_dl.sdl_alen); + for (rp = esis_pcb.rcb_next; rp != &esis_pcb; rp = rp->rcb_next) { + if (first_rp == 0) { + first_rp = rp; + continue; + } + if (mm = m_copy(m0, 0, M_COPYALL)) { /*can't block at interrupt level */ + if (sbappendaddr(&rp->rcb_socket->so_rcv, + &esis_dl, mm, (struct mbuf *)0) != 0) { + sorwakeup(rp->rcb_socket); + } else { + IFDEBUG(D_ISISINPUT) + printf("Error in sbappenaddr, mm = 0x%x\n", mm); + ENDDEBUG + m_freem(mm); + } + } + } + if (first_rp && sbappendaddr(&first_rp->rcb_socket->so_rcv, + &esis_dl, m0, (struct mbuf *)0) != 0) { + sorwakeup(first_rp->rcb_socket); + return; + } + m_freem(m0); +} + +isis_output(sdl, m) +register struct sockaddr_dl *sdl; +struct mbuf *m; +{ + register struct ifnet *ifp; + struct ifaddr *ifa, *ifa_ifwithnet(); + struct sockaddr_iso siso; + int error = 0; + unsigned sn_len; + + ifa = ifa_ifwithnet((struct sockaddr *)sdl); /* get ifp from sdl */ + if (ifa == 0) { + IFDEBUG(D_ISISOUTPUT) + printf("isis_output: interface not found\n"); + ENDDEBUG + error = EINVAL; + goto release; + } + ifp = ifa->ifa_ifp; + sn_len = sdl->sdl_alen; + IFDEBUG(D_ISISOUTPUT) + u_char *cp = (u_char *)LLADDR(sdl), *cplim = cp + sn_len; + printf("isis_output: ifp 0x%x (%s%d), to: ", + ifp, ifp->if_name, ifp->if_unit); + while (cp < cplim) { + printf("%x", *cp++); + printf("%c", (cp < cplim) ? ':' : ' '); + } + printf("\n"); + ENDDEBUG + bzero((caddr_t)&siso, sizeof(siso)); + siso.siso_family = AF_ISO; /* This convention may be useful for X.25 */ + siso.siso_data[0] = AFI_SNA; + siso.siso_nlen = sn_len + 1; + bcopy(LLADDR(sdl), siso.siso_data + 1, sn_len); + error = (ifp->if_output)(ifp, m, (struct sockaddr *)&siso, 0); + if (error) { + IFDEBUG(D_ISISOUTPUT) + printf("isis_output: error from ether_output is %d\n", error); + ENDDEBUG + } + return (error); + +release: + if (m != NULL) + m_freem(m); + return(error); +} + + +/* + * FUNCTION: esis_ctlinput + * + * PURPOSE: Handle the PRC_IFDOWN transition + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: Calls snpac_flush for interface specified. + * The loop through iso_ifaddr is stupid because + * back in if_down, we knew the ifp... + */ +esis_ctlinput(req, siso) +int req; /* request: we handle only PRC_IFDOWN */ +struct sockaddr_iso *siso; /* address of ifp */ +{ + register struct iso_ifaddr *ia; /* scan through interface addresses */ + + if (req == PRC_IFDOWN) + for (ia = iso_ifaddr; ia; ia = ia->ia_next) { + if (iso_addrmatch(IA_SIS(ia), siso)) + snpac_flushifp(ia->ia_ifp); + } +} + +#endif /* ISO */ diff --git a/sys/netiso/esis.h b/sys/netiso/esis.h new file mode 100644 index 00000000000..81dd74ac310 --- /dev/null +++ b/sys/netiso/esis.h @@ -0,0 +1,135 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)esis.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * $Header: esis.h,v 4.7 88/09/15 11:24:18 hagens Exp $ + * $Source: /usr/argo/sys/netiso/RCS/esis.h,v $ + */ + +#ifndef BYTE_ORDER +/* + * Definitions for byte order, + * according to byte significance from low address to high. + */ +#define LITTLE_ENDIAN 1234 /* least-significant byte first (vax) */ +#define BIG_ENDIAN 4321 /* most-significant byte first (IBM, net) */ +#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long (pdp) */ + +#ifdef vax +#define BYTE_ORDER LITTLE_ENDIAN +#else +#define BYTE_ORDER BIG_ENDIAN /* mc68000, tahoe, most others */ +#endif +#endif /* BYTE_ORDER */ + +#define SNPAC_AGE 60 /* seconds */ +#define ESIS_CONFIG 60 /* seconds */ +#define ESIS_HT (ESIS_CONFIG * 2) + +/* + * Fixed part of an ESIS header + */ +struct esis_fixed { + u_char esis_proto_id; /* network layer protocol identifier */ + u_char esis_hdr_len; /* length indicator (octets) */ + u_char esis_vers; /* version/protocol identifier extension */ + u_char esis_res1; /* reserved */ + u_char esis_type; /* type code */ +/* technically, type should be &='d 0x1f */ +#define ESIS_ESH 0x02 /* End System Hello */ +#define ESIS_ISH 0x04 /* Intermediate System Hello */ +#define ESIS_RD 0x06 /* Redirect */ + u_char esis_ht_msb; /* holding time (seconds) high byte */ + u_char esis_ht_lsb; /* holding time (seconds) low byte */ + u_char esis_cksum_msb; /* checksum high byte */ + u_char esis_cksum_lsb; /* checksum low byte */ +}; +/* + * Values for ESIS datagram options + */ +#define ESISOVAL_NETMASK 0xe1 /* address mask option, RD PDU only */ +#define ESISOVAL_SNPAMASK 0xe2 /* snpa mask option, RD PDU only */ +#define ESISOVAL_ESCT 0xc6 /* end system conf. timer, ISH PDU only */ + + +#define ESIS_CKSUM_OFF 0x07 +#define ESIS_CKSUM_REQUIRED(pdu)\ + ((pdu->esis_cksum_msb != 0) || (pdu->esis_cksum_lsb != 0)) + +#define ESIS_VERSION 1 + +struct esis_stat { + u_short es_nomem; /* insufficient memory to send hello */ + u_short es_badcsum; /* incorrect checksum */ + u_short es_badvers; /* incorrect version number */ + u_short es_badtype; /* unknown pdu type field */ + u_short es_toosmall; /* packet too small */ + u_short es_eshsent; /* ESH sent */ + u_short es_eshrcvd; /* ESH rcvd */ + u_short es_ishsent; /* ISH sent */ + u_short es_ishrcvd; /* ISH rcvd */ + u_short es_rdsent; /* RD sent */ + u_short es_rdrcvd; /* RD rcvd */ +}; + +#ifdef KERNEL +struct esis_stat esis_stat; +#endif /* KERNEL */ diff --git a/sys/netiso/idrp_usrreq.c b/sys/netiso/idrp_usrreq.c new file mode 100644 index 00000000000..3109936b415 --- /dev/null +++ b/sys/netiso/idrp_usrreq.c @@ -0,0 +1,177 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)idrp_usrreq.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +int idrp_input(); +struct isopcb idrp_isop; +static struct sockaddr_iso idrp_addrs[2] = +{ { sizeof(idrp_addrs), AF_ISO, }, { sizeof(idrp_addrs[1]), AF_ISO, } }; +/* + * IDRP initialization + */ +idrp_init() +{ + extern struct clnl_protosw clnl_protox[256]; + + idrp_isop.isop_next = idrp_isop.isop_prev = &idrp_isop; + idrp_isop.isop_faddr = &idrp_isop.isop_sfaddr; + idrp_isop.isop_laddr = &idrp_isop.isop_sladdr; + idrp_isop.isop_sladdr = idrp_addrs[1]; + idrp_isop.isop_sfaddr = idrp_addrs[1]; + clnl_protox[ISO10747_IDRP].clnl_input = idrp_input; +} + +/* + * CALLED FROM: + * tpclnp_input(). + * FUNCTION and ARGUMENTS: + * Take a packet (m) from clnp, strip off the clnp header + * and mke suitable for the idrp socket. + * No return value. + */ +idrp_input(m, src, dst) + register struct mbuf *m; + struct sockaddr_iso *src, *dst; +{ + if (idrp_isop.isop_socket == 0) { + bad: m_freem(m); + return 0; + } + bzero(idrp_addrs[0].siso_data, sizeof(idrp_addrs[0].siso_data)); + bcopy((caddr_t)&(src->siso_addr), (caddr_t)&idrp_addrs[0].siso_addr, + 1 + src->siso_nlen); + bzero(idrp_addrs[1].siso_data, sizeof(idrp_addrs[1].siso_data)); + bcopy((caddr_t)&(dst->siso_addr), (caddr_t)&idrp_addrs[1].siso_addr, + 1 + dst->siso_nlen); + if (sbappendaddr(&idrp_isop.isop_socket->so_rcv, + (struct sockaddr *)idrp_addrs, m, (struct mbuf *)0) == 0) + goto bad; + sorwakeup(idrp_isop.isop_socket); + return 0; +} + +idrp_output(m, addr) + struct mbuf *m, *addr; +{ + register struct sockaddr_iso *siso = mtod(addr, struct sockaddr_iso *); + int s = splnet(), i; + + bcopy((caddr_t)&(siso->siso_addr), + (caddr_t)&idrp_isop.isop_sfaddr.siso_addr, 1 + siso->siso_nlen); + siso++; + bcopy((caddr_t)&(siso->siso_addr), + (caddr_t)&idrp_isop.isop_sladdr.siso_addr, 1 + siso->siso_nlen); + i = clnp_output(m, idrp_isop, m->m_pkthdr.len, 0); + splx(s); + return (i); +} + +u_long idrp_sendspace = 3072; /* really max datagram size */ +u_long idrp_recvspace = 40 * 1024; /* 40 1K datagrams */ + +/*ARGSUSED*/ +idrp_usrreq(so, req, m, addr, control) + struct socket *so; + int req; + struct mbuf *m, *addr, *control; +{ + int error = 0; + + /* Note: need to block idrp_input while changing + * the udp pcb queue and/or pcb addresses. + */ + switch (req) { + + case PRU_ATTACH: + if (idrp_isop.isop_socket != NULL) { + error = ENXIO; + break; + } + idrp_isop.isop_socket = so; + error = soreserve(so, idrp_sendspace, idrp_recvspace); + break; + + case PRU_SHUTDOWN: + socantsendmore(so); + break; + + case PRU_SEND: + return (idrp_output(m, addr)); + + case PRU_ABORT: + soisdisconnected(so); + case PRU_DETACH: + idrp_isop.isop_socket = 0; + break; + + + case PRU_SENSE: + /* + * stat: don't bother with a blocksize. + */ + return (0); + + default: + return (EOPNOTSUPP); /* do not free mbuf's */ + } + +release: + if (control) { + printf("idrp control data unexpectedly retained\n"); + m_freem(control); + } + if (m) + m_freem(m); + return (error); +} diff --git a/sys/netiso/if_cons.c b/sys/netiso/if_cons.c new file mode 100644 index 00000000000..7724b048be5 --- /dev/null +++ b/sys/netiso/if_cons.c @@ -0,0 +1,960 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_cons.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * $Header: if_cons.c,v 4.7 88/08/11 15:52:55 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/if_cons.c,v $ + * + * cons.c - Connection Oriented Network Service: + * including support for a) user transport-level service, + * b) COSNS below CLNP, and c) CONS below TP. + */ + +#ifdef TPCONS +#ifdef KERNEL +#ifdef ARGO_DEBUG +#define Static +unsigned LAST_CALL_PCB; +#else /* ARGO_DEBUG */ +#define Static static +#endif /* ARGO_DEBUG */ + +#ifndef SOCK_STREAM +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#endif + +#ifdef ARGO_DEBUG +#define MT_XCONN 0x50 +#define MT_XCLOSE 0x51 +#define MT_XCONFIRM 0x52 +#define MT_XDATA 0x53 +#define MT_XHEADER 0x54 +#else +#define MT_XCONN MT_DATA +#define MT_XCLOSE MT_DATA +#define MT_XCONFIRM MT_DATA +#define MT_XDATA MT_DATA +#define MT_XHEADER MT_HEADER +#endif /* ARGO_DEBUG */ + +#define DONTCLEAR -1 + +/********************************************************************* + * cons.c - CONS interface to the x.25 layer + * + * TODO: figure out what resources we might run out of besides mbufs. + * If we run out of any of them (including mbufs) close and recycle + * lru x% of the connections, for some parameter x. + * + * There are 2 interfaces from above: + * 1) from TP0: + * cons CO network service + * TP associates a transport connection with a network connection. + * cons_output( isop, m, len, isdgm==0 ) + * co_flags == 0 + * 2) from TP4: + * It's a datagram service, like clnp is. - even though it calls + * cons_output( isop, m, len, isdgm==1 ) + * it eventually goes through + * cosns_output(ifp, m, dst). + * TP4 permits multiplexing (reuse, possibly simultaneously) of the + * network connections. + * This means that many sockets (many tpcbs) may be associated with + * this pklcd, hence cannot have a back ptr from pklcd to a tpcb. + * co_flags & CONSF_DGM + * co_socket is null since there may be many sockets that use this pklcd. + * +NOTE: + streams would really be nice. sigh. +NOTE: + PVCs could be handled by config-ing a cons with an address and with the + IFF_POINTTOPOINT flag on. This code would then have to skip the + connection setup stuff for pt-to-pt links. + + + *********************************************************************/ + + +#define CONS_IFQMAXLEN 5 + + +/* protosw pointers for getting to higher layer */ +Static struct protosw *CLNP_proto; +Static struct protosw *TP_proto; +Static struct protosw *X25_proto; +Static int issue_clear_req(); + +#ifndef PHASEONE +extern struct ifaddr *ifa_ifwithnet(); +#endif /* PHASEONE */ + +extern struct ifaddr *ifa_ifwithaddr(); + +extern struct isopcb tp_isopcb; /* chain of all TP pcbs */ + + +Static int parse_facil(), NSAPtoDTE(), make_partial_x25_packet(); +Static int FACILtoNSAP(), DTEtoNSAP(); +Static struct pklcd *cons_chan_to_pcb(); + +#define HIGH_NIBBLE 1 +#define LOW_NIBBLE 0 + +/* + * NAME: nibble_copy() + * FUNCTION and ARGUMENTS: + * copies (len) nibbles from (src_octet), high or low nibble + * to (dst_octet), high or low nibble, + * src_nibble & dst_nibble should be: + * HIGH_NIBBLE (1) if leftmost 4 bits/ most significant nibble + * LOW_NIBBLE (0) if rightmost 4 bits/ least significant nibble + * RETURNS: VOID + */ +void +nibble_copy(src_octet, src_nibble, dst_octet, dst_nibble, len) + register char *src_octet; + register char *dst_octet; + register unsigned src_nibble; + register unsigned dst_nibble; + int len; +{ + + register i; + register unsigned dshift, sshift; + + IFDEBUG(D_CADDR) + printf("nibble_copy ( 0x%x, 0x%x, 0x%x, 0x%x 0x%x)\n", + src_octet, src_nibble, dst_octet, dst_nibble, len); + ENDDEBUG +#define SHIFT 0x4 + + dshift = dst_nibble << 2; + sshift = src_nibble << 2; + + for (i=0; i> sshift))<< dshift; + + dshift ^= SHIFT; + sshift ^= SHIFT; + src_nibble = 1-src_nibble; + dst_nibble = 1-dst_nibble; + src_octet += src_nibble; + dst_octet += dst_nibble; + } + IFDEBUG(D_CADDR) + printf("nibble_copy DONE\n"); + ENDDEBUG +} + +/* + * NAME: nibble_match() + * FUNCTION and ARGUMENTS: + * compares src_octet/src_nibble and dst_octet/dst_nibble for len nibbles. + * RETURNS: 0 if they differ, 1 if they are the same. + */ +int +nibble_match( src_octet, src_nibble, dst_octet, dst_nibble, len) + register char *src_octet; + register char *dst_octet; + register unsigned src_nibble; + register unsigned dst_nibble; + int len; +{ + + register i; + register unsigned dshift, sshift; + u_char nibble_a, nibble_b; + + IFDEBUG(D_CADDR) + printf("nibble_match ( 0x%x, 0x%x, 0x%x, 0x%x 0x%x)\n", + src_octet, src_nibble, dst_octet, dst_nibble, len); + ENDDEBUG +#define SHIFT 0x4 + + dshift = dst_nibble << 2; + sshift = src_nibble << 2; + + for (i=0; i>dshift) & 0xf; + nibble_a = ( 0xf & (*src_octet >> sshift)); + if (nibble_b != nibble_a) + return 0; + + dshift ^= SHIFT; + sshift ^= SHIFT; + src_nibble = 1-src_nibble; + dst_nibble = 1-dst_nibble; + src_octet += src_nibble; + dst_octet += dst_nibble; + } + IFDEBUG(D_CADDR) + printf("nibble_match DONE\n"); + ENDDEBUG + return 1; +} + +/* + **************************** NET PROTOCOL cons *************************** + */ +/* + * NAME: cons_init() + * CALLED FROM: + * autoconf + * FUNCTION: + * initialize the protocol + */ +cons_init() +{ + int tp_incoming(), clnp_incoming(); + + + CLNP_proto = pffindproto(AF_ISO, ISOPROTO_CLNP, SOCK_DGRAM); + X25_proto = pffindproto(AF_ISO, ISOPROTO_X25, SOCK_STREAM); + TP_proto = pffindproto(AF_ISO, ISOPROTO_TP0, SOCK_SEQPACKET); + IFDEBUG(D_CCONS) + printf("cons_init end : cnlp_proto 0x%x cons proto 0x%x tp proto 0x%x\n", + CLNP_proto, X25_proto, TP_proto); + ENDDEBUG +#ifdef notdef + pk_protolisten(0x81, 0, clnp_incoming); + pk_protolisten(0x82, 0, esis_incoming); + pk_protolisten(0x84, 0, tp8878_A_incoming); + pk_protolisten(0, 0, tp_incoming); +#endif +} + +tp_incoming(lcp, m) +struct pklcd *lcp; +register struct mbuf *m; +{ + register struct isopcb *isop; + int cons_tpinput(); + + if (iso_pcballoc((struct socket *)0, &tp_isopcb)) { + pk_close(lcp); + return; + } + isop = tp_isopcb.isop_next; + lcp->lcd_upper = cons_tpinput; + lcp->lcd_upnext = (caddr_t)isop; + lcp->lcd_send(lcp); /* Confirms call */ + isop->isop_chan = (caddr_t)lcp; + isop->isop_laddr = &isop->isop_sladdr; + isop->isop_faddr = &isop->isop_sfaddr; + DTEtoNSAP(isop->isop_laddr, &lcp->lcd_laddr); + DTEtoNSAP(isop->isop_faddr, &lcp->lcd_faddr); + parse_facil(lcp, isop, &(mtod(m, struct x25_packet *)->packet_data), + m->m_pkthdr.len - PKHEADERLN); +} + +cons_tpinput(lcp, m0) +struct mbuf *m0; +struct pklcd *lcp; +{ + register struct isopcb *isop = (struct isopcb *)lcp->lcd_upnext; + register struct x25_packet *xp; + int cmd, ptype = CLEAR; + + if (isop == 0) + return; + if (m0 == 0) + goto dead; + switch(m0->m_type) { + case MT_DATA: + case MT_OOBDATA: + tpcons_input(m0, isop->isop_faddr, isop->isop_laddr, (caddr_t)lcp); + return; + + case MT_CONTROL: + switch (ptype = pk_decode(mtod(m0, struct x25_packet *))) { + + case RR: + cmd = PRC_CONS_SEND_DONE; + break; + + case CALL_ACCEPTED: + if (lcp->lcd_sb.sb_mb) + lcp->lcd_send(lcp); /* XXX - fix this */ + /*FALLTHROUGH*/ + default: + return; + + dead: + case CLEAR: + case CLEAR_CONF: + lcp->lcd_upper = 0; + lcp->lcd_upnext = 0; + isop->isop_chan = 0; + case RESET: + cmd = PRC_ROUTEDEAD; + } + tpcons_ctlinput(cmd, isop->isop_faddr, isop); + if (cmd = PRC_ROUTEDEAD && isop->isop_refcnt == 0) + iso_pcbdetach(isop); + } +} + +/* + * NAME: cons_connect() + * CALLED FROM: + * tpcons_pcbconnect() when opening a new connection. + * FUNCTION anD ARGUMENTS: + * Figures out which device to use, finding a route if one doesn't + * already exist. + * RETURN VALUE: + * returns E* + */ +cons_connect(isop) + register struct isopcb *isop; +{ + register struct pklcd *lcp = (struct pklcd *)isop->isop_chan; + register struct mbuf *m; + struct ifaddr *ifa; + int error; + + IFDEBUG(D_CCONN) + printf("cons_connect(0x%x): ", isop); + dump_isoaddr(isop->isop_faddr); + printf("myaddr: "); + dump_isoaddr(isop->isop_laddr); + printf("\n" ); + ENDDEBUG + NSAPtoDTE(isop->isop_faddr, &lcp->lcd_faddr); + lcp->lcd_upper = cons_tpinput; + lcp->lcd_upnext = (caddr_t)isop; + IFDEBUG(D_CCONN) + printf( + "calling make_partial_x25_packet( 0x%x, 0x%x, 0x%x)\n", + &lcp->lcd_faddr, &lcp->lcd_laddr, + isop->isop_socket->so_proto->pr_protocol); + ENDDEBUG + if ((error = make_partial_x25_packet(isop, lcp, m)) == 0) + error = pk_connect(lcp, &lcp->lcd_faddr); + return error; +} + +/* + **************************** DEVICE cons *************************** + */ + + +/* + * NAME: cons_ctlinput() + * CALLED FROM: + * lower layer when ECN_CLEAR occurs : this routine is here + * for consistency - cons subnet service calls its higher layer + * through the protosw entry. + * FUNCTION & ARGUMENTS: + * cmd is a PRC_* command, list found in ../sys/protosw.h + * copcb is the obvious. + * This serves the higher-layer cons service. + * NOTE: this takes 3rd arg. because cons uses it to inform itself + * of things (timeouts, etc) but has a pcb instead of an address. + */ +cons_ctlinput(cmd, sa, copcb) + int cmd; + struct sockaddr *sa; + register struct pklcd *copcb; +{ +} + + +find_error_reason( xp ) + register struct x25_packet *xp; +{ + extern u_char x25_error_stats[]; + int error, cause; + + if (xp) { + cause = 4[(char *)xp]; + switch (cause) { + case 0x00: + case 0x80: + /* DTE originated; look at the diagnostic */ + error = (CONL_ERROR_MASK | cause); + goto done; + + case 0x01: /* number busy */ + case 0x81: + case 0x09: /* Out of order */ + case 0x89: + case 0x11: /* Remot Procedure Error */ + case 0x91: + case 0x19: /* reverse charging accept not subscribed */ + case 0x99: + case 0x21: /* Incampat destination */ + case 0xa1: + case 0x29: /* fast select accept not subscribed */ + case 0xa9: + case 0x39: /* ship absent */ + case 0xb9: + case 0x03: /* invalid facil request */ + case 0x83: + case 0x0b: /* access barred */ + case 0x8b: + case 0x13: /* local procedure error */ + case 0x93: + case 0x05: /* network congestion */ + case 0x85: + case 0x8d: /* not obtainable */ + case 0x0d: + case 0x95: /* RPOA out of order */ + case 0x15: + /* take out bit 8 + * so we don't have to have so many perror entries + */ + error = (CONL_ERROR_MASK | 0x100 | (cause & ~0x80)); + goto done; + + case 0xc1: /* gateway-detected proc error */ + case 0xc3: /* gateway congestion */ + + error = (CONL_ERROR_MASK | 0x100 | cause); + goto done; + } + } + /* otherwise, a *hopefully* valid perror exists in the e_reason field */ + error = xp->packet_data; + if (error = 0) { + printf("Incoming PKT TYPE 0x%x with reason 0x%x\n", + pk_decode(xp), + cause); + error = E_CO_HLI_DISCA; + } + +done: + return error; +} + + + +#endif /* KERNEL */ + +/* + * NAME: make_partial_x25_packet() + * + * FUNCTION and ARGUMENTS: + * Makes part of an X.25 call packet, for use by x25. + * (src) and (dst) are the NSAP-addresses of source and destination. + * (buf) is a ptr to a buffer into which to write this partial header. + * + * 0 Facility length (in octets) + * 1 Facility field, which is a set of: + * m facil code + * m+1 facil param len (for >2-byte facilities) in octets + * m+2..p facil param field + * q user data (protocol identification octet) + * + * + * RETURNS: + * 0 if OK + * E* if failed. + * + * SIDE EFFECTS: + * Stores facilites mbuf in X.25 control block, where the connect + * routine knows where to look for it. + */ + +#ifdef X25_1984 +int cons_use_facils = 1; +#else /* X25_1984 */ +int cons_use_facils = 0; +#endif /* X25_1984 */ + +int cons_use_udata = 1; /* KLUDGE FOR DEBUGGING */ + +Static int +make_partial_x25_packet(isop, lcp) + struct isopcb *isop; + struct pklcd *lcp; +{ + u_int proto; + int flag; + caddr_t buf; + register caddr_t ptr; + register int len = 0; + int buflen =0; + caddr_t facil_len; + int oddness = 0; + struct mbuf *m; + + + IFDEBUG(D_CCONN) + printf("make_partial_x25_packet(0x%x, 0x%x, 0x%x, 0x%x, 0x%x)\n", + isop->isop_laddr, isop->isop_faddr, proto, m, flag); + ENDDEBUG + if (cons_use_udata) { + if (isop->isop_x25crud_len > 0) { + /* + * The user specified something. Stick it in + */ + bcopy(isop->isop_x25crud, lcp->lcd_faddr.x25_udata, + isop->isop_x25crud_len); + lcp->lcd_faddr.x25_udlen = isop->isop_x25crud_len; + } + } + + if (cons_use_facils == 0) { + lcp->lcd_facilities = 0; + return 0; + } + MGETHDR(m, MT_DATA, M_WAITOK); + if (m == 0) + return ENOBUFS; + buf = mtod(m, caddr_t); + ptr = buf; + + /* ptr now points to facil length (len of whole facil field in OCTETS */ + facil_len = ptr ++; + m->m_len = 0; + pk_build_facilities(m, &lcp->lcd_faddr, 0); + + IFDEBUG(D_CADDR) + printf("make_partial calling: ptr 0x%x, len 0x%x\n", ptr, + isop->isop_laddr->siso_addr.isoa_len); + ENDDEBUG + if (cons_use_facils) { + *ptr++ = 0; /* Marker to separate X.25 facitilies from CCITT ones */ + *ptr++ = 0x0f; + *ptr = 0xcb; /* calling facility code */ + ptr ++; + ptr ++; /* leave room for facil param len (in OCTETS + 1) */ + ptr ++; /* leave room for the facil param len (in nibbles), + * high two bits of which indicate full/partial NSAP + */ + len = isop->isop_laddr->siso_addr.isoa_len; + bcopy( isop->isop_laddr->siso_data, ptr, len); + *(ptr-2) = len+1; /* facil param len in octets */ + *(ptr-1) = len<<1; /* facil param len in nibbles */ + ptr += len; + + IFDEBUG(D_CADDR) + printf("make_partial called: ptr 0x%x, len 0x%x\n", ptr, + isop->isop_faddr->siso_addr.isoa_len); + ENDDEBUG + *ptr = 0xc9; /* called facility code */ + ptr ++; + ptr ++; /* leave room for facil param len (in OCTETS + 1) */ + ptr ++; /* leave room for the facil param len (in nibbles), + * high two bits of which indicate full/partial NSAP + */ + len = isop->isop_faddr->siso_nlen; + bcopy(isop->isop_faddr->siso_data, ptr, len); + *(ptr-2) = len+1; /* facil param len = addr len + 1 for each of these + * two length fields, in octets */ + *(ptr-1) = len<<1; /* facil param len in nibbles */ + ptr += len; + + } + *facil_len = ptr - facil_len - 1; + if (*facil_len > MAX_FACILITIES) + return E_CO_PNA_LONG; + + buflen = (int)(ptr - buf); + + IFDEBUG(D_CDUMP_REQ) + register int i; + + printf("ECN_CONNECT DATA buf 0x%x len %d (0x%x)\n", + buf, buflen, buflen); + for( i=0; i < buflen; ) { + printf("+%d: %x %x %x %x %x %x %x %x\n", + i, + *(buf+i), *(buf+i+1), *(buf+i+2), *(buf+i+3), + *(buf+i+4), *(buf+i+5), *(buf+i+6), *(buf+i+7)); + i+=8; + } + ENDDEBUG + IFDEBUG(D_CADDR) + printf("make_partial returns buf 0x%x size 0x%x bytes\n", + mtod(m, caddr_t), buflen); + ENDDEBUG + + if (buflen > MHLEN) + return E_CO_PNA_LONG; + + m->m_pkthdr.len = m->m_len = buflen; + lcp->lcd_facilities = m; + return 0; +} + +/* + * NAME: NSAPtoDTE() + * CALLED FROM: + * make_partial_x25_packet() + * FUNCTION and ARGUMENTS: + * get a DTE address from an NSAP-address (struct sockaddr_iso) + * (dst_octet) is the octet into which to begin stashing the DTE addr + * (dst_nibble) takes 0 or 1. 1 means begin filling in the DTE addr + * in the high-order nibble of dst_octet. 0 means low-order nibble. + * (addr) is the NSAP-address + * (flag) is true if the transport suffix is to become the + * last two digits of the DTE address + * A DTE address is a series of ASCII digits + * + * A DTE address may have leading zeros. The are significant. + * 1 digit per nibble, may be an odd number of nibbles. + * + * An NSAP-address has the DTE address in the IDI. Leading zeros are + * significant. Trailing hex f indicates the end of the DTE address. + * The IDI is a series of BCD digits, one per nibble. + * + * RETURNS + * # significant digits in the DTE address, -1 if error. + */ + +Static int +NSAPtoDTE(siso, sx25) + register struct sockaddr_iso *siso; + register struct sockaddr_x25 *sx25; +{ + int dtelen = -1; + + IFDEBUG(D_CADDR) + printf("NSAPtoDTE: nsap: %s\n", clnp_iso_addrp(&siso->siso_addr)); + ENDDEBUG + + if (siso->siso_data[0] == AFI_37) { + register char *out = sx25->x25_addr; + register char *in = siso->siso_data + 1; + register int nibble; + char *lim = siso->siso_data + siso->siso_nlen; + char *olim = out+15; + int lowNibble = 0; + + while (in < lim) { + nibble = ((lowNibble ? *in++ : (*in >> 4)) & 0xf) | 0x30; + lowNibble ^= 1; + if (nibble != 0x3f && out < olim) + *out++ = nibble; + } + dtelen = out - sx25->x25_addr; + *out++ = 0; + } else { + /* error = iso_8208snparesolve(addr, x121string, &x121strlen);*/ + register struct rtentry *rt; + extern struct sockaddr_iso blank_siso; + struct sockaddr_iso nsiso; + + nsiso = blank_siso; + bcopy(nsiso.siso_data, siso->siso_data, + nsiso.siso_nlen = siso->siso_nlen); + if (rt = rtalloc1(&nsiso, 1)) { + register struct sockaddr_x25 *sxx = + (struct sockaddr_x25 *)rt->rt_gateway; + register char *in = sxx->x25_addr; + + rt->rt_use--; + if (sxx && sxx->x25_family == AF_CCITT) { + bcopy(sx25->x25_addr, sxx->x25_addr, sizeof(sx25->x25_addr)); + while (*in++) {} + dtelen = in - sxx->x25_addr; + } + } + } + return dtelen; +} + +/* + * NAME: FACILtoNSAP() + * CALLED FROM: + * parse_facil() + * FUNCTION and ARGUMENTS: + * Creates and NSAP in the sockaddr_iso (addr) from the + * x.25 facility found at buf - 1. + * RETURNS: + * 0 if ok, -1 if error. + */ + +Static int +FACILtoNSAP(addr, buf) + register u_char *buf; + register struct sockaddr_iso *addr; +{ + int len_in_nibbles = *++buf & 0x3f; + u_char buf_len = (len_in_nibbles + 1) >> 1;; /* in bytes */ + + IFDEBUG(D_CADDR) + printf("FACILtoNSAP( 0x%x, 0x%x, 0x%x )\n", + buf, buf_len, addr ); + ENDDEBUG + + len_in_nibbles = *buf & 0x3f; + /* despite the fact that X.25 makes us put a length in nibbles + * here, the NSAP-addrs are always in full octets + */ + switch (*buf++ & 0xc0) { + case 0: + /* Entire OSI NSAP address */ + bcopy((caddr_t)buf, addr->siso_data, addr->siso_nlen = buf_len); + break; + + case 40: + /* Partial OSI NSAP address, assume trailing */ + if (buf_len + addr->siso_nlen > sizeof(addr->siso_addr)) + return -1; + bcopy((caddr_t)buf, TSEL(addr), buf_len); + addr->siso_nlen += buf_len; + break; + + default: + /* Rather than blow away the connection, just ignore and use + NSAP from DTE */; + } + return 0; +} + +Static +init_siso(siso) +register struct sockaddr_iso *siso; +{ + siso->siso_len = sizeof (*siso); + siso->siso_family = AF_ISO; + siso->siso_data[0] = AFI_37; + siso->siso_nlen = 8; +} + +/* + * NAME: DTEtoNSAP() + * CALLED FROM: + * parse_facil() + * FUNCTION and ARGUMENTS: + * Creates a type 37 NSAP in the sockaddr_iso (addr) + * from a DTE address found in a sockaddr_x25. + * + * RETURNS: + * 0 if ok; E* otherwise. + */ + +Static int +DTEtoNSAP(addr, sx) + struct sockaddr_iso *addr; + struct sockaddr_x25 *sx; +{ + register char *in, *out; + register int first; + int pad_tail = 0; + int src_len; + + + init_siso(addr); + in = sx->x25_addr; + src_len = strlen(in); + addr->siso_nlen = (src_len + 3) / 2; + out = addr->siso_data; + *out++ = 0x37; + if (src_len & 1) { + pad_tail = 0xf; + src_len++; + } + for (first = 0; src_len > 0; src_len--) { + first |= 0xf & *in++; + if (src_len & 1) { + *out++ = first; + first = 0; + } + else first <<= 4; + } + if (pad_tail) + out[-1] |= 0xf; + return 0; /* ok */ +} + +/* + * FUNCTION and ARGUMENTS: + * parses (buf_len) bytes beginning at (buf) and finds + * a called nsap, a calling nsap, and protocol identifier. + * RETURNS: + * 0 if ok, E* otherwise. + */ + +Static int +parse_facil(lcp, isop, buf, buf_len) + caddr_t buf; + u_char buf_len; /* in bytes */ + struct isopcb *isop; + struct pklcd *lcp; +{ + register int i; + register u_char *ptr = (u_char *)buf; + u_char *ptr_lim, *facil_lim; + int facil_param_len, facil_len; + + IFDEBUG(D_CADDR) + printf("parse_facil(0x%x, 0x%x, 0x%x, 0x%x)\n", + lcp, isop, buf, buf_len); + dump_buf(buf, buf_len); + ENDDEBUG + + /* find the beginnings of the facility fields in buf + * by skipping over the called & calling DTE addresses + * i <- # nibbles in called + # nibbles in calling + * i += 1 so that an odd nibble gets rounded up to even + * before dividing by 2, then divide by two to get # octets + */ + i = (int)(*ptr >> 4) + (int)(*ptr&0xf); + i++; + ptr += i >> 1; + ptr ++; /* plus one for the DTE lengths byte */ + + /* ptr now is at facil_length field */ + facil_len = *ptr++; + facil_lim = ptr + facil_len; + IFDEBUG(D_CADDR) + printf("parse_facils: facil length is 0x%x\n", (int) facil_len); + ENDDEBUG + + while (ptr < facil_lim) { + /* get NSAP addresses from facilities */ + switch (*ptr++) { + case 0xcb: + /* calling NSAP */ + facil_param_len = FACILtoNSAP(isop->isop_faddr, ptr); + break; + case 0xc9: + /* called NSAP */ + facil_param_len = FACILtoNSAP(isop->isop_laddr, ptr); + break; + + /* from here to default are legit cases that I ignore */ + /* variable length */ + case 0xca: /* end-to-end transit delay negot */ + case 0xc6: /* network user id */ + case 0xc5: /* charging info : indicating monetary unit */ + case 0xc2: /* charging info : indicating segment count */ + case 0xc1: /* charging info : indicating call duration */ + case 0xc4: /* RPOA extended format */ + case 0xc3: /* call redirection notification */ + facil_param_len = 0; + break; + + /* 1 octet */ + case 0x0a: /* min. throughput class negot */ + case 0x02: /* throughput class */ + case 0x03: case 0x47: /* CUG shit */ + case 0x0b: /* expedited data negot */ + case 0x01: /* Fast select or reverse charging + (example of intelligent protocol design) */ + case 0x04: /* charging info : requesting service */ + case 0x08: /* called line addr modified notification */ + case 0x00: /* marker to indicate beginning of CCITT facils */ + facil_param_len = 1; + break; + + /* any 2 octets */ + case 0x42: /* pkt size */ + case 0x43: /* win size */ + case 0x44: /* RPOA basic format */ + case 0x41: /* bilateral CUG shit */ + case 0x49: /* transit delay selection and indication */ + facil_param_len = 2; + break; + + default: + printf( +"BOGUS FACILITY CODE facil_lim 0x%x facil_len %d, ptr 0x%x *ptr 0x%x\n", + facil_lim, facil_len, ptr - 1, ptr[-1]); + /* facil that we don't handle + return E_CO_HLI_REJI; */ + switch (ptr[-1] & 0xc0) { + case 0x00: facil_param_len = 1; break; + case 0x40: facil_param_len = 2; break; + case 0x80: facil_param_len = 3; break; + case 0xc0: facil_param_len = 0; break; + } + } + if (facil_param_len == -1) + return E_CO_REG_ICDA; + if (facil_param_len == 0) /* variable length */ + facil_param_len = (int)*ptr++; /* 1 + the real facil param */ + ptr += facil_param_len; + } + return 0; +} + +#endif /* TPCONS */ diff --git a/sys/netiso/if_eon.c b/sys/netiso/if_eon.c new file mode 100644 index 00000000000..3c05133040a --- /dev/null +++ b/sys/netiso/if_eon.c @@ -0,0 +1,609 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_eon.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * $Header: if_eon.c,v 1.4 88/07/19 15:53:59 hagens Exp $ + * $Source: /usr/argo/sys/netiso/RCS/if_eon.c,v $ + * + * EON rfc + * Layer between IP and CLNL + * + * TODO: + * Put together a current rfc986 address format and get the right offset + * for the nsel + */ + +#ifdef EON +#define NEON 1 + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +extern struct timeval time; +extern struct ifnet loif; + +#define EOK 0 + +int eoninput(); +int eonoutput(); +int eonioctl(); +int eonattach(); +int eoninit(); +void eonrtrequest(); +struct ifnet eonif[1]; + +eonprotoinit() { + (void) eonattach(); +} + +struct eon_llinfo eon_llinfo; +#define PROBE_OK 0; + + +/* + * FUNCTION: eonattach + * + * PURPOSE: autoconf attach routine + * + * RETURNS: void + */ + +eonattach() +{ + register struct ifnet *ifp = eonif; + + IFDEBUG(D_EON) + printf("eonattach()\n"); + ENDDEBUG + ifp->if_unit = 0; + ifp->if_name = "eon"; + ifp->if_mtu = ETHERMTU; + /* since everything will go out over ether or token ring */ + + ifp->if_init = eoninit; + ifp->if_ioctl = eonioctl; + ifp->if_output = eonoutput; + ifp->if_type = IFT_EON; + ifp->if_addrlen = 5; + ifp->if_hdrlen = EONIPLEN; + ifp->if_flags = IFF_BROADCAST; + if_attach(ifp); + eonioctl(ifp, SIOCSIFADDR, (caddr_t)ifp->if_addrlist); + eon_llinfo.el_qhdr.link = + eon_llinfo.el_qhdr.rlink = &(eon_llinfo.el_qhdr); + + IFDEBUG(D_EON) + printf("eonattach()\n"); + ENDDEBUG +} + + +/* + * FUNCTION: eonioctl + * + * PURPOSE: io controls - ifconfig + * need commands to + * link-UP (core addr) (flags: ES, IS) + * link-DOWN (core addr) (flags: ES, IS) + * must be callable from kernel or user + * + * RETURNS: nothing + */ +eonioctl(ifp, cmd, data) + register struct ifnet *ifp; + int cmd; + register caddr_t data; +{ + int s = splimp(); + register int error = 0; + + IFDEBUG(D_EON) + printf("eonioctl (cmd 0x%x) \n", cmd); + ENDDEBUG + + switch (cmd) { + register struct ifaddr *ifa; + + case SIOCSIFADDR: + if (ifa = (struct ifaddr *)data) { + ifp->if_flags |= IFF_UP; + if (ifa->ifa_addr->sa_family != AF_LINK) + ifa->ifa_rtrequest = eonrtrequest; + } + break; + } + splx(s); + return(error); +} + + +eoniphdr(hdr, loc, ro, class, zero) +struct route *ro; +register struct eon_iphdr *hdr; +caddr_t loc; +{ + struct mbuf mhead; + register struct sockaddr_in *sin = (struct sockaddr_in *)&ro->ro_dst; + if (zero) { + bzero((caddr_t)hdr, sizeof (*hdr)); + bzero((caddr_t)ro, sizeof (*ro)); + } + sin->sin_family = AF_INET; + sin->sin_len = sizeof (*sin); + bcopy(loc, (caddr_t)&sin->sin_addr, sizeof(struct in_addr)); + /* + * If there is a cached route, + * check that it is to the same destination + * and is still up. If not, free it and try again. + */ + if (ro->ro_rt) { + struct sockaddr_in *dst = + (struct sockaddr_in *)rt_key(ro->ro_rt); + if ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + sin->sin_addr.s_addr != dst->sin_addr.s_addr) { + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + } + rtalloc(ro); + if (ro->ro_rt) + ro->ro_rt->rt_use++; + hdr->ei_ip.ip_dst = sin->sin_addr; + hdr->ei_ip.ip_p = IPPROTO_EON; + hdr->ei_ip.ip_ttl = MAXTTL; + hdr->ei_eh.eonh_class = class; + hdr->ei_eh.eonh_vers = EON_VERSION; + hdr->ei_eh.eonh_csum = 0; + mhead.m_data = (caddr_t) &hdr->ei_eh; + mhead.m_len = sizeof(struct eon_hdr); + mhead.m_next = 0; + IFDEBUG(D_EON) + printf("eonoutput : gen csum (0x%x, offset %d, datalen %d)\n", + &mhead, + _offsetof(struct eon_hdr, eonh_csum), sizeof(struct eon_hdr)); + ENDDEBUG + iso_gen_csum(&mhead, + _offsetof(struct eon_hdr, eonh_csum), sizeof(struct eon_hdr)); +} +/* + * FUNCTION: eonrtrequest + * + * PURPOSE: maintains list of direct eon recipients. + * sets up IP route for rest. + * + * RETURNS: nothing + */ +void +eonrtrequest(cmd, rt, gate) +register struct rtentry *rt; +register struct sockaddr *gate; +{ + unsigned long zerodst = 0; + caddr_t ipaddrloc = (caddr_t) &zerodst; + register struct eon_llinfo *el = (struct eon_llinfo *)rt->rt_llinfo; + + /* + * Common Housekeeping + */ + switch (cmd) { + case RTM_DELETE: + if (el) { + remque(&(el->el_qhdr)); + if (el->el_iproute.ro_rt) + RTFREE(el->el_iproute.ro_rt); + Free(el); + rt->rt_llinfo = 0; + } + return; + + case RTM_ADD: + case RTM_RESOLVE: + rt->rt_rmx.rmx_mtu = loif.if_mtu; /* unless better below */ + R_Malloc(el, struct eon_llinfo *, sizeof(*el)); + rt->rt_llinfo = (caddr_t)el; + if (el == 0) + return; + Bzero(el, sizeof(*el)); + insque(&(el->el_qhdr), &eon_llinfo.el_qhdr); + el->el_rt = rt; + break; + } + if (gate || (gate = rt->rt_gateway)) switch (gate->sa_family) { + case AF_LINK: +#define SDL(x) ((struct sockaddr_dl *)x) + if (SDL(gate)->sdl_alen == 1) + el->el_snpaoffset = *(u_char *)LLADDR(SDL(gate)); + else + ipaddrloc = LLADDR(SDL(gate)); + break; + case AF_INET: +#define SIN(x) ((struct sockaddr_in *)x) + ipaddrloc = (caddr_t) &SIN(gate)->sin_addr; + break; + default: + return; + } + el->el_flags |= RTF_UP; + eoniphdr(&el->el_ei, ipaddrloc, &el->el_iproute, EON_NORMAL_ADDR, 0); + if (el->el_iproute.ro_rt) + rt->rt_rmx.rmx_mtu = el->el_iproute.ro_rt->rt_rmx.rmx_mtu + - sizeof(el->el_ei); +} + +/* + * FUNCTION: eoninit + * + * PURPOSE: initialization + * + * RETURNS: nothing + */ + +eoninit(unit) + int unit; +{ + printf("eon driver-init eon%d\n", unit); +} + + +/* + * FUNCTION: eonoutput + * + * PURPOSE: prepend an eon header and hand to IP + * ARGUMENTS: (ifp) is points to the ifnet structure for this unit/device + * (m) is an mbuf *, *m is a CLNL packet + * (dst) is a destination address - have to interp. as + * multicast or broadcast or real address. + * + * RETURNS: unix error code + * + * NOTES: + * + */ +eonoutput(ifp, m, dst, rt) + struct ifnet *ifp; + register struct mbuf *m; /* packet */ + struct sockaddr_iso *dst; /* destination addr */ + struct rtentry *rt; +{ + register struct eon_llinfo *el; + register struct eon_iphdr *ei; + struct route *ro; + int datalen; + struct mbuf *mh; + int error = 0, class = 0, alen = 0; + caddr_t ipaddrloc; + static struct eon_iphdr eon_iphdr; + static struct route route; + + IFDEBUG(D_EON) + printf("eonoutput \n" ); + ENDDEBUG + + ifp->if_lastchange = time; + ifp->if_opackets++; + if (rt == 0 || (el = (struct eon_llinfo *)rt->rt_llinfo) == 0) { + if (dst->siso_family == AF_LINK) { + register struct sockaddr_dl *sdl = (struct sockaddr_dl *)dst; + + ipaddrloc = LLADDR(sdl); + alen = sdl->sdl_alen; + } else if (dst->siso_family == AF_ISO && dst->siso_data[0] == AFI_SNA) { + alen = dst->siso_nlen - 1; + ipaddrloc = (caddr_t) dst->siso_data + 1; + } + switch (alen) { + case 5: + class = 4[(u_char *)ipaddrloc]; + case 4: + ro = &route; + ei = &eon_iphdr; + eoniphdr(ei, ipaddrloc, ro, class, 1); + goto send; + } +einval: + error = EINVAL; + goto flush; + } + if ((el->el_flags & RTF_UP) == 0) { + eonrtrequest(RTM_CHANGE, rt, (struct sockaddr *)0); + if ((el->el_flags & RTF_UP) == 0) { + error = EHOSTUNREACH; + goto flush; + } + } + if ((m->m_flags & M_PKTHDR) == 0) { + printf("eon: got non headered packet\n"); + goto einval; + } + ei = &el->el_ei; + ro = &el->el_iproute; + if (el->el_snpaoffset) { + if (dst->siso_family == AF_ISO) { + bcopy((caddr_t) &dst->siso_data[el->el_snpaoffset], + (caddr_t) &ei->ei_ip.ip_dst, sizeof(ei->ei_ip.ip_dst)); + } else + goto einval; + } +send: + /* put an eon_hdr in the buffer, prepended by an ip header */ + datalen = m->m_pkthdr.len + EONIPLEN; + MGETHDR(mh, M_DONTWAIT, MT_HEADER); + if(mh == (struct mbuf *)0) + goto flush; + mh->m_next = m; + m = mh; + MH_ALIGN(m, sizeof(struct eon_iphdr)); + m->m_len = sizeof(struct eon_iphdr); + ifp->if_obytes += + (ei->ei_ip.ip_len = (u_short)(m->m_pkthdr.len = datalen)); + *mtod(m, struct eon_iphdr *) = *ei; + + IFDEBUG(D_EON) + printf("eonoutput dst ip addr : %x\n", ei->ei_ip.ip_dst.s_addr); + printf("eonoutput ip_output : eonip header:\n"); + dump_buf(ei, sizeof(struct eon_iphdr)); + ENDDEBUG + + error = ip_output(m, (struct mbuf *)0, ro, 0, NULL); + m = 0; + if (error) { + ifp->if_oerrors++; + ifp->if_opackets--; + ifp->if_obytes -= datalen; + } +flush: + if (m) + m_freem(m); + return error; +} + +eoninput(m, iphlen) + register struct mbuf *m; + int iphlen; +{ + register struct eon_hdr *eonhdr; + register struct ip *iphdr; + struct ifnet *eonifp; + int s; + + eonifp = &eonif[0]; /* kludge - really want to give CLNP + * the ifp for eon, not for the real device + */ + + IFDEBUG(D_EON) + printf("eoninput() 0x%x m_data 0x%x m_len 0x%x dequeued\n", + m, m?m->m_data:0, m?m->m_len:0); + ENDDEBUG + + if (m == 0) + return; + if (iphlen > sizeof (struct ip)) + ip_stripoptions(m, (struct mbuf *)0); + if (m->m_len < EONIPLEN) { + if ((m = m_pullup(m, EONIPLEN)) == 0) { + IncStat(es_badhdr); +drop: + IFDEBUG(D_EON) + printf("eoninput: DROP \n" ); + ENDDEBUG + eonifp->if_ierrors ++; + m_freem(m); + return; + } + } + eonif->if_ibytes += m->m_pkthdr.len; + eonif->if_lastchange = time; + iphdr = mtod(m, struct ip *); + /* do a few checks for debugging */ + if( iphdr->ip_p != IPPROTO_EON ) { + IncStat(es_badhdr); + goto drop; + } + /* temporarily drop ip header from the mbuf */ + m->m_data += sizeof(struct ip); + eonhdr = mtod(m, struct eon_hdr *); + if( iso_check_csum( m, sizeof(struct eon_hdr) ) != EOK ) { + IncStat(es_badcsum); + goto drop; + } + m->m_data -= sizeof(struct ip); + + IFDEBUG(D_EON) + printf("eoninput csum ok class 0x%x\n", eonhdr->eonh_class ); + printf("eoninput: eon header:\n"); + dump_buf(eonhdr, sizeof(struct eon_hdr)); + ENDDEBUG + + /* checks for debugging */ + if( eonhdr->eonh_vers != EON_VERSION) { + IncStat(es_badhdr); + goto drop; + } + m->m_flags &= ~(M_BCAST|M_MCAST); + switch( eonhdr->eonh_class) { + case EON_BROADCAST: + IncStat(es_in_broad); + m->m_flags |= M_BCAST; + break; + case EON_NORMAL_ADDR: + IncStat(es_in_normal); + break; + case EON_MULTICAST_ES: + IncStat(es_in_multi_es); + m->m_flags |= M_MCAST; + break; + case EON_MULTICAST_IS: + IncStat(es_in_multi_is); + m->m_flags |= M_MCAST; + break; + } + eonifp->if_ipackets++; + + { + /* put it on the CLNP queue and set soft interrupt */ + struct ifqueue *ifq; + extern struct ifqueue clnlintrq; + + m->m_pkthdr.rcvif = eonifp; /* KLUDGE */ + IFDEBUG(D_EON) + printf("eoninput to clnl IFQ\n"); + ENDDEBUG + ifq = &clnlintrq; + s = splimp(); + if (IF_QFULL(ifq)) { + IF_DROP(ifq); + m_freem(m); + eonifp->if_iqdrops++; + eonifp->if_ipackets--; + splx(s); + return; + } + IF_ENQUEUE(ifq, m); + IFDEBUG(D_EON) + printf( + "0x%x enqueued on clnp Q: m_len 0x%x m_type 0x%x m_data 0x%x\n", + m, m->m_len, m->m_type, m->m_data); + dump_buf(mtod(m, caddr_t), m->m_len); + ENDDEBUG + schednetisr(NETISR_ISO); + splx(s); + } +} + +int +eonctlinput(cmd, sin) + int cmd; + struct sockaddr_in *sin; +{ + extern u_char inetctlerrmap[]; + + IFDEBUG(D_EON) + printf("eonctlinput: cmd 0x%x addr: ", cmd); + dump_isoaddr(sin); + printf("\n"); + ENDDEBUG + + if (cmd < 0 || cmd > PRC_NCMDS) + return 0; + + IncStat(es_icmp[cmd]); + switch (cmd) { + + case PRC_QUENCH: + case PRC_QUENCH2: + /* TODO: set the dec bit */ + break; + case PRC_TIMXCEED_REASS: + case PRC_ROUTEDEAD: + case PRC_HOSTUNREACH: + case PRC_UNREACH_NET: + case PRC_IFDOWN: + case PRC_UNREACH_HOST: + case PRC_HOSTDEAD: + case PRC_TIMXCEED_INTRANS: + /* TODO: mark the link down */ + break; + + case PRC_UNREACH_PROTOCOL: + case PRC_UNREACH_PORT: + case PRC_UNREACH_SRCFAIL: + case PRC_REDIRECT_NET: + case PRC_REDIRECT_HOST: + case PRC_REDIRECT_TOSNET: + case PRC_REDIRECT_TOSHOST: + case PRC_MSGSIZE: + case PRC_PARAMPROB: + /* printf("eonctlinput: ICMP cmd 0x%x\n", cmd );*/ + break; + } + return 0; +} + +#endif diff --git a/sys/netiso/iso.c b/sys/netiso/iso.c new file mode 100644 index 00000000000..cd64e687152 --- /dev/null +++ b/sys/netiso/iso.c @@ -0,0 +1,919 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso.c 8.2 (Berkeley) 11/15/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * $Header: iso.c,v 4.11 88/09/19 14:58:35 root Exp $ + * $Source: /usr/argo/sys/netiso/RCS/iso.c,v $ + * + * iso.c: miscellaneous routines to support the iso address family + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef TUBA +#include +#endif + +#ifdef ISO + +int iso_interfaces = 0; /* number of external interfaces */ +extern struct ifnet loif; /* loopback interface */ +int ether_output(); +void llc_rtrequest(); + +/* + * FUNCTION: iso_addrmatch1 + * + * PURPOSE: decide if the two iso_addrs passed are equal + * + * RETURNS: true if the addrs match, false if they do not + * + * SIDE EFFECTS: + * + * NOTES: + */ +iso_addrmatch1(isoaa, isoab) +register struct iso_addr *isoaa, *isoab; /* addresses to check */ +{ + u_int compare_len; + + IFDEBUG(D_ROUTE) + printf("iso_addrmatch1: comparing lengths: %d to %d\n", isoaa->isoa_len, + isoab->isoa_len); + printf("a:\n"); + dump_buf(isoaa->isoa_genaddr, isoaa->isoa_len); + printf("b:\n"); + dump_buf(isoab->isoa_genaddr, isoab->isoa_len); + ENDDEBUG + + if ((compare_len = isoaa->isoa_len) != isoab->isoa_len) { + IFDEBUG(D_ROUTE) + printf("iso_addrmatch1: returning false because of lengths\n"); + ENDDEBUG + return 0; + } + +#ifdef notdef + /* TODO : generalize this to all afis with masks */ + if( isoaa->isoa_afi == AFI_37 ) { + /* must not compare 2 least significant digits, or for + * that matter, the DSP + */ + compare_len = ADDR37_IDI_LEN - 1; + } +#endif + + IFDEBUG(D_ROUTE) + int i; + char *a, *b; + + a = isoaa->isoa_genaddr; + b = isoab->isoa_genaddr; + + for (i=0; i", a[i]&0xff, b[i]&0xff); + if (a[i] != b[i]) { + printf("\naddrs are not equal at byte %d\n", i); + return(0); + } + } + printf("\n"); + printf("addrs are equal\n"); + return (1); + ENDDEBUG + return (!bcmp(isoaa->isoa_genaddr, isoab->isoa_genaddr, compare_len)); +} + +/* + * FUNCTION: iso_addrmatch + * + * PURPOSE: decide if the two sockadrr_isos passed are equal + * + * RETURNS: true if the addrs match, false if they do not + * + * SIDE EFFECTS: + * + * NOTES: + */ +iso_addrmatch(sisoa, sisob) +struct sockaddr_iso *sisoa, *sisob; /* addresses to check */ +{ + return(iso_addrmatch1(&sisoa->siso_addr, &sisob->siso_addr)); +} +#ifdef notdef +/* + * FUNCTION: iso_netmatch + * + * PURPOSE: similar to iso_addrmatch but takes sockaddr_iso + * as argument. + * + * RETURNS: true if same net, false if not + * + * SIDE EFFECTS: + * + * NOTES: + */ +iso_netmatch(sisoa, sisob) +struct sockaddr_iso *sisoa, *sisob; +{ + u_char bufa[sizeof(struct sockaddr_iso)]; + u_char bufb[sizeof(struct sockaddr_iso)]; + register int lena, lenb; + + lena = iso_netof(&sisoa->siso_addr, bufa); + lenb = iso_netof(&sisob->siso_addr, bufb); + + IFDEBUG(D_ROUTE) + printf("iso_netmatch: comparing lengths: %d to %d\n", lena, lenb); + printf("a:\n"); + dump_buf(bufa, lena); + printf("b:\n"); + dump_buf(bufb, lenb); + ENDDEBUG + + return ((lena == lenb) && (!bcmp(bufa, bufb, lena))); +} +#endif /* notdef */ + +/* + * FUNCTION: iso_hashchar + * + * PURPOSE: Hash all character in the buffer specified into + * a long. Return the long. + * + * RETURNS: The hash value. + * + * SIDE EFFECTS: + * + * NOTES: The hash is achieved by exclusive ORing 4 byte + * quantities. + */ +u_long +iso_hashchar(buf, len) +register caddr_t buf; /* buffer to pack from */ +register int len; /* length of buffer */ +{ + register u_long h = 0; + register int i; + + for (i=0; isiso_addr, buf); + hp->afh_nethash = iso_hashchar((caddr_t)buf, bufsize); + + IFDEBUG(D_ROUTE) + printf("iso_hash: iso_netof: bufsize = %d\n", bufsize); + ENDDEBUG + + hp->afh_hosthash = iso_hashchar((caddr_t)&siso->siso_addr, + siso->siso_addr.isoa_len); + + IFDEBUG(D_ROUTE) + printf("iso_hash: %s: nethash = x%x, hosthash = x%x\n", + clnp_iso_addrp(&siso->siso_addr), hp->afh_nethash, + hp->afh_hosthash); + ENDDEBUG +} +/* + * FUNCTION: iso_netof + * + * PURPOSE: Extract the network portion of the iso address. + * The network portion of the iso address varies depending + * on the type of address. The network portion of the + * address will include the IDP. The network portion is: + * + * TYPE DESC + * t37 The AFI and x.121 (IDI) + * osinet The AFI, orgid, snetid + * rfc986 The AFI, vers and network part of + * internet address. + * + * RETURNS: number of bytes placed into buf. + * + * SIDE EFFECTS: + * + * NOTES: Buf is assumed to be big enough + */ +iso_netof(isoa, buf) +struct iso_addr *isoa; /* address */ +caddr_t buf; /* RESULT: network portion of address here */ +{ + u_int len = 1; /* length of afi */ + + switch (isoa->isoa_afi) { + case AFI_37: + /* + * Due to classic x.25 tunnel vision, there is no + * net portion of an x.121 address. For our purposes + * the AFI will do, so that all x.25 -type addresses + * map to the single x.25 SNPA. (Cannot have more than + * one, obviously). + */ + + break; + +/* case AFI_OSINET:*/ + case AFI_RFC986: { + u_short idi; /* value of idi */ + + /* osinet and rfc986 have idi in the same place */ + CTOH(isoa->rfc986_idi[0], isoa->rfc986_idi[1], idi); + + if (idi == IDI_OSINET) +/* + * Network portion of OSINET address can only be the IDI. Clearly, + * with one x25 interface, one could get to several orgids, and + * several snetids. + len += (ADDROSINET_IDI_LEN + OVLOSINET_ORGID_LEN + + OVLOSINET_SNETID_LEN); + */ + len += ADDROSINET_IDI_LEN; + else if (idi == IDI_RFC986) { + u_long inetaddr; + struct ovl_rfc986 *o986 = (struct ovl_rfc986 *)isoa; + + /* bump len to include idi and version (1 byte) */ + len += ADDRRFC986_IDI_LEN + 1; + + /* get inet addr long aligned */ + bcopy(o986->o986_inetaddr, &inetaddr, sizeof(inetaddr)); + inetaddr = ntohl(inetaddr); /* convert to host byte order */ + + IFDEBUG(D_ROUTE) + printf("iso_netof: isoa "); + dump_buf(isoa, sizeof(*isoa)); + printf("iso_netof: inetaddr 0x%x ", inetaddr); + ENDDEBUG + + /* bump len by size of network portion of inet address */ + if (IN_CLASSA(inetaddr)) { + len += 4-IN_CLASSA_NSHIFT/8; + IFDEBUG(D_ROUTE) + printf("iso_netof: class A net len is now %d\n", len); + ENDDEBUG + } else if (IN_CLASSB(inetaddr)) { + len += 4-IN_CLASSB_NSHIFT/8; + IFDEBUG(D_ROUTE) + printf("iso_netof: class B net len is now %d\n", len); + ENDDEBUG + } else { + len += 4-IN_CLASSC_NSHIFT/8; + IFDEBUG(D_ROUTE) + printf("iso_netof: class C net len is now %d\n", len); + ENDDEBUG + } + } else + len = 0; + } break; + + default: + len = 0; + } + + bcopy((caddr_t)isoa, buf, len); + IFDEBUG(D_ROUTE) + printf("iso_netof: isoa "); + dump_buf(isoa, len); + printf("iso_netof: net "); + dump_buf(buf, len); + ENDDEBUG + return len; +} +#endif /* notdef */ +/* + * Generic iso control operations (ioctl's). + * Ifp is 0 if not an interface-specific ioctl. + */ +/* ARGSUSED */ +iso_control(so, cmd, data, ifp) + struct socket *so; + int cmd; + caddr_t data; + register struct ifnet *ifp; +{ + register struct iso_ifreq *ifr = (struct iso_ifreq *)data; + register struct iso_ifaddr *ia = 0; + register struct ifaddr *ifa; + struct iso_ifaddr *oia; + struct iso_aliasreq *ifra = (struct iso_aliasreq *)data; + int error, hostIsNew, maskIsNew; + + /* + * Find address for this interface, if it exists. + */ + if (ifp) + for (ia = iso_ifaddr; ia; ia = ia->ia_next) + if (ia->ia_ifp == ifp) + break; + + switch (cmd) { + + case SIOCAIFADDR_ISO: + case SIOCDIFADDR_ISO: + if (ifra->ifra_addr.siso_family == AF_ISO) + for (oia = ia; ia; ia = ia->ia_next) { + if (ia->ia_ifp == ifp && + SAME_ISOADDR(&ia->ia_addr, &ifra->ifra_addr)) + break; + } + if ((so->so_state & SS_PRIV) == 0) + return (EPERM); + if (ifp == 0) + panic("iso_control"); + if (ia == (struct iso_ifaddr *)0) { + struct iso_ifaddr *nia; + if (cmd == SIOCDIFADDR_ISO) + return (EADDRNOTAVAIL); +#ifdef TUBA + /* XXXXXX can't be done in the proto init routines */ + if (tuba_tree == 0) + tuba_table_init(); +#endif + MALLOC(nia, struct iso_ifaddr *, sizeof(*nia), + M_IFADDR, M_WAITOK); + if (nia == (struct iso_ifaddr *)0) + return (ENOBUFS); + bzero((caddr_t)nia, sizeof(*nia)); + if (ia = iso_ifaddr) { + for ( ; ia->ia_next; ia = ia->ia_next) + ; + ia->ia_next = nia; + } else + iso_ifaddr = nia; + ia = nia; + if (ifa = ifp->if_addrlist) { + for ( ; ifa->ifa_next; ifa = ifa->ifa_next) + ; + ifa->ifa_next = (struct ifaddr *) ia; + } else + ifp->if_addrlist = (struct ifaddr *) ia; + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + ia->ia_ifa.ifa_dstaddr + = (struct sockaddr *)&ia->ia_dstaddr; + ia->ia_ifa.ifa_netmask + = (struct sockaddr *)&ia->ia_sockmask; + ia->ia_ifp = ifp; + if (ifp != &loif) + iso_interfaces++; + } + break; + +#define cmdbyte(x) (((x) >> 8) & 0xff) + default: + if (cmdbyte(cmd) == 'a') + return (snpac_ioctl(so, cmd, data)); + if (ia == (struct iso_ifaddr *)0) + return (EADDRNOTAVAIL); + break; + } + switch (cmd) { + + case SIOCGIFADDR_ISO: + ifr->ifr_Addr = ia->ia_addr; + break; + + case SIOCGIFDSTADDR_ISO: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + ifr->ifr_Addr = ia->ia_dstaddr; + break; + + case SIOCGIFNETMASK_ISO: + ifr->ifr_Addr = ia->ia_sockmask; + break; + + case SIOCAIFADDR_ISO: + maskIsNew = 0; hostIsNew = 1; error = 0; + if (ia->ia_addr.siso_family == AF_ISO) { + if (ifra->ifra_addr.siso_len == 0) { + ifra->ifra_addr = ia->ia_addr; + hostIsNew = 0; + } else if (SAME_ISOADDR(&ia->ia_addr, &ifra->ifra_addr)) + hostIsNew = 0; + } + if (ifra->ifra_mask.siso_len) { + iso_ifscrub(ifp, ia); + ia->ia_sockmask = ifra->ifra_mask; + maskIsNew = 1; + } + if ((ifp->if_flags & IFF_POINTOPOINT) && + (ifra->ifra_dstaddr.siso_family == AF_ISO)) { + iso_ifscrub(ifp, ia); + ia->ia_dstaddr = ifra->ifra_dstaddr; + maskIsNew = 1; /* We lie; but the effect's the same */ + } + if (ifra->ifra_addr.siso_family == AF_ISO && + (hostIsNew || maskIsNew)) { + error = iso_ifinit(ifp, ia, &ifra->ifra_addr, 0); + } + if (ifra->ifra_snpaoffset) + ia->ia_snpaoffset = ifra->ifra_snpaoffset; + return (error); + + case SIOCDIFADDR_ISO: + iso_ifscrub(ifp, ia); + if ((ifa = ifp->if_addrlist) == (struct ifaddr *)ia) + ifp->if_addrlist = ifa->ifa_next; + else { + while (ifa->ifa_next && + (ifa->ifa_next != (struct ifaddr *)ia)) + ifa = ifa->ifa_next; + if (ifa->ifa_next) + ifa->ifa_next = ((struct ifaddr *)ia)->ifa_next; + else + printf("Couldn't unlink isoifaddr from ifp\n"); + } + oia = ia; + if (oia == (ia = iso_ifaddr)) { + iso_ifaddr = ia->ia_next; + } else { + while (ia->ia_next && (ia->ia_next != oia)) { + ia = ia->ia_next; + } + if (ia->ia_next) + ia->ia_next = oia->ia_next; + else + printf("Didn't unlink isoifadr from list\n"); + } + IFAFREE((&oia->ia_ifa)); + break; + + default: + if (ifp == 0 || ifp->if_ioctl == 0) + return (EOPNOTSUPP); + return ((*ifp->if_ioctl)(ifp, cmd, data)); + } + return (0); +} + +/* + * Delete any existing route for an interface. + */ +iso_ifscrub(ifp, ia) + register struct ifnet *ifp; + register struct iso_ifaddr *ia; +{ + int nsellength = ia->ia_addr.siso_tlen; + if ((ia->ia_flags & IFA_ROUTE) == 0) + return; + ia->ia_addr.siso_tlen = 0; + if (ifp->if_flags & IFF_LOOPBACK) + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + else if (ifp->if_flags & IFF_POINTOPOINT) + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + else { + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, 0); + } + ia->ia_addr.siso_tlen = nsellength; + ia->ia_flags &= ~IFA_ROUTE; +} + +/* + * Initialize an interface's internet address + * and routing table entry. + */ +iso_ifinit(ifp, ia, siso, scrub) + register struct ifnet *ifp; + register struct iso_ifaddr *ia; + struct sockaddr_iso *siso; +{ + struct sockaddr_iso oldaddr; + int s = splimp(), error, nsellength; + + oldaddr = ia->ia_addr; + ia->ia_addr = *siso; + /* + * Give the interface a chance to initialize + * if this is its first address, + * and to validate the address if necessary. + */ + if (ifp->if_ioctl && + (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia))) { + splx(s); + ia->ia_addr = oldaddr; + return (error); + } + if (scrub) { + ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; + iso_ifscrub(ifp, ia); + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + } + /* XXX -- The following is here temporarily out of laziness + in not changing every ethernet driver's if_ioctl routine */ + if (ifp->if_output == ether_output) { + ia->ia_ifa.ifa_rtrequest = llc_rtrequest; + ia->ia_ifa.ifa_flags |= RTF_CLONING; + } + /* + * Add route for the network. + */ + nsellength = ia->ia_addr.siso_tlen; + ia->ia_addr.siso_tlen = 0; + if (ifp->if_flags & IFF_LOOPBACK) { + ia->ia_ifa.ifa_dstaddr = ia->ia_ifa.ifa_addr; + error = rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); + } else if (ifp->if_flags & IFF_POINTOPOINT && + ia->ia_dstaddr.siso_family == AF_ISO) + error = rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); + else { + rt_maskedcopy(ia->ia_ifa.ifa_addr, ia->ia_ifa.ifa_dstaddr, + ia->ia_ifa.ifa_netmask); + ia->ia_dstaddr.siso_nlen = + min(ia->ia_addr.siso_nlen, (ia->ia_sockmask.siso_len - 6)); + error = rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_UP); + } + ia->ia_addr.siso_tlen = nsellength; + ia->ia_flags |= IFA_ROUTE; + splx(s); + return (error); +} +#ifdef notdef + +struct ifaddr * +iso_ifwithidi(addr) + register struct sockaddr *addr; +{ + register struct ifnet *ifp; + register struct ifaddr *ifa; + register u_int af = addr->sa_family; + + if (af != AF_ISO) + return (0); + IFDEBUG(D_ROUTE) + printf(">>> iso_ifwithidi addr\n"); + dump_isoaddr( (struct sockaddr_iso *)(addr)); + printf("\n"); + ENDDEBUG + for (ifp = ifnet; ifp; ifp = ifp->if_next) { + IFDEBUG(D_ROUTE) + printf("iso_ifwithidi ifnet %s\n", ifp->if_name); + ENDDEBUG + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) { + IFDEBUG(D_ROUTE) + printf("iso_ifwithidi address "); + dump_isoaddr( (struct sockaddr_iso *)(ifa->ifa_addr)); + ENDDEBUG + if (ifa->ifa_addr->sa_family != addr->sa_family) + continue; + +#define IFA_SIS(ifa)\ + ((struct sockaddr_iso *)((ifa)->ifa_addr)) + + IFDEBUG(D_ROUTE) + printf(" af same, args to iso_eqtype:\n"); + printf("0x%x ", IFA_SIS(ifa)->siso_addr); + printf(" 0x%x\n", + &(((struct sockaddr_iso *)addr)->siso_addr)); + ENDDEBUG + + if (iso_eqtype(&(IFA_SIS(ifa)->siso_addr), + &(((struct sockaddr_iso *)addr)->siso_addr))) { + IFDEBUG(D_ROUTE) + printf("ifa_ifwithidi: ifa found\n"); + ENDDEBUG + return (ifa); + } + IFDEBUG(D_ROUTE) + printf(" iso_eqtype failed\n"); + ENDDEBUG + } + } + return ((struct ifaddr *)0); +} + +#endif /* notdef */ +/* + * FUNCTION: iso_ck_addr + * + * PURPOSE: return true if the iso_addr passed is + * within the legal size limit for an iso address. + * + * RETURNS: true or false + * + * SIDE EFFECTS: + * + */ +iso_ck_addr(isoa) +struct iso_addr *isoa; /* address to check */ +{ + return (isoa->isoa_len <= 20); + +} + +#ifdef notdef +/* + * FUNCTION: iso_eqtype + * + * PURPOSE: Determine if two iso addresses are of the same type. + * This is flaky. Really we should consider all type 47 addrs to be the + * same - but there do exist different structures for 47 addrs. + * Gosip adds a 3rd. + * + * RETURNS: true if the addresses are the same type + * + * SIDE EFFECTS: + * + * NOTES: By type, I mean rfc986, t37, or osinet + * + * This will first compare afis. If they match, then + * if the addr is not t37, the idis must be compared. + */ +iso_eqtype(isoaa, isoab) +struct iso_addr *isoaa; /* first addr to check */ +struct iso_addr *isoab; /* other addr to check */ +{ + if (isoaa->isoa_afi == isoab->isoa_afi) { + if (isoaa->isoa_afi == AFI_37) + return(1); + else + return (!bcmp(&isoaa->isoa_u, &isoab->isoa_u, 2)); + } + return(0); +} +#endif /* notdef */ +/* + * FUNCTION: iso_localifa() + * + * PURPOSE: Find an interface addresss having a given destination + * or at least matching the net. + * + * RETURNS: ptr to an interface address + * + * SIDE EFFECTS: + * + * NOTES: + */ +struct iso_ifaddr * +iso_localifa(siso) + register struct sockaddr_iso *siso; +{ + register struct iso_ifaddr *ia; + register char *cp1, *cp2, *cp3; + register struct ifnet *ifp; + struct iso_ifaddr *ia_maybe = 0; + /* + * We make one pass looking for both net matches and an exact + * dst addr. + */ + for (ia = iso_ifaddr; ia; ia = ia->ia_next) { + if ((ifp = ia->ia_ifp) == 0 || ((ifp->if_flags & IFF_UP) == 0)) + continue; + if (ifp->if_flags & IFF_POINTOPOINT) { + if ((ia->ia_dstaddr.siso_family == AF_ISO) && + SAME_ISOADDR(&ia->ia_dstaddr, siso)) + return (ia); + else + if (SAME_ISOADDR(&ia->ia_addr, siso)) + ia_maybe = ia; + continue; + } + if (ia->ia_sockmask.siso_len) { + char *cplim = ia->ia_sockmask.siso_len + (char *)&ia->ia_sockmask; + cp1 = ia->ia_sockmask.siso_data; + cp2 = siso->siso_data; + cp3 = ia->ia_addr.siso_data; + while (cp1 < cplim) + if (*cp1++ & (*cp2++ ^ *cp3++)) + goto next; + ia_maybe = ia; + } + if (SAME_ISOADDR(&ia->ia_addr, siso)) + return ia; + next:; + } + return ia_maybe; +} + +#ifdef TPCONS +#include +#endif /* TPCONS */ +/* + * FUNCTION: iso_nlctloutput + * + * PURPOSE: Set options at the network level + * + * RETURNS: E* + * + * SIDE EFFECTS: + * + * NOTES: This could embody some of the functions of + * rclnp_ctloutput and cons_ctloutput. + */ +iso_nlctloutput(cmd, optname, pcb, m) +int cmd; /* command:set or get */ +int optname; /* option of interest */ +caddr_t pcb; /* nl pcb */ +struct mbuf *m; /* data for set, buffer for get */ +{ + struct isopcb *isop = (struct isopcb *)pcb; + int error = 0; /* return value */ + caddr_t data; /* data for option */ + int data_len; /* data's length */ + + IFDEBUG(D_ISO) + printf("iso_nlctloutput: cmd %x, opt %x, pcb %x, m %x\n", + cmd, optname, pcb, m); + ENDDEBUG + + if ((cmd != PRCO_GETOPT) && (cmd != PRCO_SETOPT)) + return(EOPNOTSUPP); + + data = mtod(m, caddr_t); + data_len = (m)->m_len; + + IFDEBUG(D_ISO) + printf("iso_nlctloutput: data is:\n"); + dump_buf(data, data_len); + ENDDEBUG + + switch (optname) { + +#ifdef TPCONS + case CONSOPT_X25CRUD: + if (cmd == PRCO_GETOPT) { + error = EOPNOTSUPP; + break; + } + + if (data_len > MAXX25CRUDLEN) { + error = EINVAL; + break; + } + + IFDEBUG(D_ISO) + printf("iso_nlctloutput: setting x25 crud\n"); + ENDDEBUG + + bcopy(data, (caddr_t)isop->isop_x25crud, (unsigned)data_len); + isop->isop_x25crud_len = data_len; + break; +#endif /* TPCONS */ + + default: + error = EOPNOTSUPP; + } + if (cmd == PRCO_SETOPT) + m_freem(m); + return error; +} +#endif /* ISO */ + +#ifdef ARGO_DEBUG + +/* + * FUNCTION: dump_isoaddr + * + * PURPOSE: debugging + * + * RETURNS: nada + * + */ +dump_isoaddr(s) + struct sockaddr_iso *s; +{ + char *clnp_saddr_isop(); + register int i; + + if( s->siso_family == AF_ISO) { + printf("ISO address: suffixlen %d, %s\n", + s->siso_tlen, clnp_saddr_isop(s)); + } else if( s->siso_family == AF_INET) { + /* hack */ + struct sockaddr_in *sin = (struct sockaddr_in *)s; + + printf("%d.%d.%d.%d: %d", + (sin->sin_addr.s_addr>>24)&0xff, + (sin->sin_addr.s_addr>>16)&0xff, + (sin->sin_addr.s_addr>>8)&0xff, + (sin->sin_addr.s_addr)&0xff, + sin->sin_port); + } +} + +#endif /* ARGO_DEBUG */ diff --git a/sys/netiso/iso.h b/sys/netiso/iso.h new file mode 100644 index 00000000000..9237e6aaa73 --- /dev/null +++ b/sys/netiso/iso.h @@ -0,0 +1,195 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: iso.h,v 4.9 88/09/11 18:06:38 hagens Exp $ */ +/* $Source: /usr/argo/sys/netiso/RCS/iso.h,v $ */ + +#ifndef __ISO__ +#define __ISO__ + +/* + * Return true if this is a multicast address + * This assumes that the bit transmission is lsb first. This + * assumption is valid for 802.3 but not 802.5. There is a + * kludge to get around this for 802.5 -- see if_lan.c + * where subnetwork header is setup. + */ +#define IS_MULTICAST(snpa)\ + ((snpa)[0] & 0x01) + +/* + * Protocols + */ +#define ISOPROTO_TCP 6 /* IETF experiment */ +#define ISOPROTO_UDP 17 /* IETF experiment */ +#define ISOPROTO_TP0 25 /* connection oriented transport protocol */ +#define ISOPROTO_TP1 26 /* not implemented */ +#define ISOPROTO_TP2 27 /* not implemented */ +#define ISOPROTO_TP3 28 /* not implemented */ +#define ISOPROTO_TP4 29 /* connection oriented transport protocol */ +#define ISOPROTO_TP ISOPROTO_TP4 /* tp-4 with negotiation */ +#define ISOPROTO_CLTP 30 /* connectionless transport (not yet impl.) */ +#define ISOPROTO_CLNP 31 /* connectionless internetworking protocol */ +#define ISOPROTO_X25 32 /* cons */ +#define ISOPROTO_INACT_NL 33 /* inactive network layer! */ +#define ISOPROTO_ESIS 34 /* ES-IS protocol */ +#define ISOPROTO_INTRAISIS 35 /* IS-IS protocol */ +#define ISOPROTO_IDRP 36 /* Interdomain Routing Protocol */ + +#define ISOPROTO_RAW 255 /* raw clnp */ +#define ISOPROTO_MAX 256 + +#define ISO_PORT_RESERVED 1024 +#define ISO_PORT_USERRESERVED 5000 +/* + * Port/socket numbers: standard network functions + * NOT PRESENTLY USED + */ +#define ISO_PORT_MAINT 501 +#define ISO_PORT_ECHO 507 +#define ISO_PORT_DISCARD 509 +#define ISO_PORT_SYSTAT 511 +#define ISO_PORT_NETSTAT 515 +/* + * Port/socket numbers: non-standard application functions + */ +#define ISO_PORT_LOGIN 513 +/* + * Port/socket numbers: public use + */ +#define ISO_PORT_PUBLIC 1024 /* high bit set --> public */ + +/* + * Network layer protocol identifiers + */ +#define ISO8473_CLNP 0x81 +#define ISO9542_ESIS 0x82 +#define ISO9542X25_ESIS 0x8a +#define ISO10589_ISIS 0x83 +#define ISO8878A_CONS 0x84 +#define ISO10747_IDRP 0x85 + + +#ifndef IN_CLASSA_NET +#include +#endif /* IN_CLASSA_NET */ + + + +/* The following looks like a sockaddr + * to facilitate using tree lookup routines */ +struct iso_addr { + u_char isoa_len; /* length (in bytes) */ + char isoa_genaddr[20]; /* general opaque address */ +}; + +struct sockaddr_iso { + u_char siso_len; /* length */ + u_char siso_family; /* family */ + u_char siso_plen; /* presentation selector length */ + u_char siso_slen; /* session selector length */ + u_char siso_tlen; /* transport selector length */ + struct iso_addr siso_addr; /* network address */ + u_char siso_pad[6]; /* space for gosip v2 sels */ + /* makes struct 32 bytes long */ +}; +#define siso_nlen siso_addr.isoa_len +#define siso_data siso_addr.isoa_genaddr + +#define TSEL(s) ((caddr_t)((s)->siso_data + (s)->siso_nlen)) + +#define SAME_ISOADDR(a, b) \ + (bcmp((a)->siso_data, (b)->siso_data, (unsigned)(a)->siso_nlen)==0) +/* + * The following are specific values for siso->siso_data[0], + * otherwise known as the AFI: + */ +#define AFI_37 0x37 /* bcd of "37" */ +#define AFI_OSINET 0x47 /* bcd of "47" */ +#define AFI_RFC986 0x47 /* bcd of "47" */ +#define AFI_SNA 0x00 /* SubNetwork Address; invalid really...*/ + +#ifdef KERNEL + +extern int iso_netmatch(); +extern int iso_hash(); +extern int iso_addrmatch(); +extern struct iso_ifaddr *iso_iaonnetof(); +extern struct domain isodomain; +extern struct protosw isosw[]; + +#else +/* user utilities definitions from the iso library */ + +#include + +__BEGIN_DECLS +struct iso_addr *iso_addr __P((const char *)); +char *iso_ntoa __P((const struct iso_addr *)); + +/* THESE DON'T EXIST YET */ +struct hostent *iso_gethostbyname(), *iso_gethostbyaddr(); +__END_DECLS + +#endif /* KERNEL */ + +#define _offsetof(t, m) ((int)((caddr_t)&((t *)0)->m)) +#endif /* __ISO__ */ diff --git a/sys/netiso/iso_chksum.c b/sys/netiso/iso_chksum.c new file mode 100644 index 00000000000..5b1aae59e16 --- /dev/null +++ b/sys/netiso/iso_chksum.c @@ -0,0 +1,360 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso_chksum.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * $Header: iso_chksum.c,v 4.7 88/07/29 15:31:26 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/iso_chksum.c,v $ + * + * ISO CHECKSUM + * + * The checksum generation and check routines are here. + * The checksum is 2 bytes such that the sum of all the bytes b(i) == 0 + * and the sum of i * b(i) == 0. + * The whole thing is complicated by the fact that the data are in mbuf + * chains. + * Furthermore, there is the possibility of wraparound in the running + * sums after adding up 4102 octets. In order to avoid doing a mod + * operation after EACH add, we have restricted this implementation to + * negotiating a maximum of 4096-octets per TPDU (for the transport layer). + * The routine iso_check_csum doesn't need to know where the checksum + * octets are. + * The routine iso_gen_csum takes a pointer to an mbuf chain (logically + * a chunk of data), an offset into the chunk at which the 2 octets are to + * be stuffed, and the length of the chunk. The 2 octets have to be + * logically adjacent, but may be physically located in separate mbufs. + */ + +#ifdef ISO +#include +#include +#include +#include +#endif /* ISO */ + +#ifndef MNULL +#define MNULL (struct mbuf *)0 +#endif /* MNULL */ + +/* + * FUNCTION: iso_check_csum + * + * PURPOSE: To check the checksum of the packet in the mbuf chain (m). + * The total length of the packet is (len). + * Called from tp_input() and clnp_intr() + * + * RETURNS: TRUE (something non-zero) if there is a checksum error, + * FALSE if there was NO checksum error. + * + * SIDE EFFECTS: none + * + * NOTES: It might be possible to gain something by optimizing + * this routine (unrolling loops, etc). But it is such + * a horrible thing to fiddle with anyway, it probably + * isn't worth it. + */ +int +iso_check_csum(m, len) + struct mbuf *m; + int len; +{ + register u_char *p = mtod(m, u_char *); + register u_long c0=0, c1=0; + register int i=0; + int cum = 0; /* cumulative length */ + int l; + + l = len; + len = min(m->m_len, len); + i = 0; + + IFDEBUG(D_CHKSUM) + printf("iso_check_csum: m x%x, l x%x, m->m_len x%x\n", m, l, m->m_len); + ENDDEBUG + + while( im_next; + IFDEBUG(D_CHKSUM) + printf("iso_check_csum: new mbuf\n"); + if(l-i < m->m_len) + printf( + "bad mbuf chain in check csum l 0x%x i 0x%x m_data 0x%x", + l,i,m->m_data); + ENDDEBUG + ASSERT( m != MNULL); + len = min( m->m_len, l-i); + p = mtod(m, u_char *); + } + } + if ( ((int)c0 % 255) || ((int)c1 % 255) ) { + IFDEBUG(D_CHKSUM) + printf("BAD iso_check_csum l 0x%x cum 0x%x len 0x%x, i 0x%x", + l, cum, len, i); + ENDDEBUG + return ((int)c0 % 255)<<8 | ((int)c1 % 255); + } + return 0; +} + +/* + * FUNCTION: iso_gen_csum + * + * PURPOSE: To generate the checksum of the packet in the mbuf chain (m). + * The first of the 2 (logically) adjacent checksum bytes + * (x and y) go at offset (n). + * (n) is an offset relative to the beginning of the data, + * not the beginning of the mbuf. + * (l) is the length of the total mbuf chain's data. + * Called from tp_emit(), tp_error_emit() + * clnp_emit_er(), clnp_forward(), clnp_output(). + * + * RETURNS: Rien + * + * SIDE EFFECTS: Puts the 2 checksum bytes into the packet. + * + * NOTES: Ditto the note for iso_check_csum(). + */ + +void +iso_gen_csum(m,n,l) + struct mbuf *m; + int n; /* offset of 2 checksum bytes */ + int l; +{ + register u_char *p = mtod(m, u_char *); + register int c0=0, c1=0; + register int i=0; + int loc = n++, len=0; /* n is position, loc is offset */ + u_char *xloc; + u_char *yloc; + int cum=0; /* cum == cumulative length */ + + IFDEBUG(D_CHKSUM) + printf("enter gen csum m 0x%x n 0x%x l 0x%x\n",m, n-1 ,l ); + ENDDEBUG + + while(i < l) { + len = min(m->m_len, CLBYTES); + /* RAH: don't cksum more than l bytes */ + len = min(len, l - i); + + cum +=len; + p = mtod(m, u_char *); + + if(loc>=0) { + if (loc < len) { + xloc = loc + mtod(m, u_char *); + IFDEBUG(D_CHKSUM) + printf("1: zeroing xloc 0x%x loc 0x%x\n",xloc, loc ); + ENDDEBUG + *xloc = (u_char)0; + if (loc+1 < len) { + /* both xloc and yloc are in same mbuf */ + yloc = 1 + xloc; + IFDEBUG(D_CHKSUM) + printf("2: zeroing yloc 0x%x loc 0x%x\n",yloc, loc ); + ENDDEBUG + *yloc = (u_char)0; + } else { + /* crosses boundary of mbufs */ + yloc = mtod(m->m_next, u_char *); + IFDEBUG(D_CHKSUM) + printf("3: zeroing yloc 0x%x \n",yloc ); + ENDDEBUG + *yloc = (u_char)0; + } + } + loc -= len; + } + + while(i < cum) { + c0 = (c0 + *p); + c1 += c0 ; + i++; + p++; + } + m = m->m_next; + } + IFDEBUG(D_CHKSUM) + printf("gen csum final xloc 0x%x yloc 0x%x\n",xloc, yloc ); + ENDDEBUG + + c1 = (((c0 * (l-n))-c1)%255) ; + *xloc = (u_char) ((c1 < 0)? c1+255 : c1); + + c1 = (-(int)(c1+c0))%255; + *yloc = (u_char) (c1 < 0? c1 + 255 : c1); + + IFDEBUG(D_CHKSUM) + printf("gen csum end \n"); + ENDDEBUG +} + +/* + * FUNCTION: m_datalen + * + * PURPOSE: returns length of the mbuf chain. + * used all over the iso code. + * + * RETURNS: integer + * + * SIDE EFFECTS: none + * + * NOTES: + */ + +int +m_datalen (m) + register struct mbuf *m; +{ + register int datalen; + + for (datalen = 0; m; m = m->m_next) + datalen += m->m_len; + return datalen; +} + +int +m_compress(in, out) + register struct mbuf *in, **out; +{ + register int datalen = 0; + int s = splimp(); + + if( in->m_next == MNULL ) { + *out = in; + IFDEBUG(D_REQUEST) + printf("m_compress returning 0x%x: A\n", in->m_len); + ENDDEBUG + splx(s); + return in->m_len; + } + MGET((*out), M_DONTWAIT, MT_DATA); + if((*out) == MNULL) { + *out = in; + IFDEBUG(D_REQUEST) + printf("m_compress returning -1: B\n"); + ENDDEBUG + splx(s); + return -1; + } + (*out)->m_len = 0; + (*out)->m_act = MNULL; + + while (in) { + IFDEBUG(D_REQUEST) + printf("m_compress in 0x%x *out 0x%x\n", in, *out); + printf("m_compress in: len 0x%x, off 0x%x\n", in->m_len, in->m_data); + printf("m_compress *out: len 0x%x, off 0x%x\n", (*out)->m_len, + (*out)->m_data); + ENDDEBUG + if (in->m_flags & M_EXT) { + ASSERT(in->m_len == 0); + } + if ( in->m_len == 0) { + in = in->m_next; + continue; + } + if (((*out)->m_flags & M_EXT) == 0) { + int len; + + len = M_TRAILINGSPACE(*out); + len = min(len, in->m_len); + datalen += len; + + IFDEBUG(D_REQUEST) + printf("m_compress copying len %d\n", len); + ENDDEBUG + bcopy(mtod(in, caddr_t), mtod((*out), caddr_t) + (*out)->m_len, + (unsigned)len); + + (*out)->m_len += len; + in->m_len -= len; + continue; + } else { + /* (*out) is full */ + if(( (*out)->m_next = m_get(M_DONTWAIT, MT_DATA) ) == MNULL) { + m_freem(*out); + *out = in; + IFDEBUG(D_REQUEST) + printf("m_compress returning -1: B\n"); + ENDDEBUG + splx(s); + return -1; + } + (*out)->m_len = 0; + (*out)->m_act = MNULL; + *out = (*out)->m_next; + } + } + m_freem(in); + IFDEBUG(D_REQUEST) + printf("m_compress returning 0x%x: A\n", datalen); + ENDDEBUG + splx(s); + return datalen; +} diff --git a/sys/netiso/iso_errno.h b/sys/netiso/iso_errno.h new file mode 100644 index 00000000000..0d75589ca0e --- /dev/null +++ b/sys/netiso/iso_errno.h @@ -0,0 +1,274 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso_errno.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ + +#ifndef __ISO_ERRNO__ +#define __ISO_ERRNO__ + +#define ISO_ERROR_MASK 0x8000 +#define BSD_ERROR_MASK 0x0000 +#define TP_ERROR_MASK 0x8800 /* transport layer */ +#define CONL_ERROR_MASK 0x8400 /* co network layer */ +#define CLNL_ERROR_MASK 0x8200 /* cl network layer */ +#define TP_ERROR_SNDC 0x10000 /* kludge to force DC's on certain errors */ + +#define E_CO_NOERROR (CONL_ERROR_MASK | 0x0) /* no add'l info */ + +/******************************************************************************/ +/* */ +/* */ +/* Transport Layer */ +/* */ +/* */ +/******************************************************************************/ + +#define E_TP_DR_NO_REAS (TP_ERROR_MASK | 0x0) /* dr reason not specified*/ +#define E_TP_CONGEST (TP_ERROR_MASK | 0x1) /* dr reason congestion */ +#define E_TP_NO_SESSION (TP_ERROR_MASK | 0x2) /* dr reason no sess ent */ +#define E_TP_ADDR_UNK (TP_ERROR_MASK | 0x3) /* dr reason addr unknown */ + +#define E_TP_ER_NO_REAS (TP_ERROR_MASK | 0x40) /* er reas not specified */ +#define E_TP_INV_PCODE (TP_ERROR_MASK | 0x41) /* er reas invalid parm code */ +#define E_TP_INV_TPDU (TP_ERROR_MASK | 0x42) /* er reas invalid tpdu type */ +#define E_TP_INV_PVAL (TP_ERROR_MASK | 0x43) /* er reas invalid parm value*/ + +#define E_TP_NORMAL_DISC (TP_ERROR_MASK | 0x80) /* dr reas normal disc */ +#define E_TP_CONGEST_2 (TP_ERROR_MASK | 0x81) /* dr reason congestion */ +#define E_TP_NEGOT_FAILED (TP_ERROR_MASK | 0x82) /* dr negotiation failed */ +#define E_TP_DUPL_SRCREF (TP_ERROR_MASK | 0x83) /* dr duplicate src ref */ +#define E_TP_MISM_REFS (TP_ERROR_MASK | 0x84) /* dr mismatched references*/ +#define E_TP_PROTO_ERR (TP_ERROR_MASK | 0x85) /* dr protocol error*/ +/* 0x86 not used */ +#define E_TP_REF_OVERFLOW (TP_ERROR_MASK | 0x87) /* dr reference overflow */ +#define E_TP_NO_CR_ON_NC (TP_ERROR_MASK | 0x88) /* dr cr refused on this nc */ +/* 0x89 not used */ +#define E_TP_LENGTH_INVAL (TP_ERROR_MASK | 0x8a) /* dr inval length in hdr*/ + +/******************************************************************************/ +/* */ +/* */ +/* Connection Less Network Layer */ +/* */ +/* */ +/******************************************************************************/ + +#define E_CLNL_??? (CLNL_ERROR_MASK | 0x1) /* explanation */ + +/******************************************************************************/ +/* */ +/* */ +/* Connection Oriented Network Layer */ +/* */ +/* */ +/******************************************************************************/ + /* see p. 149 of ISO 8208 */ +#define E_CO_NOERROR (CONL_ERROR_MASK | 0x0) /* no add'l info */ +#define E_CO_INV_PS (CONL_ERROR_MASK | 0x1) /* invalid p(s) */ +#define E_CO_INV_PR (CONL_ERROR_MASK | 0x2) /* invalid p(r) */ + /* dot dot dot */ +#define E_CO_INV_PKT_TYPE (CONL_ERROR_MASK | 0x10) /* packet type invalid*/ +#define E_CO_INV_PKT_R1 (CONL_ERROR_MASK | 0x11) /* for state r1 */ +#define E_CO_INV_PKT_R2 (CONL_ERROR_MASK | 0x12) /* for state r2 */ +#define E_CO_INV_PKT_R3 (CONL_ERROR_MASK | 0x13) /* for state r3 */ +#define E_CO_INV_PKT_P1 (CONL_ERROR_MASK | 0x14) /* for state p1 */ +#define E_CO_INV_PKT_P2 (CONL_ERROR_MASK | 0x15) /* for state p2 */ +#define E_CO_INV_PKT_P3 (CONL_ERROR_MASK | 0x16) /* for state p3 */ +#define E_CO_INV_PKT_P4 (CONL_ERROR_MASK | 0x17) /* for state p4 */ +#define E_CO_INV_PKT_P5 (CONL_ERROR_MASK | 0x18) /* for state p5 */ +#define E_CO_INV_PKT_P6 (CONL_ERROR_MASK | 0x19) /* for state p6 */ +#define E_CO_INV_PKT_P7 (CONL_ERROR_MASK | 0x1a) /* for state p7 */ +#define E_CO_INV_PKT_D1 (CONL_ERROR_MASK | 0x1b) /* for state d1 */ +#define E_CO_INV_PKT_D2 (CONL_ERROR_MASK | 0x1c) /* for state d2 */ +#define E_CO_INV_PKT_D3 (CONL_ERROR_MASK | 0x1d) /* for state d3 */ + /* dot dot dot */ +#define E_CO_PKT_NOT_ALWD (CONL_ERROR_MASK | 0x20) /* packet not allowed */ +#define E_CO_PNA_UNIDENT (CONL_ERROR_MASK | 0x21) /* unidentifiable pkt */ +#define E_CO_PNA_ONEWAY (CONL_ERROR_MASK | 0x22) /* call on 1-way lc */ +#define E_CO_PNA_PVC (CONL_ERROR_MASK | 0x23) /* inv pkt type on a pvc */ +#define E_CO_PNA_UNASSLC (CONL_ERROR_MASK | 0x24) /* pkt on unassigned lc */ +#define E_CO_PNA_REJECT (CONL_ERROR_MASK | 0x25) /* REJ not subscribed to*/ +#define E_CO_PNA_SHORT (CONL_ERROR_MASK | 0x26) /* pkt too short */ +#define E_CO_PNA_LONG (CONL_ERROR_MASK | 0x27) /* pkt too long */ +#define E_CO_PNA_INVGFI (CONL_ERROR_MASK | 0x28) /* inv gen format id */ +#define E_CO_PNA_NZLCI (CONL_ERROR_MASK | 0x29) \ + /* restart or reg pkt with nonzero logical channel identifier */ +#define E_CO_PNA_FACIL (CONL_ERROR_MASK | 0x2a) \ + /* pkt type not compat with facility */ +#define E_CO_PNA_UINTCON (CONL_ERROR_MASK | 0x2b) /* unauthor intrpt conf */ +#define E_CO_PNA_UINTRPT (CONL_ERROR_MASK | 0x2c) /* unauthorized intrpt */ +#define E_CO_PNA_UREJECT (CONL_ERROR_MASK | 0x2d) /* unauthorized reject */ + +#define E_CO_TMR_EXP (CONL_ERROR_MASK | 0x30) /* timer expired */ +#define E_CO_TMR_CALR (CONL_ERROR_MASK | 0x31) /* inc. call or call req */ +#define E_CO_TMR_CLRI (CONL_ERROR_MASK | 0x32) /* clear indication */ +#define E_CO_TMR_RSTI (CONL_ERROR_MASK | 0x33) /* reset indication */ +#define E_CO_TMR_RRTI (CONL_ERROR_MASK | 0x34) /* restart indication */ + +#define E_CO_REG_PROB (CONL_ERROR_MASK | 0x40)\ + /* call setup, clear, or registration problem */ +#define E_CO_REG_CODE (CONL_ERROR_MASK | 0x41) /* code not allowed */ +#define E_CO_REG_PARM (CONL_ERROR_MASK | 0x42) /* parameter not allowed */ +#define E_CO_REG_ICDA (CONL_ERROR_MASK | 0x43) /* invalid called addr */ +#define E_CO_REG_ICGA (CONL_ERROR_MASK | 0x44) /* invalid calling addr */ +#define E_CO_REG_ILEN (CONL_ERROR_MASK | 0x45) /* invalid facil length */ +#define E_CO_REG_IBAR (CONL_ERROR_MASK | 0x46) /* incoming call barred */ +#define E_CO_REG_NOLC (CONL_ERROR_MASK | 0x47) /* no logical chan avail*/ +#define E_CO_REG_COLL (CONL_ERROR_MASK | 0x48) /* call collision */ +#define E_CO_REG_DUPF (CONL_ERROR_MASK | 0x49) /* dupl facil requested */ +#define E_CO_REG_NZAL (CONL_ERROR_MASK | 0x4a) /* non-zero addr length */ +#define E_CO_REG_NZFL (CONL_ERROR_MASK | 0x4b) /* non-zero facil length */ +#define E_CO_REG_EFNP (CONL_ERROR_MASK | 0x4c) \ + /* expected facil not provided */ +#define E_CO_REG_ICCITT (CONL_ERROR_MASK | 0x4d) \ + /* invalid CCITT-specified DTE facil */ + +#define E_CO_MISC (CONL_ERROR_MASK | 0x50) /* miscellaneous */ +#define E_CO_MISC_CAUSE (CONL_ERROR_MASK | 0x51) /* improper cause code */ +#define E_CO_MISC_ALIGN (CONL_ERROR_MASK | 0x52) /* not octet-aligned */ +#define E_CO_MISC_IQBS (CONL_ERROR_MASK | 0x53) \ + /* inconsistent Q bit settings */ + +#define E_CO_INTL (CONL_ERROR_MASK | 0x70) /* international problem */ +#define E_CO_IREMNWK (CONL_ERROR_MASK | 0x71) /* remote network problem */ +#define E_CO_INPROTO (CONL_ERROR_MASK | 0x72) /* int'l protocol problem */ +#define E_CO_ILINKDWN (CONL_ERROR_MASK | 0x73) /* int'l link down */ +#define E_CO_ILINKBSY (CONL_ERROR_MASK | 0x74) /* int'l link busy */ +#define E_CO_IXNETFAC (CONL_ERROR_MASK | 0x75) /* transit netwk facil */ +#define E_CO_IRNETFAC (CONL_ERROR_MASK | 0x76) /* remote netwk facil */ +#define E_CO_IROUTING (CONL_ERROR_MASK | 0x77) /* int'l routing prob */ +#define E_CO_ITMPRTG (CONL_ERROR_MASK | 0x78) /* temporary routing prob */ +#define E_CO_IUNKDNIC (CONL_ERROR_MASK | 0x79) /* unknown called DNIC */ +#define E_CO_IMAINT (CONL_ERROR_MASK | 0x7a) /* maintenance action */ + +#define E_CO_TIMO (CONL_ERROR_MASK | 0x90) \ + /* timer expired or retransmission count surpassed */ +#define E_CO_TIM_INTRP (CONL_ERROR_MASK | 0x91) /* for interrupt */ +#define E_CO_TIM_DATA (CONL_ERROR_MASK | 0x92) /* for data */ +#define E_CO_TIM_REJ (CONL_ERROR_MASK | 0x93) /* for reject */ + +#define E_CO_DTE_SPEC (CONL_ERROR_MASK | 0xa0) /* DTE-specific */ +#define E_CO_DTE_OK (CONL_ERROR_MASK | 0xa1) /* DTE operational */ +#define E_CO_DTE_NOK (CONL_ERROR_MASK | 0xa2) /* DTE not operational */ +#define E_CO_DTE_RSRC (CONL_ERROR_MASK | 0xa3) /* DTE resource constraint*/ +#define E_CO_DTE_FSLCT (CONL_ERROR_MASK | 0xa4) /* fast select not subsc */ +#define E_CO_DTE_PFPKT (CONL_ERROR_MASK | 0xa5) /* partially full pkt */ +#define E_CO_DTE_DBIT (CONL_ERROR_MASK | 0xa6) /* D-bit proc not supp */ +#define E_CO_DTE_RCCON (CONL_ERROR_MASK | 0xa7) /* reg/canell confirmed */ + +#define E_CO_OSI_NSP (CONL_ERROR_MASK | 0xe0) /* OSI net svc problem */ +#define E_CO_OSI_DISCT (CONL_ERROR_MASK | 0xe1) /* disconnect transient */ +#define E_CO_OSI_DISCP (CONL_ERROR_MASK | 0xe2) /* disconnect permanent */ +#define E_CO_OSI_REJT (CONL_ERROR_MASK | 0xe3) /* reject transient */ +#define E_CO_OSI_REJP (CONL_ERROR_MASK | 0xe4) /* reject permanent */ +#define E_CO_OSI_QOST (CONL_ERROR_MASK | 0xe5) /* reject QOS transient */ +#define E_CO_OSI_QOSP (CONL_ERROR_MASK | 0xe6) /* reject QOS permanent */ +#define E_CO_OSI_NSAPT (CONL_ERROR_MASK | 0xe7) /* NSAP unreach transient */ +#define E_CO_OSI_NSAPP (CONL_ERROR_MASK | 0xe8) /* NSAP unreach permanent */ +#define E_CO_OSI_RESET (CONL_ERROR_MASK | 0xe9) /* reset no reason */ +#define E_CO_OSI_CONGEST (CONL_ERROR_MASK | 0xea) /* reset congestion */ +#define E_CO_OSI_UNSAP (CONL_ERROR_MASK | 0xeb) /* unknown NSAP permanent */ + +#define E_CO_HLI_INIT (CONL_ERROR_MASK | 0xf0) /* higher level initiated*/ +#define E_CO_HLI_DISCN (CONL_ERROR_MASK | 0xf1) /* disconnect normal */ +#define E_CO_HLI_DISCA (CONL_ERROR_MASK | 0xf2) /* disconnect abnormal */ +#define E_CO_HLI_DISCI (CONL_ERROR_MASK | 0xf3) /* disconnect incompatible*/ +#define E_CO_HLI_REJT (CONL_ERROR_MASK | 0xf4) /* reject transient */ +#define E_CO_HLI_REJP (CONL_ERROR_MASK | 0xf5) /* reject permanent */ +#define E_CO_HLI_QOST (CONL_ERROR_MASK | 0xf6) /* reject QOS transient */ +#define E_CO_HLI_QOSP (CONL_ERROR_MASK | 0xf7) /* reject QOS permanent */ +#define E_CO_HLI_REJI (CONL_ERROR_MASK | 0xf8) /* reject incompatible */ +#define E_CO_HLI_PROTOID (CONL_ERROR_MASK | 0xf9) /* unrecog proto id */ +#define E_CO_HLI_RESYNC (CONL_ERROR_MASK | 0xfa) /* reset - user resync */ + +/* Cause on 8208 CLEAR field */ +#define E_CO_NUMBERBUSY (CONL_ERROR_MASK | 0x101) /* Number busy */ +#define E_CO_INVFACREQ (CONL_ERROR_MASK | 0x103) /* invalid facil req */ +#define E_CO_NETCONGEST (CONL_ERROR_MASK | 0x105) /* Network congestion */ +#define E_CO_OUTOFORDER (CONL_ERROR_MASK | 0x109) /* Out of order */ +#define E_CO_ACCESSBAR (CONL_ERROR_MASK | 0x10b) /* access barred */ +#define E_CO_NOTOBTAIN (CONL_ERROR_MASK | 0x10d) /* not obtainable */ +#define E_CO_REMPROCERR (CONL_ERROR_MASK | 0x111) /* Remote procedure err */ +#define E_CO_LOCPROCERR (CONL_ERROR_MASK | 0x113) /* Local procedure err */ +#define E_CO_RPOAOOO (CONL_ERROR_MASK | 0x115) /* RPOA out of order */ +#define E_CO_NOREVCHG (CONL_ERROR_MASK | 0x119) /* Revs chg not accepted*/ +#define E_CO_INCOMPAT (CONL_ERROR_MASK | 0x121) /* Incompatible dest */ +#define E_CO_NOFASTSEL (CONL_ERROR_MASK | 0x129) + /* Fast select accpt not subscribed */ +#define E_CO_NOSHIP (CONL_ERROR_MASK | 0x139) /* ship absent */ +#define E_CO_GWPROCERR (CONL_ERROR_MASK | 0x1c1) /* Gateway-detected err*/ +#define E_CO_GWCONGEST (CONL_ERROR_MASK | 0x1c3) /* Gateway congestion*/ + +/* ARGO only */ +#define E_CO_QFULL (CONL_ERROR_MASK | 0x100) /* dropped packet - queue full*/ +#define E_CO_AIWP (CONL_ERROR_MASK | 0x102) /* addr incompat w/proto */ +#define E_CO_CHAN (CONL_ERROR_MASK | 0x104) /* bad channel number */ + +/* ARGO only; driver specific */ +#define E_CO_NORESOURCES (CONL_ERROR_MASK | 0x1b0) /* eicon clogged */ +#define E_CO_PDNDOWN (CONL_ERROR_MASK | 0x1b1) /* physical net down */ +#define E_CO_DRVRCLRESET (CONL_ERROR_MASK | 0x1b2) /* driver clear/reset */ +#define E_CO_PDNCLRESET (CONL_ERROR_MASK | 0x1b3) /* PDN clear/reset */ +#define E_CO_DTECLRESET (CONL_ERROR_MASK | 0x1b4) /* board clear/reset */ +#define E_CO_UNKCLRESET (CONL_ERROR_MASK | 0x1b5) /* unexpected clr/rst */ + +#define CONL_ERROR_MAX 0x1c3 + +#endif /* __ISO_ERRNO__ */ diff --git a/sys/netiso/iso_pcb.c b/sys/netiso/iso_pcb.c new file mode 100644 index 00000000000..0b50c603422 --- /dev/null +++ b/sys/netiso/iso_pcb.c @@ -0,0 +1,617 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso_pcb.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * $Header: iso_pcb.c,v 4.5 88/06/29 14:59:56 hagens Exp $ + * $Source: /usr/argo/sys/netiso/RCS/iso_pcb.c,v $ + * + * Iso address family net-layer(s) pcb stuff. NEH 1/29/87 + */ + +#ifdef ISO + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef TPCONS +#include +#include +#include +#endif + +#define PCBNULL (struct isopcb *)0 +struct iso_addr zeroiso_addr = { + 0 +}; + + +/* + * FUNCTION: iso_pcballoc + * + * PURPOSE: creates an isopcb structure in an mbuf, + * with socket (so), and + * puts it in the queue with head (head) + * + * RETURNS: 0 if OK, ENOBUFS if can't alloc the necessary mbuf + */ +int +iso_pcballoc(so, head) + struct socket *so; + struct isopcb *head; +{ + register struct isopcb *isop; + + IFDEBUG(D_ISO) + printf("iso_pcballoc(so 0x%x)\n", so); + ENDDEBUG + MALLOC(isop, struct isopcb *, sizeof(*isop), M_PCB, M_NOWAIT); + if (isop == NULL) + return ENOBUFS; + bzero((caddr_t)isop, sizeof(*isop)); + isop->isop_head = head; + isop->isop_socket = so; + insque(isop, head); + if (so) + so->so_pcb = (caddr_t)isop; + return 0; +} + +/* + * FUNCTION: iso_pcbbind + * + * PURPOSE: binds the address given in *(nam) to the socket + * specified by the isopcb in *(isop) + * If the given address is zero, it makes sure the + * address isn't already in use and if it's got a network + * portion, we look for an interface with that network + * address. If the address given is zero, we allocate + * a port and stuff it in the (nam) structure. + * + * RETURNS: errno E* or 0 if ok. + * + * SIDE EFFECTS: increments head->isop_lport if it allocates a port # + * + * NOTES: + */ +#define satosiso(sa) ((struct sockaddr_iso *)(sa)) +int +iso_pcbbind(isop, nam) + register struct isopcb *isop; + struct mbuf *nam; +{ + register struct isopcb *head = isop->isop_head; + register struct sockaddr_iso *siso; + struct iso_ifaddr *ia; + union { + char data[2]; + u_short s; + } suf; + + IFDEBUG(D_ISO) + printf("iso_pcbbind(isop 0x%x, nam 0x%x)\n", isop, nam); + ENDDEBUG + suf.s = 0; + if (iso_ifaddr == 0) /* any interfaces attached? */ + return EADDRNOTAVAIL; + if (isop->isop_laddr) /* already bound */ + return EADDRINUSE; + if(nam == (struct mbuf *)0) { + isop->isop_laddr = &isop->isop_sladdr; + isop->isop_sladdr.siso_len = sizeof(struct sockaddr_iso); + isop->isop_sladdr.siso_family = AF_ISO; + isop->isop_sladdr.siso_tlen = 2; + isop->isop_sladdr.siso_nlen = 0; + isop->isop_sladdr.siso_slen = 0; + isop->isop_sladdr.siso_plen = 0; + goto noname; + } + siso = mtod(nam, struct sockaddr_iso *); + IFDEBUG(D_ISO) + printf("iso_pcbbind(name len 0x%x)\n", nam->m_len); + printf("The address is %s\n", clnp_iso_addrp(&siso->siso_addr)); + ENDDEBUG + /* + * We would like sort of length check but since some OSI addrs + * do not have fixed length, we can't really do much. + * The ONLY thing we can say is that an osi addr has to have + * at LEAST an afi and one more byte and had better fit into + * a struct iso_addr. + * However, in fact the size of the whole thing is a struct + * sockaddr_iso, so probably this is what we should check for. + */ + if( (nam->m_len < 2) || (nam->m_len < siso->siso_len)) { + return ENAMETOOLONG; + } + if (siso->siso_nlen) { + /* non-zero net addr- better match one of our interfaces */ + IFDEBUG(D_ISO) + printf("iso_pcbbind: bind to NOT zeroisoaddr\n"); + ENDDEBUG + for (ia = iso_ifaddr; ia; ia = ia->ia_next) + if (SAME_ISOADDR(siso, &ia->ia_addr)) + break; + if (ia == 0) + return EADDRNOTAVAIL; + } + if (siso->siso_len <= sizeof (isop->isop_sladdr)) { + isop->isop_laddr = &isop->isop_sladdr; + } else { + if ((nam = m_copy(nam, 0, (int)M_COPYALL)) == 0) + return ENOBUFS; + isop->isop_laddr = mtod(nam, struct sockaddr_iso *); + } + bcopy((caddr_t)siso, (caddr_t)isop->isop_laddr, siso->siso_len); + if (siso->siso_tlen == 0) + goto noname; + if ((isop->isop_socket->so_options & SO_REUSEADDR) == 0 && + iso_pcblookup(head, 0, (caddr_t)0, isop->isop_laddr)) + return EADDRINUSE; + if (siso->siso_tlen <= 2) { + bcopy(TSEL(siso), suf.data, sizeof(suf.data)); + suf.s = ntohs(suf.s); + if((suf.s < ISO_PORT_RESERVED) && + (isop->isop_socket->so_state && SS_PRIV) == 0) + return EACCES; + } else { + register char *cp; +noname: + cp = TSEL(isop->isop_laddr); + IFDEBUG(D_ISO) + printf("iso_pcbbind noname\n"); + ENDDEBUG + do { + if (head->isop_lport++ < ISO_PORT_RESERVED || + head->isop_lport > ISO_PORT_USERRESERVED) + head->isop_lport = ISO_PORT_RESERVED; + suf.s = htons(head->isop_lport); + cp[0] = suf.data[0]; + cp[1] = suf.data[1]; + } while (iso_pcblookup(head, 0, (caddr_t)0, isop->isop_laddr)); + } + IFDEBUG(D_ISO) + printf("iso_pcbbind returns 0, suf 0x%x\n", suf); + ENDDEBUG + return 0; +} +/* + * FUNCTION: iso_pcbconnect + * + * PURPOSE: Make the isopcb (isop) look like it's connected. + * In other words, give it the peer address given in + * the mbuf * (nam). Make sure such a combination + * of local, peer addresses doesn't already exist + * for this protocol. Internet mentality prevails here, + * wherein a src,dst pair uniquely identifies a connection. + * Both net address and port must be specified in argument + * (nam). + * If we don't have a local address for this socket yet, + * we pick one by calling iso_pcbbind(). + * + * RETURNS: errno E* or 0 if ok. + * + * SIDE EFFECTS: Looks up a route, which may cause one to be left + * in the isopcb. + * + * NOTES: + */ +int +iso_pcbconnect(isop, nam) + register struct isopcb *isop; + struct mbuf *nam; +{ + register struct sockaddr_iso *siso = mtod(nam, struct sockaddr_iso *); + int local_zero, error = 0; + struct iso_ifaddr *ia; + + IFDEBUG(D_ISO) + printf("iso_pcbconnect(isop 0x%x sock 0x%x nam 0x%x", + isop, isop->isop_socket, nam); + printf("nam->m_len 0x%x), addr:\n", nam->m_len); + dump_isoaddr(siso); + ENDDEBUG + if (nam->m_len < siso->siso_len) + return EINVAL; + if (siso->siso_family != AF_ISO) + return EAFNOSUPPORT; + if (siso->siso_nlen == 0) { + if (ia = iso_ifaddr) { + int nlen = ia->ia_addr.siso_nlen; + ovbcopy(TSEL(siso), nlen + TSEL(siso), + siso->siso_plen + siso->siso_tlen + siso->siso_slen); + bcopy((caddr_t)&ia->ia_addr.siso_addr, + (caddr_t)&siso->siso_addr, nlen + 1); + /* includes siso->siso_nlen = nlen; */ + } else + return EADDRNOTAVAIL; + } + /* + * Local zero means either not bound, or bound to a TSEL, but no + * particular local interface. So, if we want to send somebody + * we need to choose a return address. + */ + local_zero = + ((isop->isop_laddr == 0) || (isop->isop_laddr->siso_nlen == 0)); + if (local_zero) { + int flags; + + IFDEBUG(D_ISO) + printf("iso_pcbconnect localzero 1\n"); + ENDDEBUG + /* + * If route is known or can be allocated now, + * our src addr is taken from the i/f, else punt. + */ + flags = isop->isop_socket->so_options & SO_DONTROUTE; + if (error = clnp_route(&siso->siso_addr, &isop->isop_route, flags, + (struct sockaddr **)0, &ia)) + return error; + IFDEBUG(D_ISO) + printf("iso_pcbconnect localzero 2, ro->ro_rt 0x%x", + isop->isop_route.ro_rt); + printf(" ia 0x%x\n", ia); + ENDDEBUG + } + IFDEBUG(D_ISO) + printf("in iso_pcbconnect before lookup isop 0x%x isop->sock 0x%x\n", + isop, isop->isop_socket); + ENDDEBUG + if (local_zero) { + int nlen, tlen, totlen; caddr_t oldtsel, newtsel; + siso = isop->isop_laddr; + if (siso == 0 || siso->siso_tlen == 0) + (void)iso_pcbbind(isop, (struct mbuf *)0); + /* + * Here we have problem of squezeing in a definite network address + * into an existing sockaddr_iso, which in fact may not have room + * for it. This gets messy. + */ + siso = isop->isop_laddr; + oldtsel = TSEL(siso); + tlen = siso->siso_tlen; + nlen = ia->ia_addr.siso_nlen; + totlen = tlen + nlen + _offsetof(struct sockaddr_iso, siso_data[0]); + if ((siso == &isop->isop_sladdr) && + (totlen > sizeof(isop->isop_sladdr))) { + struct mbuf *m = m_get(MT_SONAME, M_DONTWAIT); + if (m == 0) + return ENOBUFS; + m->m_len = totlen; + isop->isop_laddr = siso = mtod(m, struct sockaddr_iso *); + } + siso->siso_nlen = ia->ia_addr.siso_nlen; + newtsel = TSEL(siso); + ovbcopy(oldtsel, newtsel, tlen); + bcopy(ia->ia_addr.siso_data, siso->siso_data, nlen); + siso->siso_tlen = tlen; + siso->siso_family = AF_ISO; + siso->siso_len = totlen; + siso = mtod(nam, struct sockaddr_iso *); + } + IFDEBUG(D_ISO) + printf("in iso_pcbconnect before bcopy isop 0x%x isop->sock 0x%x\n", + isop, isop->isop_socket); + ENDDEBUG + /* + * If we had to allocate space to a previous big foreign address, + * and for some reason we didn't free it, we reuse it knowing + * that is going to be big enough, as sockaddrs are delivered in + * 128 byte mbufs. + * If the foreign address is small enough, we use default space; + * otherwise, we grab an mbuf to copy into. + */ + if (isop->isop_faddr == 0 || isop->isop_faddr == &isop->isop_sfaddr) { + if (siso->siso_len <= sizeof(isop->isop_sfaddr)) + isop->isop_faddr = &isop->isop_sfaddr; + else { + struct mbuf *m = m_get(MT_SONAME, M_DONTWAIT); + if (m == 0) + return ENOBUFS; + isop->isop_faddr = mtod(m, struct sockaddr_iso *); + } + } + bcopy((caddr_t)siso, (caddr_t)isop->isop_faddr, siso->siso_len); + IFDEBUG(D_ISO) + printf("in iso_pcbconnect after bcopy isop 0x%x isop->sock 0x%x\n", + isop, isop->isop_socket); + printf("iso_pcbconnect connected to addr:\n"); + dump_isoaddr(isop->isop_faddr); + printf("iso_pcbconnect end: src addr:\n"); + dump_isoaddr(isop->isop_laddr); + ENDDEBUG + return 0; +} + +/* + * FUNCTION: iso_pcbdisconnect() + * + * PURPOSE: washes away the peer address info so the socket + * appears to be disconnected. + * If there's no file descriptor associated with the socket + * it detaches the pcb. + * + * RETURNS: Nada. + * + * SIDE EFFECTS: May detach the pcb. + * + * NOTES: + */ +void +iso_pcbdisconnect(isop) + struct isopcb *isop; +{ + void iso_pcbdetach(); + register struct sockaddr_iso *siso; + + IFDEBUG(D_ISO) + printf("iso_pcbdisconnect(isop 0x%x)\n", isop); + ENDDEBUG + /* + * Preserver binding infnormation if already bound. + */ + if ((siso = isop->isop_laddr) && siso->siso_nlen && siso->siso_tlen) { + caddr_t otsel = TSEL(siso); + siso->siso_nlen = 0; + ovbcopy(otsel, TSEL(siso), siso->siso_tlen); + } + if (isop->isop_faddr && isop->isop_faddr != &isop->isop_sfaddr) + m_freem(dtom(isop->isop_faddr)); + isop->isop_faddr = 0; + if (isop->isop_socket->so_state & SS_NOFDREF) + iso_pcbdetach(isop); +} + +/* + * FUNCTION: iso_pcbdetach + * + * PURPOSE: detach the pcb at *(isop) from it's socket and free + * the mbufs associated with the pcb.. + * Dequeues (isop) from its head. + * + * RETURNS: Nada. + * + * SIDE EFFECTS: + * + * NOTES: + */ +void +iso_pcbdetach(isop) + struct isopcb *isop; +{ + struct socket *so = isop->isop_socket; + + IFDEBUG(D_ISO) + printf("iso_pcbdetach(isop 0x%x socket 0x%x so 0x%x)\n", + isop, isop->isop_socket, so); + ENDDEBUG +#ifdef TPCONS + if (isop->isop_chan) { + register struct pklcd *lcp = (struct pklcd *)isop->isop_chan; + if (--isop->isop_refcnt > 0) + return; + if (lcp && lcp->lcd_state == DATA_TRANSFER) { + lcp->lcd_upper = 0; + lcp->lcd_upnext = 0; + pk_disconnect(lcp); + } + isop->isop_chan = 0; + } +#endif + if (so) { /* in the x.25 domain, we sometimes have no socket */ + so->so_pcb = 0; + sofree(so); + } + IFDEBUG(D_ISO) + printf("iso_pcbdetach 2 \n"); + ENDDEBUG + if (isop->isop_options) + (void)m_free(isop->isop_options); + IFDEBUG(D_ISO) + printf("iso_pcbdetach 3 \n"); + ENDDEBUG + if (isop->isop_route.ro_rt) + rtfree(isop->isop_route.ro_rt); + IFDEBUG(D_ISO) + printf("iso_pcbdetach 3.1\n"); + ENDDEBUG + if (isop->isop_clnpcache != NULL) { + struct clnp_cache *clcp = + mtod(isop->isop_clnpcache, struct clnp_cache *); + IFDEBUG(D_ISO) + printf("iso_pcbdetach 3.2: clcp 0x%x freeing clc_hdr x%x\n", + clcp, clcp->clc_hdr); + ENDDEBUG + if (clcp->clc_hdr != NULL) + m_free(clcp->clc_hdr); + IFDEBUG(D_ISO) + printf("iso_pcbdetach 3.3: freeing cache x%x\n", + isop->isop_clnpcache); + ENDDEBUG + m_free(isop->isop_clnpcache); + } + IFDEBUG(D_ISO) + printf("iso_pcbdetach 4 \n"); + ENDDEBUG + remque(isop); + IFDEBUG(D_ISO) + printf("iso_pcbdetach 5 \n"); + ENDDEBUG + if (isop->isop_laddr && (isop->isop_laddr != &isop->isop_sladdr)) + m_freem(dtom(isop->isop_laddr)); + free((caddr_t)isop, M_PCB); +} + + +/* + * FUNCTION: iso_pcbnotify + * + * PURPOSE: notify all connections in this protocol's queue (head) + * that have peer address (dst) of the problem (errno) + * by calling (notify) on the connections' isopcbs. + * + * RETURNS: Rien. + * + * SIDE EFFECTS: + * + * NOTES: (notify) is called at splimp! + */ +void +iso_pcbnotify(head, siso, errno, notify) + struct isopcb *head; + register struct sockaddr_iso *siso; + int errno, (*notify)(); +{ + register struct isopcb *isop; + int s = splimp(); + + IFDEBUG(D_ISO) + printf("iso_pcbnotify(head 0x%x, notify 0x%x) dst:\n", head, notify); + ENDDEBUG + for (isop = head->isop_next; isop != head; isop = isop->isop_next) { + if (isop->isop_socket == 0 || isop->isop_faddr == 0 || + !SAME_ISOADDR(siso, isop->isop_faddr)) { + IFDEBUG(D_ISO) + printf("iso_pcbnotify: CONTINUE isop 0x%x, sock 0x%x\n" , + isop, isop->isop_socket); + printf("addrmatch cmp'd with (0x%x):\n", isop->isop_faddr); + dump_isoaddr(isop->isop_faddr); + ENDDEBUG + continue; + } + if (errno) + isop->isop_socket->so_error = errno; + if (notify) + (*notify)(isop); + } + splx(s); + IFDEBUG(D_ISO) + printf("END OF iso_pcbnotify\n" ); + ENDDEBUG +} + + +/* + * FUNCTION: iso_pcblookup + * + * PURPOSE: looks for a given combination of (faddr), (fport), + * (lport), (laddr) in the queue named by (head). + * Argument (flags) is ignored. + * + * RETURNS: ptr to the isopcb if it finds a connection matching + * these arguments, o.w. returns zero. + * + * SIDE EFFECTS: + * + * NOTES: + */ +struct isopcb * +iso_pcblookup(head, fportlen, fport, laddr) + struct isopcb *head; + register struct sockaddr_iso *laddr; + caddr_t fport; + int fportlen; +{ + register struct isopcb *isop; + register caddr_t lp = TSEL(laddr); + unsigned int llen = laddr->siso_tlen; + + IFDEBUG(D_ISO) + printf("iso_pcblookup(head 0x%x laddr 0x%x fport 0x%x)\n", + head, laddr, fport); + ENDDEBUG + for (isop = head->isop_next; isop != head; isop = isop->isop_next) { + if (isop->isop_laddr == 0 || isop->isop_laddr == laddr) + continue; + if (isop->isop_laddr->siso_tlen != llen) + continue; + if (bcmp(lp, TSEL(isop->isop_laddr), llen)) + continue; + if (fportlen && isop->isop_faddr && + bcmp(fport, TSEL(isop->isop_faddr), (unsigned)fportlen)) + continue; + /* PHASE2 + * addrmatch1 should be iso_addrmatch(a, b, mask) + * where mask is taken from isop->isop_laddrmask (new field) + * isop_lnetmask will also be available in isop + if (laddr != &zeroiso_addr && + !iso_addrmatch1(laddr, &(isop->isop_laddr.siso_addr))) + continue; + */ + if (laddr->siso_nlen && (!SAME_ISOADDR(laddr, isop->isop_laddr))) + continue; + return (isop); + } + return (struct isopcb *)0; +} +#endif /* ISO */ diff --git a/sys/netiso/iso_pcb.h b/sys/netiso/iso_pcb.h new file mode 100644 index 00000000000..aad76bcc065 --- /dev/null +++ b/sys/netiso/iso_pcb.h @@ -0,0 +1,113 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso_pcb.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: iso_pcb.h,v 4.3 88/06/29 15:00:01 hagens Exp $ */ +/* $Source: /usr/argo/sys/netiso/RCS/iso_pcb.h,v $ */ + +#define MAXX25CRUDLEN 16 /* 16 bytes of call request user data */ + +/* + * Common structure pcb for argo protocol implementation. + */ +struct isopcb { + struct isopcb *isop_next,*isop_prev; /* pointers to other pcb's */ + struct isopcb *isop_head; /* pointer back to chain of pcbs for + this protocol */ + struct socket *isop_socket; /* back pointer to socket */ + struct sockaddr_iso *isop_laddr; + struct sockaddr_iso *isop_faddr; + struct route_iso { + struct rtentry *ro_rt; + struct sockaddr_iso ro_dst; + } isop_route; /* CLNP routing entry */ + struct mbuf *isop_options; /* CLNP options */ + struct mbuf *isop_optindex; /* CLNP options index */ + struct mbuf *isop_clnpcache; /* CLNP cached hdr */ + caddr_t isop_chan; /* actually struct pklcb * */ + u_short isop_refcnt; /* mult TP4 tpcb's -> here */ + u_short isop_lport; /* MISLEADLING work var */ + u_short isop_tuba_cached; /* for tuba address ref cnts */ + int isop_x25crud_len; /* x25 call request ud */ + char isop_x25crud[MAXX25CRUDLEN]; + struct ifaddr *isop_ifa; /* ESIS interface assoc w/sock */ + struct sockaddr_iso isop_sladdr, /* preallocated laddr */ + isop_sfaddr; /* preallocated faddr */ +}; + +#ifdef sotorawcb +/* + * Common structure pcb for raw clnp protocol access. + * Here are clnp specific extensions to the raw control block, + * and space is allocated to the necessary sockaddrs. + */ +struct rawisopcb { + struct rawcb risop_rcb; /* common control block prefix */ + int risop_flags; /* flags, e.g. raw sockopts */ + struct isopcb risop_isop; /* space for bound addresses, routes etc.*/ +}; +#endif + +#define sotoisopcb(so) ((struct isopcb *)(so)->so_pcb) +#define sotorawisopcb(so) ((struct rawisopcb *)(so)->so_pcb) + +#ifdef KERNEL +struct isopcb *iso_pcblookup(); +#endif diff --git a/sys/netiso/iso_proto.c b/sys/netiso/iso_proto.c new file mode 100644 index 00000000000..59575c7513b --- /dev/null +++ b/sys/netiso/iso_proto.c @@ -0,0 +1,197 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso_proto.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: iso_proto.c,v 4.4 88/09/08 08:38:42 hagens Exp $ + * $Source: /usr/argo/sys/netiso/RCS/iso_proto.c,v $ + * + * iso_proto.c : protocol switch tables in the ISO domain + * + * ISO protocol family includes TP, CLTP, CLNP, 8208 + * TP and CLNP are implemented here. + */ + +#ifdef ISO +#include +#include +#include +#include +#include + +#include + +#include + +int clnp_output(), clnp_init(),clnp_slowtimo(),clnp_drain(); +int rclnp_input(), rclnp_output(), rclnp_ctloutput(), raw_usrreq(); +int clnp_usrreq(); + +int tp_ctloutput(), tpclnp_ctlinput(), tpclnp_input(), tp_usrreq(); +int tp_init(), tp_fasttimo(), tp_slowtimo(), tp_drain(); +int cons_init(), tpcons_input(); + +int isis_input(); +int esis_input(), esis_ctlinput(), esis_init(), esis_usrreq(); +int idrp_input(), idrp_init(), idrp_usrreq(); +int cltp_input(), cltp_ctlinput(), cltp_init(), cltp_usrreq(), cltp_output(); + +#ifdef TUBA +int tuba_usrreq(), tuba_ctloutput(), tuba_init(), tuba_tcpinput(); +int tuba_slowtimo(), tuba_fasttimo(); +#endif + +struct protosw isosw[] = { +/* + * We need a datagram entry through which net mgmt programs can get + * to the iso_control procedure (iso ioctls). Thus, a minimal + * SOCK_DGRAM interface is provided here. + * THIS ONE MUST BE FIRST: Kludge city : socket() says if(!proto) call + * pffindtype, which gets the first entry that matches the type. + * sigh. + */ +{ SOCK_DGRAM, &isodomain, ISOPROTO_CLTP, PR_ATOMIC|PR_ADDR, + 0, cltp_output, 0, 0, + cltp_usrreq, + cltp_init, 0, 0, 0 +}, + +/* + * A datagram interface for clnp cannot co-exist with TP/CLNP + * because CLNP has no way to discriminate incoming TP packets from + * packets coming in for any other higher layer protocol. + * Old way: set it up so that pffindproto(... dgm, clnp) fails. + * New way: let pffindproto work (for x.25, thank you) but create + * a clnp_usrreq() that returns error on PRU_ATTACH. + */ +{SOCK_DGRAM, &isodomain, ISOPROTO_CLNP, 0, + 0, clnp_output, 0, 0, + clnp_usrreq, + clnp_init, 0, clnp_slowtimo, clnp_drain, +}, + +/* raw clnp */ +{ SOCK_RAW, &isodomain, ISOPROTO_RAW, PR_ATOMIC|PR_ADDR, + rclnp_input, rclnp_output, 0, rclnp_ctloutput, + clnp_usrreq, + 0, 0, 0, 0 +}, + +/* ES-IS protocol */ +{ SOCK_DGRAM, &isodomain, ISOPROTO_ESIS, PR_ATOMIC|PR_ADDR, + esis_input, 0, esis_ctlinput, 0, + esis_usrreq, + esis_init, 0, 0, 0 +}, + +/* ISOPROTO_INTRAISIS */ +{ SOCK_DGRAM, &isodomain, ISOPROTO_INTRAISIS, PR_ATOMIC|PR_ADDR, + isis_input, 0, 0, 0, + esis_usrreq, + 0, 0, 0, 0 +}, + +/* ISOPROTO_IDRP */ +{ SOCK_DGRAM, &isodomain, ISOPROTO_IDRP, PR_ATOMIC|PR_ADDR, + idrp_input, 0, 0, 0, + idrp_usrreq, + idrp_init, 0, 0, 0 +}, + +/* ISOPROTO_TP */ +{ SOCK_SEQPACKET, &isodomain, ISOPROTO_TP, PR_CONNREQUIRED|PR_WANTRCVD, + tpclnp_input, 0, tpclnp_ctlinput, tp_ctloutput, + tp_usrreq, + tp_init, tp_fasttimo, tp_slowtimo, tp_drain, +}, + +#ifdef TUBA +{ SOCK_STREAM, &isodomain, ISOPROTO_TCP, PR_CONNREQUIRED|PR_WANTRCVD, + tuba_tcpinput, 0, 0, tuba_ctloutput, + tuba_usrreq, + tuba_init, tuba_fasttimo, tuba_fasttimo, 0 +}, +#endif + +#ifdef TPCONS +/* ISOPROTO_TP */ +{ SOCK_SEQPACKET, &isodomain, ISOPROTO_TP0, PR_CONNREQUIRED|PR_WANTRCVD, + tpcons_input, 0, 0, tp_ctloutput, + tp_usrreq, + cons_init, 0, 0, 0, +}, +#endif + +}; + + +struct domain isodomain = { + AF_ISO, /* family */ + "iso-domain", /* name */ + 0, /* initialize routine */ + 0, /* externalize access rights */ + 0, /* dispose of internalized rights */ + isosw, /* protosw */ + &isosw[sizeof(isosw)/sizeof(isosw[0])], /* NPROTOSW */ + 0, /* next */ + rn_inithead, /* rtattach */ + 48, /* rtoffset */ + sizeof(struct sockaddr_iso) /* maxkeylen */ +}; +#endif /* ISO */ diff --git a/sys/netiso/iso_snpac.c b/sys/netiso/iso_snpac.c new file mode 100644 index 00000000000..2473ae7a12d --- /dev/null +++ b/sys/netiso/iso_snpac.c @@ -0,0 +1,736 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso_snpac.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: iso_snpac.c,v 1.8 88/09/19 13:51:36 hagens Exp $ */ +/* $Source: /usr/argo/sys/netiso/RCS/iso_snpac.c,v $ */ + +#ifdef ISO + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +int iso_systype = SNPA_ES; /* default to be an ES */ +extern short esis_holding_time, esis_config_time, esis_esconfig_time; +extern struct timeval time; +extern void esis_config(); +extern int hz; +static void snpac_fixdstandmask(); + +struct sockaddr_iso blank_siso = {sizeof(blank_siso), AF_ISO}; +extern u_long iso_hashchar(); +static struct sockaddr_iso + dst = {sizeof(dst), AF_ISO}, + gte = {sizeof(dst), AF_ISO}, + src = {sizeof(dst), AF_ISO}, + msk = {sizeof(dst), AF_ISO}, + zmk = {0}; +#define zsi blank_siso +#define zero_isoa zsi.siso_addr +#define zap_isoaddr(a, b) {Bzero(&a.siso_addr, sizeof(*r)); r = b; \ + Bcopy(r, &a.siso_addr, 1 + (r)->isoa_len);} +#define S(x) ((struct sockaddr *)&(x)) + +static struct sockaddr_dl blank_dl = {sizeof(blank_dl), AF_LINK}; +static struct sockaddr_dl gte_dl; +#define zap_linkaddr(a, b, c, i) \ + (*a = blank_dl, bcopy(b, a->sdl_data, a->sdl_alen = c), a->sdl_index = i) + +/* + * We only keep track of a single IS at a time. + */ +struct rtentry *known_is; + +/* + * Addresses taken from NBS agreements, December 1987. + * + * These addresses assume on-the-wire transmission of least significant + * bit first. This is the method used by 802.3. When these + * addresses are passed to the token ring driver, (802.5), they + * must be bit-swaped because 802.5 transmission order is MSb first. + * + * Furthermore, according to IBM Austin, these addresses are not + * true token ring multicast addresses. More work is necessary + * to get multicast to work right on token ring. + * + * Currently, the token ring driver does not handle multicast, so + * these addresses are converted into the broadcast address in + * lan_output() That means that if these multicast addresses change + * the token ring driver must be altered. + */ +char all_es_snpa[] = { 0x09, 0x00, 0x2b, 0x00, 0x00, 0x04 }; +char all_is_snpa[] = { 0x09, 0x00, 0x2b, 0x00, 0x00, 0x05 }; +char all_l1is_snpa[] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x14}; +char all_l2is_snpa[] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x15}; + +union sockunion { + struct sockaddr_iso siso; + struct sockaddr_dl sdl; + struct sockaddr sa; +}; + +/* + * FUNCTION: llc_rtrequest + * + * PURPOSE: Manage routing table entries specific to LLC for ISO. + * + * NOTES: This does a lot of obscure magic; + */ +llc_rtrequest(req, rt, sa) +int req; +register struct rtentry *rt; +struct sockaddr *sa; +{ + register union sockunion *gate = (union sockunion *)rt->rt_gateway; + register struct llinfo_llc *lc = (struct llinfo_llc *)rt->rt_llinfo, *lc2; + struct rtentry *rt2; + struct ifnet *ifp = rt->rt_ifp; + int addrlen = ifp->if_addrlen; +#define LLC_SIZE 3 /* XXXXXX do this right later */ + + IFDEBUG (D_SNPA) + printf("llc_rtrequest(%d, %x, %x)\n", req, rt, sa); + ENDDEBUG + if (rt->rt_flags & RTF_GATEWAY) + return; + else switch (req) { + case RTM_ADD: + /* + * Case 1: This route may come from a route to iface with mask + * or from a default route. + */ + if (rt->rt_flags & RTF_CLONING) { + iso_setmcasts(ifp, req); + rt_setgate(rt, rt_key(rt), &blank_dl); + return; + } + if (lc != 0) + return; /* happens on a route change */ + /* FALLTHROUGH */ + case RTM_RESOLVE: + /* + * Case 2: This route may come from cloning, or a manual route + * add with a LL address. + */ + if (gate->sdl.sdl_family != AF_LINK) { + log(LOG_DEBUG, "llc_rtrequest: got non-link non-gateway route\n"); + break; + } + R_Malloc(lc, struct llinfo_llc *, sizeof (*lc)); + rt->rt_llinfo = (caddr_t)lc; + if (lc == 0) { + log(LOG_DEBUG, "llc_rtrequest: malloc failed\n"); + break; + } + Bzero(lc, sizeof(*lc)); + lc->lc_rt = rt; + rt->rt_flags |= RTF_LLINFO; + insque(lc, &llinfo_llc); + if (gate->sdl.sdl_alen == sizeof(struct esis_req) + addrlen) { + gate->sdl.sdl_alen -= sizeof(struct esis_req); + bcopy(addrlen + LLADDR(&gate->sdl), + (caddr_t)&lc->lc_er, sizeof(lc->lc_er)); + } else if (gate->sdl.sdl_alen == addrlen) + lc->lc_flags = (SNPA_ES | SNPA_VALID | SNPA_PERM); + break; + case RTM_DELETE: + if (rt->rt_flags & RTF_CLONING) + iso_setmcasts(ifp, req); + if (lc == 0) + return; + remque(lc); + Free(lc); + rt->rt_llinfo = 0; + rt->rt_flags &= ~RTF_LLINFO; + break; + } + if (rt->rt_rmx.rmx_mtu == 0) { + rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu - LLC_SIZE; + } +} +/* + * FUNCTION: iso_setmcasts + * + * PURPOSE: Enable/Disable ESIS/ISIS multicast reception on interfaces. + * + * NOTES: This also does a lot of obscure magic; + */ +iso_setmcasts(ifp, req) + struct ifnet *ifp; + int req; +{ + static char *addrlist[] = + { all_es_snpa, all_is_snpa, all_l1is_snpa, all_l2is_snpa, 0}; + struct ifreq ifr; + register caddr_t *cpp; + int doreset = 0; + + bzero((caddr_t)&ifr, sizeof(ifr)); + for (cpp = (caddr_t *)addrlist; *cpp; cpp++) { + bcopy(*cpp, (caddr_t)ifr.ifr_addr.sa_data, 6); + if (req == RTM_ADD) + if (ether_addmulti(&ifr, (struct arpcom *)ifp) == ENETRESET) + doreset++; + else + if (ether_delmulti(&ifr, (struct arpcom *)ifp) == ENETRESET) + doreset++; + } + if (doreset) { + if (ifp->if_reset) + (*ifp->if_reset)(ifp->if_unit); + else + printf("iso_setmcasts: %s%d needs reseting to receive iso mcasts\n", + ifp->if_name, ifp->if_unit); + } +} +/* + * FUNCTION: iso_snparesolve + * + * PURPOSE: Resolve an iso address into snpa address + * + * RETURNS: 0 if addr is resolved + * errno if addr is unknown + * + * SIDE EFFECTS: + * + * NOTES: Now that we have folded the snpa cache into the routing + * table, we know there is no snpa address known for this + * destination. If we know of a default IS, then the address + * of the IS is returned. If no IS is known, then return the + * multi-cast address for "all ES" for this interface. + * + * NB: the last case described above constitutes the + * query configuration function 9542, sec 6.5 + * A mechanism is needed to prevent this function from + * being invoked if the system is an IS. + */ +iso_snparesolve(ifp, dest, snpa, snpa_len) +struct ifnet *ifp; /* outgoing interface */ +struct sockaddr_iso *dest; /* destination */ +caddr_t snpa; /* RESULT: snpa to be used */ +int *snpa_len; /* RESULT: length of snpa */ +{ + struct llinfo_llc *sc; /* ptr to snpa table entry */ + caddr_t found_snpa; + int addrlen; + + /* + * This hack allows us to send esis packets that have the destination snpa + * addresss embedded in the destination nsap address + */ + if (dest->siso_data[0] == AFI_SNA) { + /* + * This is a subnetwork address. Return it immediately + */ + IFDEBUG(D_SNPA) + printf("iso_snparesolve: return SN address\n"); + ENDDEBUG + addrlen = dest->siso_nlen - 1; /* subtract size of AFI */ + found_snpa = (caddr_t) dest->siso_data + 1; + /* + * If we are an IS, we can't do much with the packet; + * Check if we know about an IS. + */ + } else if (iso_systype != SNPA_IS && known_is != 0 && + (sc = (struct llinfo_llc *)known_is->rt_llinfo) && + (sc->lc_flags & SNPA_VALID)) { + register struct sockaddr_dl *sdl = + (struct sockaddr_dl *)(known_is->rt_gateway); + found_snpa = LLADDR(sdl); + addrlen = sdl->sdl_alen; + } else if (ifp->if_flags & IFF_BROADCAST) { + /* + * no IS, no match. Return "all es" multicast address for this + * interface, as per Query Configuration Function (9542 sec 6.5) + * + * Note: there is a potential problem here. If the destination + * is on the subnet and it does not respond with a ESH, but + * does send back a TP CC, a connection could be established + * where we always transmit the CLNP packet to "all es" + */ + addrlen = ifp->if_addrlen; + found_snpa = (caddr_t)all_es_snpa; + } else + return (ENETUNREACH); + bcopy(found_snpa, snpa, *snpa_len = addrlen); + return (0); +} + + +/* + * FUNCTION: snpac_free + * + * PURPOSE: free an entry in the iso address map table + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: If there is a route entry associated with cache + * entry, then delete that as well + */ +snpac_free(lc) +register struct llinfo_llc *lc; /* entry to free */ +{ + register struct rtentry *rt = lc->lc_rt; + register struct iso_addr *r; + + if (known_is == rt) + known_is = 0; + if (rt && (rt->rt_flags & RTF_UP) && + (rt->rt_flags & (RTF_DYNAMIC | RTF_MODIFIED))) { + RTFREE(rt); + rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), + rt->rt_flags, (struct rtentry **)0); + RTFREE(rt); + } +} + +/* + * FUNCTION: snpac_add + * + * PURPOSE: Add an entry to the snpa cache + * + * RETURNS: + * + * SIDE EFFECTS: + * + * NOTES: If entry already exists, then update holding time. + */ +snpac_add(ifp, nsap, snpa, type, ht, nsellength) +struct ifnet *ifp; /* interface info is related to */ +struct iso_addr *nsap; /* nsap to add */ +caddr_t snpa; /* translation */ +char type; /* SNPA_IS or SNPA_ES */ +u_short ht; /* holding time (in seconds) */ +int nsellength; /* nsaps may differ only in trailing bytes */ +{ + register struct llinfo_llc *lc; + register struct rtentry *rt; + struct rtentry *mrt = 0; + register struct iso_addr *r; /* for zap_isoaddr macro */ + int snpalen = min(ifp->if_addrlen, MAX_SNPALEN); + int new_entry = 0, index = ifp->if_index, iftype = ifp->if_type; + + IFDEBUG(D_SNPA) + printf("snpac_add(%x, %x, %x, %x, %x, %x)\n", + ifp, nsap, snpa, type, ht, nsellength); + ENDDEBUG + zap_isoaddr(dst, nsap); + rt = rtalloc1(S(dst), 0); + IFDEBUG(D_SNPA) + printf("snpac_add: rtalloc1 returns %x\n", rt); + ENDDEBUG + if (rt == 0) { + struct sockaddr *netmask; + int flags; + add: + if (nsellength) { + netmask = S(msk); flags = RTF_UP; + snpac_fixdstandmask(nsellength); + } else { + netmask = 0; flags = RTF_UP | RTF_HOST; + } + new_entry = 1; + zap_linkaddr((>e_dl), snpa, snpalen, index); + gte_dl.sdl_type = iftype; + if (rtrequest(RTM_ADD, S(dst), S(gte_dl), netmask, flags, &mrt) || + mrt == 0) + return (0); + rt = mrt; + rt->rt_refcnt--; + } else { + register struct sockaddr_dl *sdl = (struct sockaddr_dl *)rt->rt_gateway; + rt->rt_refcnt--; + if ((rt->rt_flags & RTF_LLINFO) == 0) + goto add; + if (nsellength && (rt->rt_flags & RTF_HOST)) { + if (rt->rt_refcnt == 0) { + rtrequest(RTM_DELETE, S(dst), (struct sockaddr *)0, + (struct sockaddr *)0, 0, (struct rtentry *)0); + rt = 0; + goto add; + } else { + static struct iso_addr nsap2; register char *cp; + nsap2 = *nsap; + cp = nsap2.isoa_genaddr + nsap->isoa_len - nsellength; + while (cp < (char *)(1 + &nsap2)) + *cp++ = 0; + (void) snpac_add(ifp, &nsap2, snpa, type, ht, nsellength); + } + } + if (sdl->sdl_family != AF_LINK || sdl->sdl_alen == 0) { + int old_sdl_len = sdl->sdl_len; + if (old_sdl_len < sizeof(*sdl)) { + log(LOG_DEBUG, "snpac_add: cant make room for lladdr\n"); + return (0); + } + zap_linkaddr(sdl, snpa, snpalen, index); + sdl->sdl_len = old_sdl_len; + sdl->sdl_type = iftype; + new_entry = 1; + } + } + if ((lc = (struct llinfo_llc *)rt->rt_llinfo) == 0) + panic("snpac_rtrequest"); + rt->rt_rmx.rmx_expire = ht + time.tv_sec; + lc->lc_flags = SNPA_VALID | type; + if ((type & SNPA_IS) && !(iso_systype & SNPA_IS)) + snpac_logdefis(rt); + return (new_entry); +} + +static void +snpac_fixdstandmask(nsellength) +{ + register char *cp = msk.siso_data, *cplim; + + cplim = cp + (dst.siso_nlen -= nsellength); + msk.siso_len = cplim - (char *)&msk; + msk.siso_nlen = 0; + while (cp < cplim) + *cp++ = -1; + while (cp < (char *)msk.siso_pad) + *cp++ = 0; + for (cp = dst.siso_data + dst.siso_nlen; cp < (char *)dst.siso_pad; ) + *cp++ = 0; +} + +/* + * FUNCTION: snpac_ioctl + * + * PURPOSE: Set/Get the system type and esis parameters + * + * RETURNS: 0 on success, or unix error code + * + * SIDE EFFECTS: + * + * NOTES: + */ +snpac_ioctl (so, cmd, data) +struct socket *so; +int cmd; /* ioctl to process */ +caddr_t data; /* data for the cmd */ +{ + register struct systype_req *rq = (struct systype_req *)data; + + IFDEBUG(D_IOCTL) + if (cmd == SIOCSSTYPE) + printf("snpac_ioctl: cmd set, type x%x, ht %d, ct %d\n", + rq->sr_type, rq->sr_holdt, rq->sr_configt); + else + printf("snpac_ioctl: cmd get\n"); + ENDDEBUG + + if (cmd == SIOCSSTYPE) { + if ((so->so_state & SS_PRIV) == 0) + return (EPERM); + if ((rq->sr_type & (SNPA_ES|SNPA_IS)) == (SNPA_ES|SNPA_IS)) + return(EINVAL); + if (rq->sr_type & SNPA_ES) { + iso_systype = SNPA_ES; + } else if (rq->sr_type & SNPA_IS) { + iso_systype = SNPA_IS; + } else { + return(EINVAL); + } + esis_holding_time = rq->sr_holdt; + esis_config_time = rq->sr_configt; + if (esis_esconfig_time != rq->sr_esconfigt) { + untimeout(esis_config, (caddr_t)0); + esis_esconfig_time = rq->sr_esconfigt; + esis_config(); + } + } else if (cmd == SIOCGSTYPE) { + rq->sr_type = iso_systype; + rq->sr_holdt = esis_holding_time; + rq->sr_configt = esis_config_time; + rq->sr_esconfigt = esis_esconfig_time; + } else { + return (EINVAL); + } + return (0); +} + +/* + * FUNCTION: snpac_logdefis + * + * PURPOSE: Mark the IS passed as the default IS + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: + */ +snpac_logdefis(sc) +register struct rtentry *sc; +{ + register struct iso_addr *r; + register struct sockaddr_dl *sdl = (struct sockaddr_dl *)sc->rt_gateway; + register struct rtentry *rt; + + if (known_is == sc || !(sc->rt_flags & RTF_HOST)) + return; + if (known_is) { + RTFREE(known_is); + } + known_is = sc; + sc->rt_refcnt++; + rt = rtalloc1((struct sockaddr *)&zsi, 0); + if (rt == 0) + rtrequest(RTM_ADD, S(zsi), rt_key(sc), S(zmk), + RTF_DYNAMIC|RTF_GATEWAY, 0); + else { + if ((rt->rt_flags & RTF_DYNAMIC) && + (rt->rt_flags & RTF_GATEWAY) && rt_mask(rt)->sa_len == 0) + rt_setgate(rt, rt_key(rt), rt_key(sc)); + } +} + +/* + * FUNCTION: snpac_age + * + * PURPOSE: Time out snpac entries + * + * RETURNS: + * + * SIDE EFFECTS: + * + * NOTES: When encountering an entry for the first time, snpac_age + * may delete up to SNPAC_AGE too many seconds. Ie. + * if the entry is added a moment before snpac_age is + * called, the entry will immediately have SNPAC_AGE + * seconds taken off the holding time, even though + * it has only been held a brief moment. + * + * The proper way to do this is set an expiry timeval + * equal to current time + holding time. Then snpac_age + * would time out entries where expiry date is older + * than the current time. + */ +void +snpac_age() +{ + register struct llinfo_llc *lc, *nlc; + register struct rtentry *rt; + + timeout(snpac_age, (caddr_t)0, SNPAC_AGE * hz); + + for (lc = llinfo_llc.lc_next; lc != & llinfo_llc; lc = nlc) { + nlc = lc->lc_next; + if (lc->lc_flags & SNPA_VALID) { + rt = lc->lc_rt; + if (rt->rt_rmx.rmx_expire && rt->rt_rmx.rmx_expire < time.tv_sec) + snpac_free(lc); + } + } +} + +/* + * FUNCTION: snpac_ownmulti + * + * PURPOSE: Determine if the snpa address is a multicast address + * of the same type as the system. + * + * RETURNS: true or false + * + * SIDE EFFECTS: + * + * NOTES: Used by interface drivers when not in eavesdrop mode + * as interm kludge until + * real multicast addresses can be configured + */ +snpac_ownmulti(snpa, len) +caddr_t snpa; +u_int len; +{ + return (((iso_systype & SNPA_ES) && + (!bcmp(snpa, (caddr_t)all_es_snpa, len))) || + ((iso_systype & SNPA_IS) && + (!bcmp(snpa, (caddr_t)all_is_snpa, len)))); +} + +/* + * FUNCTION: snpac_flushifp + * + * PURPOSE: Flush entries associated with specific ifp + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: + */ +snpac_flushifp(ifp) +struct ifnet *ifp; +{ + register struct llinfo_llc *lc; + + for (lc = llinfo_llc.lc_next; lc != & llinfo_llc; lc = lc->lc_next) { + if (lc->lc_rt->rt_ifp == ifp && (lc->lc_flags & SNPA_VALID)) + snpac_free(lc); + } +} + +/* + * FUNCTION: snpac_rtrequest + * + * PURPOSE: Make a routing request + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: In the future, this should make a request of a user + * level routing daemon. + */ +snpac_rtrequest(req, host, gateway, netmask, flags, ret_nrt) +int req; +struct iso_addr *host; +struct iso_addr *gateway; +struct iso_addr *netmask; +short flags; +struct rtentry **ret_nrt; +{ + register struct iso_addr *r; + + IFDEBUG(D_SNPA) + printf("snpac_rtrequest: "); + if (req == RTM_ADD) + printf("add"); + else if (req == RTM_DELETE) + printf("delete"); + else + printf("unknown command"); + printf(" dst: %s\n", clnp_iso_addrp(host)); + printf("\tgateway: %s\n", clnp_iso_addrp(gateway)); + ENDDEBUG + + + zap_isoaddr(dst, host); + zap_isoaddr(gte, gateway); + if (netmask) { + zap_isoaddr(msk, netmask); + msk.siso_nlen = 0; + msk.siso_len = msk.siso_pad - (u_char *)&msk; + } + + rtrequest(req, S(dst), S(gte), (netmask ? S(msk) : (struct sockaddr *)0), + flags, ret_nrt); +} + +/* + * FUNCTION: snpac_addrt + * + * PURPOSE: Associate a routing entry with an snpac entry + * + * RETURNS: nothing + * + * SIDE EFFECTS: + * + * NOTES: If a cache entry exists for gateway, then + * make a routing entry (host, gateway) and associate + * with gateway. + * + * If a route already exists and is different, first delete + * it. + * + * This could be made more efficient by checking + * the existing route before adding a new one. + */ +snpac_addrt(ifp, host, gateway, netmask) +struct ifnet *ifp; +struct iso_addr *host, *gateway, *netmask; +{ + register struct iso_addr *r; + + zap_isoaddr(dst, host); + zap_isoaddr(gte, gateway); + if (netmask) { + zap_isoaddr(msk, netmask); + msk.siso_nlen = 0; + msk.siso_len = msk.siso_pad - (u_char *)&msk; + rtredirect(S(dst), S(gte), S(msk), RTF_DONE, S(gte), 0); + } else + rtredirect(S(dst), S(gte), (struct sockaddr *)0, + RTF_DONE | RTF_HOST, S(gte), 0); +} +#endif /* ISO */ diff --git a/sys/netiso/iso_snpac.h b/sys/netiso/iso_snpac.h new file mode 100644 index 00000000000..105e8dd11d6 --- /dev/null +++ b/sys/netiso/iso_snpac.h @@ -0,0 +1,112 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso_snpac.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ + +#define MAX_SNPALEN 8 /* curiously equal to sizeof x.121 ( + plus 1 for nibble len) addr */ +struct snpa_req { + struct iso_addr sr_isoa; /* nsap address */ + u_char sr_len; /* length of snpa */ + u_char sr_snpa[MAX_SNPALEN]; /* snpa associated + with nsap address */ + u_char sr_flags; /* true if entry is valid */ + u_short sr_ht; /* holding time */ +}; + +#define SNPA_VALID 0x01 +#define SNPA_ES 0x02 +#define SNPA_IS 0x04 +#define SNPA_PERM 0x10 + +struct systype_req { + short sr_holdt; /* holding timer */ + short sr_configt; /* configuration timer */ + short sr_esconfigt; /* suggested ES configuration timer */ + char sr_type; /* SNPA_ES or SNPA_IS */ +}; + +struct esis_req { + short er_ht; /* holding time */ + u_char er_flags; /* type and validity */ +}; +/* + * Space for this structure gets added onto the end of a route + * going to an ethernet or other 802.[45x] device. + */ + +struct llinfo_llc { + struct llinfo_llc *lc_next; /* keep all llc routes linked */ + struct llinfo_llc *lc_prev; /* keep all llc routes linked */ + struct rtentry *lc_rt; /* backpointer to route */ + struct esis_req lc_er; /* holding time, etc */ +#define lc_ht lc_er.er_ht +#define lc_flags lc_er.er_flags +}; + + +/* ISO arp IOCTL data structures */ + +#define SIOCSSTYPE _IOW('a', 39, struct systype_req) /* set system type */ +#define SIOCGSTYPE _IOR('a', 40, struct systype_req) /* get system type */ + +#ifdef KERNEL +struct llinfo_llc llinfo_llc; /* head for linked lists */ +#endif /* KERNEL */ diff --git a/sys/netiso/iso_var.h b/sys/netiso/iso_var.h new file mode 100644 index 00000000000..946aeea93fe --- /dev/null +++ b/sys/netiso/iso_var.h @@ -0,0 +1,137 @@ +/*- + * Copyright (c) 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)iso_var.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: iso_var.h,v 4.2 88/06/29 15:00:08 hagens Exp $ + * $Source: /usr/argo/sys/netiso/RCS/iso_var.h,v $ + */ + +/* + * Interface address, iso version. One of these structures is + * allocated for each interface with an osi address. The ifaddr + * structure conatins the protocol-independent part + * of the structure, and is assumed to be first. + */ +struct iso_ifaddr { + struct ifaddr ia_ifa; /* protocol-independent info */ +#define ia_ifp ia_ifa.ifa_ifp +#define ia_flags ia_ifa.ifa_flags + int ia_snpaoffset; + struct iso_ifaddr *ia_next; /* next in list of iso addresses */ + struct sockaddr_iso ia_addr; /* reserve space for interface name */ + struct sockaddr_iso ia_dstaddr; /* reserve space for broadcast addr */ +#define ia_broadaddr ia_dstaddr + struct sockaddr_iso ia_sockmask; /* reserve space for general netmask */ +}; + +struct iso_aliasreq { + char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct sockaddr_iso ifra_addr; + struct sockaddr_iso ifra_dstaddr; + struct sockaddr_iso ifra_mask; + int ifra_snpaoffset; +}; + +struct iso_ifreq { + char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct sockaddr_iso ifr_Addr; +}; + +/* + * Given a pointer to an iso_ifaddr (ifaddr), + * return a pointer to the addr as a sockaddr_iso + */ +/* +#define IA_SIS(ia) ((struct sockaddr_iso *)(ia.ia_ifa->ifa_addr)) + * works if sockaddr_iso becomes variable sized. + */ +#define IA_SIS(ia) (&(((struct iso_ifaddr *)ia)->ia_addr)) + +#define SIOCDIFADDR_ISO _IOW('i',25, struct iso_ifreq) /* delete IF addr */ +#define SIOCAIFADDR_ISO _IOW('i',26, struct iso_aliasreq)/* add/chg IFalias */ +#define SIOCGIFADDR_ISO _IOWR('i',33, struct iso_ifreq) /* get ifnet address */ +#define SIOCGIFDSTADDR_ISO _IOWR('i',34, struct iso_ifreq) /* get dst address */ +#define SIOCGIFNETMASK_ISO _IOWR('i',37, struct iso_ifreq) /* get dst address */ + +/* + * This stuff should go in if.h or if_llc.h or someplace else, + * but for now . . . + */ + +struct llc_etherhdr { + char dst[6]; + char src[6]; + char len[2]; + char llc_dsap; + char llc_ssap; + char llc_ui_byte; +}; + +struct snpa_hdr { + struct ifnet *snh_ifp; + char snh_dhost[6]; + char snh_shost[6]; + short snh_flags; +}; +#ifdef KERNEL +struct iso_ifaddr *iso_ifaddr; /* linked list of iso address ifaces */ +struct iso_ifaddr *iso_localifa(); /* linked list of iso address ifaces */ +struct ifqueue clnlintrq; /* clnl packet input queue */ +#endif /* KERNEL */ diff --git a/sys/netiso/tp.trans b/sys/netiso/tp.trans new file mode 100644 index 00000000000..edefc769b81 --- /dev/null +++ b/sys/netiso/tp.trans @@ -0,0 +1,1342 @@ +/* NEW */ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp.trans 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* $Header: tp.trans,v 5.1 88/10/12 12:22:07 root Exp $ + * + * Transition file for TP. + * + * DO NOT: + * - change the order of any of the events or states. to do so will + * make tppt, netstat, etc. cease working. + * + * NOTE: + * some hooks exist for data on (dis)connect, but it's ***NOT***SUPPORTED*** + * (read: may not work!) + * + * I tried to put everything that causes a change of state in here, hence + * there are some seemingly trivial events like T_DETACH and T_LISTEN_req. + * + * Almost everything having to do w/ setting & cancelling timers is here + * but once it was debugged, I moved the setting of the + * keepalive (sendack) timer to tp_emit(), where an AK_TPDU is sent. + * This is so the code wouldn't be duplicated all over creation in here. + * + */ +*PROTOCOL tp + +*INCLUDE +{ +/* @(#)tp.trans 8.1 (Berkeley) 6/10/93 */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRIVERTRACE TPPTdriver +#define sbwakeup(sb) sowakeup(p->tp_sock, sb); +#define MCPY(d, w) (d ? m_copym(d, 0, (int)M_COPYALL, w): 0) + +static trick_hc = 1; + +int tp_emit(), + tp_goodack(), tp_goodXack(), + tp_stash() +; +void tp_indicate(), tp_getoptions(), + tp_soisdisconnecting(), tp_soisdisconnected(), + tp_recycle_tsuffix(), +#ifdef TP_DEBUG_TIMERS + tp_etimeout(), tp_euntimeout(), + tp_ctimeout(), tp_cuntimeout(), + tp_ctimeout_MIN(), +#endif + tp_freeref(), tp_detach(), + tp0_stash(), tp0_send(), + tp_netcmd(), tp_send() +; + +typedef struct tp_pcb tpcb_struct; + + +} + +*PCB tpcb_struct SYNONYM P + +*STATES + +TP_CLOSED +TP_CRSENT +TP_AKWAIT +TP_OPEN +TP_CLOSING +TP_REFWAIT +TP_LISTENING /* Local to this implementation */ +TP_CONFIRMING /* Local to this implementation */ + +*EVENTS { struct timeval e_time; } SYNONYM E + + /* + * C (typically cancelled) timers - + * + * let these be the first ones so for the sake of convenience + * their values are 0--> n-1 + * DO NOT CHANGE THE ORDER OF THESE TIMER EVENTS!! + */ + TM_inact + TM_retrans + /* TM_retrans is used for all + * simple retransmissions - CR,CC,XPD,DR + */ + + TM_sendack + /* TM_sendack does dual duty - keepalive AND closed-window + * Probes. + * It's set w/ keepalive-ticks every time an ack is sent. + * (this is done in (void) tp_emit() ). + * Whenever a DT arrives which doesn't require immediate acking, + * a separate fast-timeout flag is set ensuring 200ms response. + */ + TM_notused + + /* + * E (typically expired) timers - these may be in any order. + * These cause procedures to be executed directly; may not + * cause an 'event' as we know them here. + */ + TM_reference { SeqNum e_low; SeqNum e_high; int e_retrans; } + TM_data_retrans { SeqNum e_low; SeqNum e_high; int e_retrans; } + +/* NOTE: in tp_input is a minor optimization that assumes that + * for all tpdu types that can take e_data and e_datalen, these + * fields fall in the same place in the event structure, that is, + * e_data is the first field and e_datalen is the 2nd field. + */ + + ER_TPDU { + u_char e_reason; + } + CR_TPDU { struct mbuf *e_data; /* first field */ + int e_datalen; /* 2nd field */ + u_int e_cdt; + } + DR_TPDU { struct mbuf *e_data; /* first field */ + int e_datalen; /* 2nd field */ + u_short e_sref; + u_char e_reason; + } + DC_TPDU + CC_TPDU { struct mbuf *e_data; /* first field */ + int e_datalen; /* 2nd field */ + u_short e_sref; + u_int e_cdt; + } + AK_TPDU { u_int e_cdt; + SeqNum e_seq; + SeqNum e_subseq; + u_char e_fcc_present; + } + DT_TPDU { struct mbuf *e_data; /* first field */ + int e_datalen; /* 2nd field */ + u_int e_eot; + SeqNum e_seq; + } + XPD_TPDU { struct mbuf *e_data; /* first field */ + int e_datalen; /* 2nd field */ + SeqNum e_seq; + } + XAK_TPDU { SeqNum e_seq; } + + T_CONN_req + T_DISC_req { u_char e_reason; } + T_LISTEN_req + T_DATA_req + T_XPD_req + T_USR_rcvd + T_USR_Xrcvd + T_DETACH + T_NETRESET + T_ACPT_req + + +*TRANSITIONS + + +/* TP_AKWAIT doesn't exist in TP 0 */ +SAME <== TP_AKWAIT [ CC_TPDU, DC_TPDU, XAK_TPDU ] + DEFAULT + NULLACTION +; + + +/* applicable in TP4, TP0 */ +SAME <== TP_REFWAIT DR_TPDU + ( $$.e_sref != 0 ) + { + (void) tp_emit(DC_TPDU_type, $P, 0, 0, MNULL); + } +; + +/* applicable in TP4, TP0 */ +SAME <== TP_REFWAIT [ CR_TPDU, CC_TPDU, DT_TPDU, + DR_TPDU, XPD_TPDU, AK_TPDU, XAK_TPDU, DC_TPDU, ER_TPDU ] + DEFAULT + { +# ifdef TP_DEBUG + if( $E.ev_number != AK_TPDU ) + printf("TPDU 0x%x in REFWAIT!!!!\n", $E.ev_number); +# endif TP_DEBUG + } +; + +/* applicable in TP4, TP0 */ +SAME <== TP_REFWAIT [ T_DETACH, T_DISC_req ] + DEFAULT + NULLACTION +; + +/* applicable in TP4, TP0 */ +SAME <== TP_CRSENT AK_TPDU + ($P.tp_class == TP_CLASS_0) + { + /* oh, man is this grotesque or what? */ + (void) tp_goodack($P, $$.e_cdt, $$.e_seq, $$.e_subseq); + /* but it's necessary because this pseudo-ack may happen + * before the CC arrives, but we HAVE to adjust the + * snduna as a result of the ack, WHENEVER it arrives + */ + } +; + +/* applicable in TP4, TP0 */ +SAME <== TP_CRSENT + [ CR_TPDU, DC_TPDU, DT_TPDU, XPD_TPDU, XAK_TPDU ] + DEFAULT + NULLACTION +; + +/* applicable in TP4, TP0 */ +SAME <== TP_CLOSED [ DT_TPDU, XPD_TPDU, + ER_TPDU, DC_TPDU, AK_TPDU, XAK_TPDU ] + DEFAULT + NULLACTION +; + +/* TP_CLOSING doesn't exist in TP 0 */ +SAME <== TP_CLOSING + [ CC_TPDU, CR_TPDU, DT_TPDU, XPD_TPDU, AK_TPDU, XAK_TPDU ] + DEFAULT + NULLACTION +; + + +/* DC_TPDU doesn't exist in TP 0 */ +SAME <== TP_OPEN DC_TPDU + DEFAULT + NULLACTION +; + +/* applicable in TP4, TP0 */ +SAME <== TP_LISTENING [DR_TPDU, CC_TPDU, DT_TPDU, XPD_TPDU, + ER_TPDU, DC_TPDU, AK_TPDU, XAK_TPDU ] + DEFAULT + NULLACTION +; + +/* applicable in TP4, TP0 */ +TP_LISTENING <== TP_CLOSED T_LISTEN_req + DEFAULT + NULLACTION +; + +/* applicable in TP4, TP0 */ +TP_CLOSED <== [ TP_LISTENING, TP_CLOSED ] T_DETACH + DEFAULT + { + tp_detach($P); + } +; + +TP_CONFIRMING <== TP_LISTENING CR_TPDU + ( $P.tp_class == TP_CLASS_0) + { + $P.tp_refstate = REF_OPEN; /* has timers ??? */ + } +; + +TP_CONFIRMING <== TP_LISTENING CR_TPDU + DEFAULT + { + IFTRACE(D_CONN) + tptrace(TPPTmisc, "CR datalen data", $$.e_datalen, $$.e_data,0,0); + ENDTRACE + IFDEBUG(D_CONN) + printf("CR datalen 0x%x data 0x%x", $$.e_datalen, $$.e_data); + ENDDEBUG + $P.tp_refstate = REF_OPEN; /* has timers */ + $P.tp_fcredit = $$.e_cdt; + + if ($$.e_datalen > 0) { + /* n/a for class 0 */ + ASSERT($P.tp_Xrcv.sb_cc == 0); + sbappendrecord(&$P.tp_Xrcv, $$.e_data); + $$.e_data = MNULL; + } + } +; + +TP_OPEN <== TP_CONFIRMING T_ACPT_req + ( $P.tp_class == TP_CLASS_0 ) + { + IncStat(ts_tp0_conn); + IFTRACE(D_CONN) + tptrace(TPPTmisc, "Confiming", $P, 0,0,0); + ENDTRACE + IFDEBUG(D_CONN) + printf("Confirming connection: $P" ); + ENDDEBUG + soisconnected($P.tp_sock); + (void) tp_emit(CC_TPDU_type, $P, 0,0, MNULL) ; + $P.tp_fcredit = 1; + } +; + +TP_AKWAIT <== TP_CONFIRMING T_ACPT_req + (tp_emit(CC_TPDU_type, $P, 0,0, MCPY($P.tp_ucddata, M_NOWAIT)) == 0) + { + IncStat(ts_tp4_conn); /* even though not quite open */ + IFTRACE(D_CONN) + tptrace(TPPTmisc, "Confiming", $P, 0,0,0); + ENDTRACE + IFDEBUG(D_CONN) + printf("Confirming connection: $P" ); + ENDDEBUG + tp_getoptions($P); + soisconnecting($P.tp_sock); + if (($P.tp_rx_strat & TPRX_FASTSTART) && ($P.tp_fcredit > 0)) + $P.tp_cong_win = $P.tp_fcredit * $P.tp_l_tpdusize; + $P.tp_retrans = $P.tp_Nretrans; + tp_ctimeout($P, TM_retrans, (int)$P.tp_cc_ticks); + } +; + +/* TP4 only */ +TP_CLOSED <== TP_CONFIRMING T_ACPT_req + DEFAULT /* emit failed */ + { + IFDEBUG(D_CONN) + printf("event: CR_TPDU emit CC failed done " ); + ENDDEBUG + soisdisconnected($P.tp_sock); + tp_recycle_tsuffix($P); + tp_freeref($P.tp_lref); + tp_detach($P); + } +; + +/* applicable in TP4, TP0 */ +TP_CRSENT <== TP_CLOSED T_CONN_req + DEFAULT + { + int error; + struct mbuf *data = MNULL; + + IFTRACE(D_CONN) + tptrace(TPPTmisc, "T_CONN_req flags ucddata", (int)$P.tp_flags, + $P.tp_ucddata, 0, 0); + ENDTRACE + data = MCPY($P.tp_ucddata, M_WAIT); + if (data) { + IFDEBUG(D_CONN) + printf("T_CONN_req.trans m_copy cc 0x%x\n", + $P.tp_ucddata); + dump_mbuf(data, "sosnd @ T_CONN_req"); + ENDDEBUG + } + + if (error = tp_emit(CR_TPDU_type, $P, 0, 0, data) ) + return error; /* driver WON'T change state; will return error */ + + $P.tp_refstate = REF_OPEN; /* has timers */ + if($P.tp_class != TP_CLASS_0) { + $P.tp_retrans = $P.tp_Nretrans; + tp_ctimeout($P, TM_retrans, (int)$P.tp_cr_ticks); + } + } +; + +/* applicable in TP4, TP0, but state TP_AKWAIT doesn't apply to TP0 */ +TP_REFWAIT <== [ TP_CRSENT, TP_AKWAIT, TP_OPEN ] DR_TPDU + DEFAULT + { + sbflush(&$P.tp_Xrcv); /* purge non-delivered data data */ + if ($$.e_datalen > 0) { + sbappendrecord(&$P.tp_Xrcv, $$.e_data); + $$.e_data = MNULL; + } + if ($P.tp_state == TP_OPEN) + tp_indicate(T_DISCONNECT, $P, 0); + else { + int so_error = ECONNREFUSED; + if ($$.e_reason != (E_TP_NO_SESSION ^ TP_ERROR_MASK) && + $$.e_reason != (E_TP_NO_CR_ON_NC ^ TP_ERROR_MASK) && + $$.e_reason != (E_TP_REF_OVERFLOW ^ TP_ERROR_MASK)) + so_error = ECONNABORTED; + tp_indicate(T_DISCONNECT, $P, so_error); + } + tp_soisdisconnected($P); + if ($P.tp_class != TP_CLASS_0) { + if ($P.tp_state == TP_OPEN ) { + tp_euntimeout($P, TM_data_retrans); /* all */ + tp_cuntimeout($P, TM_retrans); + tp_cuntimeout($P, TM_inact); + tp_cuntimeout($P, TM_sendack); + $P.tp_flags &= ~TPF_DELACK; + } + tp_cuntimeout($P, TM_retrans); + if( $$.e_sref != 0 ) + (void) tp_emit(DC_TPDU_type, $P, 0, 0, MNULL); + } + } +; + +SAME <== TP_CLOSED DR_TPDU + DEFAULT + { + if( $$.e_sref != 0 ) + (void) tp_emit(DC_TPDU_type, $P, 0, 0, MNULL); + /* reference timer already set - reset it to be safe (???) */ + tp_euntimeout($P, TM_reference); /* all */ + tp_etimeout($P, TM_reference, (int)$P.tp_refer_ticks); + } +; + +/* NBS(34) */ +TP_REFWAIT <== TP_CRSENT ER_TPDU + DEFAULT + { + tp_cuntimeout($P, TM_retrans); + tp_indicate(ER_TPDU, $P, $$.e_reason); + tp_soisdisconnected($P); + } +; + +/* NBS(27) */ +TP_REFWAIT <== TP_CLOSING DR_TPDU + DEFAULT + { + tp_cuntimeout($P, TM_retrans); + tp_soisdisconnected($P); + } +; +/* these two transitions are the same but can't be combined because xebec + * can't handle the use of $$.e_reason if they're combined + */ +/* NBS(27) */ +TP_REFWAIT <== TP_CLOSING ER_TPDU + DEFAULT + { + tp_indicate(ER_TPDU, $P, $$.e_reason); + tp_cuntimeout($P, TM_retrans); + tp_soisdisconnected($P); + } +; +/* NBS(27) */ +TP_REFWAIT <== TP_CLOSING DC_TPDU + DEFAULT + { + tp_cuntimeout($P, TM_retrans); + tp_soisdisconnected($P); + } +; + +/* NBS(21) */ +SAME <== TP_CLOSED [ CC_TPDU, CR_TPDU ] + DEFAULT + { /* don't ask me why we have to do this - spec says so */ + (void) tp_emit(DR_TPDU_type, $P, 0, E_TP_NO_SESSION, MNULL); + /* don't bother with retransmissions of the DR */ + } +; + +/* NBS(34) */ +TP_REFWAIT <== TP_OPEN ER_TPDU + ($P.tp_class == TP_CLASS_0) + { + tp_soisdisconnecting($P.tp_sock); + tp_indicate(ER_TPDU, $P, $$.e_reason); + tp_soisdisconnected($P); + tp_netcmd( $P, CONN_CLOSE ); + } +; + +TP_CLOSING <== [ TP_AKWAIT, TP_OPEN ] ER_TPDU + DEFAULT + { + if ($P.tp_state == TP_OPEN) { + tp_euntimeout($P, TM_data_retrans); /* all */ + tp_cuntimeout($P, TM_inact); + tp_cuntimeout($P, TM_sendack); + } + tp_soisdisconnecting($P.tp_sock); + tp_indicate(ER_TPDU, $P, $$.e_reason); + $P.tp_retrans = $P.tp_Nretrans; + tp_ctimeout($P, TM_retrans, (int)$P.tp_dr_ticks); + (void) tp_emit(DR_TPDU_type, $P, 0, E_TP_PROTO_ERR, MNULL); + } +; +/* NBS(6) */ +TP_OPEN <== TP_CRSENT CC_TPDU + ($P.tp_class == TP_CLASS_0) + { + tp_cuntimeout($P, TM_retrans); + IncStat(ts_tp0_conn); + $P.tp_fcredit = 1; + soisconnected($P.tp_sock); + } +; + +TP_OPEN <== TP_CRSENT CC_TPDU + DEFAULT + { + IFDEBUG(D_CONN) + printf("trans: CC_TPDU in CRSENT state flags 0x%x\n", + (int)$P.tp_flags); + ENDDEBUG + IncStat(ts_tp4_conn); + $P.tp_fref = $$.e_sref; + $P.tp_fcredit = $$.e_cdt; + if (($P.tp_rx_strat & TPRX_FASTSTART) && ($$.e_cdt > 0)) + $P.tp_cong_win = $$.e_cdt * $P.tp_l_tpdusize; + tp_getoptions($P); + tp_cuntimeout($P, TM_retrans); + if ($P.tp_ucddata) { + IFDEBUG(D_CONN) + printf("dropping user connect data cc 0x%x\n", + $P.tp_ucddata->m_len); + ENDDEBUG + m_freem($P.tp_ucddata); + $P.tp_ucddata = 0; + } + soisconnected($P.tp_sock); + if ($$.e_datalen > 0) { + ASSERT($P.tp_Xrcv.sb_cc == 0); /* should be empty */ + sbappendrecord(&$P.tp_Xrcv, $$.e_data); + $$.e_data = MNULL; + } + + (void) tp_emit(AK_TPDU_type, $P, $P.tp_rcvnxt, 0, MNULL); + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + } +; + +/* TP4 only */ +SAME <== TP_CRSENT TM_retrans + ( $P.tp_retrans > 0 ) + { + struct mbuf *data = MNULL; + int error; + + IncStat(ts_retrans_cr); + $P.tp_cong_win = 1 * $P.tp_l_tpdusize; + data = MCPY($P.tp_ucddata, M_NOWAIT); + if($P.tp_ucddata) { + IFDEBUG(D_CONN) + printf("TM_retrans.trans m_copy cc 0x%x\n", data); + dump_mbuf($P.tp_ucddata, "sosnd @ TM_retrans"); + ENDDEBUG + if( data == MNULL ) + return ENOBUFS; + } + + $P.tp_retrans --; + if( error = tp_emit(CR_TPDU_type, $P, 0, 0, data) ) { + $P.tp_sock->so_error = error; + } + tp_ctimeout($P, TM_retrans, (int)$P.tp_cr_ticks); + } +; + +/* TP4 only */ +TP_REFWAIT <== TP_CRSENT TM_retrans + DEFAULT /* no more CR retransmissions */ + { + IncStat(ts_conn_gaveup); + $P.tp_sock->so_error = ETIMEDOUT; + tp_indicate(T_DISCONNECT, $P, ETIMEDOUT); + tp_soisdisconnected($P); + } +; + +/* TP4 only */ +SAME <== TP_AKWAIT CR_TPDU + DEFAULT + /* duplicate CR (which doesn't really exist in the context of + * a connectionless network layer) + * Doesn't occur in class 0. + */ + { + int error; + struct mbuf *data = MCPY($P.tp_ucddata, M_WAIT); + + if( error = tp_emit(CC_TPDU_type, $P, 0, 0, data) ) { + $P.tp_sock->so_error = error; + } + $P.tp_retrans = $P.tp_Nretrans; + tp_ctimeout($P, TM_retrans, (int)$P.tp_cc_ticks); + } +; + +/* TP4 only */ +TP_OPEN <== TP_AKWAIT DT_TPDU + ( IN_RWINDOW( $P, $$.e_seq, + $P.tp_rcvnxt, SEQ($P, $P.tp_rcvnxt + $P.tp_lcredit)) ) + { + int doack; + + /* + * Get rid of any confirm or connect data, so that if we + * crash or close, it isn't thought of as disconnect data. + */ + if ($P.tp_ucddata) { + m_freem($P.tp_ucddata); + $P.tp_ucddata = 0; + } + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + tp_cuntimeout($P, TM_retrans); + soisconnected($P.tp_sock); + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + + /* see also next 2 transitions, if you make any changes */ + + doack = tp_stash($P, $E); + IFDEBUG(D_DATA) + printf("tp_stash returns %d\n",doack); + ENDDEBUG + + if (doack) { + (void) tp_emit(AK_TPDU_type, $P, $P.tp_rcvnxt, 0, MNULL ); + tp_ctimeout($P, TM_sendack, (int)$P.tp_keepalive_ticks); + } else + tp_ctimeout( $P, TM_sendack, (int)$P.tp_sendack_ticks); + + IFDEBUG(D_DATA) + printf("after stash calling sbwakeup\n"); + ENDDEBUG + } +; + +SAME <== TP_OPEN DT_TPDU + ( $P.tp_class == TP_CLASS_0 ) + { + tp0_stash($P, $E); + sbwakeup( &$P.tp_sock->so_rcv ); + + IFDEBUG(D_DATA) + printf("after stash calling sbwakeup\n"); + ENDDEBUG + } +; + +/* TP4 only */ +SAME <== TP_OPEN DT_TPDU + ( IN_RWINDOW( $P, $$.e_seq, + $P.tp_rcvnxt, SEQ($P, $P.tp_rcvnxt + $P.tp_lcredit)) ) + { + int doack; /* tells if we must ack immediately */ + + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + sbwakeup( &$P.tp_sock->so_rcv ); + + doack = tp_stash($P, $E); + IFDEBUG(D_DATA) + printf("tp_stash returns %d\n",doack); + ENDDEBUG + + if(doack) + (void) tp_emit(AK_TPDU_type, $P, $P.tp_rcvnxt, 0, MNULL ); + else + tp_ctimeout_MIN( $P, TM_sendack, (int)$P.tp_sendack_ticks); + + IFDEBUG(D_DATA) + printf("after stash calling sbwakeup\n"); + ENDDEBUG + } +; + +/* Not in window - we must ack under certain circumstances, namely + * a) if the seq number is below lwe but > lwe - (max credit ever given) + * (to handle lost acks) Can use max-possible-credit for this ^^^. + * and + * b) seq number is > uwe but < uwe + previously sent & withdrawn credit + * + * (see 12.2.3.8.1 of ISO spec, p. 73) + * We just always ack. + */ +/* TP4 only */ +SAME <== [ TP_OPEN, TP_AKWAIT ] DT_TPDU + DEFAULT /* Not in window */ + { + IFTRACE(D_DATA) + tptrace(TPPTmisc, "NIW seq rcvnxt lcredit ", + $$.e_seq, $P.tp_rcvnxt, $P.tp_lcredit, 0); + ENDTRACE + IncStat(ts_dt_niw); + m_freem($$.e_data); + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + (void) tp_emit(AK_TPDU_type, $P, $P.tp_rcvnxt, 0, MNULL ); + } +; + +/* TP4 only */ +TP_OPEN <== TP_AKWAIT AK_TPDU + DEFAULT + { + if ($P.tp_ucddata) { + m_freem($P.tp_ucddata); + $P.tp_ucddata = 0; + } + (void) tp_goodack($P, $$.e_cdt, $$.e_seq, $$.e_subseq); + tp_cuntimeout($P, TM_retrans); + + soisconnected($P.tp_sock); + IFTRACE(D_CONN) + struct socket *so = $P.tp_sock; + tptrace(TPPTmisc, + "called sosiconn: so so_state rcv.sb_sel rcv.sb_flags", + so, so->so_state, so->so_rcv.sb_sel, so->so_rcv.sb_flags); + tptrace(TPPTmisc, + "called sosiconn 2: so_qlen so_error so_rcv.sb_cc so_head", + so->so_qlen, so->so_error, so->so_rcv.sb_cc, so->so_head); + ENDTRACE + + tp_ctimeout($P, TM_sendack, (int)$P.tp_keepalive_ticks); + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + } +; + +/* TP4 only */ +TP_OPEN <== [ TP_OPEN, TP_AKWAIT ] XPD_TPDU + ($P.tp_Xrcvnxt == $$.e_seq) + { + if( $P.tp_state == TP_AKWAIT ) { + if ($P.tp_ucddata) { + m_freem($P.tp_ucddata); + $P.tp_ucddata = 0; + } + tp_cuntimeout($P, TM_retrans); + soisconnected($P.tp_sock); + tp_ctimeout($P, TM_sendack, (int)$P.tp_keepalive_ticks); + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + } + IFTRACE(D_XPD) + tptrace(TPPTmisc, "XPD tpdu accepted Xrcvnxt, e_seq datalen m_len\n", + $P.tp_Xrcvnxt,$$.e_seq, $$.e_datalen, $$.e_data->m_len); + ENDTRACE + + $P.tp_sock->so_state |= SS_RCVATMARK; + $$.e_data->m_flags |= M_EOR; + sbinsertoob(&$P.tp_Xrcv, $$.e_data); + IFDEBUG(D_XPD) + dump_mbuf($$.e_data, "XPD TPDU: tp_Xrcv"); + ENDDEBUG + tp_indicate(T_XDATA, $P, 0); + sbwakeup( &$P.tp_Xrcv ); + + (void) tp_emit(XAK_TPDU_type, $P, $P.tp_Xrcvnxt, 0, MNULL); + SEQ_INC($P, $P.tp_Xrcvnxt); + } +; + +/* TP4 only */ +SAME <== TP_OPEN T_USR_Xrcvd + DEFAULT + { + if( $P.tp_Xrcv.sb_cc == 0 ) { + /* kludge for select(): */ + /* $P.tp_sock->so_state &= ~SS_OOBAVAIL; */ + } + } + /* OLD WAY: + * Ack only after the user receives the XPD. This is better for + * users that use one XPD right after another. + * Acking right away (the NEW WAY, see the prev. transition) is + * better for occasional * XPD, when the receiving user doesn't + * want to read the XPD immediately (which is session's behavior). + * + int error = tp_emit(XAK_TPDU_type, $P, $P.tp_Xrcvnxt, 0, MNULL); + SEQ_INC($P, $P.tp_Xrcvnxt); + return error; + */ +; + +/* NOTE: presently if the user doesn't read the connection data + * before and expedited data PDU comes in, the connection data will + * be dropped. This is a bug. To avoid it, we need somewhere else + * to put the connection data. + * On the other hand, we need not to have it sitting around forever. + * This is a problem with the idea of trying to accommodate + * data on connect w/ a passive-open user interface. + */ +/* TP4 only */ + +SAME <== [ TP_AKWAIT, TP_OPEN ] XPD_TPDU + DEFAULT /* not in window or cdt==0 */ + { + IFTRACE(D_XPD) + tptrace(TPPTmisc, "XPD tpdu niw (Xrcvnxt, e_seq) or not cdt (cc)\n", + $P.tp_Xrcvnxt, $$.e_seq, $P.tp_Xrcv.sb_cc , 0); + ENDTRACE + if( $P.tp_Xrcvnxt != $$.e_seq ) + IncStat(ts_xpd_niw); + if( $P.tp_Xrcv.sb_cc ) { + /* might as well kick 'em again */ + tp_indicate(T_XDATA, $P, 0); + IncStat(ts_xpd_dup); + } + m_freem($$.e_data); + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + /* don't send an xack because the xak gives "last one received", not + * "next one i expect" (dumb) + */ + } +; + +/* Occurs (AKWAIT, OPEN) when parent (listening) socket gets aborted, and tries + * to detach all its "children" + * Also (CRSENT) when user kills a job that's doing a connect() + */ +TP_REFWAIT <== TP_CRSENT T_DETACH + ($P.tp_class == TP_CLASS_0) + { + struct socket *so = $P.tp_sock; + + /* detach from parent socket so it can finish closing */ + if (so->so_head) { + if (!soqremque(so, 0) && !soqremque(so, 1)) + panic("tp: T_DETACH"); + so->so_head = 0; + } + tp_soisdisconnecting($P.tp_sock); + tp_netcmd( $P, CONN_CLOSE); + tp_soisdisconnected($P); + } +; + +/* TP4 only */ +TP_CLOSING <== [ TP_CLOSING, TP_AKWAIT, TP_CRSENT, TP_CONFIRMING ] T_DETACH + DEFAULT + { + struct socket *so = $P.tp_sock; + struct mbuf *data = MNULL; + + /* detach from parent socket so it can finish closing */ + if (so->so_head) { + if (!soqremque(so, 0) && !soqremque(so, 1)) + panic("tp: T_DETACH"); + so->so_head = 0; + } + if ($P.tp_state != TP_CLOSING) { + tp_soisdisconnecting($P.tp_sock); + data = MCPY($P.tp_ucddata, M_NOWAIT); + (void) tp_emit(DR_TPDU_type, $P, 0, E_TP_NORMAL_DISC, data); + $P.tp_retrans = $P.tp_Nretrans; + tp_ctimeout($P, TM_retrans, (int)$P.tp_dr_ticks); + } + } +; + +TP_REFWAIT <== [ TP_OPEN, TP_CRSENT ] T_DISC_req + ( $P.tp_class == TP_CLASS_0 ) + { + tp_soisdisconnecting($P.tp_sock); + tp_netcmd( $P, CONN_CLOSE); + tp_soisdisconnected($P); + } +; + +/* TP4 only */ +TP_CLOSING <== [ TP_AKWAIT, TP_OPEN, TP_CRSENT, TP_CONFIRMING ] T_DISC_req + DEFAULT + { + struct mbuf *data = MCPY($P.tp_ucddata, M_WAIT); + + if($P.tp_state == TP_OPEN) { + tp_euntimeout($P, TM_data_retrans); /* all */ + tp_cuntimeout($P, TM_inact); + tp_cuntimeout($P, TM_sendack); + $P.tp_flags &= ~TPF_DELACK; + } + if (data) { + IFDEBUG(D_CONN) + printf("T_DISC_req.trans tp_ucddata 0x%x\n", + $P.tp_ucddata); + dump_mbuf(data, "ucddata @ T_DISC_req"); + ENDDEBUG + } + tp_soisdisconnecting($P.tp_sock); + $P.tp_retrans = $P.tp_Nretrans; + tp_ctimeout($P, TM_retrans, (int)$P.tp_dr_ticks); + + if( trick_hc ) + return tp_emit(DR_TPDU_type, $P, 0, $$.e_reason, data); + } +; + +/* TP4 only */ +SAME <== TP_AKWAIT TM_retrans + ( $P.tp_retrans > 0 ) + { + int error; + struct mbuf *data = MCPY($P.tp_ucddata, M_WAIT); + + IncStat(ts_retrans_cc); + $P.tp_retrans --; + $P.tp_cong_win = 1 * $P.tp_l_tpdusize; + + if( error = tp_emit(CC_TPDU_type, $P, 0, 0, data) ) + $P.tp_sock->so_error = error; + tp_ctimeout($P, TM_retrans, (int)$P.tp_cc_ticks); + } +; + +/* TP4 only */ +TP_CLOSING <== TP_AKWAIT TM_retrans + DEFAULT /* out of time */ + { + IncStat(ts_conn_gaveup); + tp_soisdisconnecting($P.tp_sock); + $P.tp_sock->so_error = ETIMEDOUT; + tp_indicate(T_DISCONNECT, $P, ETIMEDOUT); + (void) tp_emit(DR_TPDU_type, $P, 0, E_TP_CONGEST, MNULL); + $P.tp_retrans = $P.tp_Nretrans; + tp_ctimeout($P, TM_retrans, (int)$P.tp_dr_ticks); + } +; + +/* the retrans timers had better go off BEFORE the inactivity timer does, + * if transmissions are going on. + * (i.e., TM_inact should be greater than timer for all retrans plus ack + * turnaround) + */ +/* TP4 only */ +TP_CLOSING <== TP_OPEN [ TM_inact, TM_retrans, TM_data_retrans ] + DEFAULT + { + tp_euntimeout($P, TM_data_retrans); /* all */ + tp_cuntimeout($P, TM_inact); + tp_cuntimeout($P, TM_sendack); + + IncStat(ts_conn_gaveup); + tp_soisdisconnecting($P.tp_sock); + $P.tp_sock->so_error = ETIMEDOUT; + tp_indicate(T_DISCONNECT, $P, ETIMEDOUT); + (void) tp_emit(DR_TPDU_type, $P, 0, E_TP_CONGEST_2, MNULL); + $P.tp_retrans = $P.tp_Nretrans; + tp_ctimeout($P, TM_retrans, (int)$P.tp_dr_ticks); + } +; + +/* TP4 only */ +SAME <== TP_OPEN TM_retrans + ( $P.tp_retrans > 0 ) + { + $P.tp_cong_win = 1 * $P.tp_l_tpdusize; + /* resume XPD */ + if ( $P.tp_Xsnd.sb_mb ) { + struct mbuf *m = m_copy($P.tp_Xsnd.sb_mb, 0, (int)$P.tp_Xsnd.sb_cc); + int shift; + + IFTRACE(D_XPD) + tptrace(TPPTmisc, "XPD retrans: Xuna Xsndnxt sndnxt snduna", + $P.tp_Xuna, $P.tp_Xsndnxt, $P.tp_sndnxt, + $P.tp_snduna); + ENDTRACE + IFDEBUG(D_XPD) + dump_mbuf(m, "XPD retrans emitting M"); + ENDDEBUG + IncStat(ts_retrans_xpd); + $P.tp_retrans --; + shift = max($P.tp_Nretrans - $P.tp_retrans, 6); + (void) tp_emit(XPD_TPDU_type, $P, $P.tp_Xuna, 1, m); + tp_ctimeout($P, TM_retrans, ((int)$P.tp_dt_ticks) << shift); + } + } +; + +/* TP4 only */ +SAME <== TP_OPEN TM_data_retrans + ($P.tp_rxtshift < TP_NRETRANS) + { + $P.tp_rxtshift++; + (void) tp_data_retrans($P); + } +; + +/* TP4 only */ +SAME <== TP_CLOSING TM_retrans + ( $P.tp_retrans > 0 ) + { + $P.tp_retrans --; + (void) tp_emit(DR_TPDU_type, $P, 0, E_TP_DR_NO_REAS, MNULL); + IncStat(ts_retrans_dr); + tp_ctimeout($P, TM_retrans, (int)$P.tp_dr_ticks); + } +; + +/* TP4 only */ +TP_REFWAIT <== TP_CLOSING TM_retrans + DEFAULT /* no more retrans - gave up */ + { + $P.tp_sock->so_error = ETIMEDOUT; + $P.tp_refstate = REF_FROZEN; + tp_recycle_tsuffix( $P ); + tp_etimeout($P, TM_reference, (int)$P.tp_refer_ticks); + } +; + +/* + * The resources are kept around until the ref timer goes off. + * The suffices are wiped out sooner so they can be reused right away. + */ +/* applicable in TP4, TP0 */ +TP_CLOSED <== TP_REFWAIT TM_reference + DEFAULT + { + tp_freeref($P.tp_lref); + tp_detach($P); + } +; + +/* applicable in TP4, TP0 */ +/* A duplicate CR from connectionless network layer can't happen */ +SAME <== TP_OPEN [ CR_TPDU, CC_TPDU ] + DEFAULT + { + if( $P.tp_class != TP_CLASS_0) { + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + if ( $E.ev_number == CC_TPDU ) + (void) tp_emit(AK_TPDU_type, $P, $P.tp_rcvnxt, 0, MNULL); + } + /* ignore it if class 0 - state tables are blank for this */ + } +; + +/* applicable in TP4, TP0 */ +SAME <== TP_OPEN T_DATA_req + DEFAULT + { + IFTRACE(D_DATA) + tptrace(TPPTmisc, "T_DATA_req sndnxt snduna fcredit, tpcb", + $P.tp_sndnxt, $P.tp_snduna, $P.tp_fcredit, $P); + ENDTRACE + + tp_send($P); + } +; + +/* TP4 only */ +SAME <== TP_OPEN T_XPD_req + DEFAULT + /* T_XPD_req was issued by sosend iff xpd socket buf was empty + * at time of sosend(), + * AND (which means) there were no unacknowledged XPD tpdus outstanding! + */ + { + int error = 0; + + /* resume XPD */ + if ( $P.tp_Xsnd.sb_mb ) { + struct mbuf *m = m_copy($P.tp_Xsnd.sb_mb, 0, (int)$P.tp_Xsnd.sb_cc); + /* m_copy doesn't preserve the m_xlink field, but at this pt. + * that doesn't matter + */ + + IFTRACE(D_XPD) + tptrace(TPPTmisc, "XPD req: Xuna Xsndnxt sndnxt snduna", + $P.tp_Xuna, $P.tp_Xsndnxt, $P.tp_sndnxt, + $P.tp_snduna); + ENDTRACE + IFDEBUG(D_XPD) + printf("T_XPD_req: sb_cc 0x%x\n", $P.tp_Xsnd.sb_cc); + dump_mbuf(m, "XPD req emitting M"); + ENDDEBUG + error = + tp_emit(XPD_TPDU_type, $P, $P.tp_Xuna, 1, m); + $P.tp_retrans = $P.tp_Nretrans; + + tp_ctimeout($P, TM_retrans, (int)$P.tp_rxtcur); + SEQ_INC($P, $P.tp_Xsndnxt); + } + if(trick_hc) + return error; + } +; + +/* TP4, faked ack in TP0 when cons send completes */ +SAME <== TP_OPEN AK_TPDU + ( tp_goodack($P, $$.e_cdt, $$.e_seq, $$.e_subseq) ) + + /* tp_goodack == true means + * EITHER it actually acked something heretofore unacknowledged + * OR no news but the credit should be processed. + */ + { + struct sockbuf *sb = &$P.tp_sock->so_snd; + + IFDEBUG(D_ACKRECV) + printf("GOOD ACK seq 0x%x cdt 0x%x\n", $$.e_seq, $$.e_cdt); + ENDDEBUG + if( $P.tp_class != TP_CLASS_0) { + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + } + sbwakeup(sb); + IFDEBUG(D_ACKRECV) + printf("GOOD ACK new sndnxt 0x%x\n", $P.tp_sndnxt); + ENDDEBUG + } +; + +/* TP4, and TP0 after sending a CC or possibly a CR */ +SAME <== TP_OPEN AK_TPDU + DEFAULT + { + IFTRACE(D_ACKRECV) + tptrace(TPPTmisc, "BOGUS ACK fcc_present, tp_r_subseq e_subseq", + $$.e_fcc_present, $P.tp_r_subseq, $$.e_subseq, 0); + ENDTRACE + if( $P.tp_class != TP_CLASS_0 ) { + + if ( !$$.e_fcc_present ) { + /* send ACK with FCC */ + IncStat( ts_ackreason[_ACK_FCC_] ); + (void) tp_emit(AK_TPDU_type, $P, $P.tp_rcvnxt, 1, MNULL); + } + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + } + } +; + +/* NBS(47) */ + /* goes in at *** */ + /* just so happens that this is never true now, because we allow + * only 1 packet in the queue at once (this could be changed) + if ( $P.tp_Xsnd.sb_mb ) { + struct mbuf *m = m_copy($P.tp_Xsnd.sb_mb, 0, ??); + + (void) tp_emit(XPD_TPDU_type, $P, $P.tp_Xuna, 1, m); + $P.tp_retrans = $P.tp_Nretrans; + tp_ctimeout($P, TM_retrans, (int)$P.tp_xpd_ticks); + SEQ_INC($P, $P.tp_Xsndnxt); + } + */ + /* end of the above hack */ + +/* TP4 only */ +SAME <== TP_OPEN XAK_TPDU + ( tp_goodXack($P, $$.e_seq) ) + /* tp_goodXack checks for good ack, removes the correct + * tpdu from the queue and returns 1 if ack was legit, 0 if not. + * also updates tp_Xuna + */ + { + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + tp_cuntimeout($P, TM_retrans); + + sbwakeup( &$P.tp_sock->so_snd ); + + /* resume normal data */ + tp_send($P); + } +; + +/* TP4, and TP0 after sending a CC or possibly a CR */ +SAME <== TP_OPEN XAK_TPDU + DEFAULT + { + IFTRACE(D_ACKRECV) + tptrace(TPPTmisc, "BOGUS XACK eventtype ", $E.ev_number, 0, 0,0); + ENDTRACE + if( $P.tp_class != TP_CLASS_0 ) { + tp_ctimeout($P, TM_inact, (int)$P.tp_inact_ticks); + } + } +; + +/* TP4 only */ +SAME <== TP_OPEN TM_sendack + DEFAULT + { + int timo; + IFTRACE(D_TIMER) + tptrace(TPPTsendack, -1, $P.tp_lcredit, $P.tp_sent_uwe, + $P.tp_sent_lcdt, 0); + ENDTRACE + IncPStat($P, tps_n_TMsendack); + (void) tp_emit(AK_TPDU_type, $P, $P.tp_rcvnxt, 0, MNULL); + if ($P.tp_fcredit == 0) { + if ($P.tp_rxtshift < TP_MAXRXTSHIFT) + $P.tp_rxtshift++; + timo = ($P.tp_dt_ticks) << $P.tp_rxtshift; + } else + timo = $P.tp_sendack_ticks; + tp_ctimeout($P, TM_sendack, timo); + } +; + +/* TP0 only */ +SAME <== TP_OPEN T_USR_rcvd + ($P.tp_class == TP_CLASS_0) + { + if (sbspace(&$P.tp_sock->so_rcv) > 0) + tp0_openflow($P); + } +; + +/* TP4 only */ + /* If old credit was zero, + * we'd better inform other side that we now have space + * But this is not enough. Sender might not yet have + * seen an ack with cdt 0 but it might still think the + * window is closed, so it's going to wait. + * Best to send an ack each time. + * Strictly speaking, this ought to be a function of the + * general ack strategy. + */ +SAME <== TP_OPEN T_USR_rcvd + DEFAULT + { + if( trick_hc ) { + SeqNum ack_thresh; + /* + * If the upper window edge has advanced a reasonable + * amount beyond what was known, send an ACK. + * A reasonable amount is 2 packets, unless the max window + * is only 1 or 2 packets, in which case we + * should send an ack for any advance in the upper window edge. + */ + LOCAL_CREDIT($P); + ack_thresh = SEQ_SUB($P, $P.tp_lcredit + $P.tp_rcvnxt, + ($P.tp_maxlcredit > 2 ? 2 : 1)); + if (SEQ_GT($P, ack_thresh, $P.tp_sent_uwe)) { + IncStat(ts_ackreason[_ACK_USRRCV_]); + $P.tp_flags &= ~TPF_DELACK; + return tp_emit(AK_TPDU_type, $P, $P.tp_rcvnxt, 0, MNULL); + } + } + } +; + +/* applicable in TP4, TP0 */ +SAME <== TP_REFWAIT [ T_USR_rcvd, T_USR_Xrcvd ] + DEFAULT + /* This happens if other end sent a DR when the user was waiting + * on a receive. + * Processing the DR includes putting us in REFWAIT state. + */ + { + if(trick_hc) + return ECONNABORTED; + } +; + +/* TP0 only */ +TP_REFWAIT <== [ TP_OPEN, TP_CRSENT, TP_LISTENING ] T_NETRESET + ( $P.tp_class != TP_CLASS_4 ) + /* 0 or (4 and 0) */ + /* in OPEN class will be 0 or 4 but not both */ + /* in CRSENT or LISTENING it could be in negotiation, hence both */ + /* Actually, this shouldn't ever happen in LISTENING */ + { + ASSERT( $P.tp_state != TP_LISTENING ); + tp_indicate(T_DISCONNECT, $P, ECONNRESET); + tp_soisdisconnected($P); + } +; + +/* TP4: ignore resets */ +SAME <== [ TP_OPEN, TP_CRSENT, TP_AKWAIT, + TP_CLOSING, TP_LISTENING ] T_NETRESET + DEFAULT + NULLACTION +; + +/* applicable in TP4, TP0 */ +SAME <== [ TP_CLOSED, TP_REFWAIT ] T_NETRESET + DEFAULT + NULLACTION +; + +/* C'EST TOUT */ diff --git a/sys/netiso/tp_astring.c b/sys/netiso/tp_astring.c new file mode 100644 index 00000000000..af08cebbc86 --- /dev/null +++ b/sys/netiso/tp_astring.c @@ -0,0 +1,74 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_astring.c 8.1 (Berkeley) 6/10/93 + */ + +char *tp_sstring[] = { +"ST_ERROR(0x0)", +"TP_CLOSED(0x1)", +"TP_CRSENT(0x2)", +"TP_AKWAIT(0x3)", +"TP_OPEN(0x4)", +"TP_CLOSING(0x5)", +"TP_REFWAIT(0x6)", +"TP_LISTENING(0x7)", +"TP_CONFIRMING(0x8)", +}; + +char *tp_estring[] = { +"TM_inact(0x0)", +"TM_retrans(0x1)", +"TM_sendack(0x2)", +"TM_notused(0x3)", +"TM_reference(0x4)", +"TM_data_retrans(0x5)", +"ER_TPDU(0x6)", +"CR_TPDU(0x7)", +"DR_TPDU(0x8)", +"DC_TPDU(0x9)", +"CC_TPDU(0xa)", +"AK_TPDU(0xb)", +"DT_TPDU(0xc)", +"XPD_TPDU(0xd)", +"XAK_TPDU(0xe)", +"T_CONN_req(0xf)", +"T_DISC_req(0x10)", +"T_LISTEN_req(0x11)", +"T_DATA_req(0x12)", +"T_XPD_req(0x13)", +"T_USR_rcvd(0x14)", +"T_USR_Xrcvd(0x15)", +"T_DETACH(0x16)", +"T_NETRESET(0x17)", +"T_ACPT_req(0x18)", +}; diff --git a/sys/netiso/tp_clnp.h b/sys/netiso/tp_clnp.h new file mode 100644 index 00000000000..81a7cffc13e --- /dev/null +++ b/sys/netiso/tp_clnp.h @@ -0,0 +1,94 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_clnp.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_clnp.h,v 5.1 88/10/12 12:16:36 root Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_clnp.h,v $ + * + * AF_ISO net-dependent structures and include files + * + */ + + +#ifndef __TP_CLNP__ +#define __TP_CLNP__ + +#ifndef SOCK_STREAM +#include +#endif /* SOCK_STREAM */ + +#ifndef RTFREE +#include +#endif +#include +#include +#include +#ifndef IF_DEQUEUE +#include +#endif +#include + +struct isopcb tp_isopcb; + /* queue of active inpcbs for tp ; for tp with dod ip */ + +#endif /* __TP_CLNP__ */ diff --git a/sys/netiso/tp_cons.c b/sys/netiso/tp_cons.c new file mode 100644 index 00000000000..797ee9ef829 --- /dev/null +++ b/sys/netiso/tp_cons.c @@ -0,0 +1,308 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_cons.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * $Header: tp_cons.c,v 5.6 88/11/18 17:27:13 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_cons.c,v $ + * + * Here is where you find the iso- and cons-dependent code. We've tried + * keep all net-level and (primarily) address-family-dependent stuff + * out of the tp source, and everthing here is reached indirectly + * through a switch table (struct nl_protosw *) tpcb->tp_nlproto + * (see tp_pcb.c). + * The routines here are: + * tpcons_input: pullup and call tp_input w/ correct arguments + * tpcons_output: package a pkt for cons given an isopcb & some data + * cons_chan_to_tpcb: find a tpcb based on the channel # + */ + +#ifdef ISO +#ifdef TPCONS + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#undef FALSE +#undef TRUE +#include +#include +#include + +#include +int tpcons_output(); + +/* + * CALLED FROM: + * tp_route_to() for PRU_CONNECT + * FUNCTION, ARGUMENTS, SIDE EFFECTS and RETURN VALUE: + * version of the previous procedure for X.25 + */ + +tpcons_pcbconnect(isop, nam) +struct isopcb *isop; +register struct mbuf *nam; +{ + int error; + if (error = iso_pcbconnect(isop, nam)) + return error; + if ((isop->isop_chan = (caddr_t) pk_attach((struct socket *)0)) == 0) { + IFDEBUG(D_CCONS) + printf("tpcons_pcbconnect: no pklcd; returns 0x%x\n", error); + ENDDEBUG + return ENOBUFS; + } + if (error = cons_connect(isop)) { /* if it doesn't work */ + /* oh, dear, throw packet away */ + pk_disconnect((struct pklcd *)isop->isop_chan); + isop->isop_chan = 0; + } else + isop->isop_refcnt = 1; + return error; +} + + +/* + * CALLED FROM: + * cons + * FUNCTION and ARGUMENTS: + * THIS MAYBE BELONGS IN SOME OTHER PLACE??? but i think not - + */ +ProtoHook +tpcons_ctlinput(cmd, siso, isop) + int cmd; + struct sockaddr_iso *siso; + struct isopcb *isop; +{ + register struct tp_pcb *tpcb = 0; + + if (isop->isop_socket) + tpcb = (struct tp_pcb *)isop->isop_socket->so_pcb; + switch (cmd) { + + case PRC_CONS_SEND_DONE: + if (tpcb) { + struct tp_event E; + int error = 0; + + if (tpcb->tp_class == TP_CLASS_0) { + /* only if class is exactly class zero, not + * still in class negotiation + */ + /* fake an ack */ + register SeqNum seq = SEQ_ADD(tpcb, tpcb->tp_snduna, 1); + + IFTRACE(D_DATA) + tptrace(TPPTmisc, "FAKE ACK seq cdt 1", + seq, 0,0,0); + ENDTRACE + IFDEBUG(D_DATA) + printf("FAKE ACK seq 0x%x cdt 1\n", seq ); + ENDDEBUG + E.ATTR(AK_TPDU).e_cdt = 1; + E.ATTR(AK_TPDU).e_seq = seq; + E.ATTR(AK_TPDU).e_subseq = 0; + E.ATTR(AK_TPDU).e_fcc_present = 0; + error = DoEvent(AK_TPDU); + if( error ) { + tpcb->tp_sock->so_error = error; + } + } /* else ignore it */ + } + break; + case PRC_ROUTEDEAD: + if (tpcb && tpcb->tp_class == TP_CLASS_0) { + tpiso_reset(isop); + break; + } /* else drop through */ + default: + (void) tpclnp_ctlinput(cmd, siso); + break; + } + return 0; +} + +/* + * CALLED FROM: + * cons's intr routine + * FUNCTION and ARGUMENTS: + * Take a packet (m) from cons, pullup m as required by tp, + * ignore the socket argument, and call tp_input. + * No return value. + */ +ProtoHook +tpcons_input(m, faddr, laddr, channel) + struct mbuf *m; + struct sockaddr_iso *faddr, *laddr; + caddr_t channel; +{ + if( m == MNULL) + return 0; + + m = (struct mbuf *)tp_inputprep(m); + + IFDEBUG(D_TPINPUT) + printf("tpcons_input before tp_input(m 0x%x)\n", m); + dump_buf( m, 12+ m->m_len); + ENDDEBUG + tp_input(m, faddr, laddr, channel, tpcons_output, 0); + return 0; +} + + +/* + * CALLED FROM: + * tp_emit() + * FUNCTION and ARGUMENTS: + * Take a packet(m0) from tp and package it so that cons will accept it. + * This means filling in a few of the fields. + * inp is the isopcb structure; datalen is the length of the data in the + * mbuf string m0. + * RETURN VALUE: + * whatever (E*) is returned form the net layer output routine. + */ + +int +tpcons_output(isop, m0, datalen, nochksum) + struct isopcb *isop; + struct mbuf *m0; + int datalen; + int nochksum; +{ + register struct mbuf *m = m0; + int error; + + IFDEBUG(D_EMIT) + printf( + "tpcons_output(isop 0x%x, m 0x%x, len 0x%x socket 0x%x\n", + isop, m0, datalen, isop->isop_socket); + ENDDEBUG + if (m == MNULL) + return 0; + if ((m->m_flags & M_PKTHDR) == 0) { + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == 0) + return ENOBUFS; + m->m_next = m0; + } + m->m_pkthdr.len = datalen; + if (isop->isop_chan == 0) { + /* got a restart maybe? */ + if ((isop->isop_chan = (caddr_t) pk_attach((struct socket *)0)) == 0) { + IFDEBUG(D_CCONS) + printf("tpcons_output: no pklcd\n"); + ENDDEBUG + error = ENOBUFS; + } + if (error = cons_connect(isop)) { + pk_disconnect((struct pklcd *)isop->isop_chan); + isop->isop_chan = 0; + IFDEBUG(D_CCONS) + printf("tpcons_output: can't reconnect\n"); + ENDDEBUG + } + } else { + error = pk_send(isop->isop_chan, m); + IncStat(ts_tpdu_sent); + } + return error; +} +/* + * CALLED FROM: + * tp_error_emit() + * FUNCTION and ARGUMENTS: + * Take a packet(m0) from tp and package it so that cons will accept it. + * chan is the cons channel to use; datalen is the length of the data in the + * mbuf string m0. + * RETURN VALUE: + * whatever (E*) is returned form the net layer output routine. + */ + +int +tpcons_dg_output(chan, m0, datalen) + caddr_t chan; + struct mbuf *m0; + int datalen; +{ + return tpcons_output(((struct pklcd *)chan)->lcd_upnext, m0, datalen, 0); +} +#endif /* TPCONS */ +#endif /* ISO */ diff --git a/sys/netiso/tp_driver.c b/sys/netiso/tp_driver.c new file mode 100644 index 00000000000..586ef4e2ade --- /dev/null +++ b/sys/netiso/tp_driver.c @@ -0,0 +1,999 @@ +/* $Header$ */ +/* $Source$ */ +#ifndef lint +static char *rcsid = "$Header/**/$"; +#endif lint +#define _XEBEC_PG static + +#include "tp_states.h" + +static struct act_ent { + int a_newstate; + int a_action; +} statetable[] = { {0,0}, +#include "tp_states.init" +}; + +/* @(#)tp.trans 8.1 (Berkeley) 6/10/93 */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRIVERTRACE TPPTdriver +#define sbwakeup(sb) sowakeup(p->tp_sock, sb); +#define MCPY(d, w) (d ? m_copym(d, 0, (int)M_COPYALL, w): 0) + +static trick_hc = 1; + +int tp_emit(), + tp_goodack(), tp_goodXack(), + tp_stash() +; +void tp_indicate(), tp_getoptions(), + tp_soisdisconnecting(), tp_soisdisconnected(), + tp_recycle_tsuffix(), +#ifdef TP_DEBUG_TIMERS + tp_etimeout(), tp_euntimeout(), + tp_ctimeout(), tp_cuntimeout(), + tp_ctimeout_MIN(), +#endif + tp_freeref(), tp_detach(), + tp0_stash(), tp0_send(), + tp_netcmd(), tp_send() +; + +typedef struct tp_pcb tpcb_struct; + + + +typedef tpcb_struct tp_PCB_; + +#include "tp_events.h" + +_XEBEC_PG int _Xebec_action(a,e,p) +int a; +struct tp_event *e; +tp_PCB_ *p; +{ +switch(a) { +case -1: return tp_protocol_error(e,p); +case 0x1: + { + (void) tp_emit(DC_TPDU_type, p, 0, 0, MNULL); + } + break; +case 0x2: + { +# ifdef TP_DEBUG + if( e->ev_number != AK_TPDU ) + printf("TPDU 0x%x in REFWAIT!!!!\n", e->ev_number); +# endif TP_DEBUG + } + break; +case 0x3: + { + /* oh, man is this grotesque or what? */ + (void) tp_goodack(p, e->ev_union.EV_AK_TPDU.e_cdt, e->ev_union.EV_AK_TPDU.e_seq, e->ev_union.EV_AK_TPDU.e_subseq); + /* but it's necessary because this pseudo-ack may happen + * before the CC arrives, but we HAVE to adjust the + * snduna as a result of the ack, WHENEVER it arrives + */ + } + break; +case 0x4: + { + tp_detach(p); + } + break; +case 0x5: + { + p->tp_refstate = REF_OPEN; /* has timers ??? */ + } + break; +case 0x6: + { + IFTRACE(D_CONN) + tptrace(TPPTmisc, "CR datalen data", e->ev_union.EV_CR_TPDU.e_datalen, e->ev_union.EV_CR_TPDU.e_data,0,0); + ENDTRACE + IFDEBUG(D_CONN) + printf("CR datalen 0x%x data 0x%x", e->ev_union.EV_CR_TPDU.e_datalen, e->ev_union.EV_CR_TPDU.e_data); + ENDDEBUG + p->tp_refstate = REF_OPEN; /* has timers */ + p->tp_fcredit = e->ev_union.EV_CR_TPDU.e_cdt; + + if (e->ev_union.EV_CR_TPDU.e_datalen > 0) { + /* n/a for class 0 */ + ASSERT(p->tp_Xrcv.sb_cc == 0); + sbappendrecord(&p->tp_Xrcv, e->ev_union.EV_CR_TPDU.e_data); + e->ev_union.EV_CR_TPDU.e_data = MNULL; + } + } + break; +case 0x7: + { + IncStat(ts_tp0_conn); + IFTRACE(D_CONN) + tptrace(TPPTmisc, "Confiming", p, 0,0,0); + ENDTRACE + IFDEBUG(D_CONN) + printf("Confirming connection: p" ); + ENDDEBUG + soisconnected(p->tp_sock); + (void) tp_emit(CC_TPDU_type, p, 0,0, MNULL) ; + p->tp_fcredit = 1; + } + break; +case 0x8: + { + IncStat(ts_tp4_conn); /* even though not quite open */ + IFTRACE(D_CONN) + tptrace(TPPTmisc, "Confiming", p, 0,0,0); + ENDTRACE + IFDEBUG(D_CONN) + printf("Confirming connection: p" ); + ENDDEBUG + tp_getoptions(p); + soisconnecting(p->tp_sock); + if ((p->tp_rx_strat & TPRX_FASTSTART) && (p->tp_fcredit > 0)) + p->tp_cong_win = p->tp_fcredit * p->tp_l_tpdusize; + p->tp_retrans = p->tp_Nretrans; + tp_ctimeout(p, TM_retrans, (int)p->tp_cc_ticks); + } + break; +case 0x9: + { + IFDEBUG(D_CONN) + printf("event: CR_TPDU emit CC failed done " ); + ENDDEBUG + soisdisconnected(p->tp_sock); + tp_recycle_tsuffix(p); + tp_freeref(p->tp_lref); + tp_detach(p); + } + break; +case 0xa: + { + int error; + struct mbuf *data = MNULL; + + IFTRACE(D_CONN) + tptrace(TPPTmisc, "T_CONN_req flags ucddata", (int)p->tp_flags, + p->tp_ucddata, 0, 0); + ENDTRACE + data = MCPY(p->tp_ucddata, M_WAIT); + if (data) { + IFDEBUG(D_CONN) + printf("T_CONN_req.trans m_copy cc 0x%x\n", + p->tp_ucddata); + dump_mbuf(data, "sosnd @ T_CONN_req"); + ENDDEBUG + } + + if (error = tp_emit(CR_TPDU_type, p, 0, 0, data) ) + return error; /* driver WON'T change state; will return error */ + + p->tp_refstate = REF_OPEN; /* has timers */ + if(p->tp_class != TP_CLASS_0) { + p->tp_retrans = p->tp_Nretrans; + tp_ctimeout(p, TM_retrans, (int)p->tp_cr_ticks); + } + } + break; +case 0xb: + { + sbflush(&p->tp_Xrcv); /* purge non-delivered data data */ + if (e->ev_union.EV_DR_TPDU.e_datalen > 0) { + sbappendrecord(&p->tp_Xrcv, e->ev_union.EV_DR_TPDU.e_data); + e->ev_union.EV_DR_TPDU.e_data = MNULL; + } + if (p->tp_state == TP_OPEN) + tp_indicate(T_DISCONNECT, p, 0); + else { + int so_error = ECONNREFUSED; + if (e->ev_union.EV_DR_TPDU.e_reason != (E_TP_NO_SESSION ^ TP_ERROR_MASK) && + e->ev_union.EV_DR_TPDU.e_reason != (E_TP_NO_CR_ON_NC ^ TP_ERROR_MASK) && + e->ev_union.EV_DR_TPDU.e_reason != (E_TP_REF_OVERFLOW ^ TP_ERROR_MASK)) + so_error = ECONNABORTED; + tp_indicate(T_DISCONNECT, p, so_error); + } + tp_soisdisconnected(p); + if (p->tp_class != TP_CLASS_0) { + if (p->tp_state == TP_OPEN ) { + tp_euntimeout(p, TM_data_retrans); /* all */ + tp_cuntimeout(p, TM_retrans); + tp_cuntimeout(p, TM_inact); + tp_cuntimeout(p, TM_sendack); + p->tp_flags &= ~TPF_DELACK; + } + tp_cuntimeout(p, TM_retrans); + if( e->ev_union.EV_DR_TPDU.e_sref != 0 ) + (void) tp_emit(DC_TPDU_type, p, 0, 0, MNULL); + } + } + break; +case 0xc: + { + if( e->ev_union.EV_DR_TPDU.e_sref != 0 ) + (void) tp_emit(DC_TPDU_type, p, 0, 0, MNULL); + /* reference timer already set - reset it to be safe (???) */ + tp_euntimeout(p, TM_reference); /* all */ + tp_etimeout(p, TM_reference, (int)p->tp_refer_ticks); + } + break; +case 0xd: + { + tp_cuntimeout(p, TM_retrans); + tp_indicate(ER_TPDU, p, e->ev_union.EV_ER_TPDU.e_reason); + tp_soisdisconnected(p); + } + break; +case 0xe: + { + tp_cuntimeout(p, TM_retrans); + tp_soisdisconnected(p); + } + break; +case 0xf: + { + tp_indicate(ER_TPDU, p, e->ev_union.EV_ER_TPDU.e_reason); + tp_cuntimeout(p, TM_retrans); + tp_soisdisconnected(p); + } + break; +case 0x10: + { + tp_cuntimeout(p, TM_retrans); + tp_soisdisconnected(p); + } + break; +case 0x11: + { /* don't ask me why we have to do this - spec says so */ + (void) tp_emit(DR_TPDU_type, p, 0, E_TP_NO_SESSION, MNULL); + /* don't bother with retransmissions of the DR */ + } + break; +case 0x12: + { + tp_soisdisconnecting(p->tp_sock); + tp_indicate(ER_TPDU, p, e->ev_union.EV_ER_TPDU.e_reason); + tp_soisdisconnected(p); + tp_netcmd( p, CONN_CLOSE ); + } + break; +case 0x13: + { + if (p->tp_state == TP_OPEN) { + tp_euntimeout(p, TM_data_retrans); /* all */ + tp_cuntimeout(p, TM_inact); + tp_cuntimeout(p, TM_sendack); + } + tp_soisdisconnecting(p->tp_sock); + tp_indicate(ER_TPDU, p, e->ev_union.EV_ER_TPDU.e_reason); + p->tp_retrans = p->tp_Nretrans; + tp_ctimeout(p, TM_retrans, (int)p->tp_dr_ticks); + (void) tp_emit(DR_TPDU_type, p, 0, E_TP_PROTO_ERR, MNULL); + } + break; +case 0x14: + { + tp_cuntimeout(p, TM_retrans); + IncStat(ts_tp0_conn); + p->tp_fcredit = 1; + soisconnected(p->tp_sock); + } + break; +case 0x15: + { + IFDEBUG(D_CONN) + printf("trans: CC_TPDU in CRSENT state flags 0x%x\n", + (int)p->tp_flags); + ENDDEBUG + IncStat(ts_tp4_conn); + p->tp_fref = e->ev_union.EV_CC_TPDU.e_sref; + p->tp_fcredit = e->ev_union.EV_CC_TPDU.e_cdt; + if ((p->tp_rx_strat & TPRX_FASTSTART) && (e->ev_union.EV_CC_TPDU.e_cdt > 0)) + p->tp_cong_win = e->ev_union.EV_CC_TPDU.e_cdt * p->tp_l_tpdusize; + tp_getoptions(p); + tp_cuntimeout(p, TM_retrans); + if (p->tp_ucddata) { + IFDEBUG(D_CONN) + printf("dropping user connect data cc 0x%x\n", + p->tp_ucddata->m_len); + ENDDEBUG + m_freem(p->tp_ucddata); + p->tp_ucddata = 0; + } + soisconnected(p->tp_sock); + if (e->ev_union.EV_CC_TPDU.e_datalen > 0) { + ASSERT(p->tp_Xrcv.sb_cc == 0); /* should be empty */ + sbappendrecord(&p->tp_Xrcv, e->ev_union.EV_CC_TPDU.e_data); + e->ev_union.EV_CC_TPDU.e_data = MNULL; + } + + (void) tp_emit(AK_TPDU_type, p, p->tp_rcvnxt, 0, MNULL); + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + } + break; +case 0x16: + { + struct mbuf *data = MNULL; + int error; + + IncStat(ts_retrans_cr); + p->tp_cong_win = 1 * p->tp_l_tpdusize; + data = MCPY(p->tp_ucddata, M_NOWAIT); + if(p->tp_ucddata) { + IFDEBUG(D_CONN) + printf("TM_retrans.trans m_copy cc 0x%x\n", data); + dump_mbuf(p->tp_ucddata, "sosnd @ TM_retrans"); + ENDDEBUG + if( data == MNULL ) + return ENOBUFS; + } + + p->tp_retrans --; + if( error = tp_emit(CR_TPDU_type, p, 0, 0, data) ) { + p->tp_sock->so_error = error; + } + tp_ctimeout(p, TM_retrans, (int)p->tp_cr_ticks); + } + break; +case 0x17: + { + IncStat(ts_conn_gaveup); + p->tp_sock->so_error = ETIMEDOUT; + tp_indicate(T_DISCONNECT, p, ETIMEDOUT); + tp_soisdisconnected(p); + } + break; +case 0x18: + { + int error; + struct mbuf *data = MCPY(p->tp_ucddata, M_WAIT); + + if( error = tp_emit(CC_TPDU_type, p, 0, 0, data) ) { + p->tp_sock->so_error = error; + } + p->tp_retrans = p->tp_Nretrans; + tp_ctimeout(p, TM_retrans, (int)p->tp_cc_ticks); + } + break; +case 0x19: + { + int doack; + + /* + * Get rid of any confirm or connect data, so that if we + * crash or close, it isn't thought of as disconnect data. + */ + if (p->tp_ucddata) { + m_freem(p->tp_ucddata); + p->tp_ucddata = 0; + } + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + tp_cuntimeout(p, TM_retrans); + soisconnected(p->tp_sock); + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + + /* see also next 2 transitions, if you make any changes */ + + doack = tp_stash(p, e); + IFDEBUG(D_DATA) + printf("tp_stash returns %d\n",doack); + ENDDEBUG + + if (doack) { + (void) tp_emit(AK_TPDU_type, p, p->tp_rcvnxt, 0, MNULL ); + tp_ctimeout(p, TM_sendack, (int)p->tp_keepalive_ticks); + } else + tp_ctimeout( p, TM_sendack, (int)p->tp_sendack_ticks); + + IFDEBUG(D_DATA) + printf("after stash calling sbwakeup\n"); + ENDDEBUG + } + break; +case 0x1a: + { + tp0_stash(p, e); + sbwakeup( &p->tp_sock->so_rcv ); + + IFDEBUG(D_DATA) + printf("after stash calling sbwakeup\n"); + ENDDEBUG + } + break; +case 0x1b: + { + int doack; /* tells if we must ack immediately */ + + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + sbwakeup( &p->tp_sock->so_rcv ); + + doack = tp_stash(p, e); + IFDEBUG(D_DATA) + printf("tp_stash returns %d\n",doack); + ENDDEBUG + + if(doack) + (void) tp_emit(AK_TPDU_type, p, p->tp_rcvnxt, 0, MNULL ); + else + tp_ctimeout_MIN( p, TM_sendack, (int)p->tp_sendack_ticks); + + IFDEBUG(D_DATA) + printf("after stash calling sbwakeup\n"); + ENDDEBUG + } + break; +case 0x1c: + { + IFTRACE(D_DATA) + tptrace(TPPTmisc, "NIW seq rcvnxt lcredit ", + e->ev_union.EV_DT_TPDU.e_seq, p->tp_rcvnxt, p->tp_lcredit, 0); + ENDTRACE + IncStat(ts_dt_niw); + m_freem(e->ev_union.EV_DT_TPDU.e_data); + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + (void) tp_emit(AK_TPDU_type, p, p->tp_rcvnxt, 0, MNULL ); + } + break; +case 0x1d: + { + if (p->tp_ucddata) { + m_freem(p->tp_ucddata); + p->tp_ucddata = 0; + } + (void) tp_goodack(p, e->ev_union.EV_AK_TPDU.e_cdt, e->ev_union.EV_AK_TPDU.e_seq, e->ev_union.EV_AK_TPDU.e_subseq); + tp_cuntimeout(p, TM_retrans); + + soisconnected(p->tp_sock); + IFTRACE(D_CONN) + struct socket *so = p->tp_sock; + tptrace(TPPTmisc, + "called sosiconn: so so_state rcv.sb_sel rcv.sb_flags", + so, so->so_state, so->so_rcv.sb_sel, so->so_rcv.sb_flags); + tptrace(TPPTmisc, + "called sosiconn 2: so_qlen so_error so_rcv.sb_cc so_head", + so->so_qlen, so->so_error, so->so_rcv.sb_cc, so->so_head); + ENDTRACE + + tp_ctimeout(p, TM_sendack, (int)p->tp_keepalive_ticks); + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + } + break; +case 0x1e: + { + if( p->tp_state == TP_AKWAIT ) { + if (p->tp_ucddata) { + m_freem(p->tp_ucddata); + p->tp_ucddata = 0; + } + tp_cuntimeout(p, TM_retrans); + soisconnected(p->tp_sock); + tp_ctimeout(p, TM_sendack, (int)p->tp_keepalive_ticks); + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + } + IFTRACE(D_XPD) + tptrace(TPPTmisc, "XPD tpdu accepted Xrcvnxt, e_seq datalen m_len\n", + p->tp_Xrcvnxt,e->ev_union.EV_XPD_TPDU.e_seq, e->ev_union.EV_XPD_TPDU.e_datalen, e->ev_union.EV_XPD_TPDU.e_data->m_len); + ENDTRACE + + p->tp_sock->so_state |= SS_RCVATMARK; + e->ev_union.EV_XPD_TPDU.e_data->m_flags |= M_EOR; + sbinsertoob(&p->tp_Xrcv, e->ev_union.EV_XPD_TPDU.e_data); + IFDEBUG(D_XPD) + dump_mbuf(e->ev_union.EV_XPD_TPDU.e_data, "XPD TPDU: tp_Xrcv"); + ENDDEBUG + tp_indicate(T_XDATA, p, 0); + sbwakeup( &p->tp_Xrcv ); + + (void) tp_emit(XAK_TPDU_type, p, p->tp_Xrcvnxt, 0, MNULL); + SEQ_INC(p, p->tp_Xrcvnxt); + } + break; +case 0x1f: + { + if( p->tp_Xrcv.sb_cc == 0 ) { + /* kludge for select(): */ + /* p->tp_sock->so_state &= ~SS_OOBAVAIL; */ + } + } + break; +case 0x20: + { + IFTRACE(D_XPD) + tptrace(TPPTmisc, "XPD tpdu niw (Xrcvnxt, e_seq) or not cdt (cc)\n", + p->tp_Xrcvnxt, e->ev_union.EV_XPD_TPDU.e_seq, p->tp_Xrcv.sb_cc , 0); + ENDTRACE + if( p->tp_Xrcvnxt != e->ev_union.EV_XPD_TPDU.e_seq ) + IncStat(ts_xpd_niw); + if( p->tp_Xrcv.sb_cc ) { + /* might as well kick 'em again */ + tp_indicate(T_XDATA, p, 0); + IncStat(ts_xpd_dup); + } + m_freem(e->ev_union.EV_XPD_TPDU.e_data); + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + /* don't send an xack because the xak gives "last one received", not + * "next one i expect" (dumb) + */ + } + break; +case 0x21: + { + struct socket *so = p->tp_sock; + + /* detach from parent socket so it can finish closing */ + if (so->so_head) { + if (!soqremque(so, 0) && !soqremque(so, 1)) + panic("tp: T_DETACH"); + so->so_head = 0; + } + tp_soisdisconnecting(p->tp_sock); + tp_netcmd( p, CONN_CLOSE); + tp_soisdisconnected(p); + } + break; +case 0x22: + { + struct socket *so = p->tp_sock; + struct mbuf *data = MNULL; + + /* detach from parent socket so it can finish closing */ + if (so->so_head) { + if (!soqremque(so, 0) && !soqremque(so, 1)) + panic("tp: T_DETACH"); + so->so_head = 0; + } + if (p->tp_state != TP_CLOSING) { + tp_soisdisconnecting(p->tp_sock); + data = MCPY(p->tp_ucddata, M_NOWAIT); + (void) tp_emit(DR_TPDU_type, p, 0, E_TP_NORMAL_DISC, data); + p->tp_retrans = p->tp_Nretrans; + tp_ctimeout(p, TM_retrans, (int)p->tp_dr_ticks); + } + } + break; +case 0x23: + { + tp_soisdisconnecting(p->tp_sock); + tp_netcmd( p, CONN_CLOSE); + tp_soisdisconnected(p); + } + break; +case 0x24: + { + struct mbuf *data = MCPY(p->tp_ucddata, M_WAIT); + + if(p->tp_state == TP_OPEN) { + tp_euntimeout(p, TM_data_retrans); /* all */ + tp_cuntimeout(p, TM_inact); + tp_cuntimeout(p, TM_sendack); + p->tp_flags &= ~TPF_DELACK; + } + if (data) { + IFDEBUG(D_CONN) + printf("T_DISC_req.trans tp_ucddata 0x%x\n", + p->tp_ucddata); + dump_mbuf(data, "ucddata @ T_DISC_req"); + ENDDEBUG + } + tp_soisdisconnecting(p->tp_sock); + p->tp_retrans = p->tp_Nretrans; + tp_ctimeout(p, TM_retrans, (int)p->tp_dr_ticks); + + if( trick_hc ) + return tp_emit(DR_TPDU_type, p, 0, e->ev_union.EV_T_DISC_req.e_reason, data); + } + break; +case 0x25: + { + int error; + struct mbuf *data = MCPY(p->tp_ucddata, M_WAIT); + + IncStat(ts_retrans_cc); + p->tp_retrans --; + p->tp_cong_win = 1 * p->tp_l_tpdusize; + + if( error = tp_emit(CC_TPDU_type, p, 0, 0, data) ) + p->tp_sock->so_error = error; + tp_ctimeout(p, TM_retrans, (int)p->tp_cc_ticks); + } + break; +case 0x26: + { + IncStat(ts_conn_gaveup); + tp_soisdisconnecting(p->tp_sock); + p->tp_sock->so_error = ETIMEDOUT; + tp_indicate(T_DISCONNECT, p, ETIMEDOUT); + (void) tp_emit(DR_TPDU_type, p, 0, E_TP_CONGEST, MNULL); + p->tp_retrans = p->tp_Nretrans; + tp_ctimeout(p, TM_retrans, (int)p->tp_dr_ticks); + } + break; +case 0x27: + { + tp_euntimeout(p, TM_data_retrans); /* all */ + tp_cuntimeout(p, TM_inact); + tp_cuntimeout(p, TM_sendack); + + IncStat(ts_conn_gaveup); + tp_soisdisconnecting(p->tp_sock); + p->tp_sock->so_error = ETIMEDOUT; + tp_indicate(T_DISCONNECT, p, ETIMEDOUT); + (void) tp_emit(DR_TPDU_type, p, 0, E_TP_CONGEST_2, MNULL); + p->tp_retrans = p->tp_Nretrans; + tp_ctimeout(p, TM_retrans, (int)p->tp_dr_ticks); + } + break; +case 0x28: + { + p->tp_cong_win = 1 * p->tp_l_tpdusize; + /* resume XPD */ + if ( p->tp_Xsnd.sb_mb ) { + struct mbuf *m = m_copy(p->tp_Xsnd.sb_mb, 0, (int)p->tp_Xsnd.sb_cc); + int shift; + + IFTRACE(D_XPD) + tptrace(TPPTmisc, "XPD retrans: Xuna Xsndnxt sndnxt snduna", + p->tp_Xuna, p->tp_Xsndnxt, p->tp_sndnxt, + p->tp_snduna); + ENDTRACE + IFDEBUG(D_XPD) + dump_mbuf(m, "XPD retrans emitting M"); + ENDDEBUG + IncStat(ts_retrans_xpd); + p->tp_retrans --; + shift = max(p->tp_Nretrans - p->tp_retrans, 6); + (void) tp_emit(XPD_TPDU_type, p, p->tp_Xuna, 1, m); + tp_ctimeout(p, TM_retrans, ((int)p->tp_dt_ticks) << shift); + } + } + break; +case 0x29: + { + p->tp_rxtshift++; + (void) tp_data_retrans(p); + } + break; +case 0x2a: + { + p->tp_retrans --; + (void) tp_emit(DR_TPDU_type, p, 0, E_TP_DR_NO_REAS, MNULL); + IncStat(ts_retrans_dr); + tp_ctimeout(p, TM_retrans, (int)p->tp_dr_ticks); + } + break; +case 0x2b: + { + p->tp_sock->so_error = ETIMEDOUT; + p->tp_refstate = REF_FROZEN; + tp_recycle_tsuffix( p ); + tp_etimeout(p, TM_reference, (int)p->tp_refer_ticks); + } + break; +case 0x2c: + { + tp_freeref(p->tp_lref); + tp_detach(p); + } + break; +case 0x2d: + { + if( p->tp_class != TP_CLASS_0) { + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + if ( e->ev_number == CC_TPDU ) + (void) tp_emit(AK_TPDU_type, p, p->tp_rcvnxt, 0, MNULL); + } + /* ignore it if class 0 - state tables are blank for this */ + } + break; +case 0x2e: + { + IFTRACE(D_DATA) + tptrace(TPPTmisc, "T_DATA_req sndnxt snduna fcredit, tpcb", + p->tp_sndnxt, p->tp_snduna, p->tp_fcredit, p); + ENDTRACE + + tp_send(p); + } + break; +case 0x2f: + { + int error = 0; + + /* resume XPD */ + if ( p->tp_Xsnd.sb_mb ) { + struct mbuf *m = m_copy(p->tp_Xsnd.sb_mb, 0, (int)p->tp_Xsnd.sb_cc); + /* m_copy doesn't preserve the m_xlink field, but at this pt. + * that doesn't matter + */ + + IFTRACE(D_XPD) + tptrace(TPPTmisc, "XPD req: Xuna Xsndnxt sndnxt snduna", + p->tp_Xuna, p->tp_Xsndnxt, p->tp_sndnxt, + p->tp_snduna); + ENDTRACE + IFDEBUG(D_XPD) + printf("T_XPD_req: sb_cc 0x%x\n", p->tp_Xsnd.sb_cc); + dump_mbuf(m, "XPD req emitting M"); + ENDDEBUG + error = + tp_emit(XPD_TPDU_type, p, p->tp_Xuna, 1, m); + p->tp_retrans = p->tp_Nretrans; + + tp_ctimeout(p, TM_retrans, (int)p->tp_rxtcur); + SEQ_INC(p, p->tp_Xsndnxt); + } + if(trick_hc) + return error; + } + break; +case 0x30: + { + struct sockbuf *sb = &p->tp_sock->so_snd; + + IFDEBUG(D_ACKRECV) + printf("GOOD ACK seq 0x%x cdt 0x%x\n", e->ev_union.EV_AK_TPDU.e_seq, e->ev_union.EV_AK_TPDU.e_cdt); + ENDDEBUG + if( p->tp_class != TP_CLASS_0) { + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + } + sbwakeup(sb); + IFDEBUG(D_ACKRECV) + printf("GOOD ACK new sndnxt 0x%x\n", p->tp_sndnxt); + ENDDEBUG + } + break; +case 0x31: + { + IFTRACE(D_ACKRECV) + tptrace(TPPTmisc, "BOGUS ACK fcc_present, tp_r_subseq e_subseq", + e->ev_union.EV_AK_TPDU.e_fcc_present, p->tp_r_subseq, e->ev_union.EV_AK_TPDU.e_subseq, 0); + ENDTRACE + if( p->tp_class != TP_CLASS_0 ) { + + if ( !e->ev_union.EV_AK_TPDU.e_fcc_present ) { + /* send ACK with FCC */ + IncStat( ts_ackreason[_ACK_FCC_] ); + (void) tp_emit(AK_TPDU_type, p, p->tp_rcvnxt, 1, MNULL); + } + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + } + } + break; +case 0x32: + { + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + tp_cuntimeout(p, TM_retrans); + + sbwakeup( &p->tp_sock->so_snd ); + + /* resume normal data */ + tp_send(p); + } + break; +case 0x33: + { + IFTRACE(D_ACKRECV) + tptrace(TPPTmisc, "BOGUS XACK eventtype ", e->ev_number, 0, 0,0); + ENDTRACE + if( p->tp_class != TP_CLASS_0 ) { + tp_ctimeout(p, TM_inact, (int)p->tp_inact_ticks); + } + } + break; +case 0x34: + { + int timo; + IFTRACE(D_TIMER) + tptrace(TPPTsendack, -1, p->tp_lcredit, p->tp_sent_uwe, + p->tp_sent_lcdt, 0); + ENDTRACE + IncPStat(p, tps_n_TMsendack); + (void) tp_emit(AK_TPDU_type, p, p->tp_rcvnxt, 0, MNULL); + if (p->tp_fcredit == 0) { + if (p->tp_rxtshift < TP_MAXRXTSHIFT) + p->tp_rxtshift++; + timo = (p->tp_dt_ticks) << p->tp_rxtshift; + } else + timo = p->tp_sendack_ticks; + tp_ctimeout(p, TM_sendack, timo); + } + break; +case 0x35: + { + if (sbspace(&p->tp_sock->so_rcv) > 0) + tp0_openflow(p); + } + break; +case 0x36: + { + if( trick_hc ) { + SeqNum ack_thresh; + /* + * If the upper window edge has advanced a reasonable + * amount beyond what was known, send an ACK. + * A reasonable amount is 2 packets, unless the max window + * is only 1 or 2 packets, in which case we + * should send an ack for any advance in the upper window edge. + */ + LOCAL_CREDIT(p); + ack_thresh = SEQ_SUB(p, p->tp_lcredit + p->tp_rcvnxt, + (p->tp_maxlcredit > 2 ? 2 : 1)); + if (SEQ_GT(p, ack_thresh, p->tp_sent_uwe)) { + IncStat(ts_ackreason[_ACK_USRRCV_]); + p->tp_flags &= ~TPF_DELACK; + return tp_emit(AK_TPDU_type, p, p->tp_rcvnxt, 0, MNULL); + } + } + } + break; +case 0x37: + { + if(trick_hc) + return ECONNABORTED; + } + break; +case 0x38: + { + ASSERT( p->tp_state != TP_LISTENING ); + tp_indicate(T_DISCONNECT, p, ECONNRESET); + tp_soisdisconnected(p); + } + break; + } +return 0; +} + +_XEBEC_PG int +_Xebec_index( e,p ) + struct tp_event *e; + tp_PCB_ *p; +{ +switch( (e->ev_number<<4)+(p->tp_state) ) { +case 0x12: + if ( p->tp_retrans > 0 ) return 0x1e; + else return 0x1f; +case 0x13: + if ( p->tp_retrans > 0 ) return 0x2f; + else return 0x30; +case 0x14: + if ( p->tp_retrans > 0 ) return 0x32; + else return 0x31; +case 0x15: + if ( p->tp_retrans > 0 ) return 0x34; + else return 0x35; +case 0x54: + if (p->tp_rxtshift < TP_NRETRANS) return 0x33; + else return 0x31; +case 0x64: + if (p->tp_class == TP_CLASS_0) return 0x1a; + else return 0x1b; +case 0x77: + if ( p->tp_class == TP_CLASS_0) return 0xd; + else return 0xe; +case 0x86: + if ( e->ev_union.EV_DR_TPDU.e_sref != 0 ) return 0x2; + else return 0x3; +case 0xa2: + if (p->tp_class == TP_CLASS_0) return 0x1c; + else return 0x1d; +case 0xb2: + if (p->tp_class == TP_CLASS_0) return 0x5; + else return 0x0; +case 0xb4: + if ( tp_goodack(p, e->ev_union.EV_AK_TPDU.e_cdt, e->ev_union.EV_AK_TPDU.e_seq, e->ev_union.EV_AK_TPDU.e_subseq) ) return 0x3a; + else return 0x3b; +case 0xc3: + if ( IN_RWINDOW( p, e->ev_union.EV_DT_TPDU.e_seq, + p->tp_rcvnxt, SEQ(p, p->tp_rcvnxt + p->tp_lcredit)) ) return 0x21; + else return 0x24; +case 0xc4: + if ( p->tp_class == TP_CLASS_0 ) return 0x22; + else if ( IN_RWINDOW( p, e->ev_union.EV_DT_TPDU.e_seq, + p->tp_rcvnxt, SEQ(p, p->tp_rcvnxt + p->tp_lcredit)) ) return 0x23; + else return 0x25; +case 0xd3: + if (p->tp_Xrcvnxt == e->ev_union.EV_XPD_TPDU.e_seq) return 0x27; + else return 0x2a; +case 0xd4: + if (p->tp_Xrcvnxt == e->ev_union.EV_XPD_TPDU.e_seq) return 0x27; + else return 0x29; +case 0xe4: + if ( tp_goodXack(p, e->ev_union.EV_XAK_TPDU.e_seq) ) return 0x3c; + else return 0x3d; +case 0x102: + if ( p->tp_class == TP_CLASS_0 ) return 0x2d; + else return 0x2e; +case 0x104: + if ( p->tp_class == TP_CLASS_0 ) return 0x2d; + else return 0x2e; +case 0x144: + if (p->tp_class == TP_CLASS_0) return 0x3f; + else return 0x40; +case 0x162: + if (p->tp_class == TP_CLASS_0) return 0x2b; + else return 0x2c; +case 0x172: + if ( p->tp_class != TP_CLASS_4 ) return 0x42; + else return 0x46; +case 0x174: + if ( p->tp_class != TP_CLASS_4 ) return 0x42; + else return 0x47; +case 0x177: + if ( p->tp_class != TP_CLASS_4 ) return 0x42; + else return 0x43; +case 0x188: + if ( p->tp_class == TP_CLASS_0 ) return 0xf; + else if (tp_emit(CC_TPDU_type, p, 0,0, MCPY(p->tp_ucddata, M_NOWAIT)) == 0) return 0x10; + else return 0x11; +default: return 0; +} /* end switch */ +} /* _Xebec_index() */ +static int inx[26][9] = { {0,0,0,0,0,0,0,0,0,}, + {0x0,0x0,0x0,0x0,0x31,0x0,0x0,0x0,0x0, }, + {0x0,0x0,-1,-1,-1,-1,0x0,0x0,0x0, }, + {0x0,0x0,0x0,0x0,0x3e,0x0,0x0,0x0,0x0, }, + {0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0, }, + {0x0,0x0,0x0,0x0,0x0,0x0,0x36,0x0,0x0, }, + {0x0,0x0,0x0,0x0,-1,0x0,0x0,0x0,0x0, }, + {0x0,0x7,0x15,0x1b,-1,0x17,0x3,0xa,0x0, }, + {0x0,0x19,0x6,0x20,0x37,0x8,0x3,-1,0x0, }, + {0x0,0x14,0x13,0x13,0x13,0x16,-1,0xa,0x0, }, + {0x0,0x7,0x6,0x1,0x9,0x18,0x3,0xa,0x0, }, + {0x0,0x19,-1,0x1,0x37,0x8,0x3,0xa,0x0, }, + {0x0,0x7,-1,0x26,-1,0x8,0x3,0xa,0x0, }, + {0x0,0x7,0x6,-1,-1,0x8,0x3,0xa,0x0, }, + {0x0,0x7,0x6,-1,-1,0x8,0x3,0xa,0x0, }, + {0x0,0x7,0x6,0x1,-1,0x8,0x3,0xa,0x0, }, + {0x0,0x12,0x0,0x0,0x0,0x0,0x0,0x0,0x0, }, + {0x0,0x0,-1,0x2e,-1,0x0,0x4,0x0,0x2e, }, + {0x0,0xb,0x0,0x0,0x0,0x0,0x0,0x0,0x0, }, + {0x0,0x0,0x0,0x0,0x38,0x0,0x0,0x0,0x0, }, + {0x0,0x0,0x0,0x0,0x39,0x0,0x0,0x0,0x0, }, + {0x0,0x0,0x0,0x0,-1,0x0,0x41,0x0,0x0, }, + {0x0,0x0,0x0,0x0,0x28,0x0,0x41,0x0,0x0, }, + {0x0,0xc,-1,0x2c,0x0,0x2c,0x4,0xc,0x2c, }, + {0x0,0x49,-1,0x45,-1,0x44,0x48,-1,0x0, }, + {0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,-1, }, +}; +tp_driver(p, e) +register tp_PCB_ *p; +register struct tp_event *e; +{ + register int index, error=0; + struct act_ent *a; + static struct act_ent erroraction = {0,-1}; + + index = inx[1 + e->ev_number][p->tp_state]; + if(index<0) index=_Xebec_index(e, p); + if (index==0) { + a = &erroraction; + } else + a = &statetable[index]; + + if(a->a_action) + error = _Xebec_action( a->a_action, e, p ); + IFTRACE(D_DRIVER) + tptrace(DRIVERTRACE, a->a_newstate, p->tp_state, e->ev_number, a->a_action, 0); + ENDTRACE + if(error==0) + p->tp_state = a->a_newstate; + return error; +} diff --git a/sys/netiso/tp_emit.c b/sys/netiso/tp_emit.c new file mode 100644 index 00000000000..16ed5bc7b7b --- /dev/null +++ b/sys/netiso/tp_emit.c @@ -0,0 +1,996 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_emit.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_emit.c,v 5.5 88/11/18 17:27:20 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_emit.c,v $ + * + * This file contains tp_emit() and tp_error_emit(), which + * form TPDUs and hand them to ip. + * They take data in the form of mbuf chain, allocate mbufs as + * necessary for headers, and set the fields as appropriate from + * information found in the tpcb and net-level pcb. + * + * The worst thing about this code is adding the variable-length + * options on a machine that requires alignment for any memory access + * that isn't of size 1. See the macro ADDOPTION() below. + * + * We don't do any concatenation. (There's a kludge to test the + * basic mechanism of separation under the 'w' tpdebug option, that's all.) + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifdef TRUE +#undef FALSE +#undef TRUE +#endif +#include +#include +#include + +void iso_gen_csum(); + + +/* Here is a mighty kludge. The token ring misorders packets if you + * fire them at it too fast, and TP sans checksum is "too fast", so + * we have introduced a delay when checksumming isn't used. + */ +char tp_delay = 0x00; /* delay to keep token ring from blowing it */ + +/* + * NAME: tp_emit() + * + * CALLED FROM: tp.trans and from tp_sbsend() + * + * FUNCTION and ARGUMENTS: + * Emits one tpdu of the type (dutype), of the format appropriate + * to the connection described by the pcb (tpcb), with sequence + * number (seq) (where appropriate), end-of-tsdu bit (eot) where + * appropriate, and with the data in the mbuf chain (data). + * For DR and ER tpdus, the argument (eot) is + * the reason for issuing the tpdu rather than an end-of-tsdu indicator. + * + * RETURNS: + * 0 OK + * ENOBUFS + * E* returned from net layer output rtn + * + * SIDE EFFECTS: + * + * NOTES: + * + * WE ASSUME that the tp header + all options will fit in ONE mbuf. + * If mbufs are 256 this will most likely be true, but if they are 128 it's + * possible that they won't. + * If you used every option on the CR + max. user data you'd overrun + * 112 but unless you used > 115 bytes for the security + * parameter, it would fit in a 256-byte mbuf (240 bytes for the header) + * We don't support the security parameter, so this isn't a problem. + * If security is added, we ought to remove this assumption. + * + * We do not implement the flow control confirmation "element of procedure". + * A) it should not affect interoperability, + * B) it should not be necessary - the protocol will eventually + * straighten things out w/o FCC, as long as we don't have severely + * mismatched keepalive and inactivity timers, and + * C) it appears not to be REQUIRED, and + * D) it's incredibly grotesque, and no doubt will lengthen a few + * critical paths. + * HOWEVER, we're thinking about putting it in anyway, for + * completeness, just like we did with ack subsequencing. + */ + +int +tp_emit(dutype, tpcb, seq, eot, data) + int dutype; + struct tp_pcb *tpcb; + SeqNum seq; + u_int eot; + struct mbuf *data; +{ + register struct tpdu *hdr; + register struct mbuf *m; + int csum_offset=0; + int datalen = 0; + int error = 0; + SeqNum olduwe; + int acking_ooo; + + /* NOTE: + * here we treat tpdu_li as if it DID include the li field, up until + * the end, at which time we subtract 1 + * THis is because if we subtract 1 right away, we end up adding + * one every time we add an option. + */ + IFDEBUG(D_EMIT) + printf( + "tp_emit dutype 0x%x, tpcb 0x%x, eot 0x%x, seq 0x%x, data 0x%x", + dutype, tpcb, eot, seq, data); + ENDDEBUG + + if (dutype == CR_TPDU || dutype == CC_TPDU) { + m = (struct mbuf *) malloc((u_long)256, M_MBUF, M_DONTWAIT); + if (m) { + m->m_type = TPMT_TPHDR; + mbstat.m_mtypes[TPMT_TPHDR]++; + m->m_next = MNULL; + m->m_nextpkt = MNULL; + m->m_data = m->m_pktdat; + m->m_flags = M_PKTHDR; + } + } else { + MGETHDR(m, M_DONTWAIT, TPMT_TPHDR); + } + m->m_data += max_hdr; + if (m == NULL) { + if(data != (struct mbuf *)0) + m_freem(data); + error = ENOBUFS; + goto done; + } + m->m_len = sizeof(struct tpdu); + m->m_act = MNULL; + + hdr = mtod(m, struct tpdu *); + bzero((caddr_t)hdr, sizeof(struct tpdu)); + + { + int tp_headersize(); + + hdr->tpdu_type = dutype; + hdr->tpdu_li = tp_headersize(dutype, tpcb); + /* + * class 0 doesn't use this for DT + * it'll just get overwritten below + */ + hdr->tpdu_dref = htons(tpcb->tp_fref); + if( tpcb->tp_use_checksum || + (dutype == CR_TPDU_type && (tpcb->tp_class & TP_CLASS_4) )) { + csum_offset = hdr->tpdu_li + 2; /* DOESN'T include csum */ + ADDOPTION(TPP_checksum, hdr, 2, eot /* dummy arg */); + IFDEBUG(D_CHKSUM) + printf( + "tp_emit: csum_offset 0x%x, hdr->tpdu_li 0x%x\n", + csum_offset, hdr->tpdu_li); + ENDDEBUG + } + /* + * VARIABLE PARTS... + */ + switch( dutype ) { + + case CR_TPDU_type: + hdr->tpdu_CRdref_0 = 0; /* must be zero */ + case CC_TPDU_type: + if (!tpcb->tp_cebit_off) { + tpcb->tp_win_recv = tp_start_win << 8; + LOCAL_CREDIT(tpcb); + CONG_INIT_SAMPLE(tpcb); + } else + LOCAL_CREDIT(tpcb); + +/* Case CC_TPDU_type used to be here */ + { + u_char x; + + hdr->tpdu_CCsref = htons(tpcb->tp_lref); /* same as CRsref */ + + if( tpcb->tp_class > TP_CLASS_1 ) { + tpcb->tp_sent_uwe = tpcb->tp_lcredit -1; + tpcb->tp_sent_rcvnxt = 1; + tpcb->tp_sent_lcdt = tpcb->tp_lcredit; + hdr->tpdu_cdt = tpcb->tp_lcredit; + } else { +#ifdef TPCONS + if (tpcb->tp_netservice == ISO_CONS) { + struct isopcb *isop = (struct isopcb *)tpcb->tp_npcb; + struct pklcd *lcp = (struct pklcd *)(isop->isop_chan); + lcp->lcd_flags &= ~X25_DG_CIRCUIT; + } +#endif + hdr->tpdu_cdt = 0; + } + hdr->tpdu_CCclass = tp_mask_to_num(tpcb->tp_class); + hdr->tpdu_CCoptions = + (tpcb->tp_xtd_format? TPO_XTD_FMT:0) | + (tpcb->tp_use_efc? TPO_USE_EFC:0); + + IFPERF(tpcb) + u_char perf_meas = tpcb->tp_perf_on; + ADDOPTION(TPP_perf_meas, hdr, sizeof(perf_meas), perf_meas); + ENDPERF + + if( dutype == CR_TPDU_type ) { + IncStat(ts_CR_sent); + + ASSERT( tpcb->tp_lsuffixlen > 0 ); + ASSERT( tpcb->tp_fsuffixlen > 0 ); + + ADDOPTION(TPP_calling_sufx, hdr, + tpcb->tp_lsuffixlen, tpcb->tp_lsuffix[0]); + ADDOPTION(TPP_called_sufx, hdr, + tpcb->tp_fsuffixlen, tpcb->tp_fsuffix[0]); + } else { + IncStat(ts_CC_sent); + } + + ADDOPTION(TPP_tpdu_size, hdr, + sizeof(tpcb->tp_tpdusize), tpcb->tp_tpdusize); + + if (tpcb->tp_class != TP_CLASS_0) { + short millisec = 500*(tpcb->tp_sendack_ticks); + + millisec = htons(millisec); + ADDOPTION(TPP_acktime, hdr, sizeof(short), millisec); + + x = (tpcb->tp_use_nxpd? TPAO_USE_NXPD: 0) + | (tpcb->tp_use_rcc? TPAO_USE_RCC : 0) + | (tpcb->tp_use_checksum?0: TPAO_NO_CSUM) + | (tpcb->tp_xpd_service? TPAO_USE_TXPD: 0); + ADDOPTION(TPP_addl_opt, hdr, 1, x); + + if ((tpcb->tp_l_tpdusize ^ (1 << tpcb->tp_tpdusize)) != 0) { + u_short size_s = tpcb->tp_l_tpdusize >> 7; + u_char size_c = size_s; + ASSERT(tpcb->tp_l_tpdusize < 65536 * 128); + if (dutype == CR_TPDU_type) + tpcb->tp_ptpdusize = size_s; + if (size_s < 256) { + ADDOPTION(TPP_ptpdu_size, hdr, 1, size_c); + } else { + size_s = htons(size_s); + ADDOPTION(TPP_ptpdu_size, hdr, 2, size_s); + } + } + } + + if( (dutype == CR_TPDU_type) && (tpcb->tp_class != TP_CLASS_0)){ + + ASSERT( 1 == sizeof(tpcb->tp_vers) ); + ADDOPTION(TPP_vers, hdr, 1, tpcb->tp_vers); + + /* for each alt protocol class x, + * x = x<<4; + * option = concat(option, x); + * Well, for now we only have TP0 for an + * alternative so... this is easy. + * + * HOWEVER... There should be NO alt protocol + * class over CLNS. Need to see if the route suggests + * CONS, and iff so add alt class. + */ + x = 0; + ADDOPTION(TPP_alt_class, hdr, 1, x); + } + + if( hdr->tpdu_li > MLEN) + panic("tp_emit CR/CC"); + } + break; + + case DR_TPDU_type: + if( hdr->tpdu_DRdref == 0 ) { + /* don't issue the DR */ + goto done; + } + hdr->tpdu_cdt = 0; + hdr->tpdu_DRsref = htons(tpcb->tp_lref); + hdr->tpdu_DRreason = (u_char)eot; /* WHICH BYTE OF THIS??? */ + + /* forget the add'l information variable part */ + IncStat(ts_DR_sent); + break; + + case DC_TPDU_type: /* not used in class 0 */ + ASSERT( tpcb->tp_class != TP_CLASS_0); + hdr->tpdu_DCsref = htons(tpcb->tp_lref); + hdr->tpdu_cdt = 0; + data = (struct mbuf *)0; + IncStat(ts_DC_sent); + break; + + case XAK_TPDU_type: /* xak not used in class 0 */ + ASSERT( tpcb->tp_class != TP_CLASS_0); /* fall through */ + hdr->tpdu_cdt = 0; + + IFTRACE(D_XPD) + tptraceTPCB(TPPTXack, seq, 0, 0, 0, 0); + ENDTRACE + data = (struct mbuf *)0; + if (tpcb->tp_xtd_format) { +#ifdef BYTE_ORDER + union seq_type seqeotX; + + seqeotX.s_seq = seq; + seqeotX.s_eot = 1; + hdr->tpdu_seqeotX = htonl(seqeotX.s_seqeot); +#else + hdr->tpdu_XAKseqX = seq; +#endif /* BYTE_ORDER */ + } else { + hdr->tpdu_XAKseq = seq; + } + IncStat(ts_XAK_sent); + IncPStat(tpcb, tps_XAK_sent); + break; + + case XPD_TPDU_type: /* xpd not used in class 0 */ + ASSERT( tpcb->tp_class != TP_CLASS_0); /* fall through */ + hdr->tpdu_cdt = 0; + if (tpcb->tp_xtd_format) { +#ifdef BYTE_ORDER + union seq_type seqeotX; + + seqeotX.s_seq = seq; + seqeotX.s_eot = 1; + hdr->tpdu_seqeotX = htonl(seqeotX.s_seqeot); +#else + hdr->tpdu_XPDseqX = seq; + hdr->tpdu_XPDeotX = 1; /* always 1 for XPD tpdu */ +#endif /* BYTE_ORDER */ + } else { + hdr->tpdu_XPDseq = seq; + hdr->tpdu_XPDeot = 1; /* always 1 for XPD tpdu */ + } + IncStat(ts_XPD_sent); + IncPStat(tpcb, tps_XPD_sent); + + /* kludge to test the input size checking */ + IFDEBUG(D_SIZE_CHECK) + /*if(data->m_len <= 16 && data->m_off < (MLEN-18) ) { + printf("Sending too much data on XPD: 18 bytes\n"); + data->m_len = 18; + }*/ + ENDDEBUG + break; + + case DT_TPDU_type: + hdr->tpdu_cdt = 0; + IFTRACE(D_DATA) + tptraceTPCB(TPPTmisc, "emit DT: eot seq tpdu_li", eot, seq, + hdr->tpdu_li, 0); + ENDTRACE + if (tpcb->tp_xtd_format) { +#ifdef BYTE_ORDER + union seq_type seqeotX; + + seqeotX.s_seq = seq; + seqeotX.s_eot = eot; + hdr->tpdu_seqeotX = htonl(seqeotX.s_seqeot); +#else + hdr->tpdu_DTseqX = seq; + hdr->tpdu_DTeotX = eot; +#endif /* BYTE_ORDER */ + } else if (tpcb->tp_class == TP_CLASS_0) { + IFDEBUG(D_EMIT) + printf("DT tpdu: class 0 m 0x%x hdr 0x%x\n", m, hdr); + dump_buf( hdr, hdr->tpdu_li + 1 ); + ENDDEBUG + ((struct tp0du *)hdr)->tp0du_eot = eot; + ((struct tp0du *)hdr)->tp0du_mbz = 0; + IFDEBUG(D_EMIT) + printf("DT 2 tpdu: class 0 m 0x%x hdr 0x%x\n", m, hdr); + dump_buf( hdr, hdr->tpdu_li + 1 ); + ENDDEBUG + } else { + hdr->tpdu_DTseq = seq; + hdr->tpdu_DTeot = eot; + } + if(eot) { + IncStat(ts_EOT_sent); + } + IncStat(ts_DT_sent); + IncPStat(tpcb, tps_DT_sent); + break; + + case AK_TPDU_type:/* ak not used in class 0 */ + ASSERT( tpcb->tp_class != TP_CLASS_0); + data = (struct mbuf *)0; + olduwe = tpcb->tp_sent_uwe; + + if (seq != tpcb->tp_sent_rcvnxt || tpcb->tp_rsycnt == 0) { + LOCAL_CREDIT( tpcb ); + tpcb->tp_sent_uwe = + SEQ(tpcb,tpcb->tp_rcvnxt + tpcb->tp_lcredit -1); + tpcb->tp_sent_lcdt = tpcb->tp_lcredit; + acking_ooo = 0; + } else + acking_ooo = 1; + + IFDEBUG(D_RENEG) + /* occasionally fake a reneging so + you can test subsequencing */ + if( olduwe & 0x1 ) { + tpcb->tp_reneged = 1; + IncStat(ts_ldebug); + } + ENDDEBUG + /* Are we about to reneg on credit? + * When might we do so? + * a) when using optimistic credit (which we no longer do). + * b) when drain() gets implemented (not in the plans). + * c) when D_RENEG is on. + * d) when DEC BIT response is implemented. + * (not- when we do this, we'll need to implement flow control + * confirmation) + */ + if( SEQ_LT(tpcb, tpcb->tp_sent_uwe, olduwe) ) { + tpcb->tp_reneged = 1; + IncStat(ts_lcdt_reduced); + IFTRACE(D_CREDIT) + tptraceTPCB(TPPTmisc, + "RENEG: olduwe newuwe lcredit rcvnxt", + olduwe, + tpcb->tp_sent_uwe, tpcb->tp_lcredit, + tpcb->tp_rcvnxt); + ENDTRACE + } + IFPERF(tpcb) + /* new lwe is less than old uwe means we're + * acking before we received a whole window full + */ + if( SEQ_LT( tpcb, tpcb->tp_rcvnxt, olduwe) ) { + /* tmp1 = number of pkts fewer than the full window */ + register int tmp1 = + (int) SEQ_SUB( tpcb, olduwe, tpcb->tp_rcvnxt); + + if(tmp1 > TP_PM_MAX) + tmp1 = TP_PM_MAX; + IncPStat( tpcb, tps_ack_early[tmp1] ); + + /* tmp1 = amt of new cdt we're advertising */ + tmp1 = SEQ_SUB( tpcb, seq, tpcb->tp_sent_rcvnxt); + if(tmp1 > TP_PM_MAX ) + tmp1 = TP_PM_MAX; + + IncPStat( tpcb, + tps_cdt_acked [ tmp1 ] + [ ((tpcb->tp_lcredit > TP_PM_MAX)? + TP_PM_MAX:tpcb->tp_lcredit) ] ); + + } + ENDPERF + + IFTRACE(D_ACKSEND) + tptraceTPCB(TPPTack, seq, tpcb->tp_lcredit, tpcb->tp_sent_uwe, + tpcb->tp_r_subseq, 0); + ENDTRACE + if (tpcb->tp_xtd_format) { +#ifdef BYTE_ORDER + union seq_type seqeotX; + + seqeotX.s_seq = seq; + seqeotX.s_eot = 0; + hdr->tpdu_seqeotX = htonl(seqeotX.s_seqeot); + hdr->tpdu_AKcdtX = htons(tpcb->tp_lcredit); +#else + hdr->tpdu_cdt = 0; + hdr->tpdu_AKseqX = seq; + hdr->tpdu_AKcdtX = tpcb->tp_lcredit; +#endif /* BYTE_ORDER */ + } else { + hdr->tpdu_AKseq = seq; + hdr->tpdu_AKcdt = tpcb->tp_lcredit; + } + if ((tpcb->tp_class == TP_CLASS_4) && + (tpcb->tp_reneged || acking_ooo)) { + /* + * Ack subsequence parameter req'd if WE reneged on + * credit offered. (ISO 8073, 12.2.3.8.2, p. 74) + */ + IFDEBUG(D_RENEG) + printf("Adding subseq 0x%x\n", tpcb->tp_s_subseq); + ENDDEBUG + tpcb->tp_s_subseq++; + /* + * add tmp subseq and do a htons on it. + */ + ADDOPTION(TPP_subseq, hdr, + sizeof(tpcb->tp_s_subseq), tpcb->tp_s_subseq); + } else + tpcb->tp_s_subseq = 0; + + if ( tpcb->tp_sendfcc || eot ) /* overloaded to mean SEND FCC */ { + /* + * Rules for sending FCC ("should" send when) : + * %a) received an ack from peer with NO NEWS whatsoever, + * and it did not contain an FCC + * b) received an ack from peer that opens its closed window. + * c) received an ack from peer after it reneged on its + * offered credit, AND this ack raises UWE but LWE is same + * and below UWE at time of reneging (reduction) + * Now, ISO 8073 12.2.3.8.3 says + * that a retransmitted AK shall not contain the FCC + * parameter. Now, how the hell you tell the difference + * between a retransmitted ack and an ack that's sent in + * response to a received ack, I don't know, because without + * any local activity, and w/o any received DTs, they + * will contain exactly the same credit/seq# information. + * Anyway, given that the "retransmission of acks" + * procedure (ISO 8073 12.2.3.8.3) is optional, and we + * don't do it (although the peer can't tell that), we + * ignore this last rule. + * + * We send FCC for reasons a) and b) only. + * To add reason c) would require a ridiculous amount of state. + * + */ + u_short bogus[4]; /* lwe(32), subseq(16), cdt(16) */ + SeqNum lwe; + u_short subseq, fcredit; + + tpcb->tp_sendfcc = 0; + + lwe = (SeqNum) htonl(tpcb->tp_snduna); + subseq = htons(tpcb->tp_r_subseq); + fcredit = htons(tpcb->tp_fcredit); + + bcopy((caddr_t) &lwe, (caddr_t)&bogus[0], sizeof(SeqNum)); + bcopy((caddr_t) &subseq, (caddr_t)&bogus[2], sizeof(u_short)); + bcopy((caddr_t) &fcredit, (caddr_t)&bogus[3], sizeof(u_short)); + + IFTRACE(D_ACKSEND) + tptraceTPCB(TPPTmisc, + "emit w/FCC: snduna r_subseq fcredit", + tpcb->tp_snduna, tpcb->tp_r_subseq, + tpcb->tp_fcredit, 0); + ENDTRACE + + IFDEBUG(D_ACKSEND) + printf("Calling ADDOPTION 0x%x, 0x%x, 0x%x,0x%x\n", + TPP_flow_cntl_conf, + hdr, sizeof(bogus), bogus[0]); + ENDDEBUG + ADDOPTION(TPP_flow_cntl_conf, hdr, sizeof(bogus), bogus[0]); + IFDEBUG(D_ACKSEND) + printf("after ADDOPTION hdr 0x%x hdr->tpdu_li 0x%x\n", + hdr, hdr->tpdu_li); + printf( + "after ADDOPTION csum_offset 0x%x, hdr->tpdu_li 0x%x\n", + csum_offset, hdr->tpdu_li); + ENDDEBUG + + } + tpcb->tp_reneged = 0; + tpcb->tp_sent_rcvnxt = seq; + if (tpcb->tp_fcredit == 0) { + int timo = tpcb->tp_keepalive_ticks; + if (tpcb->tp_rxtshift < TP_MAXRXTSHIFT) + tpcb->tp_rxtshift++; + timo = min(timo, ((int)tpcb->tp_dt_ticks) << tpcb->tp_rxtshift); + tp_ctimeout(tpcb, TM_sendack, timo); + } else + tp_ctimeout(tpcb, TM_sendack, tpcb->tp_keepalive_ticks); + IncStat(ts_AK_sent); + IncPStat(tpcb, tps_AK_sent); + IFDEBUG(D_ACKSEND) + printf( + "2 after rADDOPTION csum_offset 0x%x, hdr->tpdu_li 0x%x\n", + csum_offset, hdr->tpdu_li); + ENDDEBUG + break; + + case ER_TPDU_type: + hdr->tpdu_ERreason = eot; + hdr->tpdu_cdt = 0; + /* no user data */ + data = (struct mbuf *)0; + IncStat(ts_ER_sent); + break; + } + + } + ASSERT( ((int)hdr->tpdu_li > 0) && ((int)hdr->tpdu_li < MLEN) ); + + m->m_next = data; + + ASSERT( hdr->tpdu_li < MLEN ); /* leave this in */ + ASSERT( hdr->tpdu_li != 0 ); /* leave this in */ + + m->m_len = hdr->tpdu_li ; + hdr->tpdu_li --; /* doesn't include the li field */ + + datalen = m_datalen( m ); /* total len */ + + ASSERT( datalen <= tpcb->tp_l_tpdusize ); /* may become a problem + when CLNP is used; leave in here for the time being */ + IFDEBUG(D_ACKSEND) + printf( + "4 after rADDOPTION csum_offset 0x%x, hdr->tpdu_li 0x%x\n", + csum_offset, hdr->tpdu_li); + ENDDEBUG + if( datalen > tpcb->tp_l_tpdusize ) { + printf("data len 0x%x tpcb->tp_l_tpdusize 0x%x\n", + datalen, tpcb->tp_l_tpdusize); + } + IFDEBUG(D_EMIT) + printf( + "tp_emit before gen_csum m_len 0x%x, csum_offset 0x%x, datalen 0x%x\n", + m->m_len, csum_offset, datalen); + ENDDEBUG + if( tpcb->tp_use_checksum || + (dutype == CR_TPDU_type && (tpcb->tp_class & TP_CLASS_4)) ) { + iso_gen_csum(m, csum_offset, datalen); + } + + IFDEBUG(D_EMIT) + printf("tp_emit before tpxxx_output tpcb 0x%x, dutype 0x%x, datalen 0x%x\n", + tpcb, dutype, datalen); + dump_buf(mtod(m, caddr_t), datalen); + ENDDEBUG + + IFPERF(tpcb) + if( dutype == DT_TPDU_type ) { + PStat(tpcb, Nb_to_ll) += (datalen - m->m_len); + tpmeas( tpcb->tp_lref, TPtime_to_ll, (struct timeval *)0, + seq, PStat(tpcb, Nb_to_ll), (datalen - m->m_len)); + } + ENDPERF + + IFTRACE(D_EMIT) + tptraceTPCB(TPPTtpduout, dutype, hdr, hdr->tpdu_li+1, datalen, 0); + ENDTRACE + IFDEBUG(D_EMIT) + printf("OUTPUT: tpcb 0x%x, isop 0x%x, so 0x%x\n", + tpcb, tpcb->tp_npcb, tpcb->tp_sock); + ENDDEBUG + + { extern char tp_delay; + + if( tp_delay ) + if( tpcb->tp_use_checksum == 0 ) { + register u_int i = tp_delay; + for (; i!= 0; i--) + (void) iso_check_csum(m, datalen); + } + } + ASSERT( m->m_len > 0 ); + error = (tpcb->tp_nlproto->nlp_output)(tpcb->tp_npcb, m, datalen, + !tpcb->tp_use_checksum); + IFDEBUG(D_EMIT) + printf("OUTPUT: returned 0x%x\n", error); + ENDDEBUG + IFTRACE(D_EMIT) + tptraceTPCB(TPPTmisc, + "tp_emit nlproto->output netservice returns datalen", + tpcb->tp_nlproto->nlp_output, tpcb->tp_netservice, error, datalen); + ENDTRACE +done: + if (error) { + if (dutype == AK_TPDU_type) + tp_ctimeout(tpcb, TM_sendack, 1); + if (error == E_CO_QFULL) { + tp_quench(tpcb, PRC_QUENCH); + return 0; + } + } + return error; +} +/* + * NAME: tp_error_emit() + * CALLED FROM: tp_input() when a DR or ER is to be issued in + * response to an input error. + * FUNCTION and ARGUMENTS: + * The error type is the first argument. + * The argument (sref) is the source reference on the bad incoming tpdu, + * and is used for a destination reference on the outgoing packet. + * (faddr) and (laddr) are the foreign and local addresses for this + * connection. + * (erdata) is a ptr to the errant incoming tpdu, and is copied into the + * outgoing ER, if an ER is to be issued. + * (erlen) is the number of octets of the errant tpdu that we should + * try to copy. + * (tpcb) is the pcb that describes the connection for which the bad tpdu + * arrived. + * RETURN VALUES: + * 0 OK + * ENOBUFS + * E* from net layer datagram output routine + * SIDE EFFECTS: + * + * NOTES: + */ + +int +tp_error_emit(error, sref, faddr, laddr, erdata, erlen, tpcb, cons_channel, + dgout_routine) + int error; + u_long sref; + struct sockaddr_iso *faddr, *laddr; + struct mbuf *erdata; + int erlen; + struct tp_pcb *tpcb; + caddr_t cons_channel; + int (*dgout_routine)(); +{ + int dutype; + int datalen = 0; + register struct tpdu *hdr; + register struct mbuf *m; + int csum_offset; + + IFTRACE(D_ERROR_EMIT) + tptrace(TPPTmisc, "tp_error_emit error sref tpcb erlen", + error, sref, tpcb, erlen); + ENDTRACE + IFDEBUG(D_ERROR_EMIT) + printf( + "tp_error_emit error 0x%x sref 0x%x tpcb 0x%x erlen 0x%x chan 0x%x\n", + error, sref, tpcb, erlen, cons_channel); + ENDDEBUG + + MGET(m, M_DONTWAIT, TPMT_TPHDR); + if (m == NULL) { + return ENOBUFS; + } + m->m_len = sizeof(struct tpdu); + m->m_act = MNULL; + + hdr = mtod(m, struct tpdu *); + + IFDEBUG(D_ERROR_EMIT) + printf("[error 0x%x] [error&0xff 0x%x] [(char)error 0x%x]\n", + error, error&0xff, (char)error); + ENDDEBUG + + + if (error & TP_ERROR_SNDC) + dutype = DC_TPDU_type; + else if (error & 0x40) { + error &= ~0x40; + dutype = ER_TPDU_type; + } else + dutype = DR_TPDU_type; + error &= 0xff; + + hdr->tpdu_type = dutype; + hdr->tpdu_cdt = 0; + + switch( dutype ) { + + case DC_TPDU_type: + IncStat(ts_DC_sent); + hdr->tpdu_li = 6; + hdr->tpdu_DCdref = htons(sref); + hdr->tpdu_DCsref = tpcb ? htons(tpcb->tp_lref) : 0; + IFDEBUG(D_ERROR_EMIT) + printf("DC case:\n"); + dump_buf( hdr, 6); + ENDDEBUG + /* forget the add'l information variable part */ + break; + + case DR_TPDU_type: + IncStat(ts_DR_sent); + hdr->tpdu_li = 7; + hdr->tpdu_DRdref = htons(sref); + hdr->tpdu_DRsref = 0; + hdr->tpdu_DRreason = (char)error; + IFDEBUG(D_ERROR_EMIT) + printf("DR case:\n"); + dump_buf( hdr, 7); + ENDDEBUG + /* forget the add'l information variable part */ + break; + + case ER_TPDU_type: + IncStat(ts_ER_sent); + hdr->tpdu_li = 5; + hdr->tpdu_ERreason = (char)error; + hdr->tpdu_ERdref = htons(sref); + break; + + default: + ASSERT(0); + printf("TP PANIC: bad dutype 0x%x\n", dutype); + } + + if(tpcb) + if( tpcb->tp_use_checksum ) { + ADDOPTION(TPP_checksum, hdr, 2, csum_offset /* dummy argument */); + csum_offset = hdr->tpdu_li - 2; + } + + ASSERT( hdr->tpdu_li < MLEN ); + + if (dutype == ER_TPDU_type) { + /* copy the errant tpdu into another 'variable part' */ + register caddr_t P; + + IFTRACE(D_ERROR_EMIT) + tptrace(TPPTmisc, "error_emit ER len tpduli", erlen, hdr->tpdu_li, + 0,0); + ENDTRACE + IFDEBUG(D_ERROR_EMIT) + printf("error_emit ER len 0x%x tpduli 0x%x\n", erlen, hdr->tpdu_li); + ENDDEBUG + + /* copy at most as many octets for which you have room */ + if (erlen + hdr->tpdu_li + 2 > TP_MAX_HEADER_LEN) + erlen = TP_MAX_HEADER_LEN - hdr->tpdu_li - 2; + + /* add the "invalid tpdu" parameter : required in class 0 */ + P = (caddr_t)hdr + (int)(hdr->tpdu_li); + vbptr(P)->tpv_code = TPP_invalid_tpdu; /* parameter code */ + vbptr(P)->tpv_len = erlen; /* parameter length */ + m->m_len = hdr->tpdu_li + 2; /* 1 for code, 1 for length */ + + /* tp_input very likely handed us an mbuf chain w/ nothing in + * the first mbuf and the data following the empty mbuf + */ + if(erdata->m_len == 0) { + erdata = m_free(erdata); /* returns the next mbuf on the chain */ + } + /* + * copy only up to the bad octet + * (or max that will fit in a header + */ + m->m_next = m_copy(erdata, 0, erlen); + hdr->tpdu_li += erlen + 2; + m_freem(erdata); + } else { + IFDEBUG(D_ERROR_EMIT) + printf("error_emit DR error tpduli 0x%x\n", error, hdr->tpdu_li); + dump_buf( (char *)hdr, hdr->tpdu_li ); + ENDDEBUG + m->m_len = hdr->tpdu_li ; + m_freem(erdata); + } + + hdr->tpdu_li --; + IFTRACE(D_ERROR_EMIT) + tptrace(TPPTtpduout, 2, hdr, hdr->tpdu_li+1, 0, 0); + ENDTRACE + + datalen = m_datalen( m); + if (tpcb) { + if( tpcb->tp_use_checksum ) { + IFTRACE(D_ERROR_EMIT) + tptrace(TPPTmisc, "before gen csum datalen", datalen,0,0,0); + ENDTRACE + IFDEBUG(D_ERROR_EMIT) + printf("before gen csum datalen 0x%x, csum_offset 0x%x\n", + datalen, csum_offset); + ENDDEBUG + + iso_gen_csum(m, csum_offset, datalen); + } + + IFDEBUG(D_ERROR_EMIT) + printf("OUTPUT: tpcb 0x%x, isop 0x%x, so 0x%x\n", + tpcb, tpcb->tp_npcb, tpcb->tp_sock); + ENDDEBUG + } + if (cons_channel) { +#ifdef TPCONS + struct pklcd *lcp = (struct pklcd *)cons_channel; + struct isopcb *isop = (struct isopcb *)lcp->lcd_upnext; + + tpcons_dg_output(cons_channel, m, datalen); + /* was if (tpcb == 0) iso_pcbdetach(isop); */ + /* but other side may want to try again over same VC, + so, we'll depend on him closing it, but in case it gets forgotten + we'll mark it for garbage collection */ + lcp->lcd_flags |= X25_DG_CIRCUIT; + IFDEBUG(D_ERROR_EMIT) + printf("OUTPUT: dutype 0x%x channel 0x%x\n", + dutype, cons_channel); + ENDDEBUG +#else + printf("TP panic! cons channel 0x%x but not cons configured\n", + cons_channel); +#endif + } else if (tpcb) { + + IFDEBUG(D_ERROR_EMIT) + printf("tp_error_emit 1 sending DG: Laddr\n"); + dump_addr((struct sockaddr *)laddr); + printf("Faddr\n"); + dump_addr((struct sockaddr *)faddr); + ENDDEBUG + return (tpcb->tp_nlproto->nlp_dgoutput)( + &laddr->siso_addr, + &faddr->siso_addr, + m, datalen, + /* no route */ (caddr_t)0, !tpcb->tp_use_checksum); + } else if (dgout_routine) { + IFDEBUG(D_ERROR_EMIT) + printf("tp_error_emit sending DG: Laddr\n"); + dump_addr((struct sockaddr *)laddr); + printf("Faddr\n"); + dump_addr((struct sockaddr *)faddr); + ENDDEBUG + return (*dgout_routine)( &laddr->siso_addr, &faddr->siso_addr, + m, datalen, /* no route */ + (caddr_t)0, /* nochecksum==false */0); + } else { + IFDEBUG(D_ERROR_EMIT) + printf("tp_error_emit DROPPING \n", m); + ENDDEBUG + IncStat(ts_send_drop); + m_freem(m); + return 0; + } +} diff --git a/sys/netiso/tp_events.h b/sys/netiso/tp_events.h new file mode 100644 index 00000000000..48222830a0d --- /dev/null +++ b/sys/netiso/tp_events.h @@ -0,0 +1,84 @@ +/* $Header$ */ +/* $Source$ */ +struct tp_event { + int ev_number; + struct timeval e_time; +#define TM_inact 0x0 +#define TM_retrans 0x1 +#define TM_sendack 0x2 +#define TM_notused 0x3 + + union{ +struct { SeqNum e_low; SeqNum e_high; int e_retrans; } EV_TM_reference; + +#define TM_reference 0x4 +struct { SeqNum e_low; SeqNum e_high; int e_retrans; } EV_TM_data_retrans; + +#define TM_data_retrans 0x5 +struct { + u_char e_reason; + } EV_ER_TPDU; + +#define ER_TPDU 0x6 +struct { struct mbuf *e_data; /* first field */ + int e_datalen; /* 2nd field */ + u_int e_cdt; + } EV_CR_TPDU; + +#define CR_TPDU 0x7 +struct { struct mbuf *e_data; /* first field */ + int e_datalen; /* 2nd field */ + u_short e_sref; + u_char e_reason; + } EV_DR_TPDU; + +#define DR_TPDU 0x8 +#define DC_TPDU 0x9 +struct { struct mbuf *e_data; /* first field */ + int e_datalen; /* 2nd field */ + u_short e_sref; + u_int e_cdt; + } EV_CC_TPDU; + +#define CC_TPDU 0xa +struct { u_int e_cdt; + SeqNum e_seq; + SeqNum e_subseq; + u_char e_fcc_present; + } EV_AK_TPDU; + +#define AK_TPDU 0xb +struct { struct mbuf *e_data; /* first field */ + int e_datalen; /* 2nd field */ + u_int e_eot; + SeqNum e_seq; + } EV_DT_TPDU; + +#define DT_TPDU 0xc +struct { struct mbuf *e_data; /* first field */ + int e_datalen; /* 2nd field */ + SeqNum e_seq; + } EV_XPD_TPDU; + +#define XPD_TPDU 0xd +struct { SeqNum e_seq; } EV_XAK_TPDU; + +#define XAK_TPDU 0xe +#define T_CONN_req 0xf +struct { u_char e_reason; } EV_T_DISC_req; + +#define T_DISC_req 0x10 +#define T_LISTEN_req 0x11 +#define T_DATA_req 0x12 +#define T_XPD_req 0x13 +#define T_USR_rcvd 0x14 +#define T_USR_Xrcvd 0x15 +#define T_DETACH 0x16 +#define T_NETRESET 0x17 +#define T_ACPT_req 0x18 + }ev_union; +};/* end struct event */ + +#define tp_NEVENTS 0x19 + +#define ATTR(X)ev_union.EV_/**/X/**/ diff --git a/sys/netiso/tp_inet.c b/sys/netiso/tp_inet.c new file mode 100644 index 00000000000..fb013718ba2 --- /dev/null +++ b/sys/netiso/tp_inet.c @@ -0,0 +1,688 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_inet.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * $Header: tp_inet.c,v 5.3 88/11/18 17:27:29 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_inet.c,v $ + * + * Here is where you find the inet-dependent code. We've tried + * keep all net-level and (primarily) address-family-dependent stuff + * out of the tp source, and everthing here is reached indirectly + * through a switch table (struct nl_protosw *) tpcb->tp_nlproto + * (see tp_pcb.c). + * The routines here are: + * in_getsufx: gets transport suffix out of an inpcb structure. + * in_putsufx: put transport suffix into an inpcb structure. + * in_putnetaddr: put a whole net addr into an inpcb. + * in_getnetaddr: get a whole net addr from an inpcb. + * in_cmpnetaddr: compare a whole net addr from an isopcb. + * in_recycle_suffix: clear suffix for reuse in inpcb + * tpip_mtu: figure out what size tpdu to use + * tpip_input: take a pkt from ip, strip off its ip header, give to tp + * tpip_output_dg: package a pkt for ip given 2 addresses & some data + * tpip_output: package a pkt for ip given an inpcb & some data + */ + +#ifdef INET + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef ISO +#include +#endif + +/* + * NAME: in_getsufx() + + * CALLED FROM: pr_usrreq() on PRU_BIND, + * PRU_CONNECT, PRU_ACCEPT, and PRU_PEERADDR + * + * FUNCTION, ARGUMENTS, and RETURN VALUE: + * Get a transport suffix from an inpcb structure (inp). + * The argument (which) takes the value TP_LOCAL or TP_FOREIGN. + * + * RETURNS: internet port / transport suffix + * (CAST TO AN INT) + * + * SIDE EFFECTS: + * + * NOTES: + */ +in_getsufx(inp, lenp, data_out, which) + struct inpcb *inp; + u_short *lenp; + caddr_t data_out; + int which; +{ + *lenp = sizeof(u_short); + switch (which) { + case TP_LOCAL: + *(u_short *)data_out = inp->inp_lport; + return; + + case TP_FOREIGN: + *(u_short *)data_out = inp->inp_fport; + } + +} + +/* + * NAME: in_putsufx() + * + * CALLED FROM: tp_newsocket(); i.e., when a connection + * is being established by an incoming CR_TPDU. + * + * FUNCTION, ARGUMENTS: + * Put a transport suffix (found in name) into an inpcb structure (inp). + * The argument (which) takes the value TP_LOCAL or TP_FOREIGN. + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: + */ +/*ARGSUSED*/ +void +in_putsufx(inp, sufxloc, sufxlen, which) + struct inpcb *inp; + caddr_t sufxloc; + int which; +{ + if (which == TP_FOREIGN) { + bcopy(sufxloc, (caddr_t)&inp->inp_fport, sizeof(inp->inp_fport)); + } +} + +/* + * NAME: in_recycle_tsuffix() + * + * CALLED FROM: tp.trans whenever we go into REFWAIT state. + * + * FUNCTION and ARGUMENT: + * Called when a ref is frozen, to allow the suffix to be reused. + * (inp) is the net level pcb. + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: This really shouldn't have to be done in a NET level pcb + * but... for the internet world that just the way it is done in BSD... + * The alternative is to have the port unusable until the reference + * timer goes off. + */ +void +in_recycle_tsuffix(inp) + struct inpcb *inp; +{ + inp->inp_fport = inp->inp_lport = 0; +} + +/* + * NAME: in_putnetaddr() + * + * CALLED FROM: + * tp_newsocket(); i.e., when a connection is being established by an + * incoming CR_TPDU. + * + * FUNCTION and ARGUMENTS: + * Copy a whole net addr from a struct sockaddr (name). + * into an inpcb (inp). + * The argument (which) takes values TP_LOCAL or TP_FOREIGN + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: + */ +void +in_putnetaddr(inp, name, which) + register struct inpcb *inp; + struct sockaddr_in *name; + int which; +{ + switch (which) { + case TP_LOCAL: + bcopy((caddr_t)&name->sin_addr, + (caddr_t)&inp->inp_laddr, sizeof(struct in_addr)); + /* won't work if the dst address (name) is INADDR_ANY */ + + break; + case TP_FOREIGN: + if( name != (struct sockaddr_in *)0 ) { + bcopy((caddr_t)&name->sin_addr, + (caddr_t)&inp->inp_faddr, sizeof(struct in_addr)); + } + } +} + +/* + * NAME: in_putnetaddr() + * + * CALLED FROM: + * tp_input() when a connection is being established by an + * incoming CR_TPDU, and considered for interception. + * + * FUNCTION and ARGUMENTS: + * Compare a whole net addr from a struct sockaddr (name), + * with that implicitly stored in an inpcb (inp). + * The argument (which) takes values TP_LOCAL or TP_FOREIGN + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: + */ +in_cmpnetaddr(inp, name, which) + register struct inpcb *inp; + register struct sockaddr_in *name; + int which; +{ + if (which == TP_LOCAL) { + if (name->sin_port && name->sin_port != inp->inp_lport) + return 0; + return (name->sin_addr.s_addr == inp->inp_laddr.s_addr); + } + if (name->sin_port && name->sin_port != inp->inp_fport) + return 0; + return (name->sin_addr.s_addr == inp->inp_faddr.s_addr); +} + +/* + * NAME: in_getnetaddr() + * + * CALLED FROM: + * pr_usrreq() PRU_SOCKADDR, PRU_ACCEPT, PRU_PEERADDR + * FUNCTION and ARGUMENTS: + * Copy a whole net addr from an inpcb (inp) into + * an mbuf (name); + * The argument (which) takes values TP_LOCAL or TP_FOREIGN. + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: + */ + +void +in_getnetaddr( inp, name, which) + register struct mbuf *name; + struct inpcb *inp; + int which; +{ + register struct sockaddr_in *sin = mtod(name, struct sockaddr_in *); + bzero((caddr_t)sin, sizeof(*sin)); + switch (which) { + case TP_LOCAL: + sin->sin_addr = inp->inp_laddr; + sin->sin_port = inp->inp_lport; + break; + case TP_FOREIGN: + sin->sin_addr = inp->inp_faddr; + sin->sin_port = inp->inp_fport; + break; + default: + return; + } + name->m_len = sin->sin_len = sizeof (*sin); + sin->sin_family = AF_INET; +} + +/* + * NAME: tpip_mtu() + * + * CALLED FROM: + * tp_route_to() on incoming CR, CC, and pr_usrreq() for PRU_CONNECT + * + * FUNCTION, ARGUMENTS, and RETURN VALUE: + * + * Perform subnetwork dependent part of determining MTU information. + * It appears that setting a double pointer to the rtentry associated with + * the destination, and returning the header size for the network protocol + * suffices. + * + * SIDE EFFECTS: + * Sets tp_routep pointer in pcb. + * + * NOTES: + */ + +tpip_mtu(tpcb) +register struct tp_pcb *tpcb; +{ + struct inpcb *inp = (struct inpcb *)tpcb->tp_npcb; + + IFDEBUG(D_CONN) + printf("tpip_mtu(tpcb)\n", tpcb); + printf("tpip_mtu routing to addr 0x%x\n", inp->inp_faddr.s_addr); + ENDDEBUG + tpcb->tp_routep = &(inp->inp_route.ro_rt); + return (sizeof (struct ip)); + +} + +/* + * NAME: tpip_output() + * + * CALLED FROM: tp_emit() + * + * FUNCTION and ARGUMENTS: + * Take a packet(m0) from tp and package it so that ip will accept it. + * This means prepending space for the ip header and filling in a few + * of the fields. + * inp is the inpcb structure; datalen is the length of the data in the + * mbuf string m0. + * RETURNS: + * whatever (E*) is returned form the net layer output routine. + * + * SIDE EFFECTS: + * + * NOTES: + */ + +int +tpip_output(inp, m0, datalen, nochksum) + struct inpcb *inp; + struct mbuf *m0; + int datalen; + int nochksum; +{ + return tpip_output_dg( &inp->inp_laddr, &inp->inp_faddr, m0, datalen, + &inp->inp_route, nochksum); +} + +/* + * NAME: tpip_output_dg() + * + * CALLED FROM: tp_error_emit() + * + * FUNCTION and ARGUMENTS: + * This is a copy of tpip_output that takes the addresses + * instead of a pcb. It's used by the tp_error_emit, when we + * don't have an in_pcb with which to call the normal output rtn. + * + * RETURNS: ENOBUFS or whatever (E*) is + * returned form the net layer output routine. + * + * SIDE EFFECTS: + * + * NOTES: + */ + +/*ARGSUSED*/ +int +tpip_output_dg(laddr, faddr, m0, datalen, ro, nochksum) + struct in_addr *laddr, *faddr; + struct mbuf *m0; + int datalen; + struct route *ro; + int nochksum; +{ + register struct mbuf *m; + register struct ip *ip; + int error; + + IFDEBUG(D_EMIT) + printf("tpip_output_dg datalen 0x%x m0 0x%x\n", datalen, m0); + ENDDEBUG + + + MGETHDR(m, M_DONTWAIT, TPMT_IPHDR); + if (m == 0) { + error = ENOBUFS; + goto bad; + } + m->m_next = m0; + MH_ALIGN(m, sizeof(struct ip)); + m->m_len = sizeof(struct ip); + + ip = mtod(m, struct ip *); + bzero((caddr_t)ip, sizeof *ip); + + ip->ip_p = IPPROTO_TP; + m->m_pkthdr.len = ip->ip_len = sizeof(struct ip) + datalen; + ip->ip_ttl = MAXTTL; + /* don't know why you need to set ttl; + * overlay doesn't even make this available + */ + + ip->ip_src = *laddr; + ip->ip_dst = *faddr; + + IncStat(ts_tpdu_sent); + IFDEBUG(D_EMIT) + dump_mbuf(m, "tpip_output_dg before ip_output\n"); + ENDDEBUG + + error = ip_output(m, (struct mbuf *)0, ro, IP_ALLOWBROADCAST, NULL); + + IFDEBUG(D_EMIT) + printf("tpip_output_dg after ip_output\n"); + ENDDEBUG + + return error; + +bad: + m_freem(m); + IncStat(ts_send_drop); + return error; +} + +/* + * NAME: tpip_input() + * + * CALLED FROM: + * ip's input routine, indirectly through the protosw. + * + * FUNCTION and ARGUMENTS: + * Take a packet (m) from ip, strip off the ip header and give it to tp + * + * RETURNS: No return value. + * + * SIDE EFFECTS: + * + * NOTES: + */ +ProtoHook +tpip_input(m, iplen) + struct mbuf *m; + int iplen; +{ + struct sockaddr_in src, dst; + register struct ip *ip; + int s = splnet(), hdrlen; + + IncStat(ts_pkt_rcvd); + + /* + * IP layer has already pulled up the IP header, + * but the first byte after the IP header may not be there, + * e.g. if you came in via loopback, so you have to do an + * m_pullup to before you can even look to see how much you + * really need. The good news is that m_pullup will round + * up to almost the next mbuf's worth. + */ + + + if((m = m_pullup(m, iplen + 1)) == MNULL) + goto discard; + CHANGE_MTYPE(m, TPMT_DATA); + + /* + * Now pull up the whole tp header: + * Unfortunately, there may be IP options to skip past so we + * just fetch it as an unsigned char. + */ + hdrlen = iplen + 1 + mtod(m, u_char *)[iplen]; + + if( m->m_len < hdrlen ) { + if((m = m_pullup(m, hdrlen)) == MNULL){ + IFDEBUG(D_TPINPUT) + printf("tp_input, pullup 2!\n"); + ENDDEBUG + goto discard; + } + } + /* + * cannot use tp_inputprep() here 'cause you don't + * have quite the same situation + */ + + IFDEBUG(D_TPINPUT) + dump_mbuf(m, "after tpip_input both pullups"); + ENDDEBUG + /* + * m_pullup may have returned a different mbuf + */ + ip = mtod(m, struct ip *); + + /* + * drop the ip header from the front of the mbuf + * this is necessary for the tp checksum + */ + m->m_len -= iplen; + m->m_data += iplen; + + src.sin_addr = *(struct in_addr *)&(ip->ip_src); + src.sin_family = AF_INET; + src.sin_len = sizeof(src); + dst.sin_addr = *(struct in_addr *)&(ip->ip_dst); + dst.sin_family = AF_INET; + dst.sin_len = sizeof(dst); + + (void) tp_input(m, (struct sockaddr *)&src, (struct sockaddr *)&dst, + 0, tpip_output_dg, 0); + return 0; + +discard: + IFDEBUG(D_TPINPUT) + printf("tpip_input DISCARD\n"); + ENDDEBUG + IFTRACE(D_TPINPUT) + tptrace(TPPTmisc, "tpip_input DISCARD m", m,0,0,0); + ENDTRACE + m_freem(m); + IncStat(ts_recv_drop); + splx(s); + return 0; +} + + +#include +#include + +extern void tp_quench(); +/* + * NAME: tpin_quench() + * + * CALLED FROM: tpip_ctlinput() + * + * FUNCTION and ARGUMENTS: find the tpcb pointer and pass it to tp_quench + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: + */ + +void +tpin_quench(inp) + struct inpcb *inp; +{ + tp_quench((struct tp_pcb *)inp->inp_socket->so_pcb, PRC_QUENCH); +} + +/* + * NAME: tpip_ctlinput() + * + * CALLED FROM: + * The network layer through the protosw table. + * + * FUNCTION and ARGUMENTS: + * When clnp gets an ICMP msg this gets called. + * It either returns an error status to the user or + * causes all connections on this address to be aborted + * by calling the appropriate xx_notify() routine. + * (cmd) is the type of ICMP error. + * (sa) the address of the sender + * + * RETURNS: Nothing + * + * SIDE EFFECTS: + * + * NOTES: + */ +ProtoHook +tpip_ctlinput(cmd, sin) + int cmd; + struct sockaddr_in *sin; +{ + extern u_char inetctlerrmap[]; + extern struct in_addr zeroin_addr; + void tp_quench __P((struct inpcb *,int)); + void tpin_abort __P((struct inpcb *,int)); + + if (sin->sin_family != AF_INET && sin->sin_family != AF_IMPLINK) + return 0; + if (sin->sin_addr.s_addr == INADDR_ANY) + return 0; + if (cmd < 0 || cmd > PRC_NCMDS) + return 0; + switch (cmd) { + + case PRC_QUENCH: + in_pcbnotify(&tp_inpcb, (struct sockaddr *)sin, 0, + zeroin_addr, 0, cmd, tp_quench); + break; + + case PRC_ROUTEDEAD: + case PRC_HOSTUNREACH: + case PRC_UNREACH_NET: + case PRC_IFDOWN: + case PRC_HOSTDEAD: + in_pcbnotify(&tp_inpcb, (struct sockaddr *)sin, 0, + zeroin_addr, 0, cmd, in_rtchange); + break; + + default: + /* + case PRC_MSGSIZE: + case PRC_UNREACH_HOST: + case PRC_UNREACH_PROTOCOL: + case PRC_UNREACH_PORT: + case PRC_UNREACH_NEEDFRAG: + case PRC_UNREACH_SRCFAIL: + case PRC_REDIRECT_NET: + case PRC_REDIRECT_HOST: + case PRC_REDIRECT_TOSNET: + case PRC_REDIRECT_TOSHOST: + case PRC_TIMXCEED_INTRANS: + case PRC_TIMXCEED_REASS: + case PRC_PARAMPROB: + */ + in_pcbnotify(&tp_inpcb, (struct sockaddr *)sin, 0, + zeroin_addr, 0, cmd, tpin_abort); + } + return 0; +} + +/* + * NAME: tpin_abort() + * + * CALLED FROM: + * xxx_notify() from tp_ctlinput() when + * net level gets some ICMP-equiv. type event. + * + * FUNCTION and ARGUMENTS: + * Cause the connection to be aborted with some sort of error + * reason indicating that the network layer caused the abort. + * Fakes an ER TPDU so we can go through the driver. + * + * RETURNS: Nothing + * + * SIDE EFFECTS: + * + * NOTES: + */ + +ProtoHook +tpin_abort(inp) + struct inpcb *inp; +{ + struct tp_event e; + + e.ev_number = ER_TPDU; + e.ATTR(ER_TPDU).e_reason = ENETRESET; + (void) tp_driver((struct tp_pcb *)inp->inp_ppcb, &e); + return 0; +} + +#ifdef ARGO_DEBUG +dump_inaddr(addr) + register struct sockaddr_in *addr; +{ + printf("INET: port 0x%x; addr 0x%x\n", addr->sin_port, addr->sin_addr); +} +#endif /* ARGO_DEBUG */ +#endif /* INET */ diff --git a/sys/netiso/tp_input.c b/sys/netiso/tp_input.c new file mode 100644 index 00000000000..a071a5d4add --- /dev/null +++ b/sys/netiso/tp_input.c @@ -0,0 +1,1624 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_input.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_input.c,v 5.6 88/11/18 17:27:38 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_input.c,v $ + * + * tp_input() gets an mbuf chain from ip. Actually, not directly + * from ip, because ip calls a net-level routine that strips off + * the net header and then calls tp_input(), passing the proper type + * of addresses for the address family in use (how it figures out + * which AF is not yet determined.) + * + * Decomposing the tpdu is some of the most laughable code. The variable-length + * parameters and the problem of non-aligned memory references + * necessitates such abominations as the macros WHILE_OPTIONS (q.v. below) + * to loop through the header and decompose it. + * + * The routine tp_newsocket() is called when a CR comes in for a listening + * socket. tp_input calls sonewconn() and tp_newsocket() to set up the + * "child" socket. Most tpcb values are copied from the parent tpcb into + * the child. + * + * Also in here is tp_headersize() (grot) which tells the expected size + * of a tp header, to be used by other layers. It's in here because it + * uses the static structure tpdu_info. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifdef TRUE +#undef FALSE +#undef TRUE +#endif +#include +#include +#include + +int iso_check_csum(), tp_driver(), tp_headersize(), tp_error_emit(); + +/* + #ifdef lint + #undef ATTR + #define ATTR(X)ev_number + #endif lint +*/ + +struct mbuf * +tp_inputprep(m) + register struct mbuf *m; +{ + int hdrlen; + + IFDEBUG(D_TPINPUT) + printf("tp_inputprep: m 0x%x\n", m) ; + ENDDEBUG + + while( m->m_len < 1 ) { + /* The "m_free" logic + * if( (m = m_free(m)) == MNULL ) + * return (struct mbuf *)0; + * would cause a system crash if ever executed. + * This logic will be executed if the first mbuf + * in the chain only contains a CLNP header. The m_free routine + * will release the mbuf containing the CLNP header from the + * chain and the new head of the chain will not have the + * M_PKTHDR bit set. This routine, tp_inputprep, will + * eventually call the "sbappendaddr" routine. "sbappendaddr" + * calls "panic" if M_PKTHDR is not set. m_pullup is a cheap + * way of keeping the head of the chain from being freed. + */ + if((m = m_pullup(m, 1)) == MNULL) + return (MNULL); + } + if(((int)m->m_data) & 0x3) { + /* If we are not 4-byte aligned, we have to be + * above the beginning of the mbuf, and it is ok just + * to slide it back. + */ + caddr_t ocp = m->m_data; + + m->m_data = (caddr_t)(((int)m->m_data) & ~0x3); + bcopy(ocp, m->m_data, (unsigned)m->m_len); + } + CHANGE_MTYPE(m, TPMT_DATA); + + /* we KNOW that there is at least 1 byte in this mbuf + and that it is hdr->tpdu_li XXXXXXX! */ + + hdrlen = 1 + *mtod( m, u_char *); + + /* + * now pull up the whole tp header + */ + if ( m->m_len < hdrlen) { + if ((m = m_pullup(m, hdrlen)) == MNULL ) { + IncStat(ts_recv_drop); + return (struct mbuf *)0; + } + } + IFDEBUG(D_INPUT) + printf( + " at end: m 0x%x hdr->tpdu_li 0x%x m_len 0x%x\n",m, + hdrlen, m->m_len); + ENDDEBUG + return m; +} + +/* begin groan + * -- this array and the following macros allow you to step through the + * parameters of the variable part of a header + * note that if for any reason the values of the **_TPDU macros (in tp_events.h) + * should change, this array has to be rearranged + */ + +#define TP_LEN_CLASS_0_INDEX 2 +#define TP_MAX_DATA_INDEX 3 + +static u_char tpdu_info[][4] = +{ +/* length max data len */ +/* reg fmt xtd fmt class 0 */ + /* UNUSED 0x0 */ 0x0 , 0x0, 0x0, 0x0, + /* XPD_TPDU_type 0x1 */ 0x5, 0x8, 0x0, TP_MAX_XPD_DATA, + /* XAK_TPDU_type 0x2 */ 0x5 , 0x8, 0x0, 0x0, + /* GR_TPDU_type 0x3 */ 0x0 , 0x0, 0x0, 0x0, + /* UNUSED 0x4 */ 0x0 , 0x0, 0x0, 0x0, + /* UNUSED 0x5 */ 0x0 , 0x0, 0x0, 0x0, + /* AK_TPDU_type 0x6 */ 0x5, 0xa, 0x0, 0x0, + /* ER_TPDU_type 0x7 */ 0x5, 0x5, 0x0, 0x0, + /* DR_TPDU_type 0x8 */ 0x7, 0x7, 0x7, TP_MAX_DR_DATA, + /* UNUSED 0x9 */ 0x0 , 0x0, 0x0, 0x0, + /* UNUSED 0xa */ 0x0 , 0x0, 0x0, 0x0, + /* UNUSED 0xb */ 0x0 , 0x0, 0x0, 0x0, + /* DC_TPDU_type 0xc */ 0x6, 0x6, 0x0, 0x0, + /* CC_TPDU_type 0xd */ 0x7, 0x7, 0x7, TP_MAX_CC_DATA, + /* CR_TPDU_type 0xe */ 0x7, 0x7, 0x7, TP_MAX_CR_DATA, + /* DT_TPDU_type 0xf */ 0x5, 0x8, 0x3, 0x0, +}; + +#define CHECK(Phrase, Erval, Stat, Whattodo, Loc)\ + if (Phrase) {error = (Erval); errlen = (int)(Loc); IncStat(Stat);\ + goto Whattodo; } + +/* + * WHENEVER YOU USE THE FOLLOWING MACRO, + * BE SURE THE TPDUTYPE IS A LEGIT VALUE FIRST! + */ + +#define WHILE_OPTIONS(P, hdr, format)\ +{ register caddr_t P = tpdu_info[(hdr)->tpdu_type][(format)] + (caddr_t)hdr;\ + caddr_t PLIM = 1 + hdr->tpdu_li + (caddr_t)hdr;\ + for (;; P += 2 + ((struct tp_vbp *)P)->tpv_len) {\ + CHECK((P > PLIM), E_TP_LENGTH_INVAL, ts_inv_length,\ + respond, P - (caddr_t)hdr);\ + if (P == PLIM) break; + +#define END_WHILE_OPTIONS(P) } } + +/* end groan */ + +/* + * NAME: tp_newsocket() + * + * CALLED FROM: + * tp_input() on incoming CR, when a socket w/ the called suffix + * is awaiting a connection request + * + * FUNCTION and ARGUMENTS: + * Create a new socket structure, attach to it a new transport pcb, + * using a copy of the net level pcb for the parent socket. + * (so) is the parent socket. + * (fname) is the foreign address (all that's used is the nsap portion) + * + * RETURN VALUE: + * a new socket structure, being this end of the newly formed connection. + * + * SIDE EFFECTS: + * Sets a few things in the tpcb and net level pcb + * + * NOTES: + */ +static struct socket * +tp_newsocket(so, fname, cons_channel, class_to_use, netservice) + struct socket *so; + struct sockaddr *fname; + caddr_t cons_channel; + u_char class_to_use; + u_int netservice; +{ + register struct tp_pcb *tpcb = sototpcb(so); /* old tpcb, needed below */ + register struct tp_pcb *newtpcb; + + /* + * sonewconn() gets a new socket structure, + * a new lower layer pcb and a new tpcb, + * but the pcbs are unnamed (not bound) + */ + IFTRACE(D_NEWSOCK) + tptraceTPCB(TPPTmisc, "newsock: listg_so, _tpcb, so_head", + so, tpcb, so->so_head, 0); + ENDTRACE + + if ((so = sonewconn(so, SS_ISCONFIRMING)) == (struct socket *)0) + return so; + IFTRACE(D_NEWSOCK) + tptraceTPCB(TPPTmisc, "newsock: after newconn so, so_head", + so, so->so_head, 0, 0); + ENDTRACE + + IFDEBUG(D_NEWSOCK) + printf("tp_newsocket(channel 0x%x) after sonewconn so 0x%x \n", + cons_channel, so); + dump_addr(fname); + { + struct socket *t, *head ; + + head = so->so_head; + t = so; + printf("so 0x%x so_head 0x%x so_q0 0x%x, q0len %d\n", + t, t->so_head, t->so_q0, t->so_q0len); + while( (t=t->so_q0) && t!= so && t!= head) + printf("so 0x%x so_head 0x%x so_q0 0x%x, q0len %d\n", + t, t->so_head, t->so_q0, t->so_q0len); + } + ENDDEBUG + + /* + * before we clobber the old tpcb ptr, get these items from the parent pcb + */ + newtpcb = sototpcb(so); + newtpcb->_tp_param = tpcb->_tp_param; + newtpcb->tp_flags = tpcb->tp_flags; + newtpcb->tp_lcredit = tpcb->tp_lcredit; + newtpcb->tp_l_tpdusize = tpcb->tp_l_tpdusize; + newtpcb->tp_lsuffixlen = tpcb->tp_lsuffixlen; + bcopy( tpcb->tp_lsuffix, newtpcb->tp_lsuffix, newtpcb->tp_lsuffixlen); + + if( /* old */ tpcb->tp_ucddata) { + /* + * These data are the connect- , confirm- or disconnect- data. + */ + struct mbuf *conndata; + + conndata = m_copy(tpcb->tp_ucddata, 0, (int)M_COPYALL); + IFDEBUG(D_CONN) + dump_mbuf(conndata, "conndata after mcopy"); + ENDDEBUG + newtpcb->tp_ucddata = conndata; + } + + tpcb = newtpcb; + tpcb->tp_state = TP_LISTENING; + tpcb->tp_class = class_to_use; + tpcb->tp_netservice = netservice; + + + ASSERT( fname != 0 ) ; /* just checking */ + if ( fname ) { + /* + * tp_route_to takes its address argument in the form of an mbuf. + */ + struct mbuf *m; + int err; + + MGET(m, M_DONTWAIT, MT_SONAME); /* mbuf type used is confusing */ + if (m) { + /* + * this seems a bit grotesque, but tp_route_to expects + * an mbuf * instead of simply a sockaddr; it calls the ll + * pcb_connect, which expects the name/addr in an mbuf as well. + * sigh. + */ + bcopy((caddr_t)fname, mtod(m, caddr_t), fname->sa_len); + m->m_len = fname->sa_len; + + /* grot : have to say the kernel can override params in + * the passive open case + */ + tpcb->tp_dont_change_params = 0; + err = tp_route_to( m, tpcb, cons_channel); + m_free(m); + + if (!err) + goto ok; + } + IFDEBUG(D_CONN) + printf("tp_route_to FAILED! detaching tpcb 0x%x, so 0x%x\n", + tpcb, so); + ENDDEBUG + (void) tp_detach(tpcb); + return 0; + } +ok: + IFDEBUG(D_TPINPUT) + printf("tp_newsocket returning so 0x%x, sototpcb(so) 0x%x\n", + so, sototpcb(so)); + ENDDEBUG + return so; +} + +#ifndef TPCONS +tpcons_output() +{ + return(0); +} +#endif /* !CONS */ + +/* + * NAME: tp_input() + * + * CALLED FROM: + * net layer input routine + * + * FUNCTION and ARGUMENTS: + * Process an incoming TPDU (m), finding the associated tpcb if there + * is one. Create the appropriate type of event and call the driver. + * (faddr) and (laddr) are the foreign and local addresses. + * + * When tp_input() is called we KNOW that the ENTIRE TP HEADER + * has been m_pullup-ed. + * + * RETURN VALUE: Nada + * + * SIDE EFFECTS: + * When using COSNS it may affect the state of the net-level pcb + * + * NOTE: + * The initial value of acktime is 2 so that we will never + * have a 0 value for tp_peer_acktime. It gets used in the + * computation of the retransmission timer value, and so it + * mustn't be zero. + * 2 seems like a reasonable minimum. + */ +ProtoHook +tp_input(m, faddr, laddr, cons_channel, dgout_routine, ce_bit) + register struct mbuf *m; + struct sockaddr *faddr, *laddr; /* NSAP addresses */ + caddr_t cons_channel; + int (*dgout_routine)(); + int ce_bit; + +{ + register struct tp_pcb *tpcb; + register struct tpdu *hdr; + struct socket *so; + struct tp_event e; + int error; + unsigned dutype; + u_short dref, sref, acktime, subseq; + u_char preferred_class, class_to_use, pdusize; + u_char opt, dusize, addlopt, version; +#ifdef TP_PERF_MEAS + u_char perf_meas; +#endif /* TP_PERF_MEAS */ + u_char fsufxlen, lsufxlen; + caddr_t fsufxloc, lsufxloc; + int tpdu_len; + u_int takes_data; + u_int fcc_present; + int errlen; + struct tp_conn_param tpp; + int tpcons_output(); + +again: + hdr = mtod(m, struct tpdu *); + tpcb = 0; + error = errlen = tpdu_len = 0; + takes_data = fcc_present = FALSE; + acktime = 2; sref = subseq = 0; + fsufxloc = lsufxloc = NULL; + fsufxlen = lsufxlen = + preferred_class = class_to_use = pdusize = addlopt = 0; + dusize = TP_DFL_TPDUSIZE; +#ifdef TP_PERF_MEAS + GET_CUR_TIME( &e.e_time ); perf_meas = 0; +#endif /* TP_PERF_MEAS */ + + IFDEBUG(D_TPINPUT) + printf("tp_input(0x%x, ... 0x%x)\n", m, cons_channel); + ENDDEBUG + + + /* + * get the actual tpdu length - necessary for monitoring + * and for checksumming + * + * Also, maybe measure the mbuf chain lengths and sizes. + */ + + { register struct mbuf *n=m; +# ifdef ARGO_DEBUG + int chain_length = 0; +# endif ARGO_DEBUG + + for(;;) { + tpdu_len += n->m_len; + IFDEBUG(D_MBUF_MEAS) + if( n->m_flags & M_EXT) { + IncStat(ts_mb_cluster); + } else { + IncStat(ts_mb_small); + } + chain_length ++; + ENDDEBUG + if (n->m_next == MNULL ) { + break; + } + n = n->m_next; + } + IFDEBUG(D_MBUF_MEAS) + if(chain_length > 16) + chain_length = 0; /* zero used for anything > 16 */ + tp_stat.ts_mb_len_distr[chain_length] ++; + ENDDEBUG + } + IFTRACE(D_TPINPUT) + tptraceTPCB(TPPTtpduin, hdr->tpdu_type, hdr, hdr->tpdu_li+1, tpdu_len, + 0); + ENDTRACE + + dref = ntohs((short)hdr->tpdu_dref); + sref = ntohs((short)hdr->tpdu_sref); + dutype = (int)hdr->tpdu_type; + + IFDEBUG(D_TPINPUT) + printf("input: dutype 0x%x cons_channel 0x%x dref 0x%x\n", dutype, + cons_channel, dref); + printf("input: dref 0x%x sref 0x%x\n", dref, sref); + ENDDEBUG + IFTRACE(D_TPINPUT) + tptrace(TPPTmisc, "channel dutype dref ", + cons_channel, dutype, dref, 0); + ENDTRACE + + +#ifdef ARGO_DEBUG + if( (dutype < TP_MIN_TPDUTYPE) || (dutype > TP_MAX_TPDUTYPE)) { + printf("BAD dutype! 0x%x, channel 0x%x dref 0x%x\n", + dutype, cons_channel, dref); + dump_buf (m, sizeof( struct mbuf )); + + IncStat(ts_inv_dutype); + goto discard; + } +#endif /* ARGO_DEBUG */ + + CHECK( (dutype < TP_MIN_TPDUTYPE || dutype > TP_MAX_TPDUTYPE), + E_TP_INV_TPDU, ts_inv_dutype, respond, + 2 ); + /* unfortunately we can't take the address of the tpdu_type field, + * since it's a bit field - so we just use the constant offset 2 + */ + + /* Now this isn't very neat but since you locate a pcb one way + * at the beginning of connection establishment, and by + * the dref for each tpdu after that, we have to treat CRs differently + */ + if ( dutype == CR_TPDU_type ) { + u_char alt_classes = 0; + + preferred_class = 1 << hdr->tpdu_CRclass; + opt = hdr->tpdu_CRoptions; + + WHILE_OPTIONS(P, hdr, 1 ) /* { */ + + switch( vbptr(P)->tpv_code ) { + + case TPP_tpdu_size: + vb_getval(P, u_char, dusize); + IFDEBUG(D_TPINPUT) + printf("CR dusize 0x%x\n", dusize); + ENDDEBUG + /* COS tests: NBS IA (Dec. 1987) Sec. 4.5.2.1 */ + if (dusize < TP_MIN_TPDUSIZE || dusize > TP_MAX_TPDUSIZE) + dusize = TP_DFL_TPDUSIZE; + break; + case TPP_ptpdu_size: + switch (vbptr(P)->tpv_len) { + case 1: pdusize = vbval(P, u_char); break; + case 2: pdusize = ntohs(vbval(P, u_short)); break; + default: ; + IFDEBUG(D_TPINPUT) + printf("malformed prefered TPDU option\n"); + ENDDEBUG + } + break; + case TPP_addl_opt: + vb_getval(P, u_char, addlopt); + break; + case TPP_calling_sufx: + /* could use vb_getval, but we want to save the loc & len + * for later use + */ + fsufxloc = (caddr_t) &vbptr(P)->tpv_val; + fsufxlen = vbptr(P)->tpv_len; + IFDEBUG(D_TPINPUT) + printf("CR fsufx:"); + { register int j; + for(j=0; jtpv_val; + lsufxlen = vbptr(P)->tpv_len; + IFDEBUG(D_TPINPUT) + printf("CR lsufx:"); + { register int j; + for(j=0; jtpv_val - (caddr_t)hdr) ); + setversion: + version = vbval(P, u_char); + break; + case TPP_acktime: + vb_getval(P, u_short, acktime); + acktime = ntohs(acktime); + acktime = acktime/500; /* convert to slowtimo ticks */ + if((short)acktime <=0 ) + acktime = 2; /* don't allow a bad peer to screw us up */ + IFDEBUG(D_TPINPUT) + printf("CR acktime 0x%x\n", acktime); + ENDDEBUG + break; + + case TPP_alt_class: + { + u_char *aclass = 0; + register int i; + static u_char bad_alt_classes[5] = + { ~0, ~3, ~5, ~0xf, ~0x1f}; + + aclass = + (u_char *) &(((struct tp_vbp *)P)->tpv_val); + for (i = ((struct tp_vbp *)P)->tpv_len; i>0; i--) { + alt_classes |= (1<<((*aclass++)>>4)); + } + CHECK( (bad_alt_classes[hdr->tpdu_CRclass] & alt_classes), + E_TP_INV_PVAL, ts_inv_aclass, respond, + ((caddr_t)aclass) - (caddr_t)hdr); + IFDEBUG(D_TPINPUT) + printf("alt_classes 0x%x\n", alt_classes); + ENDDEBUG + } + break; + + case TPP_security: + case TPP_residER: + case TPP_priority: + case TPP_transdelay: + case TPP_throughput: + case TPP_addl_info: + case TPP_subseq: + default: + IFDEBUG(D_TPINPUT) + printf("param ignored CR_TPDU code= 0x%x\n", + vbptr(P)->tpv_code); + ENDDEBUG + IncStat(ts_param_ignored); + break; + + case TPP_checksum: + IFDEBUG(D_TPINPUT) + printf("CR before cksum\n"); + ENDDEBUG + + CHECK( iso_check_csum(m, tpdu_len), + E_TP_INV_PVAL, ts_bad_csum, discard, 0) + + IFDEBUG(D_TPINPUT) + printf("CR before cksum\n"); + ENDDEBUG + break; + } + + /* } */ END_WHILE_OPTIONS(P) + + if (lsufxlen == 0) { + /* can't look for a tpcb w/o any called sufx */ + error = E_TP_LENGTH_INVAL; + IncStat(ts_inv_sufx); + goto respond; + } else { + register struct tp_pcb *t; + /* + * The intention here is to trap all CR requests + * to a given nsap, for constructing transport + * service bridges at user level; so these + * intercepts should precede the normal listens. + * Phrasing the logic in this way also allows for + * mop-up listeners, which we don't currently implement. + * We also wish to have a single socket be able to + * listen over any network service provider, + * (cons or clns or ip). + */ + for (t = tp_listeners; t ; t = t->tp_nextlisten) + if ((t->tp_lsuffixlen == 0 || + (lsufxlen == t->tp_lsuffixlen && + bcmp(lsufxloc, t->tp_lsuffix, lsufxlen) == 0)) && + ((t->tp_flags & TPF_GENERAL_ADDR) || + (laddr->sa_family == t->tp_domain && + (*t->tp_nlproto->nlp_cmpnetaddr) + (t->tp_npcb, laddr, TP_LOCAL)))) + break; + + CHECK(t == 0, E_TP_NO_SESSION, ts_inv_sufx, respond, + (1 + 2 + (caddr_t)&hdr->_tpduf - (caddr_t)hdr)) + /* _tpduf is the fixed part; add 2 to get the dref bits of + * the fixed part (can't take the address of a bit field) + */ + IFDEBUG(D_TPINPUT) + printf("checking if dup CR\n"); + ENDDEBUG + tpcb = t; + for (t = tpcb->tp_next; t != tpcb; t = t->tp_next) { + if (sref != t->tp_fref) + continue; + if ((*tpcb->tp_nlproto->nlp_cmpnetaddr)( + t->tp_npcb, faddr, TP_FOREIGN)) { + IFDEBUG(D_TPINPUT) + printf("duplicate CR discarded\n"); + ENDDEBUG + goto discard; + } + } + IFTRACE(D_TPINPUT) + tptrace(TPPTmisc, "tp_input: tpcb *lsufxloc tpstate", + tpcb, *lsufxloc, tpcb->tp_state, 0); + ENDTRACE + } + + /* + * WE HAVE A TPCB + * already know that the classes in the CR match at least + * one class implemented, but we don't know yet if they + * include any classes permitted by this server. + */ + + IFDEBUG(D_TPINPUT) + printf("HAVE A TPCB 1: 0x%x\n", tpcb); + ENDDEBUG + IFDEBUG(D_CONN) + printf( +"CR: bef CHKS: flags 0x%x class_to_use 0x%x alt 0x%x opt 0x%x tp_class 0x%x\n", + tpcb->tp_flags, class_to_use, alt_classes, opt, tpcb->tp_class); + ENDDEBUG + /* tpcb->tp_class doesn't include any classes not implemented */ + class_to_use = (preferred_class & tpcb->tp_class); + if( (class_to_use = preferred_class & tpcb->tp_class) == 0 ) + class_to_use = alt_classes & tpcb->tp_class; + + class_to_use = 1 << tp_mask_to_num(class_to_use); + + { + tpp = tpcb->_tp_param; + tpp.p_class = class_to_use; + tpp.p_tpdusize = dusize; + tpp.p_ptpdusize = pdusize; + tpp.p_xtd_format = (opt & TPO_XTD_FMT) == TPO_XTD_FMT; + tpp.p_xpd_service = (addlopt & TPAO_USE_TXPD) == TPAO_USE_TXPD; + tpp.p_use_checksum = (tpp.p_class == TP_CLASS_0)?0: + (addlopt & TPAO_NO_CSUM) == 0; + tpp.p_version = version; +#ifdef notdef + tpp.p_use_efc = (opt & TPO_USE_EFC) == TPO_USE_EFC; + tpp.p_use_nxpd = (addlopt & TPAO_USE_NXPD) == TPAO_USE_NXPD; + tpp.p_use_rcc = (addlopt & TPAO_USE_RCC) == TPAO_USE_RCC; +#endif /* notdef */ + + CHECK( + tp_consistency(tpcb, 0 /* not force or strict */, &tpp) != 0, + E_TP_NEGOT_FAILED, ts_negotfailed, clear_parent_tcb, + (1 + 2 + (caddr_t)&hdr->_tpdufr.CRCC - (caddr_t)hdr) + /* ^ more or less the location of class */ + ) + } + IFTRACE(D_CONN) + tptrace(TPPTmisc, + "after 1 consist class_to_use class, out, tpconsout", + class_to_use, + tpcb->tp_class, dgout_routine, tpcons_output + ); + ENDTRACE + CHECK( + ((class_to_use == TP_CLASS_0)&&(dgout_routine != tpcons_output)), + E_TP_NEGOT_FAILED, ts_negotfailed, clear_parent_tcb, + (1 + 2 + (caddr_t)&hdr->_tpdufr.CRCC - (caddr_t)hdr) + /* ^ more or less the location of class */ + ) + IFDEBUG(D_CONN) + printf("CR: after CRCCCHECKS: tpcb 0x%x, flags 0x%x\n", + tpcb, tpcb->tp_flags); + ENDDEBUG + takes_data = TRUE; + e.ATTR(CR_TPDU).e_cdt = hdr->tpdu_CRcdt; + e.ev_number = CR_TPDU; + + so = tpcb->tp_sock; + if (so->so_options & SO_ACCEPTCONN) { + struct tp_pcb *parent_tpcb = tpcb; + /* + * Create a socket, tpcb, ll pcb, etc. + * for this newborn connection, and fill in all the values. + */ + IFDEBUG(D_CONN) + printf("abt to call tp_newsocket(0x%x, 0x%x, 0x%x, 0x%x)\n", + so, laddr, faddr, cons_channel); + ENDDEBUG + if( (so = + tp_newsocket(so, faddr, cons_channel, + class_to_use, + ((tpcb->tp_netservice == IN_CLNS) ? IN_CLNS : + (dgout_routine == tpcons_output)?ISO_CONS:ISO_CLNS)) + ) == (struct socket *)0 ) { + /* note - even if netservice is IN_CLNS, as far as + * the tp entity is concerned, the only differences + * are CO vs CL + */ + IFDEBUG(D_CONN) + printf("tp_newsocket returns 0\n"); + ENDDEBUG + goto discard; + clear_parent_tcb: + tpcb = 0; + goto respond; + } + tpcb = sototpcb(so); + insque(tpcb, parent_tpcb); + + /* + * Stash the addresses in the net level pcb + * kind of like a pcbconnect() but don't need + * or want all those checks. + */ + (tpcb->tp_nlproto->nlp_putnetaddr)(tpcb->tp_npcb, faddr, TP_FOREIGN); + (tpcb->tp_nlproto->nlp_putnetaddr)(tpcb->tp_npcb, laddr, TP_LOCAL); + + /* stash the f suffix in the new tpcb */ + if (tpcb->tp_fsuffixlen = fsufxlen) { + bcopy(fsufxloc, tpcb->tp_fsuffix, fsufxlen); + (tpcb->tp_nlproto->nlp_putsufx) + (tpcb->tp_npcb, fsufxloc, fsufxlen, TP_FOREIGN); + } + /* stash the l suffix in the new tpcb */ + tpcb->tp_lsuffixlen = lsufxlen; + bcopy(lsufxloc, tpcb->tp_lsuffix, lsufxlen); + (tpcb->tp_nlproto->nlp_putsufx) + (tpcb->tp_npcb, lsufxloc, lsufxlen, TP_LOCAL); +#ifdef TP_PERF_MEAS + if( tpcb->tp_perf_on = perf_meas ) { /* assignment */ + /* ok, let's create an mbuf for stashing the + * statistics if one doesn't already exist + */ + (void) tp_setup_perf(tpcb); + } +#endif /* TP_PERF_MEAS */ + tpcb->tp_fref = sref; + + /* We've already checked for consistency with the options + * set in tpp, but we couldn't set them earlier because + * we didn't want to change options in the LISTENING tpcb. + * Now we set the options in the new socket's tpcb. + */ + (void) tp_consistency( tpcb, TP_FORCE, &tpp); + + if(!tpcb->tp_use_checksum) + IncStat(ts_csum_off); + if(tpcb->tp_xpd_service) + IncStat(ts_use_txpd); + if(tpcb->tp_xtd_format) + IncStat(ts_xtd_fmt); + + tpcb->tp_peer_acktime = acktime; + + /* + * The following kludge is used to test retransmissions and + * timeout during connection establishment. + */ + IFDEBUG(D_ZDREF) + IncStat(ts_zdebug); + /*tpcb->tp_fref = 0;*/ + ENDDEBUG + } + LOCAL_CREDIT(tpcb); + IncStat(ts_CR_rcvd); + if (!tpcb->tp_cebit_off) { + tpcb->tp_win_recv = tp_start_win << 8; + tpcb->tp_cong_sample.cs_size = 0; + CONG_INIT_SAMPLE(tpcb); + CONG_UPDATE_SAMPLE(tpcb, ce_bit); + } + } else if ( dutype == ER_TPDU_type ) { + /* + * ER TPDUs have to be recognized separately + * because they don't necessarily have a tpcb + * with them and we don't want err out looking for such + * a beast. + * We could put a bunch of little kludges in the + * next section of code so it would avoid references to tpcb + * if dutype == ER_TPDU_type but we don't want code for ERs to + * mess up code for data transfer. + */ + IncStat(ts_ER_rcvd); + e.ev_number = ER_TPDU; + e.ATTR(ER_TPDU).e_reason = (u_char)hdr->tpdu_ERreason; + CHECK (((int)dref <= 0 || dref >= tp_refinfo.tpr_size || + (tpcb = tp_ref[dref].tpr_pcb ) == (struct tp_pcb *) 0 || + tpcb->tp_refstate == REF_FREE || + tpcb->tp_refstate == REF_FROZEN), + E_TP_MISM_REFS, ts_inv_dref, discard, 0) + + } else { + /* tpdu type is CC, XPD, XAK, GR, AK, DR, DC, or DT */ + + /* In the next 4 checks, + * _tpduf is the fixed part; add 2 to get the dref bits of + * the fixed part (can't take the address of a bit field) + */ +#ifdef TPCONS + if (cons_channel && dutype == DT_TPDU_type) { + struct isopcb *isop = ((struct isopcb *) + ((struct pklcd *)cons_channel)->lcd_upnext); + if (isop && isop->isop_refcnt == 1 && isop->isop_socket && + (tpcb = sototpcb(isop->isop_socket)) && + (tpcb->tp_class == TP_CLASS_0/* || == CLASS_1 */)) { + IFDEBUG(D_TPINPUT) + printf("tpinput_dt: class 0 short circuit\n"); + ENDDEBUG + dref = tpcb->tp_lref; + sref = tpcb->tp_fref; + CHECK( (tpcb->tp_refstate == REF_FREE), + E_TP_MISM_REFS,ts_inv_dref, nonx_dref, + (1 + 2 + (caddr_t)&hdr->_tpduf - (caddr_t)hdr)) + goto tp0_data; + } + + } +#endif + { + + CHECK( ((int)dref <= 0 || dref >= tp_refinfo.tpr_size) , + E_TP_MISM_REFS,ts_inv_dref, nonx_dref, + (1 + 2 + (caddr_t)&hdr->_tpduf - (caddr_t)hdr)) + CHECK( ((tpcb = tp_ref[dref].tpr_pcb ) == (struct tp_pcb *) 0 ), + E_TP_MISM_REFS,ts_inv_dref, nonx_dref, + (1 + 2 + (caddr_t)&hdr->_tpduf - (caddr_t)hdr)) + CHECK( (tpcb->tp_refstate == REF_FREE), + E_TP_MISM_REFS,ts_inv_dref, nonx_dref, + (1 + 2 + (caddr_t)&hdr->_tpduf - (caddr_t)hdr)) + } + + IFDEBUG(D_TPINPUT) + printf("HAVE A TPCB 2: 0x%x\n", tpcb); + ENDDEBUG + + /* causes a DR to be sent for CC; ER for all else */ + CHECK( (tpcb->tp_refstate == REF_FROZEN), + (dutype == CC_TPDU_type?E_TP_NO_SESSION:E_TP_MISM_REFS), + ts_inv_dref, respond, + (1 + 2 + (caddr_t)&hdr->_tpduf - (caddr_t)hdr)) + + IFDEBUG(D_TPINPUT) + printf("state of dref %d ok, tpcb 0x%x\n", dref,tpcb); + ENDDEBUG + /* + * At this point the state of the dref could be + * FROZEN: tpr_pcb == NULL, has ( reference only) timers + * for example, DC may arrive after the close() has detached + * the tpcb (e.g., if user turned off SO_LISTEN option) + * OPENING : a tpcb exists but no timers yet + * OPEN : tpcb exists & timers are outstanding + */ + + if (!tpcb->tp_cebit_off) + CONG_UPDATE_SAMPLE(tpcb, ce_bit); + + dusize = tpcb->tp_tpdusize; + pdusize = tpcb->tp_ptpdusize; + + dutype = hdr->tpdu_type << 8; /* for the switch below */ + + WHILE_OPTIONS(P, hdr, tpcb->tp_xtd_format) /* { */ + +#define caseof(x,y) case (((x)<<8)+(y)) + switch( dutype | vbptr(P)->tpv_code ) { + + caseof( CC_TPDU_type, TPP_addl_opt ): + /* not in class 0; 1 octet */ + vb_getval(P, u_char, addlopt); + break; + caseof( CC_TPDU_type, TPP_tpdu_size ): + { + u_char odusize = dusize; + vb_getval(P, u_char, dusize); + CHECK( (dusize < TP_MIN_TPDUSIZE || + dusize > TP_MAX_TPDUSIZE || dusize > odusize), + E_TP_INV_PVAL, ts_inv_pval, respond, + (1 + (caddr_t)&vbptr(P)->tpv_val - (caddr_t)hdr) ) + IFDEBUG(D_TPINPUT) + printf("CC dusize 0x%x\n", dusize); + ENDDEBUG + } + break; + caseof( CC_TPDU_type, TPP_ptpdu_size ): + { + u_short opdusize = pdusize; + switch (vbptr(P)->tpv_len) { + case 1: pdusize = vbval(P, u_char); break; + case 2: pdusize = ntohs(vbval(P, u_short)); break; + default: ; + IFDEBUG(D_TPINPUT) + printf("malformed prefered TPDU option\n"); + ENDDEBUG + } + CHECK( (pdusize == 0 || + (opdusize && (pdusize > opdusize))), + E_TP_INV_PVAL, ts_inv_pval, respond, + (1 + (caddr_t)&vbptr(P)->tpv_val - (caddr_t)hdr) ) + } + break; + caseof( CC_TPDU_type, TPP_calling_sufx): + IFDEBUG(D_TPINPUT) + printf("CC calling (local) sufxlen 0x%x\n", lsufxlen); + ENDDEBUG + lsufxloc = (caddr_t) &vbptr(P)->tpv_val; + lsufxlen = vbptr(P)->tpv_len; + break; + caseof( CC_TPDU_type, TPP_acktime ): + /* class 4 only, 2 octets */ + vb_getval(P, u_short, acktime); + acktime = ntohs(acktime); + acktime = acktime/500; /* convert to slowtimo ticks */ + if( (short)acktime <=0 ) + acktime = 2; + break; + caseof( CC_TPDU_type, TPP_called_sufx): + fsufxloc = (caddr_t) &vbptr(P)->tpv_val; + fsufxlen = vbptr(P)->tpv_len; + IFDEBUG(D_TPINPUT) + printf("CC called (foreign) sufx len %d\n", fsufxlen); + ENDDEBUG + break; + + caseof( CC_TPDU_type, TPP_checksum): + caseof( DR_TPDU_type, TPP_checksum): + caseof( DT_TPDU_type, TPP_checksum): + caseof( XPD_TPDU_type, TPP_checksum): + if( tpcb->tp_use_checksum ) { + CHECK( iso_check_csum(m, tpdu_len), + E_TP_INV_PVAL, ts_bad_csum, discard, 0) + } + break; + + /* this is different from the above because in the context + * of concat/ sep tpdu_len might not be the same as hdr len + */ + caseof( AK_TPDU_type, TPP_checksum): + caseof( XAK_TPDU_type, TPP_checksum): + caseof( DC_TPDU_type, TPP_checksum): + if( tpcb->tp_use_checksum ) { + CHECK( iso_check_csum(m, (int)hdr->tpdu_li + 1), + E_TP_INV_PVAL, ts_bad_csum, discard, 0) + } + break; +#ifdef notdef + caseof( DR_TPDU_type, TPP_addl_info ): + /* ignore - its length and meaning are + * user defined and there's no way + * to pass this info to the user anyway + */ + break; +#endif /* notdef */ + + caseof( AK_TPDU_type, TPP_subseq ): + /* used after reduction of window */ + vb_getval(P, u_short, subseq); + subseq = ntohs(subseq); + IFDEBUG(D_ACKRECV) + printf("AK dref 0x%x Subseq 0x%x\n", dref, subseq); + ENDDEBUG + break; + + caseof( AK_TPDU_type, TPP_flow_cntl_conf ): + { + u_int ylwe; + u_short ysubseq, ycredit; + + fcc_present = TRUE; + vb_getval(P, u_int, ylwe); + vb_getval(P, u_short, ysubseq); + vb_getval(P, u_short, ycredit); + ylwe = ntohl(ylwe); + ysubseq = ntohs(ysubseq); + ycredit = ntohs(ycredit); + IFDEBUG(D_ACKRECV) + printf("%s%x, subseq 0x%x, cdt 0x%x dref 0x%x\n", + "AK FCC lwe 0x", ylwe, ysubseq, ycredit, dref); + ENDDEBUG + } + break; + + default: + IFDEBUG(D_TPINPUT) + printf("param ignored dutype 0x%x, code 0x%x\n", + dutype, vbptr(P)->tpv_code); + ENDDEBUG + IFTRACE(D_TPINPUT) + tptrace(TPPTmisc, "param ignored dutype code ", + dutype, vbptr(P)->tpv_code ,0,0); + ENDTRACE + IncStat(ts_param_ignored); + break; +#undef caseof + } + /* } */ END_WHILE_OPTIONS(P) + + /* NOTE: the variable dutype has been shifted left! */ + + switch( hdr->tpdu_type ) { + case CC_TPDU_type: + /* If CC comes back with an unacceptable class + * respond with a DR or ER + */ + + opt = hdr->tpdu_CCoptions; /* 1 byte */ + + { + tpp = tpcb->_tp_param; + tpp.p_class = (1<tpdu_CCclass); + tpp.p_tpdusize = dusize; + tpp.p_ptpdusize = pdusize; + tpp.p_dont_change_params = 0; + tpp.p_xtd_format = (opt & TPO_XTD_FMT) == TPO_XTD_FMT; + tpp.p_xpd_service = (addlopt & TPAO_USE_TXPD) == TPAO_USE_TXPD; + tpp.p_use_checksum = (addlopt & TPAO_NO_CSUM) == 0; +#ifdef notdef + tpp.p_use_efc = (opt & TPO_USE_EFC) == TPO_USE_EFC; + tpp.p_use_nxpd = (addlopt & TPAO_USE_NXPD) == TPAO_USE_NXPD; + tpp.p_use_rcc = (addlopt & TPAO_USE_RCC) == TPAO_USE_RCC; +#endif /* notdef */ + + CHECK( + tp_consistency(tpcb, TP_FORCE, &tpp) != 0, + E_TP_NEGOT_FAILED, ts_negotfailed, respond, + (1 + 2 + (caddr_t)&hdr->_tpdufr.CRCC - (caddr_t)hdr) + /* ^ more or less the location of class */ + ) + IFTRACE(D_CONN) + tptrace(TPPTmisc, + "after 1 consist class, out, tpconsout", + tpcb->tp_class, dgout_routine, tpcons_output, 0 + ); + ENDTRACE + CHECK( + ((class_to_use == TP_CLASS_0)&& + (dgout_routine != tpcons_output)), + E_TP_NEGOT_FAILED, ts_negotfailed, respond, + (1 + 2 + (caddr_t)&hdr->_tpdufr.CRCC - (caddr_t)hdr) + /* ^ more or less the location of class */ + ) +#ifdef TPCONS + if (tpcb->tp_netservice == ISO_CONS && + class_to_use == TP_CLASS_0) { + struct isopcb *isop = (struct isopcb *)tpcb->tp_npcb; + struct pklcd *lcp = (struct pklcd *)isop->isop_chan; + lcp->lcd_flags &= ~X25_DG_CIRCUIT; + } +#endif + } + if( ! tpcb->tp_use_checksum) + IncStat(ts_csum_off); + if(tpcb->tp_xpd_service) + IncStat(ts_use_txpd); + if(tpcb->tp_xtd_format) + IncStat(ts_xtd_fmt); + + IFTRACE(D_CONN) + tptrace(TPPTmisc, "after CC class flags dusize CCclass", + tpcb->tp_class, tpcb->tp_flags, tpcb->tp_tpdusize, + hdr->tpdu_CCclass); + ENDTRACE + + /* if called or calling suffices appeared on the CC, + * they'd better jive with what's in the pcb + */ + if( fsufxlen ) { + CHECK( ((tpcb->tp_fsuffixlen != fsufxlen) || + bcmp(fsufxloc, tpcb->tp_fsuffix, fsufxlen)), + E_TP_INV_PVAL,ts_inv_sufx, respond, + (1+fsufxloc - (caddr_t)hdr)) + } + if( lsufxlen ) { + CHECK( ((tpcb->tp_lsuffixlen != lsufxlen) || + bcmp(lsufxloc, tpcb->tp_lsuffix, lsufxlen)), + E_TP_INV_PVAL,ts_inv_sufx, respond, + (1+lsufxloc - (caddr_t)hdr)) + } + + e.ATTR(CC_TPDU).e_sref = sref; + e.ATTR(CC_TPDU).e_cdt = hdr->tpdu_CCcdt; + takes_data = TRUE; + e.ev_number = CC_TPDU; + IncStat(ts_CC_rcvd); + break; + + case DC_TPDU_type: + if (sref != tpcb->tp_fref) + printf("INPUT: inv sufx DCsref 0x%x, tp_fref 0x%x\n", + sref, tpcb->tp_fref); + + CHECK( (sref != tpcb->tp_fref), + E_TP_MISM_REFS, ts_inv_sufx, discard, + (1 + (caddr_t)&hdr->tpdu_DCsref - (caddr_t)hdr)) + + e.ev_number = DC_TPDU; + IncStat(ts_DC_rcvd); + break; + + case DR_TPDU_type: + IFTRACE(D_TPINPUT) + tptrace(TPPTmisc, "DR recvd", hdr->tpdu_DRreason, 0, 0, 0); + ENDTRACE + if (sref != tpcb->tp_fref) { + printf("INPUT: inv sufx DRsref 0x%x tp_fref 0x%x\n", + sref, tpcb->tp_fref); + } + + CHECK( (sref != 0 && sref != tpcb->tp_fref && + tpcb->tp_state != TP_CRSENT), + (TP_ERROR_SNDC | E_TP_MISM_REFS),ts_inv_sufx, respond, + (1 + (caddr_t)&hdr->tpdu_DRsref - (caddr_t)hdr)) + + e.ATTR(DR_TPDU).e_reason = hdr->tpdu_DRreason; + e.ATTR(DR_TPDU).e_sref = (u_short)sref; + takes_data = TRUE; + e.ev_number = DR_TPDU; + IncStat(ts_DR_rcvd); + break; + + case ER_TPDU_type: + IFTRACE(D_TPINPUT) + tptrace(TPPTmisc, "ER recvd", hdr->tpdu_ERreason,0,0,0); + ENDTRACE + e.ev_number = ER_TPDU; + e.ATTR(ER_TPDU).e_reason = hdr->tpdu_ERreason; + IncStat(ts_ER_rcvd); + break; + + case AK_TPDU_type: + + e.ATTR(AK_TPDU).e_subseq = subseq; + e.ATTR(AK_TPDU).e_fcc_present = fcc_present; + + if (tpcb->tp_xtd_format) { +#ifdef BYTE_ORDER + union seq_type seqeotX; + + seqeotX.s_seqeot = ntohl(hdr->tpdu_seqeotX); + e.ATTR(AK_TPDU).e_seq = seqeotX.s_seq; + e.ATTR(AK_TPDU).e_cdt = ntohs(hdr->tpdu_AKcdtX); +#else + e.ATTR(AK_TPDU).e_cdt = hdr->tpdu_AKcdtX; + e.ATTR(AK_TPDU).e_seq = hdr->tpdu_AKseqX; +#endif /* BYTE_ORDER */ + } else { + e.ATTR(AK_TPDU).e_cdt = hdr->tpdu_AKcdt; + e.ATTR(AK_TPDU).e_seq = hdr->tpdu_AKseq; + } + IFTRACE(D_TPINPUT) + tptrace(TPPTmisc, "AK recvd seq cdt subseq fcc_pres", + e.ATTR(AK_TPDU).e_seq, e.ATTR(AK_TPDU).e_cdt, + subseq, fcc_present); + ENDTRACE + + e.ev_number = AK_TPDU; + IncStat(ts_AK_rcvd); + IncPStat(tpcb, tps_AK_rcvd); + break; + + case XAK_TPDU_type: + if (tpcb->tp_xtd_format) { +#ifdef BYTE_ORDER + union seq_type seqeotX; + + seqeotX.s_seqeot = ntohl(hdr->tpdu_seqeotX); + e.ATTR(XAK_TPDU).e_seq = seqeotX.s_seq; +#else + e.ATTR(XAK_TPDU).e_seq = hdr->tpdu_XAKseqX; +#endif /* BYTE_ORDER */ + } else { + e.ATTR(XAK_TPDU).e_seq = hdr->tpdu_XAKseq; + } + e.ev_number = XAK_TPDU; + IncStat(ts_XAK_rcvd); + IncPStat(tpcb, tps_XAK_rcvd); + break; + + case XPD_TPDU_type: + if (tpcb->tp_xtd_format) { +#ifdef BYTE_ORDER + union seq_type seqeotX; + + seqeotX.s_seqeot = ntohl(hdr->tpdu_seqeotX); + e.ATTR(XPD_TPDU).e_seq = seqeotX.s_seq; +#else + e.ATTR(XPD_TPDU).e_seq = hdr->tpdu_XPDseqX; +#endif /* BYTE_ORDER */ + } else { + e.ATTR(XPD_TPDU).e_seq = hdr->tpdu_XPDseq; + } + takes_data = TRUE; + e.ev_number = XPD_TPDU; + IncStat(ts_XPD_rcvd); + IncPStat(tpcb, tps_XPD_rcvd); + break; + + case DT_TPDU_type: + { /* the y option will cause occasional packets to be dropped. + * A little crude but it works. + */ + + IFDEBUG(D_DROP) + if(time.tv_usec & 0x4 && hdr->tpdu_DTseq & 0x1) { + IncStat(ts_ydebug); + goto discard; + } + ENDDEBUG + } + if (tpcb->tp_class == TP_CLASS_0) { + tp0_data: + e.ATTR(DT_TPDU).e_seq = 0; /* actually don't care */ + e.ATTR(DT_TPDU).e_eot = (((struct tp0du *)hdr)->tp0du_eot); + } else if (tpcb->tp_xtd_format) { +#ifdef BYTE_ORDER + union seq_type seqeotX; + + seqeotX.s_seqeot = ntohl(hdr->tpdu_seqeotX); + e.ATTR(DT_TPDU).e_seq = seqeotX.s_seq; + e.ATTR(DT_TPDU).e_eot = seqeotX.s_eot; +#else + e.ATTR(DT_TPDU).e_seq = hdr->tpdu_DTseqX; + e.ATTR(DT_TPDU).e_eot = hdr->tpdu_DTeotX; +#endif /* BYTE_ORDER */ + } else { + e.ATTR(DT_TPDU).e_seq = hdr->tpdu_DTseq; + e.ATTR(DT_TPDU).e_eot = hdr->tpdu_DTeot; + } + if(e.ATTR(DT_TPDU).e_eot) + IncStat(ts_eot_input); + takes_data = TRUE; + e.ev_number = DT_TPDU; + IncStat(ts_DT_rcvd); + IncPStat(tpcb, tps_DT_rcvd); + break; + + case GR_TPDU_type: + tp_indicate(T_DISCONNECT, tpcb, ECONNABORTED); + /* drop through */ + default: + /* this should NEVER happen because there is a + * check for dutype well above here + */ + error = E_TP_INV_TPDU; /* causes an ER */ + IFDEBUG(D_TPINPUT) + printf("INVALID dutype 0x%x\n", hdr->tpdu_type); + ENDDEBUG + IncStat(ts_inv_dutype); + goto respond; + } + } + /* peel off the tp header; + * remember that the du_li doesn't count itself. + * This may leave us w/ an empty mbuf at the front of a chain. + * We can't just throw away the empty mbuf because hdr still points + * into the mbuf's data area and we're still using hdr (the tpdu header) + */ + m->m_len -= ((int)hdr->tpdu_li + 1); + m->m_data += ((int)hdr->tpdu_li + 1); + + if (takes_data) { + int max = tpdu_info[ hdr->tpdu_type ] [TP_MAX_DATA_INDEX]; + int datalen = tpdu_len - hdr->tpdu_li - 1, mbtype = MT_DATA; + struct { + struct tp_disc_reason dr; + struct cmsghdr x_hdr; + } x; +#define c_hdr x.x_hdr + register struct mbuf *n; + + CHECK( (max && datalen > max), E_TP_LENGTH_INVAL, + ts_inv_length, respond, (max + hdr->tpdu_li + 1) ); + switch( hdr->tpdu_type ) { + + case CR_TPDU_type: + c_hdr.cmsg_type = TPOPT_CONN_DATA; + goto make_control_msg; + + case CC_TPDU_type: + c_hdr.cmsg_type = TPOPT_CFRM_DATA; + goto make_control_msg; + + case DR_TPDU_type: + x.dr.dr_hdr.cmsg_len = sizeof(x) - sizeof(c_hdr); + x.dr.dr_hdr.cmsg_type = TPOPT_DISC_REASON; + x.dr.dr_hdr.cmsg_level = SOL_TRANSPORT; + x.dr.dr_reason = hdr->tpdu_DRreason; + c_hdr.cmsg_type = TPOPT_DISC_DATA; + make_control_msg: + datalen += sizeof(c_hdr); + c_hdr.cmsg_len = datalen; + c_hdr.cmsg_level = SOL_TRANSPORT; + mbtype = MT_CONTROL; + MGET(n, M_DONTWAIT, MT_DATA); + if (n == 0) + {m_freem(m); m = 0; datalen = 0; goto invoke; } + if (hdr->tpdu_type == DR_TPDU_type) { + datalen += sizeof(x) - sizeof(c_hdr); + bcopy((caddr_t)&x, mtod(n, caddr_t), n->m_len = sizeof(x)); + } else + bcopy((caddr_t)&c_hdr, mtod(n, caddr_t), + n->m_len = sizeof(c_hdr)); + n->m_next = m; + m = n; + /* FALLTHROUGH */ + + case XPD_TPDU_type: + if (mbtype != MT_CONTROL) + mbtype = MT_OOBDATA; + m->m_flags |= M_EOR; + /* FALLTHROUGH */ + + case DT_TPDU_type: + for (n = m; n; n = n->m_next) { + MCHTYPE(n, mbtype); + } + invoke: + e.ATTR(DT_TPDU).e_datalen = datalen; + e.ATTR(DT_TPDU).e_data = m; + break; + + default: + printf( + "ERROR in tp_input! hdr->tpdu_type 0x%x takes_data 0x%x m 0x%x\n", + hdr->tpdu_type, takes_data, m); + break; + } + /* prevent m_freem() after tp_driver() from throwing it all away */ + m = MNULL; + } + + IncStat(ts_tpdu_rcvd); + + IFDEBUG(D_TPINPUT) + printf( "tp_input: before driver, state 0x%x event 0x%x m 0x%x", + tpcb->tp_state, e.ev_number, m ); + printf(" e.e_data 0x%x\n", e.ATTR(DT_TPDU).e_data); + printf("takes_data 0x%x m_len 0x%x, tpdu_len 0x%x\n", + takes_data, (m==MNULL)?0:m->m_len, tpdu_len); + ENDDEBUG + + error = tp_driver(tpcb, &e); + + ASSERT(tpcb != (struct tp_pcb *)0); + ASSERT(tpcb->tp_sock != (struct socket *)0); + if( tpcb->tp_sock->so_error == 0 ) + tpcb->tp_sock->so_error = error; + + /* Kludge to keep the state tables under control (adding + * data on connect & disconnect & freeing the mbuf containing + * the data would have exploded the tables and made a big mess ). + */ + switch(e.ev_number) { + case CC_TPDU: + case DR_TPDU: + case CR_TPDU: + m = e.ATTR(CC_TPDU).e_data; /* same field for all three dutypes */ + IFDEBUG(D_TPINPUT) + printf("after driver, restoring m to 0x%x, takes_data 0x%x\n", + m, takes_data); + ENDDEBUG + break; + default: + break; + } + /* Concatenated sequences are terminated by any tpdu that + * carries data: CR, CC, DT, XPD, DR. + * All other tpdu types may be concatenated: AK, XAK, DC, ER. + */ + +separate: + if ( takes_data == 0 ) { + ASSERT( m != MNULL ); + /* + * we already peeled off the prev. tp header so + * we can just pull up some more and repeat + */ + + if( m = tp_inputprep(m) ) { + IFDEBUG(D_TPINPUT) + hdr = mtod(m, struct tpdu *); + printf("tp_input @ separate: hdr 0x%x size %d m 0x%x\n", + hdr, (int) hdr->tpdu_li + 1, m); + dump_mbuf(m, "tp_input after driver, at separate"); + ENDDEBUG + + IncStat(ts_concat_rcvd); + goto again; + } + } + if ( m != MNULL ) { + IFDEBUG(D_TPINPUT) + printf("tp_input : m_freem(0x%x)\n", m); + ENDDEBUG + m_freem(m); + IFDEBUG(D_TPINPUT) + printf("tp_input : after m_freem 0x%x\n", m); + ENDDEBUG + } + return (ProtoHook) tpcb; + +discard: + /* class 4: drop the tpdu */ + /* class 2,0: Should drop the net connection, if you can figure out + * to which connection it applies + */ + IFDEBUG(D_TPINPUT) + printf("tp_input DISCARD\n"); + ENDDEBUG + IFTRACE(D_TPINPUT) + tptrace(TPPTmisc, "tp_input DISCARD m", m,0,0,0); + ENDTRACE + m_freem(m); + IncStat(ts_recv_drop); + return (ProtoHook)0; + +nonx_dref: + switch (dutype) { + default: + goto discard; + case CC_TPDU_type: + /* error = E_TP_MISM_REFS; */ + break; + case DR_TPDU_type: + error |= TP_ERROR_SNDC; + } +respond: + IFDEBUG(D_TPINPUT) + printf("RESPOND: error 0x%x, errlen 0x%x\n", error, errlen); + ENDDEBUG + IFTRACE(D_TPINPUT) + tptrace(TPPTmisc, "tp_input RESPOND m error sref", m, error, sref, 0); + ENDTRACE + if (sref == 0) + goto discard; + (void) tp_error_emit(error, (u_long)sref, (struct sockaddr_iso *)faddr, + (struct sockaddr_iso *)laddr, m, errlen, tpcb, + cons_channel, dgout_routine); + IFDEBUG(D_ERROR_EMIT) + printf("tp_input after error_emit\n"); + ENDDEBUG + +#ifdef lint + printf("",sref,opt); +#endif /* lint */ + IncStat(ts_recv_drop); + return (ProtoHook)0; +} + + +/* + * NAME: tp_headersize() + * + * CALLED FROM: + * tp_emit() and tp_sbsend() + * TP needs to know the header size so it can figure out how + * much data to put in each tpdu. + * + * FUNCTION, ARGUMENTS, and RETURN VALUE: + * For a given connection, represented by (tpcb), and + * tpdu type (dutype), return the size of a tp header. + * + * RETURNS: the expected size of the heade in bytesr + * + * SIDE EFFECTS: + * + * NOTES: It would be nice if it got the network header size as well. + */ +int +tp_headersize(dutype, tpcb) + int dutype; + struct tp_pcb *tpcb; +{ + register int size = 0; + + IFTRACE(D_CONN) + tptrace(TPPTmisc, "tp_headersize dutype class xtd_format", + dutype, tpcb->tp_class, tpcb->tp_xtd_format, 0); + ENDTRACE + if( !( (tpcb->tp_class == TP_CLASS_0) || + (tpcb->tp_class == TP_CLASS_4) || + (dutype == DR_TPDU_type) || + (dutype == CR_TPDU_type) )) { + printf("tp_headersize:dutype 0x%x, class 0x%x", + dutype, tpcb->tp_class); + /* TODO: identify this and GET RID OF IT */ + } + ASSERT( (tpcb->tp_class == TP_CLASS_0) || + (tpcb->tp_class == TP_CLASS_4) || + (dutype == DR_TPDU_type) || + (dutype == CR_TPDU_type) ); + + if( tpcb->tp_class == TP_CLASS_0 ) { + size = tpdu_info[ dutype ] [TP_LEN_CLASS_0_INDEX]; + } else { + size = tpdu_info[ dutype ] [tpcb->tp_xtd_format]; + } + return size; + /* caller must get network level header size separately */ +} diff --git a/sys/netiso/tp_ip.h b/sys/netiso/tp_ip.h new file mode 100644 index 00000000000..f2777676e13 --- /dev/null +++ b/sys/netiso/tp_ip.h @@ -0,0 +1,91 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_ip.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_ip.h,v 5.1 88/10/12 12:19:47 root Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_ip.h,v $ + * + * internet IP-dependent structures and include files + * + */ + + +#ifndef __TP_IP__ +#define __TP_IP__ + +#ifndef SOCK_STREAM +#include +#endif + +#include +#include +#include +#include +#include +#include + + +struct inpcb tp_inpcb; + /* queue of active inpcbs for tp ; for tp with dod ip */ + +#endif /* __TP_IP__ */ diff --git a/sys/netiso/tp_iso.c b/sys/netiso/tp_iso.c new file mode 100644 index 00000000000..1cf67f86648 --- /dev/null +++ b/sys/netiso/tp_iso.c @@ -0,0 +1,693 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_iso.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * $Header: /var/src/sys/netiso/RCS/tp_iso.c,v 5.1 89/02/09 16:20:51 hagens Exp $ + * $Source: /var/src/sys/netiso/RCS/tp_iso.c,v $ + * + * Here is where you find the iso-dependent code. We've tried + * keep all net-level and (primarily) address-family-dependent stuff + * out of the tp source, and everthing here is reached indirectly + * through a switch table (struct nl_protosw *) tpcb->tp_nlproto + * (see tp_pcb.c). + * The routines here are: + * iso_getsufx: gets transport suffix out of an isopcb structure. + * iso_putsufx: put transport suffix into an isopcb structure. + * iso_putnetaddr: put a whole net addr into an isopcb. + * iso_getnetaddr: get a whole net addr from an isopcb. + * iso_cmpnetaddr: compare a whole net addr from an isopcb. + * iso_recycle_suffix: clear suffix for reuse in isopcb + * tpclnp_ctlinput: handle ER CNLPdu : icmp-like stuff + * tpclnp_mtu: figure out what size tpdu to use + * tpclnp_input: take a pkt from clnp, strip off its clnp header, + * give to tp + * tpclnp_output_dg: package a pkt for clnp given 2 addresses & some data + * tpclnp_output: package a pkt for clnp given an isopcb & some data + */ + +#ifdef ISO + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * CALLED FROM: + * pr_usrreq() on PRU_BIND, PRU_CONNECT, PRU_ACCEPT, and PRU_PEERADDR + * FUNCTION, ARGUMENTS: + * The argument (which) takes the value TP_LOCAL or TP_FOREIGN. + */ + +iso_getsufx(isop, lenp, data_out, which) + struct isopcb *isop; + u_short *lenp; + caddr_t data_out; + int which; +{ + register struct sockaddr_iso *addr = 0; + + switch (which) { + case TP_LOCAL: + addr = isop->isop_laddr; + break; + + case TP_FOREIGN: + addr = isop->isop_faddr; + } + if (addr) + bcopy(TSEL(addr), data_out, (*lenp = addr->siso_tlen)); +} + +/* CALLED FROM: + * tp_newsocket(); i.e., when a connection is being established by an + * incoming CR_TPDU. + * + * FUNCTION, ARGUMENTS: + * Put a transport suffix (found in name) into an isopcb structure (isop). + * The argument (which) takes the value TP_LOCAL or TP_FOREIGN. + */ +void +iso_putsufx(isop, sufxloc, sufxlen, which) + struct isopcb *isop; + caddr_t sufxloc; + int sufxlen, which; +{ + struct sockaddr_iso **dst, *backup; + register struct sockaddr_iso *addr; + struct mbuf *m; + int len; + + switch (which) { + default: + return; + + case TP_LOCAL: + dst = &isop->isop_laddr; + backup = &isop->isop_sladdr; + break; + + case TP_FOREIGN: + dst = &isop->isop_faddr; + backup = &isop->isop_sfaddr; + } + if ((addr = *dst) == 0) { + addr = *dst = backup; + addr->siso_nlen = 0; + addr->siso_slen = 0; + addr->siso_plen = 0; + printf("iso_putsufx on un-initialized isopcb\n"); + } + len = sufxlen + addr->siso_nlen + + (sizeof(*addr) - sizeof(addr->siso_data)); + if (addr == backup) { + if (len > sizeof(*addr)) { + m = m_getclr(M_DONTWAIT, MT_SONAME); + if (m == 0) + return; + addr = *dst = mtod(m, struct sockaddr_iso *); + *addr = *backup; + m->m_len = len; + } + } + bcopy(sufxloc, TSEL(addr), sufxlen); + addr->siso_tlen = sufxlen; + addr->siso_len = len; +} + +/* + * CALLED FROM: + * tp.trans whenever we go into REFWAIT state. + * FUNCTION and ARGUMENT: + * Called when a ref is frozen, to allow the suffix to be reused. + * (isop) is the net level pcb. This really shouldn't have to be + * done in a NET level pcb but... for the internet world that just + * the way it is done in BSD... + * The alternative is to have the port unusable until the reference + * timer goes off. + */ +void +iso_recycle_tsuffix(isop) + struct isopcb *isop; +{ + isop->isop_laddr->siso_tlen = isop->isop_faddr->siso_tlen = 0; +} + +/* + * CALLED FROM: + * tp_newsocket(); i.e., when a connection is being established by an + * incoming CR_TPDU. + * + * FUNCTION and ARGUMENTS: + * Copy a whole net addr from a struct sockaddr (name). + * into an isopcb (isop). + * The argument (which) takes values TP_LOCAL or TP_FOREIGN + */ +void +iso_putnetaddr(isop, name, which) + register struct isopcb *isop; + struct sockaddr_iso *name; + int which; +{ + struct sockaddr_iso **sisop, *backup; + register struct sockaddr_iso *siso; + + switch (which) { + default: + printf("iso_putnetaddr: should panic\n"); + return; + case TP_LOCAL: + sisop = &isop->isop_laddr; + backup = &isop->isop_sladdr; + break; + case TP_FOREIGN: + sisop = &isop->isop_faddr; + backup = &isop->isop_sfaddr; + } + siso = ((*sisop == 0) ? (*sisop = backup) : *sisop); + IFDEBUG(D_TPISO) + printf("ISO_PUTNETADDR\n"); + dump_isoaddr(isop->isop_faddr); + ENDDEBUG + siso->siso_addr = name->siso_addr; +} + +/* + * CALLED FROM: + * tp_input() when a connection is being established by an + * incoming CR_TPDU, and considered for interception. + * + * FUNCTION and ARGUMENTS: + * compare a whole net addr from a struct sockaddr (name), + * with that implicitly stored in an isopcb (isop). + * The argument (which) takes values TP_LOCAL or TP_FOREIGN. + */ +iso_cmpnetaddr(isop, name, which) + register struct isopcb *isop; + register struct sockaddr_iso *name; + int which; +{ + struct sockaddr_iso **sisop, *backup; + register struct sockaddr_iso *siso; + + switch (which) { + default: + printf("iso_cmpnetaddr: should panic\n"); + return 0; + case TP_LOCAL: + sisop = &isop->isop_laddr; + backup = &isop->isop_sladdr; + break; + case TP_FOREIGN: + sisop = &isop->isop_faddr; + backup = &isop->isop_sfaddr; + } + siso = ((*sisop == 0) ? (*sisop = backup) : *sisop); + IFDEBUG(D_TPISO) + printf("ISO_CMPNETADDR\n"); + dump_isoaddr(siso); + ENDDEBUG + if (name->siso_tlen && bcmp(TSEL(name), TSEL(siso), name->siso_tlen)) + return (0); + return (bcmp((caddr_t)name->siso_data, + (caddr_t)siso->siso_data, name->siso_nlen) == 0); +} + +/* + * CALLED FROM: + * pr_usrreq() PRU_SOCKADDR, PRU_ACCEPT, PRU_PEERADDR + * FUNCTION and ARGUMENTS: + * Copy a whole net addr from an isopcb (isop) into + * a struct sockaddr (name). + * The argument (which) takes values TP_LOCAL or TP_FOREIGN. + */ + +void +iso_getnetaddr( isop, name, which) + struct isopcb *isop; + struct mbuf *name; + int which; +{ + struct sockaddr_iso *siso = + (which == TP_LOCAL ? isop->isop_laddr : isop->isop_faddr); + if (siso) + bcopy((caddr_t)siso, mtod(name, caddr_t), + (unsigned)(name->m_len = siso->siso_len)); + else + name->m_len = 0; +} +/* + * NAME: tpclnp_mtu() + * + * CALLED FROM: + * tp_route_to() on incoming CR, CC, and pr_usrreq() for PRU_CONNECT + * + * FUNCTION, ARGUMENTS, and RETURN VALUE: + * + * Perform subnetwork dependent part of determining MTU information. + * It appears that setting a double pointer to the rtentry associated with + * the destination, and returning the header size for the network protocol + * suffices. + * + * SIDE EFFECTS: + * Sets tp_routep pointer in pcb. + * + * NOTES: + */ +tpclnp_mtu(tpcb) +register struct tp_pcb *tpcb; +{ + struct isopcb *isop = (struct isopcb *)tpcb->tp_npcb; + + IFDEBUG(D_CONN) + printf("tpclnp_mtu(tpcb)\n", tpcb); + ENDDEBUG + tpcb->tp_routep = &(isop->isop_route.ro_rt); + if (tpcb->tp_netservice == ISO_CONS) + return 0; + else + return (sizeof(struct clnp_fixed) + sizeof(struct clnp_segment) + + 2 * sizeof(struct iso_addr)); + +} + +/* + * CALLED FROM: + * tp_emit() + * FUNCTION and ARGUMENTS: + * Take a packet(m0) from tp and package it so that clnp will accept it. + * This means prepending space for the clnp header and filling in a few + * of the fields. + * isop is the isopcb structure; datalen is the length of the data in the + * mbuf string m0. + * RETURN VALUE: + * whatever (E*) is returned form the net layer output routine. + */ + +int +tpclnp_output(isop, m0, datalen, nochksum) + struct isopcb *isop; + struct mbuf *m0; + int datalen; + int nochksum; +{ + register struct mbuf *m = m0; + IncStat(ts_tpdu_sent); + + IFDEBUG(D_TPISO) + struct tpdu *hdr = mtod(m0, struct tpdu *); + + printf( +"abt to call clnp_output: datalen 0x%x, hdr.li 0x%x, hdr.dutype 0x%x nocsum x%x dst addr:\n", + datalen, + (int)hdr->tpdu_li, (int)hdr->tpdu_type, nochksum); + dump_isoaddr(isop->isop_faddr); + printf("\nsrc addr:\n"); + dump_isoaddr(isop->isop_laddr); + dump_mbuf(m0, "at tpclnp_output"); + ENDDEBUG + + return + clnp_output(m0, isop, datalen, /* flags */nochksum ? CLNP_NO_CKSUM : 0); +} + +/* + * CALLED FROM: + * tp_error_emit() + * FUNCTION and ARGUMENTS: + * This is a copy of tpclnp_output that takes the addresses + * instead of a pcb. It's used by the tp_error_emit, when we + * don't have an iso_pcb with which to call the normal output rtn. + * RETURN VALUE: + * ENOBUFS or + * whatever (E*) is returned form the net layer output routine. + */ + +int +tpclnp_output_dg(laddr, faddr, m0, datalen, ro, nochksum) + struct iso_addr *laddr, *faddr; + struct mbuf *m0; + int datalen; + struct route *ro; + int nochksum; +{ + struct isopcb tmppcb; + int err; + int flags; + register struct mbuf *m = m0; + + IFDEBUG(D_TPISO) + printf("tpclnp_output_dg datalen 0x%x m0 0x%x\n", datalen, m0); + ENDDEBUG + + /* + * Fill in minimal portion of isopcb so that clnp can send the + * packet. + */ + bzero((caddr_t)&tmppcb, sizeof(tmppcb)); + tmppcb.isop_laddr = &tmppcb.isop_sladdr; + tmppcb.isop_laddr->siso_addr = *laddr; + tmppcb.isop_faddr = &tmppcb.isop_sfaddr; + tmppcb.isop_faddr->siso_addr = *faddr; + + IFDEBUG(D_TPISO) + printf("tpclnp_output_dg faddr: \n"); + dump_isoaddr(&tmppcb.isop_sfaddr); + printf("\ntpclnp_output_dg laddr: \n"); + dump_isoaddr(&tmppcb.isop_sladdr); + printf("\n"); + ENDDEBUG + + /* + * Do not use packet cache since this is a one shot error packet + */ + flags = (CLNP_NOCACHE|(nochksum?CLNP_NO_CKSUM:0)); + + IncStat(ts_tpdu_sent); + + err = clnp_output(m0, &tmppcb, datalen, flags); + + /* + * Free route allocated by clnp (if the route was indeed allocated) + */ + if (tmppcb.isop_route.ro_rt) + RTFREE(tmppcb.isop_route.ro_rt); + + return(err); +} +/* + * CALLED FROM: + * clnp's input routine, indirectly through the protosw. + * FUNCTION and ARGUMENTS: + * Take a packet (m) from clnp, strip off the clnp header and give it to tp + * No return value. + */ +ProtoHook +tpclnp_input(m, src, dst, clnp_len, ce_bit) + register struct mbuf *m; + struct sockaddr_iso *src, *dst; + int clnp_len, ce_bit; +{ + struct mbuf *tp_inputprep(); + int tp_input(), cltp_input(), (*input)() = tp_input; + + IncStat(ts_pkt_rcvd); + + IFDEBUG(D_TPINPUT) + printf("tpclnp_input: m 0x%x clnp_len 0x%x\n", m, clnp_len); + dump_mbuf(m, "at tpclnp_input"); + ENDDEBUG + /* + * CLNP gives us an mbuf chain WITH the clnp header pulled up, + * and the length of the clnp header. + * First, strip off the Clnp header. leave the mbuf there for the + * pullup that follows. + */ + m->m_len -= clnp_len; + m->m_data += clnp_len; + m->m_pkthdr.len -= clnp_len; + /* XXXX: should probably be in clnp_input */ + switch (dst->siso_data[dst->siso_nlen - 1]) { +#ifdef TUBA + case ISOPROTO_TCP: + return (tuba_tcpinput(m, src, dst)); +#endif + case 0: + if (m->m_len == 0 && (m = m_pullup(m, 1)) == 0) + return 0; + if (*(mtod(m, u_char *)) == ISO10747_IDRP) + return (idrp_input(m, src, dst)); + } + m = tp_inputprep(m); + if (m == 0) + return 0; + if (mtod(m, u_char *)[1] == UD_TPDU_type) + input = cltp_input; + + IFDEBUG(D_TPINPUT) + dump_mbuf(m, "after tpclnp_input both pullups"); + ENDDEBUG + + IFDEBUG(D_TPISO) + printf("calling %sinput : src 0x%x, dst 0x%x, src addr:\n", + (input == tp_input ? "tp_" : "clts_"), src, dst); + dump_isoaddr(src); + printf(" dst addr:\n"); + dump_isoaddr(dst); + ENDDEBUG + + (void) (*input)(m, (struct sockaddr *)src, (struct sockaddr *)dst, + 0, tpclnp_output_dg, ce_bit); + + IFDEBUG(D_QUENCH) + { + if(time.tv_usec & 0x4 && time.tv_usec & 0x40) { + printf("tpclnp_input: FAKING %s\n", + tp_stat.ts_pkt_rcvd & 0x1?"QUENCH":"QUENCH2"); + if(tp_stat.ts_pkt_rcvd & 0x1) { + tpclnp_ctlinput(PRC_QUENCH, &src); + } else { + tpclnp_ctlinput(PRC_QUENCH2, &src); + } + } + } + ENDDEBUG + + return 0; +} + +ProtoHook +iso_rtchange() +{ + return 0; +} + +/* + * CALLED FROM: + * tpclnp_ctlinput() + * FUNCTION and ARGUMENTS: + * find the tpcb pointer and pass it to tp_quench + */ +void +tpiso_decbit(isop) + struct isopcb *isop; +{ + tp_quench((struct tp_pcb *)isop->isop_socket->so_pcb, PRC_QUENCH2); +} +/* + * CALLED FROM: + * tpclnp_ctlinput() + * FUNCTION and ARGUMENTS: + * find the tpcb pointer and pass it to tp_quench + */ +void +tpiso_quench(isop) + struct isopcb *isop; +{ + tp_quench((struct tp_pcb *)isop->isop_socket->so_pcb, PRC_QUENCH); +} + +/* + * CALLED FROM: + * The network layer through the protosw table. + * FUNCTION and ARGUMENTS: + * When clnp an ICMP-like msg this gets called. + * It either returns an error status to the user or + * it causes all connections on this address to be aborted + * by calling the appropriate xx_notify() routine. + * (cmd) is the type of ICMP error. + * (siso) is the address of the guy who sent the ER CLNPDU + */ +ProtoHook +tpclnp_ctlinput(cmd, siso) + int cmd; + struct sockaddr_iso *siso; +{ + extern u_char inetctlerrmap[]; + extern ProtoHook tpiso_abort(); + extern ProtoHook iso_rtchange(); + extern ProtoHook tpiso_reset(); + void iso_pcbnotify(); + + IFDEBUG(D_TPINPUT) + printf("tpclnp_ctlinput1: cmd 0x%x addr: \n", cmd); + dump_isoaddr(siso); + ENDDEBUG + + if (cmd < 0 || cmd > PRC_NCMDS) + return 0; + if (siso->siso_family != AF_ISO) + return 0; + switch (cmd) { + + case PRC_QUENCH2: + iso_pcbnotify(&tp_isopcb, siso, 0, (int (*)())tpiso_decbit); + break; + + case PRC_QUENCH: + iso_pcbnotify(&tp_isopcb, siso, 0, (int (*)())tpiso_quench); + break; + + case PRC_TIMXCEED_REASS: + case PRC_ROUTEDEAD: + iso_pcbnotify(&tp_isopcb, siso, 0, tpiso_reset); + break; + + case PRC_HOSTUNREACH: + case PRC_UNREACH_NET: + case PRC_IFDOWN: + case PRC_HOSTDEAD: + iso_pcbnotify(&tp_isopcb, siso, + (int)inetctlerrmap[cmd], iso_rtchange); + break; + + default: + /* + case PRC_MSGSIZE: + case PRC_UNREACH_HOST: + case PRC_UNREACH_PROTOCOL: + case PRC_UNREACH_PORT: + case PRC_UNREACH_NEEDFRAG: + case PRC_UNREACH_SRCFAIL: + case PRC_REDIRECT_NET: + case PRC_REDIRECT_HOST: + case PRC_REDIRECT_TOSNET: + case PRC_REDIRECT_TOSHOST: + case PRC_TIMXCEED_INTRANS: + case PRC_PARAMPROB: + */ + iso_pcbnotify(&tp_isopcb, siso, (int)inetctlerrmap[cmd], tpiso_abort); + break; + } + return 0; +} +/* + * XXX - Variant which is called by clnp_er.c with an isoaddr rather + * than a sockaddr_iso. + */ + +static struct sockaddr_iso siso = {sizeof(siso), AF_ISO}; +tpclnp_ctlinput1(cmd, isoa) + int cmd; + struct iso_addr *isoa; +{ + bzero((caddr_t)&siso.siso_addr, sizeof(siso.siso_addr)); + bcopy((caddr_t)isoa, (caddr_t)&siso.siso_addr, isoa->isoa_len); + tpclnp_ctlinput(cmd, &siso); +} + +/* + * These next 2 routines are + * CALLED FROM: + * xxx_notify() from tp_ctlinput() when + * net level gets some ICMP-equiv. type event. + * FUNCTION and ARGUMENTS: + * Cause the connection to be aborted with some sort of error + * reason indicating that the network layer caused the abort. + * Fakes an ER TPDU so we can go through the driver. + * abort always aborts the TP connection. + * reset may or may not, depending on the TP class that's in use. + */ +ProtoHook +tpiso_abort(isop) + struct isopcb *isop; +{ + struct tp_event e; + + IFDEBUG(D_CONN) + printf("tpiso_abort 0x%x\n", isop); + ENDDEBUG + e.ev_number = ER_TPDU; + e.ATTR(ER_TPDU).e_reason = ECONNABORTED; + return tp_driver((struct tp_pcb *)isop->isop_socket->so_pcb, &e); +} + +ProtoHook +tpiso_reset(isop) + struct isopcb *isop; +{ + struct tp_event e; + + e.ev_number = T_NETRESET; + return tp_driver((struct tp_pcb *)isop->isop_socket->so_pcb, &e); + +} + +#endif /* ISO */ diff --git a/sys/netiso/tp_meas.c b/sys/netiso/tp_meas.c new file mode 100644 index 00000000000..f8bbbe6dceb --- /dev/null +++ b/sys/netiso/tp_meas.c @@ -0,0 +1,127 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_meas.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * $Header: tp_meas.c,v 5.2 88/11/18 17:28:04 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_meas.c,v $ + * + * tp_meas.c : create a performance measurement event + * in the circular buffer tp_Meas[] + */ + +#include +#include + +#include +#include + +extern struct timeval time; + +#ifdef TP_PERF_MEAS +int tp_Measn = 0; +struct tp_Meas tp_Meas[TPMEASN]; + +/* + * NAME: tpmeas() + * + * CALLED FROM: tp_emit(), tp_soisdisconecting(), tp_soisdisconnected() + * tp0_stash(), tp_stash(), tp_send(), tp_goodack(), tp_usrreq() + * + * FUNCTION and ARGUMENTS: + * stashes a performance-measurement event for the given reference (ref) + * (kind) tells which kind of event, timev is the time to be stored + * with this event, (seq), (win), and (size) are integers that usually + * refer to the sequence number, window number (on send) and + * size of tpdu or window. + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: + */ +void +Tpmeas(ref, kind, timev, seq, win, size) + u_int ref; + u_int kind; + struct timeval *timev; + u_int seq, win, size; +{ + register struct tp_Meas *tpm; + static int mseq; + + tpm = &tp_Meas[tp_Measn++]; + tp_Measn %= TPMEASN; + + tpm->tpm_kind = kind; + tpm->tpm_tseq = mseq++; + tpm->tpm_ref = ref; + if(kind == TPtime_from_ll) + bcopy((caddr_t)timev, (caddr_t)&tpm->tpm_time, sizeof(struct timeval)); + else + bcopy( (caddr_t)&time, + (caddr_t)&tpm->tpm_time, sizeof(struct timeval) ); + tpm->tpm_seq = seq; + tpm->tpm_window = win; + tpm->tpm_size = size; +} + +#endif /* TP_PERF_MEAS */ diff --git a/sys/netiso/tp_meas.h b/sys/netiso/tp_meas.h new file mode 100644 index 00000000000..10ef93d350b --- /dev/null +++ b/sys/netiso/tp_meas.h @@ -0,0 +1,94 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_meas.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +#ifdef TP_PERF_MEAS +#define tpmeas(a, b, t, c, d, e) \ + Tpmeas((u_int)(a), (u_int)(b), t, (u_int)(c), (u_int)(d), (u_int)(e)) + +struct tp_Meas { + int tpm_tseq; + u_char tpm_kind; + u_short tpm_ref; + u_short tpm_size; + u_short tpm_window; + u_int tpm_seq; + struct timeval tpm_time; +}; + +#define TPMEASN 4000 +extern int tp_Measn; +extern struct tp_Meas tp_Meas[]; + +/* + * the kinds of events for packet tracing are: + */ +#define TPtime_from_session 0x01 +#define TPtime_to_session 0x02 +#define TPtime_ack_rcvd 0x03 +#define TPtime_ack_sent 0x04 +#define TPtime_from_ll 0x05 +#define TPtime_to_ll 0x06 +#define TPsbsend 0x07 +#define TPtime_open 0x08 +#define TPtime_open_X 0x28 /* xtd format */ +#define TPtime_close 0x09 + +#endif /* TP_PERF_MEAS */ diff --git a/sys/netiso/tp_output.c b/sys/netiso/tp_output.c new file mode 100644 index 00000000000..cdd7c4fe76b --- /dev/null +++ b/sys/netiso/tp_output.c @@ -0,0 +1,712 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_output.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_output.c,v 5.4 88/11/18 17:28:08 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_output.c,v $ + * + * In here is tp_ctloutput(), the guy called by [sg]etsockopt(), + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TPDUSIZESHIFT 24 +#define CLASSHIFT 16 + +/* + * NAME: tp_consistency() + * + * CALLED FROM: + * tp_ctloutput(), tp_input() + * + * FUNCTION and ARGUMENTS: + * Checks the consistency of options and tpdusize with class, + * using the parameters passed in via (param). + * (cmd) may be TP_STRICT or TP_FORCE or both. + * Force means it will set all the values in (tpcb) to those in + * the input arguements iff no errors were encountered. + * Strict means that no inconsistency will be tolerated. If it's + * not used, checksum and tpdusize inconsistencies will be tolerated. + * The reason for this is that in some cases, when we're negotiating down + * from class 4, these options should be changed but should not + * cause negotiation to fail. + * + * RETURNS + * E* or EOK + * E* if the various parms aren't ok for a given class + * EOK if they are ok for a given class + */ + +int +tp_consistency( tpcb, cmd, param ) + u_int cmd; + struct tp_conn_param *param; + struct tp_pcb *tpcb; +{ + register int error = EOK; + int class_to_use = tp_mask_to_num(param->p_class); + + IFTRACE(D_SETPARAMS) + tptrace(TPPTmisc, + "tp_consist enter class_to_use dontchange param.class cmd", + class_to_use, param->p_dont_change_params, param->p_class, cmd); + ENDTRACE + IFDEBUG(D_SETPARAMS) + printf("tp_consistency %s %s\n", + cmd& TP_FORCE? "TP_FORCE": "", + cmd& TP_STRICT? "TP_STRICT":""); + ENDDEBUG + if ((cmd & TP_FORCE) && (param->p_dont_change_params)) { + cmd &= ~TP_FORCE; + } + /* can switch net services within a domain, but + * cannot switch domains + */ + switch( param->p_netservice) { + case ISO_CONS: + case ISO_CLNS: + case ISO_COSNS: + /* param->p_netservice in ISO DOMAIN */ + if(tpcb->tp_domain != AF_ISO ) { + error = EINVAL; goto done; + } + break; + case IN_CLNS: + /* param->p_netservice in INET DOMAIN */ + if( tpcb->tp_domain != AF_INET ) { + error = EINVAL; goto done; + } + break; + /* no others not possible-> netservice is a 2-bit field! */ + } + + IFDEBUG(D_SETPARAMS) + printf("p_class 0x%x, class_to_use 0x%x\n", param->p_class, + class_to_use); + ENDDEBUG + if((param->p_netservice < 0) || (param->p_netservice > TP_MAX_NETSERVICES)){ + error = EINVAL; goto done; + } + if( (param->p_class & TP_CLASSES_IMPLEMENTED) == 0 ) { + error = EINVAL; goto done; + } + IFDEBUG(D_SETPARAMS) + printf("Nretrans 0x%x\n", param->p_Nretrans ); + ENDDEBUG + if( ( param->p_Nretrans < 1 ) || + (param->p_cr_ticks < 1) || (param->p_cc_ticks < 1) ) { + /* bad for any class because negot has to be done a la class 4 */ + error = EINVAL; goto done; + } + IFDEBUG(D_SETPARAMS) + printf("use_csum 0x%x\n", param->p_use_checksum ); + printf("xtd_format 0x%x\n", param->p_xtd_format ); + printf("xpd_service 0x%x\n", param->p_xpd_service ); + printf("tpdusize 0x%x\n", param->p_tpdusize ); + printf("tpcb->flags 0x%x\n", tpcb->tp_flags ); + ENDDEBUG + switch( class_to_use ) { + + case 0: + /* do not use checksums, xtd format, or XPD */ + + if( param->p_use_checksum | param->p_xtd_format | param->p_xpd_service ) { + if(cmd & TP_STRICT) { + error = EINVAL; + } else { + param->p_use_checksum = 0; + param->p_xtd_format = 0; + param->p_xpd_service = 0; + } + break; + } + + if (param->p_tpdusize < TP_MIN_TPDUSIZE) { + if(cmd & TP_STRICT) { + error = EINVAL; + } else { + param->p_tpdusize = TP_MIN_TPDUSIZE; + } + break; + } + if (param->p_tpdusize > TP0_TPDUSIZE) { + if (cmd & TP_STRICT) { + error = EINVAL; + } else { + param->p_tpdusize = TP0_TPDUSIZE; + } + break; + } + + /* connect/disc data not allowed for class 0 */ + if (tpcb->tp_ucddata) { + if(cmd & TP_STRICT) { + error = EINVAL; + } else if(cmd & TP_FORCE) { + m_freem(tpcb->tp_ucddata); + tpcb->tp_ucddata = 0; + } + } + break; + + case 4: + IFDEBUG(D_SETPARAMS) + printf("dt_ticks 0x%x\n", param->p_dt_ticks ); + printf("x_ticks 0x%x\n", param->p_x_ticks ); + printf("dr_ticks 0x%x\n", param->p_dr_ticks ); + printf("keepalive 0x%x\n", param->p_keepalive_ticks ); + printf("sendack 0x%x\n", param->p_sendack_ticks ); + printf("inact 0x%x\n", param->p_inact_ticks ); + printf("ref 0x%x\n", param->p_ref_ticks ); + ENDDEBUG + if( (param->p_class & TP_CLASS_4 ) && ( + (param->p_dt_ticks < 1) || (param->p_dr_ticks < 1) || + (param->p_x_ticks < 1) || (param->p_keepalive_ticks < 1) || + (param->p_sendack_ticks < 1) || (param->p_ref_ticks < 1) || + (param->p_inact_ticks < 1) ) ) { + error = EINVAL; + break; + } + IFDEBUG(D_SETPARAMS) + printf("rx_strat 0x%x\n", param->p_rx_strat ); + ENDDEBUG + if(param->p_rx_strat > + ( TPRX_USE_CW | TPRX_EACH | TPRX_FASTSTART) ) { + if(cmd & TP_STRICT) { + error = EINVAL; + } else { + param->p_rx_strat = TPRX_USE_CW; + } + break; + } + IFDEBUG(D_SETPARAMS) + printf("ack_strat 0x%x\n", param->p_ack_strat ); + ENDDEBUG + if((param->p_ack_strat != 0) && (param->p_ack_strat != 1)) { + if(cmd & TP_STRICT) { + error = EINVAL; + } else { + param->p_ack_strat = TPACK_WINDOW; + } + break; + } + if (param->p_tpdusize < TP_MIN_TPDUSIZE) { + if(cmd & TP_STRICT) { + error = EINVAL; + } else { + param->p_tpdusize = TP_MIN_TPDUSIZE; + } + break; + } + if (param->p_tpdusize > TP_TPDUSIZE) { + if(cmd & TP_STRICT) { + error = EINVAL; + } else { + param->p_tpdusize = TP_TPDUSIZE; + } + break; + } + break; + } + + if ((error==0) && (cmd & TP_FORCE)) { + long dusize = ((long)param->p_ptpdusize) << 7; + /* Enforce Negotation rules below */ + tpcb->tp_class = param->p_class; + if (tpcb->tp_use_checksum || param->p_use_checksum) + tpcb->tp_use_checksum = 1; + if (!tpcb->tp_xpd_service || !param->p_xpd_service) + tpcb->tp_xpd_service = 0; + if (!tpcb->tp_xtd_format || !param->p_xtd_format) + tpcb->tp_xtd_format = 0; + if (dusize) { + if (tpcb->tp_l_tpdusize > dusize) + tpcb->tp_l_tpdusize = dusize; + if (tpcb->tp_ptpdusize == 0 || + tpcb->tp_ptpdusize > param->p_ptpdusize) + tpcb->tp_ptpdusize = param->p_ptpdusize; + } else { + if (param->p_tpdusize != 0 && + tpcb->tp_tpdusize > param->p_tpdusize) + tpcb->tp_tpdusize = param->p_tpdusize; + tpcb->tp_l_tpdusize = 1 << tpcb->tp_tpdusize; + } + } +done: + + IFTRACE(D_CONN) + tptrace(TPPTmisc, "tp_consist returns class xtdfmt cmd", + error, tpcb->tp_class, tpcb->tp_xtd_format, cmd); + ENDTRACE + IFDEBUG(D_CONN) + printf( + "tp_consist rtns 0x%x class 0x%x xtd_fmt 0x%x cmd 0x%x\n", + error, tpcb->tp_class, tpcb->tp_xtd_format, cmd); + ENDDEBUG + return error; +} + +/* + * NAME: tp_ctloutput() + * + * CALLED FROM: + * [sg]etsockopt(), via so[sg]etopt(). + * + * FUNCTION and ARGUMENTS: + * Implements the socket options at transport level. + * (cmd) is either PRCO_SETOPT or PRCO_GETOPT (see ../sys/protosw.h). + * (so) is the socket. + * (level) is SOL_TRANSPORT (see ../sys/socket.h) + * (optname) is the particular command or option to be set. + * (**mp) is an mbuf structure. + * + * RETURN VALUE: + * ENOTSOCK if the socket hasn't got an associated tpcb + * EINVAL if + * trying to set window too big + * trying to set illegal max tpdu size + * trying to set illegal credit fraction + * trying to use unknown or unimplemented class of TP + * structure passed to set timer values is wrong size + * illegal combination of command/GET-SET option, + * e.g., GET w/ TPOPT_CDDATA_CLEAR: + * EOPNOTSUPP if the level isn't transport, or command is neither GET nor SET + * or if the transport-specific command is not implemented + * EISCONN if trying a command that isn't allowed after a connection + * is established + * ENOTCONN if trying a command that is allowed only if a connection is + * established + * EMSGSIZE if trying to give too much data on connect/disconnect + * + * SIDE EFFECTS: + * + * NOTES: + */ +ProtoHook +tp_ctloutput(cmd, so, level, optname, mp) + int cmd, level, optname; + struct socket *so; + struct mbuf **mp; +{ + struct tp_pcb *tpcb = sototpcb(so); + int s = splnet(); + caddr_t value; + unsigned val_len; + int error = 0; + + IFTRACE(D_REQUEST) + tptrace(TPPTmisc, "tp_ctloutput cmd so optname mp", + cmd, so, optname, mp); + ENDTRACE + IFDEBUG(D_REQUEST) + printf( + "tp_ctloutput so 0x%x cmd 0x%x optname 0x%x, mp 0x%x *mp 0x%x tpcb 0x%x\n", + so, cmd, optname, mp, mp?*mp:0, tpcb); + ENDDEBUG + if( tpcb == (struct tp_pcb *)0 ) { + error = ENOTSOCK; goto done; + } + if(*mp == MNULL) { + register struct mbuf *m; + + MGET(m, M_DONTWAIT, TPMT_SONAME); /* does off, type, next */ + if (m == NULL) { + splx(s); + return ENOBUFS; + } + m->m_len = 0; + m->m_act = 0; + *mp = m; + } + + /* + * Hook so one can set network options via a tp socket. + */ + if ( level == SOL_NETWORK ) { + if ((tpcb->tp_nlproto == NULL) || (tpcb->tp_npcb == NULL)) + error = ENOTSOCK; + else if (tpcb->tp_nlproto->nlp_ctloutput == NULL) + error = EOPNOTSUPP; + else + return ((tpcb->tp_nlproto->nlp_ctloutput)(cmd, optname, + tpcb->tp_npcb, *mp)); + goto done; + } else if ( level == SOL_SOCKET) { + if (optname == SO_RCVBUF && cmd == PRCO_SETOPT) { + u_long old_credit = tpcb->tp_maxlcredit; + tp_rsyset(tpcb); + if (tpcb->tp_rhiwat != so->so_rcv.sb_hiwat && + tpcb->tp_state == TP_OPEN && + (old_credit < tpcb->tp_maxlcredit)) + tp_emit(AK_TPDU_type, tpcb, + tpcb->tp_rcvnxt, 0, MNULL); + tpcb->tp_rhiwat = so->so_rcv.sb_hiwat; + } + goto done; + } else if ( level != SOL_TRANSPORT ) { + error = EOPNOTSUPP; goto done; + } + if (cmd != PRCO_GETOPT && cmd != PRCO_SETOPT) { + error = EOPNOTSUPP; goto done; + } + if ( so->so_error ) { + error = so->so_error; goto done; + } + + /* The only options allowed after connection is established + * are GET (anything) and SET DISC DATA and SET PERF MEAS + */ + if ( ((so->so_state & SS_ISCONNECTING)||(so->so_state & SS_ISCONNECTED)) + && + (cmd == PRCO_SETOPT && + optname != TPOPT_DISC_DATA && + optname != TPOPT_CFRM_DATA && + optname != TPOPT_PERF_MEAS && + optname != TPOPT_CDDATA_CLEAR ) ) { + error = EISCONN; goto done; + } + /* The only options allowed after disconnection are GET DISC DATA, + * and TPOPT_PSTATISTICS + * and they're not allowed if the ref timer has gone off, because + * the tpcb is gone + */ + if ((so->so_state & (SS_ISCONNECTED | SS_ISCONFIRMING)) == 0) { + if ( so->so_pcb == (caddr_t)0 ) { + error = ENOTCONN; goto done; + } + if ( (tpcb->tp_state == TP_REFWAIT || tpcb->tp_state == TP_CLOSING) && + (optname != TPOPT_DISC_DATA && optname != TPOPT_PSTATISTICS)) { + error = ENOTCONN; goto done; + } + } + + value = mtod(*mp, caddr_t); /* it's aligned, don't worry, + * but lint complains about it + */ + val_len = (*mp)->m_len; + + switch (optname) { + + case TPOPT_INTERCEPT: +#define INA(t) (((struct inpcb *)(t->tp_npcb))->inp_laddr.s_addr) +#define ISOA(t) (((struct isopcb *)(t->tp_npcb))->isop_laddr->siso_addr) + + if ((so->so_state & SS_PRIV) == 0) { + error = EPERM; + } else if (cmd != PRCO_SETOPT || tpcb->tp_state != TP_CLOSED || + (tpcb->tp_flags & TPF_GENERAL_ADDR) || + tpcb->tp_next == 0) + error = EINVAL; + else { + register struct tp_pcb *t; + error = EADDRINUSE; + for (t = tp_listeners; t; t = t->tp_nextlisten) + if ((t->tp_flags & TPF_GENERAL_ADDR) == 0 && + t->tp_domain == tpcb->tp_domain) + switch (tpcb->tp_domain) { + default: + goto done; +#ifdef INET + case AF_INET: + if (INA(t) == INA(tpcb)) + goto done; + continue; +#endif +#ifdef ISO + case AF_ISO: + if (bcmp(ISOA(t).isoa_genaddr, ISOA(tpcb).isoa_genaddr, + ISOA(t).isoa_len) == 0) + goto done; + continue; +#endif + } + tpcb->tp_lsuffixlen = 0; + tpcb->tp_state = TP_LISTENING; + error = 0; + remque(tpcb); + tpcb->tp_next = tpcb->tp_prev = tpcb; + tpcb->tp_nextlisten = tp_listeners; + tp_listeners = tpcb; + } + break; + + case TPOPT_MY_TSEL: + if ( cmd == PRCO_GETOPT ) { + ASSERT( tpcb->tp_lsuffixlen <= MAX_TSAP_SEL_LEN ); + bcopy((caddr_t)tpcb->tp_lsuffix, value, tpcb->tp_lsuffixlen); + (*mp)->m_len = tpcb->tp_lsuffixlen; + } else /* cmd == PRCO_SETOPT */ { + if( (val_len > MAX_TSAP_SEL_LEN) || (val_len <= 0 )) { + printf("val_len 0x%x (*mp)->m_len 0x%x\n", val_len, (*mp)); + error = EINVAL; + } else { + bcopy(value, (caddr_t)tpcb->tp_lsuffix, val_len); + tpcb->tp_lsuffixlen = val_len; + } + } + break; + + case TPOPT_PEER_TSEL: + if ( cmd == PRCO_GETOPT ) { + ASSERT( tpcb->tp_fsuffixlen <= MAX_TSAP_SEL_LEN ); + bcopy((caddr_t)tpcb->tp_fsuffix, value, tpcb->tp_fsuffixlen); + (*mp)->m_len = tpcb->tp_fsuffixlen; + } else /* cmd == PRCO_SETOPT */ { + if( (val_len > MAX_TSAP_SEL_LEN) || (val_len <= 0 )) { + printf("val_len 0x%x (*mp)->m_len 0x%x\n", val_len, (*mp)); + error = EINVAL; + } else { + bcopy(value, (caddr_t)tpcb->tp_fsuffix, val_len); + tpcb->tp_fsuffixlen = val_len; + } + } + break; + + case TPOPT_FLAGS: + IFDEBUG(D_REQUEST) + printf("%s TPOPT_FLAGS value 0x%x *value 0x%x, flags 0x%x \n", + cmd==PRCO_GETOPT?"GET":"SET", + value, + *value, + tpcb->tp_flags); + ENDDEBUG + + if ( cmd == PRCO_GETOPT ) { + *(int *)value = (int)tpcb->tp_flags; + (*mp)->m_len = sizeof(u_int); + } else /* cmd == PRCO_SETOPT */ { + error = EINVAL; goto done; + } + break; + + case TPOPT_PARAMS: + /* This handles: + * timer values, + * class, use of transport expedited data, + * max tpdu size, checksum, xtd format and + * disconnect indications, and may get rid of connect/disc data + */ + IFDEBUG(D_SETPARAMS) + printf("TPOPT_PARAMS value 0x%x, cmd %s \n", value, + cmd==PRCO_GETOPT?"GET":"SET"); + ENDDEBUG + IFDEBUG(D_REQUEST) + printf("TPOPT_PARAMS value 0x%x, cmd %s \n", value, + cmd==PRCO_GETOPT?"GET":"SET"); + ENDDEBUG + + if ( cmd == PRCO_GETOPT ) { + *(struct tp_conn_param *)value = tpcb->_tp_param; + (*mp)->m_len = sizeof(tpcb->_tp_param); + } else /* cmd == PRCO_SETOPT */ { + if( (error = + tp_consistency(tpcb, TP_STRICT | TP_FORCE, + (struct tp_conn_param *)value))==0) { + /* + * tp_consistency doesn't copy the whole set of params + */ + tpcb->_tp_param = *(struct tp_conn_param *)value; + (*mp)->m_len = sizeof(tpcb->_tp_param); + } + } + break; + + case TPOPT_PSTATISTICS: +#ifdef TP_PERF_MEAS + if (cmd == PRCO_SETOPT) { + error = EINVAL; goto done; + } + IFPERF(tpcb) + if (*mp) { + struct mbuf * n; + do { + MFREE(*mp, n); + *mp = n; + } while (n); + } + *mp = m_copym(tpcb->tp_p_mbuf, (int)M_COPYALL, M_WAITOK); + ENDPERF + else { + error = EINVAL; goto done; + } + break; +#else + error = EOPNOTSUPP; + goto done; +#endif /* TP_PERF_MEAS */ + + case TPOPT_CDDATA_CLEAR: + if (cmd == PRCO_GETOPT) { + error = EINVAL; + } else { + if (tpcb->tp_ucddata) { + m_freem(tpcb->tp_ucddata); + tpcb->tp_ucddata = 0; + } + } + break; + + case TPOPT_CFRM_DATA: + case TPOPT_DISC_DATA: + case TPOPT_CONN_DATA: + if( tpcb->tp_class == TP_CLASS_0 ) { + error = EOPNOTSUPP; + break; + } + IFDEBUG(D_REQUEST) + printf("%s\n", optname==TPOPT_DISC_DATA?"DISC data":"CONN data"); + printf("m_len 0x%x, vallen 0x%x so_snd.cc 0x%x\n", + (*mp)->m_len, val_len, so->so_snd.sb_cc); + dump_mbuf(so->so_snd.sb_mb, "tp_ctloutput: sosnd "); + ENDDEBUG + if (cmd == PRCO_SETOPT) { + int len = tpcb->tp_ucddata ? tpcb->tp_ucddata->m_len : 0; + /* can append connect data in several calls */ + if (len + val_len > + (optname==TPOPT_CONN_DATA?TP_MAX_CR_DATA:TP_MAX_DR_DATA) ) { + error = EMSGSIZE; goto done; + } + (*mp)->m_next = MNULL; + (*mp)->m_act = 0; + if (tpcb->tp_ucddata) + m_cat(tpcb->tp_ucddata, *mp); + else + tpcb->tp_ucddata = *mp; + IFDEBUG(D_REQUEST) + dump_mbuf(tpcb->tp_ucddata, "tp_ctloutput after CONN_DATA"); + ENDDEBUG + IFTRACE(D_REQUEST) + tptrace(TPPTmisc,"C/D DATA: flags snd.sbcc val_len", + tpcb->tp_flags, so->so_snd.sb_cc,val_len,0); + ENDTRACE + *mp = MNULL; + if (optname == TPOPT_CFRM_DATA && (so->so_state & SS_ISCONFIRMING)) + (void) tp_confirm(tpcb); + } + break; + + case TPOPT_PERF_MEAS: +#ifdef TP_PERF_MEAS + if (cmd == PRCO_GETOPT) { + *value = (u_int)tpcb->tp_perf_on; + (*mp)->m_len = sizeof(u_int); + } else if (cmd == PRCO_SETOPT) { + (*mp)->m_len = 0; + if ((*value) != 0 && (*value) != 1 ) + error = EINVAL; + else tpcb->tp_perf_on = (*value); + } + if( tpcb->tp_perf_on ) + error = tp_setup_perf(tpcb); +#else /* TP_PERF_MEAS */ + error = EOPNOTSUPP; +#endif /* TP_PERF_MEAS */ + break; + + default: + error = EOPNOTSUPP; + } + +done: + IFDEBUG(D_REQUEST) + dump_mbuf(so->so_snd.sb_mb, "tp_ctloutput sosnd at end"); + dump_mbuf(*mp, "tp_ctloutput *mp"); + ENDDEBUG + /* + * sigh: getsockopt looks only at m_len : all output data must + * reside in the first mbuf + */ + if (*mp) { + if (cmd == PRCO_SETOPT) { + m_freem(*mp); + *mp = MNULL; + } else { + ASSERT ( m_compress(*mp, mp) <= MLEN ); + if (error) + (*mp)->m_len = 0; + IFDEBUG(D_REQUEST) + dump_mbuf(*mp, "tp_ctloutput *mp after compress"); + ENDDEBUG + } + } + splx(s); + return error; +} diff --git a/sys/netiso/tp_param.h b/sys/netiso/tp_param.h new file mode 100644 index 00000000000..f1862a24392 --- /dev/null +++ b/sys/netiso/tp_param.h @@ -0,0 +1,367 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_param.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_param.h,v 5.3 88/11/18 17:28:18 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_param.h,v $ + * + */ + +#ifndef __TP_PARAM__ +#define __TP_PARAM__ + + +/****************************************************** + * compile time parameters that can be changed + *****************************************************/ + +#define TP_CLASSES_IMPLEMENTED 0x11 /* zero and 4 */ + +#define TP_DECBIT_CLEAR_COUNT 3 + +/*#define N_TPREF 100 */ +#ifdef KERNEL +extern int N_TPREF; +#endif + +#define TP_SOCKBUFSIZE ((u_long)4096) +#define TP0_SOCKBUFSIZE ((u_long)512) +#define MAX_TSAP_SEL_LEN 64 + +/* maximum tpdu size we'll accept: */ +#define TP_TPDUSIZE 0xc /* 4096 octets for classes 1-4*/ +#define TP0_TPDUSIZE 0xb /* 2048 octets for class 0 */ +#define TP_DFL_TPDUSIZE 0x7 /* 128 octets default */ + /* NOTE: don't ever negotiate 8192 because could get + * wraparound in checksumming + * (No mtu is likely to be larger than 4K anyway...) + */ +#define TP_NRETRANS 12 /* TCP_MAXRXTSHIFT + 1 */ +#define TP_MAXRXTSHIFT 6 /* factor of 64 */ +#define TP_MAXPORT 0xefff + +/* ALPHA: to be used in the context: gain= 1/(2**alpha), or + * put another way, gaintimes(x) (x)>>alpha (forgetting the case alpha==0) + */ +#define TP_RTT_ALPHA 3 +#define TP_RTV_ALPHA 2 +#define TP_REXMTVAL(tpcb)\ + ((tp_rttadd + (tpcb)->tp_rtt + ((tpcb)->tp_rtv) << 2) / tp_rttdiv) +#define TP_RANGESET(tv, value, min, max) \ + ((tv = value) > (max) ? (tv = max) : (tv < min ? tv = min : tv)) + +/* + * not sure how to treat data on disconnect + */ +#define T_CONN_DATA 0x1 +#define T_DISCONNECT 0x2 +#define T_DISC_DATA 0x4 +#define T_XDATA 0x8 + +#define ISO_CLNS 0 +#define IN_CLNS 1 +#define ISO_CONS 2 +#define ISO_COSNS 3 +#define TP_MAX_NETSERVICES 3 + +/* Indices into tp stats ackreason[i] */ +#define _ACK_DONT_ 0 +#define _ACK_STRAT_EACH_ 0x1 +#define _ACK_STRAT_FULLWIN_ 0x2 +#define _ACK_DUP_ 0x3 +#define _ACK_EOT_ 0x4 +#define _ACK_REORDER_ 0x5 +#define _ACK_USRRCV_ 0x6 +#define _ACK_FCC_ 0x7 +#define _ACK_NUM_REASONS_ 0x8 + +/* masks for use in tp_stash() */ +#define ACK_DONT 0 +#define ACK_STRAT_EACH (1<< _ACK_STRAT_EACH_) +#define ACK_STRAT_FULLWIN (1<< _ACK_STRAT_FULLWIN_) +#define ACK_DUP (1<< _ACK_DUP_) +#define ACK_EOT (1<< _ACK_EOT_) +#define ACK_REORDER (1<< _ACK_REORDER_) + +/****************************************************** + * constants used in the protocol + *****************************************************/ + +#define TP_VERSION 0x1 + +#define TP_MAX_HEADER_LEN 256 + +#define TP_MIN_TPDUSIZE 0x7 /* 128 octets */ +#define TP_MAX_TPDUSIZE 0xd /* 8192 octets */ + +#define TP_MAX_XPD_DATA 0x10 /* 16 octets */ +#define TP_MAX_CC_DATA 0x20 /* 32 octets */ +#define TP_MAX_CR_DATA TP_MAX_CC_DATA +#define TP_MAX_DR_DATA 0x40 /* 64 octets */ + +#define TP_XTD_FMT_BIT 0x80000000 +#define TP_XTD_FMT_MASK 0x7fffffff +#define TP_NML_FMT_BIT 0x80 +#define TP_NML_FMT_MASK 0x7f + +/* + * values for the tpdu_type field, 2nd byte in a tpdu + */ + +#define TP_MIN_TPDUTYPE 0x1 + +#define XPD_TPDU_type 0x1 +#define XAK_TPDU_type 0x2 +#define GR_TPDU_type 0x3 +#define AK_TPDU_type 0x6 +#define ER_TPDU_type 0x7 +#define DR_TPDU_type 0x8 +#define DC_TPDU_type 0xc +#define CC_TPDU_type 0xd +#define CR_TPDU_type 0xe +#define DT_TPDU_type 0xf + +#define TP_MAX_TPDUTYPE 0xf + +/* + * identifiers for the variable-length options in tpdus + */ + +#define TPP_acktime 0x85 +#define TPP_residER 0x86 +#define TPP_priority 0x87 +#define TPP_transdelay 0x88 +#define TPP_throughput 0x89 +#define TPP_subseq 0x8a +#define TPP_flow_cntl_conf 0x8c /* not implemented */ +#define TPP_addl_info 0xe0 +#define TPP_tpdu_size 0xc0 +#define TPP_calling_sufx 0xc1 +#define TPP_invalid_tpdu 0xc1 /* the bozos used a value twice */ +#define TPP_called_sufx 0xc2 +#define TPP_checksum 0xc3 +#define TPP_vers 0xc4 +#define TPP_security 0xc5 +#define TPP_addl_opt 0xc6 +#define TPP_alt_class 0xc7 +#define TPP_perf_meas 0xc8 /* local item : perf meas on, svp */ +#define TPP_ptpdu_size 0xf0 /* preferred TPDU size */ +#define TPP_inact_time 0xf2 /* inactivity time exchanged */ + + +/****************************************************** + * Some fundamental data types + *****************************************************/ +#ifndef TRUE +#define TRUE 1 +#endif /* TRUE */ + +#ifndef FALSE +#define FALSE 0 +#endif /* FALSE */ + +#define TP_LOCAL 22 +#define TP_FOREIGN 33 + +#ifndef EOK +#define EOK 0 +#endif /* EOK */ + +#define TP_CLASS_0 (1<<0) +#define TP_CLASS_1 (1<<1) +#define TP_CLASS_2 (1<<2) +#define TP_CLASS_3 (1<<3) +#define TP_CLASS_4 (1<<4) + +#define TP_FORCE 0x1 +#define TP_STRICT 0x2 + +#ifndef MNULL +#define MNULL (struct mbuf *)0 +#endif /* MNULL */ + /* if ../sys/mbuf.h gets MT_types up to 0x40, these will + * have to be changed: + */ +#define MT_XPD 0x44 +#define MT_EOT 0x40 + +#define TP_ENOREF 0x80000000 + +typedef unsigned int SeqNum; +typedef unsigned short RefNum; +typedef int ProtoHook; + +/****************************************************** + * Macro used all over, for driver + *****************************************************/ + +#define DoEvent(x) \ + ((E.ev_number=(x)),(tp_driver(tpcb,&E))) + +/****************************************************** + * Some macros used all over, for timestamping + *****************************************************/ + +#define GET_CUR_TIME(tvalp) ((*tvalp) = time) + +#define GET_TIME_SINCE(oldtvalp, diffp) {\ + (diffp)->tv_sec = time.tv_sec - (oldtvalp)->tv_sec;\ + (diffp)->tv_usec = time.tv_usec - (oldtvalp)->tv_usec;\ + if( (diffp)->tv_usec <0 ) {\ + (diffp)->tv_sec --;\ + (diffp)->tv_usec = 1000000 - (diffp)->tv_usec;\ + }\ +} + +/****************************************************** + * Some macros used for address families + *****************************************************/ + +#define satosiso(ADDR) ((struct sockaddr_iso *)(ADDR)) +#define satosin(ADDR) ((struct sockaddr_in *)(ADDR)) + +/****************************************************** + * Macro used for changing types of mbufs + *****************************************************/ + +#define CHANGE_MTYPE(m, TYPE)\ + if((m)->m_type != TYPE) { \ + mbstat.m_mtypes[(m)->m_type]--; mbstat.m_mtypes[TYPE]++; \ + (m)->m_type = TYPE; \ + } + +/****************************************************** + * Macros used for adding options to a tpdu header and for + * parsing the headers. + * Options are variable-length and must be bcopy-d because on the + * RT your assignments must be N-word aligned for objects of length + * N. Such a drag. + *****************************************************/ + +struct tp_vbp { + u_char tpv_code; + char tpv_len; + char tpv_val; +}; +#define vbptr(x) ((struct tp_vbp *)(x)) +#define vbval(x,type) (*((type *)&(((struct tp_vbp *)(x))->tpv_val))) +#define vbcode(x) (vbptr(x)->tpv_code) +#define vblen(x) (vbptr(x)->tpv_len) + +#define vb_putval(dst,type,src)\ + bcopy((caddr_t)&(src),(caddr_t)&(((struct tp_vbp *)(dst))->tpv_val),\ + sizeof(type)) + +#define vb_getval(src,type,dst)\ +bcopy((caddr_t)&(((struct tp_vbp *)(src))->tpv_val),(caddr_t)&(dst),sizeof(type)) + +#define ADDOPTION(type, DU, len, src)\ +{ register caddr_t P;\ + P = (caddr_t)(DU) + (int)((DU)->tpdu_li);\ + vbptr(P)->tpv_code = type;\ + vbptr(P)->tpv_len = len;\ + bcopy((caddr_t)&src, (caddr_t)&(vbptr(P)->tpv_val), (unsigned)len);\ + DU->tpdu_li += len+2;/* 1 for code, 1 for length */\ +} +/****************************************************** + * Macro for the local credit: + * uses max transmission unit for the ll + * (as modified by the max TPDU size negotiated) + *****************************************************/ + +#if defined(ARGO_DEBUG)&&!defined(LOCAL_CREDIT_EXPAND) +#define LOCAL_CREDIT(tpcb) tp_local_credit(tpcb) +#else +#define LOCAL_CREDIT(tpcb) { if (tpcb->tp_rsycnt == 0) {\ + register struct sockbuf *xxsb = &((tpcb)->tp_sock->so_rcv);\ + register int xxi = sbspace(xxsb);\ + xxi = (xxi<0) ? 0 : ((xxi) / (tpcb)->tp_l_tpdusize);\ + xxi = min(xxi, (tpcb)->tp_maxlcredit); \ + if (!(tpcb->tp_cebit_off)) { \ + (tpcb)->tp_lcredit = ROUND((tpcb)->tp_win_recv); \ + if (xxi < (tpcb)->tp_lcredit) { \ + (tpcb)->tp_lcredit = xxi; \ + } \ + } else \ + (tpcb)->tp_lcredit = xxi; \ +} } +#endif /* ARGO_DEBUG */ + +#ifdef KERNEL +extern int tp_rttadd, tp_rttdiv; +#include +#define printf logpri(LOG_DEBUG),addlog + +#ifndef tp_NSTATES + +#include +#include +#if defined(__STDC__) || defined(__cplusplus) +#undef ATTR +#define ATTR(X) ev_union.EV_ ## X +#endif /* defined(__STDC__) || defined(__cplusplus) */ + +#endif /* tp_NSTATES */ +#endif /* KERNEL */ + +#endif /* __TP_PARAM__ */ diff --git a/sys/netiso/tp_pcb.c b/sys/netiso/tp_pcb.c new file mode 100644 index 00000000000..de345c1e377 --- /dev/null +++ b/sys/netiso/tp_pcb.c @@ -0,0 +1,999 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_pcb.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_pcb.c,v 5.4 88/11/18 17:28:24 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_pcb.c,v $ + * + * + * This is the initialization and cleanup stuff - + * for the tp machine in general as well as for the individual pcbs. + * tp_init() is called at system startup. tp_attach() and tp_getref() are + * called when a socket is created. tp_detach() and tp_freeref() + * are called during the closing stage and/or when the reference timer + * goes off. + * tp_soisdisconnecting() and tp_soisdisconnected() are tp-specific + * versions of soisconnect* + * and are called (obviously) during the closing phase. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* ticks are in units of: + * 500 nano-fortnights ;-) or + * 500 ms or + * 1/2 second + */ + +struct tp_conn_param tp_conn_param[] = { + /* ISO_CLNS: TP4 CONNECTION LESS */ + { + TP_NRETRANS, /* short p_Nretrans; */ + 20, /* 10 sec */ /* short p_dr_ticks; */ + + 20, /* 10 sec */ /* short p_cc_ticks; */ + 20, /* 10 sec */ /* short p_dt_ticks; */ + + 40, /* 20 sec */ /* short p_x_ticks; */ + 80, /* 40 sec */ /* short p_cr_ticks;*/ + + 240, /* 2 min */ /* short p_keepalive_ticks;*/ + 10, /* 5 sec */ /* short p_sendack_ticks; */ + + 600, /* 5 min */ /* short p_ref_ticks; */ + 360, /* 3 min */ /* short p_inact_ticks; */ + + (short) 100, /* short p_lcdtfract */ + (short) TP_SOCKBUFSIZE, /* short p_winsize */ + TP_TPDUSIZE, /* u_char p_tpdusize */ + + TPACK_WINDOW, /* 4 bits p_ack_strat */ + TPRX_USE_CW | TPRX_FASTSTART, + /* 4 bits p_rx_strat*/ + TP_CLASS_4 | TP_CLASS_0,/* 5 bits p_class */ + 1, /* 1 bit xtd format */ + 1, /* 1 bit xpd service */ + 1, /* 1 bit use_checksum */ + 0, /* 1 bit use net xpd */ + 0, /* 1 bit use rcc */ + 0, /* 1 bit use efc */ + 1, /* no disc indications */ + 0, /* don't change params */ + ISO_CLNS, /* p_netservice */ + }, + /* IN_CLNS: TP4 CONNECTION LESS */ + { + TP_NRETRANS, /* short p_Nretrans; */ + 20, /* 10 sec */ /* short p_dr_ticks; */ + + 20, /* 10 sec */ /* short p_cc_ticks; */ + 20, /* 10 sec */ /* short p_dt_ticks; */ + + 40, /* 20 sec */ /* short p_x_ticks; */ + 80, /* 40 sec */ /* short p_cr_ticks;*/ + + 240, /* 2 min */ /* short p_keepalive_ticks;*/ + 10, /* 5 sec */ /* short p_sendack_ticks; */ + + 600, /* 5 min */ /* short p_ref_ticks; */ + 360, /* 3 min */ /* short p_inact_ticks; */ + + (short) 100, /* short p_lcdtfract */ + (short) TP_SOCKBUFSIZE, /* short p_winsize */ + TP_TPDUSIZE, /* u_char p_tpdusize */ + + TPACK_WINDOW, /* 4 bits p_ack_strat */ + TPRX_USE_CW | TPRX_FASTSTART, + /* 4 bits p_rx_strat*/ + TP_CLASS_4, /* 5 bits p_class */ + 1, /* 1 bit xtd format */ + 1, /* 1 bit xpd service */ + 1, /* 1 bit use_checksum */ + 0, /* 1 bit use net xpd */ + 0, /* 1 bit use rcc */ + 0, /* 1 bit use efc */ + 1, /* no disc indications */ + 0, /* don't change params */ + IN_CLNS, /* p_netservice */ + }, + /* ISO_CONS: TP0 CONNECTION MODE */ + { + TP_NRETRANS, /* short p_Nretrans; */ + 0, /* n/a */ /* short p_dr_ticks; */ + + 40, /* 20 sec */ /* short p_cc_ticks; */ + 0, /* n/a */ /* short p_dt_ticks; */ + + 0, /* n/a */ /* short p_x_ticks; */ + 360, /* 3 min */ /* short p_cr_ticks;*/ + + 0, /* n/a */ /* short p_keepalive_ticks;*/ + 0, /* n/a */ /* short p_sendack_ticks; */ + + 600, /* for cr/cc to clear *//* short p_ref_ticks; */ + 0, /* n/a */ /* short p_inact_ticks; */ + + /* Use tp4 defaults just in case the user changes ONLY + * the class + */ + (short) 100, /* short p_lcdtfract */ + (short) TP0_SOCKBUFSIZE, /* short p_winsize */ + TP0_TPDUSIZE, /* 8 bits p_tpdusize */ + + 0, /* 4 bits p_ack_strat */ + 0, /* 4 bits p_rx_strat*/ + TP_CLASS_0, /* 5 bits p_class */ + 0, /* 1 bit xtd format */ + 0, /* 1 bit xpd service */ + 0, /* 1 bit use_checksum */ + 0, /* 1 bit use net xpd */ + 0, /* 1 bit use rcc */ + 0, /* 1 bit use efc */ + 0, /* no disc indications */ + 0, /* don't change params */ + ISO_CONS, /* p_netservice */ + }, + /* ISO_COSNS: TP4 CONNECTION LESS SERVICE over CONSNS */ + { + TP_NRETRANS, /* short p_Nretrans; */ + 40, /* 20 sec */ /* short p_dr_ticks; */ + + 40, /* 20 sec */ /* short p_cc_ticks; */ + 80, /* 40 sec */ /* short p_dt_ticks; */ + + 120, /* 1 min */ /* short p_x_ticks; */ + 360, /* 3 min */ /* short p_cr_ticks;*/ + + 360, /* 3 min */ /* short p_keepalive_ticks;*/ + 20, /* 10 sec */ /* short p_sendack_ticks; */ + + 600, /* 5 min */ /* short p_ref_ticks; */ + 480, /* 4 min */ /* short p_inact_ticks; */ + + (short) 100, /* short p_lcdtfract */ + (short) TP0_SOCKBUFSIZE, /* short p_winsize */ + TP0_TPDUSIZE, /* u_char p_tpdusize */ + + TPACK_WINDOW, /* 4 bits p_ack_strat */ + TPRX_USE_CW , /* No fast start */ + /* 4 bits p_rx_strat*/ + TP_CLASS_4 | TP_CLASS_0,/* 5 bits p_class */ + 0, /* 1 bit xtd format */ + 1, /* 1 bit xpd service */ + 1, /* 1 bit use_checksum */ + 0, /* 1 bit use net xpd */ + 0, /* 1 bit use rcc */ + 0, /* 1 bit use efc */ + 0, /* no disc indications */ + 0, /* don't change params */ + ISO_COSNS, /* p_netservice */ + }, +}; + +#ifdef INET +int in_putnetaddr(); +int in_getnetaddr(); +int in_cmpnetaddr(); +int in_putsufx(); +int in_getsufx(); +int in_recycle_tsuffix(); +int tpip_mtu(); +int in_pcbbind(); +int in_pcbconnect(); +int in_pcbdisconnect(); +int in_pcbdetach(); +int in_pcballoc(); +int tpip_output(); +int tpip_output_dg(); +struct inpcb tp_inpcb; +#endif /* INET */ +#ifdef ISO +int iso_putnetaddr(); +int iso_getnetaddr(); +int iso_cmpnetaddr(); +int iso_putsufx(); +int iso_getsufx(); +int iso_recycle_tsuffix(); +int tpclnp_mtu(); +int iso_pcbbind(); +int iso_pcbconnect(); +int iso_pcbdisconnect(); +int iso_pcbdetach(); +int iso_pcballoc(); +int tpclnp_output(); +int tpclnp_output_dg(); +int iso_nlctloutput(); +struct isopcb tp_isopcb; +#endif /* ISO */ +#ifdef TPCONS +int iso_putnetaddr(); +int iso_getnetaddr(); +int iso_cmpnetaddr(); +int iso_putsufx(); +int iso_getsufx(); +int iso_recycle_tsuffix(); +int iso_pcbbind(); +int tpcons_pcbconnect(); +int tpclnp_mtu(); +int iso_pcbdisconnect(); +int iso_pcbdetach(); +int iso_pcballoc(); +int tpcons_output(); +struct isopcb tp_isopcb; +#endif /* TPCONS */ + + +struct nl_protosw nl_protosw[] = { + /* ISO_CLNS */ +#ifdef ISO + { AF_ISO, iso_putnetaddr, iso_getnetaddr, iso_cmpnetaddr, + iso_putsufx, iso_getsufx, + iso_recycle_tsuffix, + tpclnp_mtu, iso_pcbbind, iso_pcbconnect, + iso_pcbdisconnect, iso_pcbdetach, + iso_pcballoc, + tpclnp_output, tpclnp_output_dg, iso_nlctloutput, + (caddr_t) &tp_isopcb, + }, +#else + { 0 }, +#endif /* ISO */ + /* IN_CLNS */ +#ifdef INET + { AF_INET, in_putnetaddr, in_getnetaddr, in_cmpnetaddr, + in_putsufx, in_getsufx, + in_recycle_tsuffix, + tpip_mtu, in_pcbbind, in_pcbconnect, + in_pcbdisconnect, in_pcbdetach, + in_pcballoc, + tpip_output, tpip_output_dg, /* nl_ctloutput */ NULL, + (caddr_t) &tp_inpcb, + }, +#else + { 0 }, +#endif /* INET */ + /* ISO_CONS */ +#if defined(ISO) && defined(TPCONS) + { AF_ISO, iso_putnetaddr, iso_getnetaddr, iso_cmpnetaddr, + iso_putsufx, iso_getsufx, + iso_recycle_tsuffix, + tpclnp_mtu, iso_pcbbind, tpcons_pcbconnect, + iso_pcbdisconnect, iso_pcbdetach, + iso_pcballoc, + tpcons_output, tpcons_output, iso_nlctloutput, + (caddr_t) &tp_isopcb, + }, +#else + { 0 }, +#endif /* ISO_CONS */ + /* End of protosw marker */ + { 0 } +}; + +u_long tp_sendspace = 1024 * 4; +u_long tp_recvspace = 1024 * 4; + +/* + * NAME: tp_init() + * + * CALLED FROM: + * autoconf through the protosw structure + * + * FUNCTION: + * initialize tp machine + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: + */ +int +tp_init() +{ + static int init_done=0; + void tp_timerinit(); + + if (init_done++) + return 0; + + + /* FOR INET */ + tp_inpcb.inp_next = tp_inpcb.inp_prev = &tp_inpcb; + /* FOR ISO */ + tp_isopcb.isop_next = tp_isopcb.isop_prev = &tp_isopcb; + + tp_start_win = 2; + + tp_timerinit(); + bzero((caddr_t)&tp_stat, sizeof(struct tp_stat)); + return 0; +} + +/* + * NAME: tp_soisdisconnecting() + * + * CALLED FROM: + * tp.trans + * + * FUNCTION and ARGUMENTS: + * Set state of the socket (so) to reflect that fact that we're disconnectING + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: + * This differs from the regular soisdisconnecting() in that the latter + * also sets the SS_CANTRECVMORE and SS_CANTSENDMORE flags. + * We don't want to set those flags because those flags will cause + * a SIGPIPE to be delivered in sosend() and we don't like that. + * If anyone else is sleeping on this socket, wake 'em up. + */ +void +tp_soisdisconnecting(so) + register struct socket *so; +{ + soisdisconnecting(so); + so->so_state &= ~SS_CANTSENDMORE; + IFPERF(sototpcb(so)) + register struct tp_pcb *tpcb = sototpcb(so); + u_int fsufx, lsufx; + + bcopy ((caddr_t)tpcb->tp_fsuffix, (caddr_t)&fsufx, sizeof(u_int) ); + bcopy ((caddr_t)tpcb->tp_lsuffix, (caddr_t)&lsufx, sizeof(u_int) ); + + tpmeas(tpcb->tp_lref, TPtime_close, &time, fsufx, lsufx, tpcb->tp_fref); + tpcb->tp_perf_on = 0; /* turn perf off */ + ENDPERF +} + + +/* + * NAME: tp_soisdisconnected() + * + * CALLED FROM: + * tp.trans + * + * FUNCTION and ARGUMENTS: + * Set state of the socket (so) to reflect that fact that we're disconnectED + * Set the state of the reference structure to closed, and + * recycle the suffix. + * Start a reference timer. + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: + * This differs from the regular soisdisconnected() in that the latter + * also sets the SS_CANTRECVMORE and SS_CANTSENDMORE flags. + * We don't want to set those flags because those flags will cause + * a SIGPIPE to be delivered in sosend() and we don't like that. + * If anyone else is sleeping on this socket, wake 'em up. + */ +void +tp_soisdisconnected(tpcb) + register struct tp_pcb *tpcb; +{ + register struct socket *so = tpcb->tp_sock; + + soisdisconnecting(so); + so->so_state &= ~SS_CANTSENDMORE; + IFPERF(tpcb) + register struct tp_pcb *ttpcb = sototpcb(so); + u_int fsufx, lsufx; + + /* CHOKE */ + bcopy ((caddr_t)ttpcb->tp_fsuffix, (caddr_t)&fsufx, sizeof(u_int) ); + bcopy ((caddr_t)ttpcb->tp_lsuffix, (caddr_t)&lsufx, sizeof(u_int) ); + + tpmeas(ttpcb->tp_lref, TPtime_close, + &time, &lsufx, &fsufx, ttpcb->tp_fref); + tpcb->tp_perf_on = 0; /* turn perf off */ + ENDPERF + + tpcb->tp_refstate = REF_FROZEN; + tp_recycle_tsuffix(tpcb); + tp_etimeout(tpcb, TM_reference, (int)tpcb->tp_refer_ticks); +} + +/* + * NAME: tp_freeref() + * + * CALLED FROM: + * tp.trans when the reference timer goes off, and + * from tp_attach() and tp_detach() when a tpcb is partially set up but not + * set up enough to have a ref timer set for it, and it's discarded + * due to some sort of error or an early close() + * + * FUNCTION and ARGUMENTS: + * Frees the reference represented by (r) for re-use. + * + * RETURNS: Nothing + * + * SIDE EFFECTS: + * + * NOTES: better be called at clock priority !!!!! + */ +void +tp_freeref(n) +RefNum n; +{ + register struct tp_ref *r = tp_ref + n; + register struct tp_pcb *tpcb; + + tpcb = r->tpr_pcb; + IFDEBUG(D_TIMER) + printf("tp_freeref called for ref %d pcb %x maxrefopen %d\n", + n, tpcb, tp_refinfo.tpr_maxopen); + ENDDEBUG + IFTRACE(D_TIMER) + tptrace(TPPTmisc, "tp_freeref ref maxrefopen pcb", + n, tp_refinfo.tpr_maxopen, tpcb, 0); + ENDTRACE + if (tpcb == 0) + return; + IFDEBUG(D_CONN) + printf("tp_freeref: CLEARING tpr_pcb 0x%x\n", tpcb); + ENDDEBUG + r->tpr_pcb = (struct tp_pcb *)0; + tpcb->tp_refstate = REF_FREE; + + for (r = tp_ref + tp_refinfo.tpr_maxopen; r > tp_ref; r--) + if (r->tpr_pcb) + break; + tp_refinfo.tpr_maxopen = r - tp_ref; + tp_refinfo.tpr_numopen--; + + IFDEBUG(D_TIMER) + printf("tp_freeref ends w/ maxrefopen %d\n", tp_refinfo.tpr_maxopen); + ENDDEBUG +} + +/* + * NAME: tp_getref() + * + * CALLED FROM: + * tp_attach() + * + * FUNCTION and ARGUMENTS: + * obtains the next free reference and allocates the appropriate + * ref structure, links that structure to (tpcb) + * + * RETURN VALUE: + * a reference number + * or TP_ENOREF + * + * SIDE EFFECTS: + * + * NOTES: + */ +u_long +tp_getref(tpcb) + register struct tp_pcb *tpcb; +{ + register struct tp_ref *r, *rlim; + register int i; + caddr_t obase; + unsigned size; + + if (++tp_refinfo.tpr_numopen < tp_refinfo.tpr_size) + for (r = tp_refinfo.tpr_base, rlim = r + tp_refinfo.tpr_size; + ++r < rlim; ) /* tp_ref[0] is never used */ + if (r->tpr_pcb == 0) + goto got_one; + /* else have to allocate more space */ + + obase = (caddr_t)tp_refinfo.tpr_base; + size = tp_refinfo.tpr_size * sizeof(struct tp_ref); + r = (struct tp_ref *) malloc(size + size, M_PCB, M_NOWAIT); + if (r == 0) + return (--tp_refinfo.tpr_numopen, TP_ENOREF); + tp_refinfo.tpr_base = tp_ref = r; + tp_refinfo.tpr_size *= 2; + bcopy(obase, (caddr_t)r, size); + free(obase, M_PCB); + r = (struct tp_ref *)(size + (caddr_t)r); + bzero((caddr_t)r, size); + +got_one: + r->tpr_pcb = tpcb; + tpcb->tp_refstate = REF_OPENING; + i = r - tp_refinfo.tpr_base; + if (tp_refinfo.tpr_maxopen < i) + tp_refinfo.tpr_maxopen = i; + return (u_long)i; +} + +/* + * NAME: tp_set_npcb() + * + * CALLED FROM: + * tp_attach(), tp_route_to() + * + * FUNCTION and ARGUMENTS: + * given a tpcb, allocate an appropriate lower-lever npcb, freeing + * any old ones that might need re-assigning. + */ +tp_set_npcb(tpcb) +register struct tp_pcb *tpcb; +{ + register struct socket *so = tpcb->tp_sock; + int error; + + if (tpcb->tp_nlproto && tpcb->tp_npcb) { + short so_state = so->so_state; + so->so_state &= ~SS_NOFDREF; + tpcb->tp_nlproto->nlp_pcbdetach(tpcb->tp_npcb); + so->so_state = so_state; + } + tpcb->tp_nlproto = &nl_protosw[tpcb->tp_netservice]; + /* xx_pcballoc sets so_pcb */ + error = tpcb->tp_nlproto->nlp_pcballoc(so, tpcb->tp_nlproto->nlp_pcblist); + tpcb->tp_npcb = so->so_pcb; + so->so_pcb = (caddr_t)tpcb; + return (error); +} +/* + * NAME: tp_attach() + * + * CALLED FROM: + * tp_usrreq, PRU_ATTACH + * + * FUNCTION and ARGUMENTS: + * given a socket (so) and a protocol family (dom), allocate a tpcb + * and ref structure, initialize everything in the structures that + * needs to be initialized. + * + * RETURN VALUE: + * 0 ok + * EINVAL if DEBUG(X) in is on and a disaster has occurred + * ENOPROTOOPT if TP hasn't been configured or if the + * socket wasn't created with tp as its protocol + * EISCONN if this socket is already part of a connection + * ETOOMANYREFS if ran out of tp reference numbers. + * E* whatever error is returned from soreserve() + * for from the network-layer pcb allocation routine + * + * SIDE EFFECTS: + * + * NOTES: + */ +tp_attach(so, protocol) + struct socket *so; + int protocol; +{ + register struct tp_pcb *tpcb; + int error = 0; + int dom = so->so_proto->pr_domain->dom_family; + u_long lref; + extern struct tp_conn_param tp_conn_param[]; + + IFDEBUG(D_CONN) + printf("tp_attach:dom 0x%x so 0x%x ", dom, so); + ENDDEBUG + IFTRACE(D_CONN) + tptrace(TPPTmisc, "tp_attach:dom so", dom, so, 0, 0); + ENDTRACE + + if (so->so_pcb != NULL) { + return EISCONN; /* socket already part of a connection*/ + } + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) + error = soreserve(so, tp_sendspace, tp_recvspace); + /* later an ioctl will allow reallocation IF still in closed state */ + + if (error) + goto bad2; + + MALLOC(tpcb, struct tp_pcb *, sizeof(*tpcb), M_PCB, M_NOWAIT); + if (tpcb == NULL) { + error = ENOBUFS; + goto bad2; + } + bzero( (caddr_t)tpcb, sizeof (struct tp_pcb) ); + + if ( ((lref = tp_getref(tpcb)) & TP_ENOREF) != 0 ) { + error = ETOOMANYREFS; + goto bad3; + } + tpcb->tp_lref = lref; + tpcb->tp_sock = so; + tpcb->tp_domain = dom; + tpcb->tp_rhiwat = so->so_rcv.sb_hiwat; + /* tpcb->tp_proto = protocol; someday maybe? */ + if (protocol && protocoltp_netservice = ISO_CONS; + tpcb->tp_snduna = (SeqNum) -1;/* kludge so the pseudo-ack from the CR/CC + * will generate correct fake-ack values + */ + } else { + tpcb->tp_netservice = (dom== AF_INET)?IN_CLNS:ISO_CLNS; + /* the default */ + } + tpcb->_tp_param = tp_conn_param[tpcb->tp_netservice]; + + tpcb->tp_state = TP_CLOSED; + tpcb->tp_vers = TP_VERSION; + tpcb->tp_notdetached = 1; + + /* Spec says default is 128 octets, + * that is, if the tpdusize argument never appears, use 128. + * As the initiator, we will always "propose" the 2048 + * size, that is, we will put this argument in the CR + * always, but accept what the other side sends on the CC. + * If the initiator sends us something larger on a CR, + * we'll respond w/ this. + * Our maximum is 4096. See tp_chksum.c comments. + */ + tpcb->tp_cong_win = + tpcb->tp_l_tpdusize = 1 << tpcb->tp_tpdusize; + + tpcb->tp_seqmask = TP_NML_FMT_MASK; + tpcb->tp_seqbit = TP_NML_FMT_BIT; + tpcb->tp_seqhalf = tpcb->tp_seqbit >> 1; + + /* attach to a network-layer protoswitch */ + if ( error = tp_set_npcb(tpcb)) + goto bad4; + ASSERT( tpcb->tp_nlproto->nlp_afamily == tpcb->tp_domain); + + /* nothing to do for iso case */ + if( dom == AF_INET ) + sotoinpcb(so)->inp_ppcb = (caddr_t) tpcb; + + return 0; + +bad4: + IFDEBUG(D_CONN) + printf("BAD4 in tp_attach, so 0x%x\n", so); + ENDDEBUG + tp_freeref(tpcb->tp_lref); + +bad3: + IFDEBUG(D_CONN) + printf("BAD3 in tp_attach, so 0x%x\n", so); + ENDDEBUG + + free((caddr_t)tpcb, M_PCB); /* never a cluster */ + +bad2: + IFDEBUG(D_CONN) + printf("BAD2 in tp_attach, so 0x%x\n", so); + ENDDEBUG + so->so_pcb = 0; + +/*bad:*/ + IFDEBUG(D_CONN) + printf("BAD in tp_attach, so 0x%x\n", so); + ENDDEBUG + return error; +} + +/* + * NAME: tp_detach() + * + * CALLED FROM: + * tp.trans, on behalf of a user close request + * and when the reference timer goes off + * (if the disconnect was initiated by the protocol entity + * rather than by the user) + * + * FUNCTION and ARGUMENTS: + * remove the tpcb structure from the list of active or + * partially active connections, recycle all the mbufs + * associated with the pcb, ref structure, sockbufs, etc. + * Only free the ref structure if you know that a ref timer + * wasn't set for this tpcb. + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: + * tp_soisdisconnected() was already when this is called + */ +void +tp_detach(tpcb) + register struct tp_pcb *tpcb; +{ + void tp_freeref(), tp_rsyflush(); + register struct socket *so = tpcb->tp_sock; + + IFDEBUG(D_CONN) + printf("tp_detach(tpcb 0x%x, so 0x%x)\n", + tpcb,so); + ENDDEBUG + IFTRACE(D_CONN) + tptraceTPCB(TPPTmisc, "tp_detach tpcb so lsufx", + tpcb, so, *(u_short *)(tpcb->tp_lsuffix), 0); + ENDTRACE + + IFDEBUG(D_CONN) + printf("so_snd at 0x%x so_rcv at 0x%x\n", &so->so_snd, &so->so_rcv); + dump_mbuf(so->so_snd.sb_mb, "so_snd at detach "); + printf("about to call LL detach, nlproto 0x%x, nl_detach 0x%x\n", + tpcb->tp_nlproto, tpcb->tp_nlproto->nlp_pcbdetach); + ENDDEBUG + + if (tpcb->tp_Xsnd.sb_mb) { + printf("Unsent Xdata on detach; would panic"); + sbflush(&tpcb->tp_Xsnd); + } + if (tpcb->tp_ucddata) + m_freem(tpcb->tp_ucddata); + + IFDEBUG(D_CONN) + printf("reassembly info cnt %d rsyq 0x%x\n", + tpcb->tp_rsycnt, tpcb->tp_rsyq); + ENDDEBUG + if (tpcb->tp_rsyq) + tp_rsyflush(tpcb); + + if (tpcb->tp_next) { + remque(tpcb); + tpcb->tp_next = tpcb->tp_prev = 0; + } + tpcb->tp_notdetached = 0; + + IFDEBUG(D_CONN) + printf("calling (...nlproto->...)(0x%x, so 0x%x)\n", + tpcb->tp_npcb, so); + printf("so 0x%x so_head 0x%x, qlen %d q0len %d qlimit %d\n", + so, so->so_head, + so->so_q0len, so->so_qlen, so->so_qlimit); + ENDDEBUG + + (tpcb->tp_nlproto->nlp_pcbdetach)(tpcb->tp_npcb); + /* does an so->so_pcb = 0; sofree(so) */ + + IFDEBUG(D_CONN) + printf("after xxx_pcbdetach\n"); + ENDDEBUG + + if (tpcb->tp_state == TP_LISTENING) { + register struct tp_pcb **tt; + for (tt = &tp_listeners; *tt; tt = &((*tt)->tp_nextlisten)) + if (*tt == tpcb) + break; + if (*tt) + *tt = tpcb->tp_nextlisten; + else + printf("tp_detach from listen: should panic\n"); + } + if (tpcb->tp_refstate == REF_OPENING ) { + /* no connection existed here so no reference timer will be called */ + IFDEBUG(D_CONN) + printf("SETTING ref %d to REF_FREE\n", tpcb->tp_lref); + ENDDEBUG + + tp_freeref(tpcb->tp_lref); + } +#ifdef TP_PERF_MEAS + /* + * Get rid of the cluster mbuf allocated for performance measurements, if + * there is one. Note that tpcb->tp_perf_on says nothing about whether or + * not a cluster mbuf was allocated, so you have to check for a pointer + * to one (that is, we need the TP_PERF_MEASs around the following section + * of code, not the IFPERFs) + */ + if (tpcb->tp_p_mbuf) { + register struct mbuf *m = tpcb->tp_p_mbuf; + struct mbuf *n; + IFDEBUG(D_PERF_MEAS) + printf("freeing tp_p_meas 0x%x ", tpcb->tp_p_meas); + ENDDEBUG + do { + MFREE(m, n); + m = n; + } while (n); + tpcb->tp_p_meas = 0; + tpcb->tp_p_mbuf = 0; + } +#endif /* TP_PERF_MEAS */ + + IFDEBUG(D_CONN) + printf( "end of detach, NOT single, tpcb 0x%x\n", tpcb); + ENDDEBUG + /* free((caddr_t)tpcb, M_PCB); WHere to put this ? */ +} + +struct que { + struct tp_pcb *next; + struct tp_pcb *prev; +} tp_bound_pcbs = +{(struct tp_pcb *)&tp_bound_pcbs, (struct tp_pcb *)&tp_bound_pcbs}; + +u_short tp_unique; + +tp_tselinuse(tlen, tsel, siso, reuseaddr) +caddr_t tsel; +register struct sockaddr_iso *siso; +{ + struct tp_pcb *b = tp_bound_pcbs.next, *l = tp_listeners; + register struct tp_pcb *t; + + for (;;) { + if (b != (struct tp_pcb *)&tp_bound_pcbs) { + t = b; b = t->tp_next; + } else if (l) { + t = l; l = t->tp_nextlisten; + } else + break; + if (tlen == t->tp_lsuffixlen && bcmp(tsel, t->tp_lsuffix, tlen) == 0) { + if (t->tp_flags & TPF_GENERAL_ADDR) { + if (siso == 0 || reuseaddr == 0) + return 1; + } else if (siso) { + if (siso->siso_family == t->tp_domain && + t->tp_nlproto->nlp_cmpnetaddr(t->tp_npcb, siso, TP_LOCAL)) + return 1; + } else if (reuseaddr == 0) + return 1; + } + } + return 0; + +} + + +tp_pcbbind(tpcb, nam) +register struct tp_pcb *tpcb; +register struct mbuf *nam; +{ + register struct sockaddr_iso *siso = 0; + int tlen = 0, wrapped = 0; + caddr_t tsel; + u_short tutil; + + if (tpcb->tp_state != TP_CLOSED) + return (EINVAL); + if (nam) { + siso = mtod(nam, struct sockaddr_iso *); + switch (siso->siso_family) { + default: + return (EAFNOSUPPORT); +#ifdef ISO + case AF_ISO: + tlen = siso->siso_tlen; + tsel = TSEL(siso); + if (siso->siso_nlen == 0) + siso = 0; + break; +#endif +#ifdef INET + case AF_INET: + tsel = (caddr_t)&tutil; + if (tutil = ((struct sockaddr_in *)siso)->sin_port) { + tlen = 2; + } + if (((struct sockaddr_in *)siso)->sin_addr.s_addr == 0) + siso = 0; + } +#endif + } + if (tpcb->tp_lsuffixlen == 0) { + if (tlen) { + if (tp_tselinuse(tlen, tsel, siso, + tpcb->tp_sock->so_options & SO_REUSEADDR)) + return (EINVAL); + } else { + for (tsel = (caddr_t)&tutil, tlen = 2;;){ + if (tp_unique++ < ISO_PORT_RESERVED || + tp_unique > ISO_PORT_USERRESERVED) { + if (wrapped++) + return ESRCH; + tp_unique = ISO_PORT_RESERVED; + } + tutil = htons(tp_unique); + if (tp_tselinuse(tlen, tsel, siso, 0) == 0) + break; + } + if (siso) switch (siso->siso_family) { +#ifdef ISO + case AF_ISO: + bcopy(tsel, TSEL(siso), tlen); + siso->siso_tlen = tlen; + break; +#endif +#ifdef INET + case AF_INET: + ((struct sockaddr_in *)siso)->sin_port = tutil; +#endif + } + } + bcopy(tsel, tpcb->tp_lsuffix, (tpcb->tp_lsuffixlen = tlen)); + insque(tpcb, &tp_bound_pcbs); + } else { + if (tlen || siso == 0) + return (EINVAL); + } + if (siso == 0) { + tpcb->tp_flags |= TPF_GENERAL_ADDR; + return (0); + } + return tpcb->tp_nlproto->nlp_pcbbind(tpcb->tp_npcb, nam); +} diff --git a/sys/netiso/tp_pcb.h b/sys/netiso/tp_pcb.h new file mode 100644 index 00000000000..0353cb47b20 --- /dev/null +++ b/sys/netiso/tp_pcb.h @@ -0,0 +1,356 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_pcb.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_pcb.h,v 5.2 88/11/18 17:09:32 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_pcb.h,v $ + * + * + * This file defines the transport protocol control block (tpcb). + * and a bunch of #define values that are used in the tpcb. + */ + +#ifndef __TP_PCB__ +#define __TP_PCB__ + +#include +#include +#include +#ifndef sblock +#include +#endif /* sblock */ + +/* NOTE: the code depends on REF_CLOSED > REF_OPEN > the rest, and + * on REF_FREE being zero + * + * Possible improvement: + * think about merging the tp_ref w/ the tpcb and doing a search + * through the tpcb list, from tpb. This would slow down lookup + * during data transfer + * It would be a little nicer also to have something based on the + * clock (like top n bits of the reference is part of the clock, to + * minimize the likelihood of reuse after a crash) + * also, need to keep the timer servicing part to a minimum (although + * the cost of this is probably independent of whether the timers are + * in the pcb or in an array.. + * Last, would have to make the number of timers a function of the amount of + * mbufs available, plus some for the frozen references. + * + * Possible improvement: + * Might not need the ref_state stuff either... + * REF_FREE could correspond to tp_state == CLOSED or nonexistend tpcb, + * REF_OPEN to tp_state anywhere from AK_WAIT or CR_SENT to CLOSING + * REF_OPENING could correspond to LISTENING, because that's the + * way it's used, not because the correspondence is exact. + * REF_CLOSED could correspond to REFWAIT + */ +#define REF_FROZEN 3 /* has ref timer only */ +#define REF_OPEN 2 /* has timers, possibly active */ +#define REF_OPENING 1 /* in use (has a pcb) but no timers */ +#define REF_FREE 0 /* free to reallocate */ + +#define TM_NTIMERS 6 + +struct tp_ref { + struct tp_pcb *tpr_pcb; /* back ptr to PCB */ +}; + +/* PER system stuff (one static structure instead of a bunch of names) */ +struct tp_refinfo { + struct tp_ref *tpr_base; + int tpr_size; + int tpr_maxopen; + int tpr_numopen; +}; + +struct nl_protosw { + int nlp_afamily; /* address family */ + int (*nlp_putnetaddr)(); /* puts addresses in nl pcb */ + int (*nlp_getnetaddr)(); /* gets addresses from nl pcb */ + int (*nlp_cmpnetaddr)(); /* compares address in pcb with sockaddr */ + int (*nlp_putsufx)(); /* puts transport suffixes in nl pcb */ + int (*nlp_getsufx)(); /* gets transport suffixes from nl pcb */ + int (*nlp_recycle_suffix)();/* clears suffix from nl pcb */ + int (*nlp_mtu)(); /* figures out mtu based on nl used */ + int (*nlp_pcbbind)(); /* bind to pcb for net level */ + int (*nlp_pcbconn)(); /* connect for net level */ + int (*nlp_pcbdisc)(); /* disconnect net level */ + int (*nlp_pcbdetach)(); /* detach net level pcb */ + int (*nlp_pcballoc)(); /* allocate a net level pcb */ + int (*nlp_output)(); /* prepare a packet to give to nl */ + int (*nlp_dgoutput)(); /* prepare a packet to give to nl */ + int (*nlp_ctloutput)(); /* hook for network set/get options */ + caddr_t nlp_pcblist; /* list of xx_pcb's for connections */ +}; + + +struct tp_pcb { + struct tp_pcb *tp_next; + struct tp_pcb *tp_prev; + struct tp_pcb *tp_nextlisten; /* chain all listeners */ + struct socket *tp_sock; /* back ptr */ + u_short tp_state; /* state of fsm */ + short tp_retrans; /* # times can still retrans */ + caddr_t tp_npcb; /* to lower layer pcb */ + struct nl_protosw *tp_nlproto; /* lower-layer dependent routines */ + struct rtentry **tp_routep; /* obtain mtu; inside npcb */ + + + RefNum tp_lref; /* local reference */ + RefNum tp_fref; /* foreign reference */ + + u_int tp_seqmask; /* mask for seq space */ + u_int tp_seqbit; /* bit for seq number wraparound */ + u_int tp_seqhalf; /* half the seq space */ + + struct mbuf *tp_ucddata; /* user connect/disconnect data */ + + /* credit & sequencing info for SENDING */ + u_short tp_fcredit; /* current remote credit in # packets */ + u_short tp_maxfcredit; /* max remote credit in # packets */ + u_short tp_dupacks; /* intuit packet loss before rxt timo */ + u_long tp_cong_win; /* congestion window in bytes. + * see profuse comments in TCP code + */ + u_long tp_ssthresh; /* cong_win threshold for slow start + * exponential to linear switch + */ + SeqNum tp_snduna; /* seq # of lowest unacked DT */ + SeqNum tp_sndnew; /* seq # of lowest unsent DT */ + SeqNum tp_sndnum; /* next seq # to be assigned */ + SeqNum tp_sndnxt; /* what to do next; poss. rxt */ + struct mbuf *tp_sndnxt_m; /* packet corres. to sndnxt*/ + int tp_Nwindow; /* for perf. measurement */ + + /* credit & sequencing info for RECEIVING */ + SeqNum tp_rcvnxt; /* next DT seq # expect to recv */ + SeqNum tp_sent_lcdt; /* cdt according to last ack sent */ + SeqNum tp_sent_uwe; /* uwe according to last ack sent */ + SeqNum tp_sent_rcvnxt; /* rcvnxt according to last ack sent + * needed for perf measurements only + */ + u_short tp_lcredit; /* current local credit in # packets */ + u_short tp_maxlcredit; /* needed for reassembly queue */ + struct mbuf **tp_rsyq; /* unacked stuff recvd out of order */ + int tp_rsycnt; /* number of packets "" "" "" "" */ + u_long tp_rhiwat; /* remember original RCVBUF size */ + + /* receiver congestion state stuff ... */ + u_int tp_win_recv; + + /* receive window as a scaled int (8 bit fraction part) */ + + struct cong_sample { + ushort cs_size; /* current window size */ + ushort cs_received; /* PDUs received in this sample */ + ushort cs_ce_set; /* PDUs received in this sample with CE bit set */ + } tp_cong_sample; + + + /* parameters per-connection controllable by user */ + struct tp_conn_param _tp_param; + +#define tp_Nretrans _tp_param.p_Nretrans +#define tp_dr_ticks _tp_param.p_dr_ticks +#define tp_cc_ticks _tp_param.p_cc_ticks +#define tp_dt_ticks _tp_param.p_dt_ticks +#define tp_xpd_ticks _tp_param.p_x_ticks +#define tp_cr_ticks _tp_param.p_cr_ticks +#define tp_keepalive_ticks _tp_param.p_keepalive_ticks +#define tp_sendack_ticks _tp_param.p_sendack_ticks +#define tp_refer_ticks _tp_param.p_ref_ticks +#define tp_inact_ticks _tp_param.p_inact_ticks +#define tp_xtd_format _tp_param.p_xtd_format +#define tp_xpd_service _tp_param.p_xpd_service +#define tp_ack_strat _tp_param.p_ack_strat +#define tp_rx_strat _tp_param.p_rx_strat +#define tp_use_checksum _tp_param.p_use_checksum +#define tp_use_efc _tp_param.p_use_efc +#define tp_use_nxpd _tp_param.p_use_nxpd +#define tp_use_rcc _tp_param.p_use_rcc +#define tp_tpdusize _tp_param.p_tpdusize +#define tp_class _tp_param.p_class +#define tp_winsize _tp_param.p_winsize +#define tp_no_disc_indications _tp_param.p_no_disc_indications +#define tp_dont_change_params _tp_param.p_dont_change_params +#define tp_netservice _tp_param.p_netservice +#define tp_version _tp_param.p_version +#define tp_ptpdusize _tp_param.p_ptpdusize + + int tp_l_tpdusize; + /* whereas tp_tpdusize is log2(the negotiated max size) + * l_tpdusize is the size we'll use when sending, in # chars + */ + + int tp_rtv; /* max round-trip time variance */ + int tp_rtt; /* smoothed round-trip time */ + SeqNum tp_rttseq; /* packet being timed */ + int tp_rttemit; /* when emitted, in ticks */ + int tp_idle; /* last activity, in ticks */ + short tp_rxtcur; /* current retransmit value */ + short tp_rxtshift; /* log(2) of rexmt exp. backoff */ + u_char tp_cebit_off; /* real DEC bit algorithms not in use */ + u_char tp_oktonagle; /* Last unsent pckt may be append to */ + u_char tp_flags; /* values: */ +#define TPF_NLQOS_PDN TPFLAG_NLQOS_PDN +#define TPF_PEER_ON_SAMENET TPFLAG_PEER_ON_SAMENET +#define TPF_GENERAL_ADDR TPFLAG_GENERAL_ADDR +#define TPF_DELACK 0x8 +#define TPF_ACKNOW 0x10 + +#define PEER_IS_LOCAL(t) (((t)->tp_flags & TPF_PEER_ON_SAME_NET) != 0) +#define USES_PDN(t) (((t)->tp_flags & TPF_NLQOS_PDN) != 0) + + + unsigned + tp_sendfcc:1, /* shall next ack include FCC parameter? */ + tp_trace:1, /* is this pcb being traced? (not used yet) */ + tp_perf_on:1, /* 0/1 -> performance measuring on */ + tp_reneged:1, /* have we reneged on cdt since last ack? */ + tp_decbit:3, /* dec bit was set, we're in reneg mode */ + tp_notdetached:1; /* Call tp_detach before freeing XXXXXXX */ + +#ifdef TP_PERF_MEAS + /* performance stats - see tp_stat.h */ + struct tp_pmeas *tp_p_meas; + struct mbuf *tp_p_mbuf; +#endif /* TP_PERF_MEAS */ + + /* addressing */ + u_short tp_domain; /* domain (INET, ISO) */ + /* for compatibility with the *old* way and with INET, be sure that + * that lsuffix and fsuffix are aligned to a short addr. + * having them follow the u_short *suffixlen should suffice (choke) + */ + u_short tp_fsuffixlen; /* foreign suffix */ + char tp_fsuffix[MAX_TSAP_SEL_LEN]; + u_short tp_lsuffixlen; /* local suffix */ + char tp_lsuffix[MAX_TSAP_SEL_LEN]; +#define SHORT_LSUFXP(tpcb) ((short *)((tpcb)->tp_lsuffix)) +#define SHORT_FSUFXP(tpcb) ((short *)((tpcb)->tp_fsuffix)) + + /* Timer stuff */ + u_char tp_vers; /* protocol version */ + u_char tp_peer_acktime; /* used for DT retrans time */ + u_char tp_refstate; /* values REF_FROZEN, etc. above */ + struct tp_pcb *tp_fasttimeo; /* limit pcbs to examine */ + u_int tp_timer[TM_NTIMERS]; /* C timers */ + + struct sockbuf tp_Xsnd; /* for expedited data */ +/* struct sockbuf tp_Xrcv; /* for expedited data */ +#define tp_Xrcv tp_sock->so_rcv + SeqNum tp_Xsndnxt; /* next XPD seq # to send */ + SeqNum tp_Xuna; /* seq # of unacked XPD */ + SeqNum tp_Xrcvnxt; /* next XPD seq # expect to recv */ + + /* AK subsequencing */ + u_short tp_s_subseq; /* next subseq to send */ + u_short tp_r_subseq; /* highest recv subseq */ + +}; + +u_int tp_start_win; + +#define ROUND(scaled_int) (((scaled_int) >> 8) + (((scaled_int) & 0x80) ? 1:0)) + +/* to round off a scaled int with an 8 bit fraction part */ + +#define CONG_INIT_SAMPLE(pcb) \ + pcb->tp_cong_sample.cs_received = \ + pcb->tp_cong_sample.cs_ce_set = 0; \ + pcb->tp_cong_sample.cs_size = max(pcb->tp_lcredit, 1) << 1; + +#define CONG_UPDATE_SAMPLE(pcb, ce_bit) \ + pcb->tp_cong_sample.cs_received++; \ + if (ce_bit) { \ + pcb->tp_cong_sample.cs_ce_set++; \ + } \ + if (pcb->tp_cong_sample.cs_size <= pcb->tp_cong_sample.cs_received) { \ + if ((pcb->tp_cong_sample.cs_ce_set << 1) >= \ + pcb->tp_cong_sample.cs_size ) { \ + pcb->tp_win_recv -= pcb->tp_win_recv >> 3; /* multiply by .875 */ \ + pcb->tp_win_recv = max(1 << 8, pcb->tp_win_recv); \ + } \ + else { \ + pcb->tp_win_recv += (1 << 8); /* add one to the scaled int */ \ + } \ + pcb->tp_lcredit = ROUND(pcb->tp_win_recv); \ + CONG_INIT_SAMPLE(pcb); \ + } + +#ifdef KERNEL +extern struct tp_refinfo tp_refinfo; +extern struct timeval time; +extern struct tp_ref *tp_ref; +extern struct tp_param tp_param; +extern struct nl_protosw nl_protosw[]; +extern struct tp_pcb *tp_listeners; +extern struct tp_pcb *tp_ftimeolist; +#endif + +#define sototpcb(so) ((struct tp_pcb *)(so->so_pcb)) +#define sototpref(so) ((sototpcb(so)->tp_ref)) +#define tpcbtoso(tp) ((struct socket *)((tp)->tp_sock)) +#define tpcbtoref(tp) ((struct tp_ref *)((tp)->tp_ref)) + +#endif /* __TP_PCB__ */ diff --git a/sys/netiso/tp_seq.h b/sys/netiso/tp_seq.h new file mode 100644 index 00000000000..f14e5ae7c7d --- /dev/null +++ b/sys/netiso/tp_seq.h @@ -0,0 +1,124 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_seq.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_seq.h,v 5.1 88/10/12 12:20:59 root Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_seq.h,v $ + * + * These macros perform sequence number arithmetic modulo (2**7 or 2**31). + * The relevant fields in the tpcb are: + * tp_seqmask : the mask of bits that define the sequence space. + * tp_seqbit : 1 + tp_seqmask + * tp_seqhalf : tp_seqbit / 2 or half the sequence space (rounded up) + * Not exactly fast, but at least it's maintainable. + */ + +#ifndef __TP_SEQ__ +#define __TP_SEQ__ + +#define SEQ(tpcb,x) \ + ((x) & (tpcb)->tp_seqmask) + +#define SEQ_GT(tpcb, seq, operand ) \ +( ((int)((seq)-(operand)) > 0)\ +? ((int)((seq)-(operand)) < (int)(tpcb)->tp_seqhalf)\ +: !(-((int)(seq)-(operand)) < (int)(tpcb)->tp_seqhalf)) + +#define SEQ_GEQ(tpcb, seq, operand ) \ +( ((int)((seq)-(operand)) >= 0)\ +? ((int)((seq)-(operand)) < (int)(tpcb)->tp_seqhalf)\ +: !((-((int)(seq)-(operand))) < (int)(tpcb)->tp_seqhalf)) + +#define SEQ_LEQ(tpcb, seq, operand ) \ +( ((int)((seq)-(operand)) <= 0)\ +? ((-(int)((seq)-(operand))) < (int)(tpcb)->tp_seqhalf)\ +: !(((int)(seq)-(operand)) < (int)(tpcb)->tp_seqhalf)) + +#define SEQ_LT(tpcb, seq, operand ) \ +( ((int)((seq)-(operand)) < 0)\ +? ((-(int)((seq)-(operand))) < (int)(tpcb)->tp_seqhalf)\ +: !(((int)(seq)-(operand)) < (int)(tpcb)->tp_seqhalf)) + +#define SEQ_MIN(tpcb, a, b) ( SEQ_GT(tpcb, a, b) ? b : a) + +#define SEQ_MAX(tpcb, a, b) ( SEQ_GT(tpcb, a, b) ? a : b) + +#define SEQ_INC(tpcb, Seq) ((++Seq), ((Seq) &= (tpcb)->tp_seqmask)) + +#define SEQ_DEC(tpcb, Seq)\ + ((Seq) = (((Seq)+(unsigned)((int)(tpcb)->tp_seqbit - 1))&(tpcb)->tp_seqmask)) + +/* (amt) had better be less than the seq bit ! */ + +#define SEQ_SUB(tpcb, Seq, amt)\ + (((Seq) + (unsigned)((int)(tpcb)->tp_seqbit - amt)) & (tpcb)->tp_seqmask) +#define SEQ_ADD(tpcb, Seq, amt) (((Seq) + (unsigned)amt) & (tpcb)->tp_seqmask) + + +#define IN_RWINDOW(tpcb, seq, lwe, uwe)\ + ( SEQ_GEQ(tpcb, seq, lwe) && SEQ_LT(tpcb, seq, uwe) ) + +#define IN_SWINDOW(tpcb, seq, lwe, uwe)\ + ( SEQ_GT(tpcb, seq, lwe) && SEQ_LEQ(tpcb, seq, uwe) ) + +#endif /* __TP_SEQ__ */ diff --git a/sys/netiso/tp_stat.h b/sys/netiso/tp_stat.h new file mode 100644 index 00000000000..bf6e1a5e124 --- /dev/null +++ b/sys/netiso/tp_stat.h @@ -0,0 +1,283 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_stat.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_stat.h,v 5.4 88/11/18 17:28:38 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_stat.h,v $ + * + * Here are the data structures in which the global + * statistics(counters) are gathered. + */ + +#ifndef __TP_STAT__ +#define __TP_STAT__ + +struct tp_stat { + u_long ts_param_ignored; + u_long ts_unused3; + u_long ts_bad_csum; + + u_long ts_inv_length; + u_long ts_inv_pcode; + u_long ts_inv_dutype; + u_long ts_negotfailed; + u_long ts_inv_dref; + u_long ts_inv_pval; + u_long ts_inv_sufx; + u_long ts_inv_aclass; + + u_long ts_xtd_fmt; + u_long ts_use_txpd; + u_long ts_csum_off; + u_long ts_send_drop; + u_long ts_recv_drop; + + u_long ts_xpd_intheway;/* xpd mark caused data flow to stop */ + u_long ts_xpdmark_del; /* xpd markers thrown away */ + u_long ts_dt_ooo; /* dt tpdus received out of order */ + u_long ts_dt_niw; /* dt tpdus received & not in window */ + u_long ts_xpd_niw; /* xpd tpdus received & not in window */ + u_long ts_xpd_dup; + u_long ts_dt_dup; /* dt tpdus received & are duplicates */ + + u_long ts_zfcdt; /* # times f credit went down to 0 */ + u_long ts_lcdt_reduced; /* + # times local cdt reduced on an acknowledgement. + */ + + u_long ts_pkt_rcvd; /* from ip */ + u_long ts_tpdu_rcvd; /* accepted as a TPDU in tp_input */ + u_long ts_tpdu_sent; + u_long ts_unused2; + + u_long ts_retrans_cr; + u_long ts_retrans_cc; + u_long ts_retrans_dr; + u_long ts_retrans_dt; + u_long ts_retrans_xpd; + u_long ts_conn_gaveup; + + u_long ts_ER_sent; + u_long ts_DT_sent; + u_long ts_XPD_sent; + u_long ts_AK_sent; + u_long ts_XAK_sent; + u_long ts_DR_sent; + u_long ts_DC_sent; + u_long ts_CR_sent; + u_long ts_CC_sent; + + u_long ts_ER_rcvd; + u_long ts_DT_rcvd; + u_long ts_XPD_rcvd; + u_long ts_AK_rcvd; + u_long ts_XAK_rcvd; + u_long ts_DR_rcvd; + u_long ts_DC_rcvd; + u_long ts_CR_rcvd; + u_long ts_CC_rcvd; + + u_long ts_Eticks; + u_long ts_Eexpired; + u_long ts_Eset; + u_long ts_Ecan_act; + u_long ts_Cticks; + u_long ts_Cexpired; + u_long ts_Cset; + u_long ts_Ccan_act; + u_long ts_Ccan_inact; + u_long ts_Fdelack; + u_long ts_Fpruned; + + u_long ts_concat_rcvd; + + u_long ts_zdebug; /* zero dref to test timeout on conn estab tp_input.c */ + u_long ts_ydebug; /* throw away pseudo-random pkts tp_input.c */ + u_long ts_unused5; + u_long ts_unused; /* kludged concat to test separation tp_emit.c */ + u_long ts_vdebug; /* kludge to test input size checking tp_emit.c */ + u_long ts_unused4; + u_long ts_ldebug; /* faked a renegging of credit */ + + u_long ts_mb_small; + u_long ts_mb_cluster; + u_long ts_mb_len_distr[17]; + + u_long ts_eot_input; + u_long ts_eot_user; + u_long ts_EOT_sent; + u_long ts_tp0_conn; + u_long ts_tp4_conn; + u_long ts_quench; + u_long ts_rcvdecbit; + +#define NRTT_CATEGORIES 4 + /* The 4 categories are: + * 0 --> tp_flags: ~TPF_PEER_ON_SAMENET | TPF_NL_PDN + * 1 --> tp_flags: ~TPF_PEER_ON_SAMENET | ~TPF_NL_PDN + * 2 --> tp_flags: TPF_PEER_ON_SAMENET | ~TPF_NL_PDN + * 3 --> tp_flags: TPF_PEER_ON_SAMENET | TPF_NL_PDN + */ + int ts_rtt[NRTT_CATEGORIES]; + int ts_rtv[NRTT_CATEGORIES]; + + u_long ts_ackreason[_ACK_NUM_REASONS_]; + /* ACK_DONT 0 / ACK_STRAT_EACH 0x1 / ACK_STRAT_FULLWIN 0x4 + * ACK_DUP 0x8 / ACK_EOT 0x10 / ACK_REORDER 0x20 + * ACK_USRRCV ** + * ACK_FCC ** + */ +} tp_stat ; +#define TP_PM_MAX 0xa /* 10 decimal */ + +#define IncStat(x) tp_stat./**/x/**/++ + +#ifdef TP_PERF_MEAS + +#define PStat(Tpcb, X) (Tpcb)->tp_p_meas->/**/X/**/ +#define IncPStat(Tpcb, X) if((Tpcb)->tp_perf_on) (Tpcb)->tp_p_meas->/**/X/**/++ + +/* BEWARE OF MACROS like this ^^^ must be sure it's surrounded by {} if + * it's used in an if-else statement. + */ + + +/* for perf measurement stuff: maximum window size it can handle */ + +struct tp_pmeas { + /* the first few are distributions as a fn of window size + * only keep enough space for normal format plus 1 slot for + * extended format, in case any windows larger than 15 are used + */ + + /* + * tps_npdusent: for each call to tp_sbsend, we inc the + * element representing the number of pdus sent in this call + */ + int tps_win_lim_by_cdt[TP_PM_MAX+1]; + int tps_win_lim_by_data[TP_PM_MAX+1]; + /* + * tps_sendtime: Each call to tp_sbsend() is timed. For + * Each window size, we keep the running average of the time + * taken by tp_sbsend() for each window size. + */ + int tps_sendtime[TP_PM_MAX+1]; + /* + * n_TMsendack: # times ack sent because timer went off + * n_ack_cuz_eot: # times ack sent due to EOTSDU on incoming packet + * n_ack_cuz_dup: # times ack sent for receiving a duplicate pkt. + * n_ack_cuz_fullwin: # times ack sent for receiving the full window. + * n_ack_cuz_doack: # times ack sent for having just reordered data. + */ + int tps_n_TMsendack; + int tps_n_ack_cuz_eot; + int tps_n_ack_cuz_fullwin; + int tps_n_ack_cuz_reorder; + int tps_n_ack_cuz_dup; + int tps_n_ack_cuz_strat; + /* + * when we send an ack: how much less than the "expected" window + * did we actually ack. For example: if we last sent a credit + * of 10, and we're acking now for whatever reason, and have + * only received 6 since our last credit advertisement, we'll + * keep the difference, 4, in this variable. + */ + int tps_ack_early[TP_PM_MAX+1]; + /* + * when we ack, for the # pkts we actually acked w/ this ack, + * how much cdt are we advertising? + * [ size of window acknowledged ] [ cdt we're giving ] + */ + int tps_cdt_acked[TP_PM_MAX+1][TP_PM_MAX+1]; + + int tps_AK_sent; + int tps_XAK_sent; + int tps_DT_sent; + int tps_XPD_sent; + int tps_AK_rcvd; + int tps_XAK_rcvd; + int tps_DT_rcvd; + int tps_XPD_rcvd; + + int Nb_from_sess; + int Nb_to_sess; + int Nb_to_ll; + int Nb_from_ll; +}; + +#define IFPERF(tpcb) if (tpcb->tp_perf_on && tpcb->tp_p_meas) { +#define ENDPERF } + +#else + +int PStat_Junk; +#define PStat(tpcb, x) PStat_Junk +#define IncPStat(tpcb, x) /* no-op */ +#define tpmeas(a,b,c,d,e,f) 0 + +#define IFPERF(x) if (0) { +#define ENDPERF } + +#endif /* TP_PERF_MEAS */ + +#endif /* __TP_STAT__ */ diff --git a/sys/netiso/tp_states.h b/sys/netiso/tp_states.h new file mode 100644 index 00000000000..ac6213a64d7 --- /dev/null +++ b/sys/netiso/tp_states.h @@ -0,0 +1,13 @@ +/* $Header$ */ +/* $Source$ */ +#define ST_ERROR 0x0 +#define TP_CLOSED 0x1 +#define TP_CRSENT 0x2 +#define TP_AKWAIT 0x3 +#define TP_OPEN 0x4 +#define TP_CLOSING 0x5 +#define TP_REFWAIT 0x6 +#define TP_LISTENING 0x7 +#define TP_CONFIRMING 0x8 + +#define tp_NSTATES 0x9 diff --git a/sys/netiso/tp_states.init b/sys/netiso/tp_states.init new file mode 100644 index 00000000000..89e53453866 --- /dev/null +++ b/sys/netiso/tp_states.init @@ -0,0 +1,75 @@ +/* $Header$ */ +/* $Source$ */ +{0x3,0x0}, +{0x6,0x1}, +{0x6,0x2}, +{0x6,0x0}, +{0x2,0x3}, +{0x2,0x0}, +{0x1,0x0}, +{0x5,0x0}, +{0x4,0x0}, +{0x7,0x0}, +{0x7,0x0}, +{0x1,0x4}, +{0x8,0x5}, +{0x8,0x6}, +{0x4,0x7}, +{0x3,0x8}, +{0x1,0x9}, +{0x2,0xa}, +{0x6,0xb}, +{0x1,0xc}, +{0x6,0xd}, +{0x6,0xe}, +{0x6,0xf}, +{0x6,0x10}, +{0x1,0x11}, +{0x6,0x12}, +{0x5,0x13}, +{0x4,0x14}, +{0x4,0x15}, +{0x2,0x16}, +{0x6,0x17}, +{0x3,0x18}, +{0x4,0x19}, +{0x4,0x1a}, +{0x4,0x1b}, +{0x3,0x1c}, +{0x4,0x1c}, +{0x4,0x1d}, +{0x4,0x1e}, +{0x4,0x1f}, +{0x4,0x20}, +{0x3,0x20}, +{0x6,0x21}, +{0x5,0x22}, +{0x6,0x23}, +{0x5,0x24}, +{0x3,0x25}, +{0x5,0x26}, +{0x5,0x27}, +{0x4,0x28}, +{0x4,0x29}, +{0x5,0x2a}, +{0x6,0x2b}, +{0x1,0x2c}, +{0x4,0x2d}, +{0x4,0x2e}, +{0x4,0x2f}, +{0x4,0x30}, +{0x4,0x31}, +{0x4,0x32}, +{0x4,0x33}, +{0x4,0x34}, +{0x4,0x35}, +{0x4,0x36}, +{0x6,0x37}, +{0x6,0x38}, +{0x7,0x0}, +{0x5,0x0}, +{0x3,0x0}, +{0x2,0x0}, +{0x4,0x0}, +{0x6,0x0}, +{0x1,0x0}, diff --git a/sys/netiso/tp_subr.c b/sys/netiso/tp_subr.c new file mode 100644 index 00000000000..1259ee41253 --- /dev/null +++ b/sys/netiso/tp_subr.c @@ -0,0 +1,947 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_subr.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_subr.c,v 5.3 88/11/18 17:28:43 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_subr.c,v $ + * + * The main work of data transfer is done here. + * These routines are called from tp.trans. + * They include the routines that check the validity of acks and Xacks, + * (tp_goodack() and tp_goodXack() ) + * take packets from socket buffers and send them (tp_send()), + * drop the data from the socket buffers (tp_sbdrop()), + * and put incoming packet data into socket buffers (tp_stash()). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int tp_emit(), tp_sbdrop(); +int tprexmtthresh = 3; +extern int ticks; +void tp_send(); + +/* + * CALLED FROM: + * tp.trans, when an XAK arrives + * FUNCTION and ARGUMENTS: + * Determines if the sequence number (seq) from the XAK + * acks anything new. If so, drop the appropriate tpdu + * from the XPD send queue. + * RETURN VALUE: + * Returns 1 if it did this, 0 if the ack caused no action. + */ +int +tp_goodXack(tpcb, seq) + struct tp_pcb *tpcb; + SeqNum seq; +{ + + IFTRACE(D_XPD) + tptraceTPCB(TPPTgotXack, + seq, tpcb->tp_Xuna, tpcb->tp_Xsndnxt, tpcb->tp_sndnew, + tpcb->tp_snduna); + ENDTRACE + + if ( seq == tpcb->tp_Xuna ) { + tpcb->tp_Xuna = tpcb->tp_Xsndnxt; + + /* DROP 1 packet from the Xsnd socket buf - just so happens + * that only one packet can be there at any time + * so drop the whole thing. If you allow > 1 packet + * the socket buffer, then you'll have to keep + * track of how many characters went w/ each XPD tpdu, so this + * will get messier + */ + IFDEBUG(D_XPD) + dump_mbuf(tpcb->tp_Xsnd.sb_mb, + "tp_goodXack Xsnd before sbdrop"); + ENDDEBUG + + IFTRACE(D_XPD) + tptraceTPCB(TPPTmisc, + "goodXack: dropping cc ", + (int)(tpcb->tp_Xsnd.sb_cc), + 0,0,0); + ENDTRACE + sbdroprecord(&tpcb->tp_Xsnd); + return 1; + } + return 0; +} + +/* + * CALLED FROM: + * tp_good_ack() + * FUNCTION and ARGUMENTS: + * updates + * smoothed average round trip time (*rtt) + * roundtrip time variance (*rtv) - actually deviation, not variance + * given the new value (diff) + * RETURN VALUE: + * void + */ + +void +tp_rtt_rtv(tpcb) +register struct tp_pcb *tpcb; +{ + int old = tpcb->tp_rtt; + int delta, elapsed = ticks - tpcb->tp_rttemit; + + if (tpcb->tp_rtt != 0) { + /* + * rtt is the smoothed round trip time in machine clock ticks (hz). + * It is stored as a fixed point number, unscaled (unlike the tcp + * srtt). The rationale here is that it is only significant to the + * nearest unit of slowtimo, which is at least 8 machine clock ticks + * so there is no need to scale. The smoothing is done according + * to the same formula as TCP (rtt = rtt*7/8 + measured_rtt/8). + */ + delta = elapsed - tpcb->tp_rtt; + if ((tpcb->tp_rtt += (delta >> TP_RTT_ALPHA)) <= 0) + tpcb->tp_rtt = 1; + /* + * rtv is a smoothed accumulated mean difference, unscaled + * for reasons expressed above. + * It is smoothed with an alpha of .75, and the round trip timer + * will be set to rtt + 4*rtv, also as TCP does. + */ + if (delta < 0) + delta = -delta; + if ((tpcb->tp_rtv += ((delta - tpcb->tp_rtv) >> TP_RTV_ALPHA)) <= 0) + tpcb->tp_rtv = 1; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. + * Set the variance to half the rtt (so our first + * retransmit happens at 3*rtt) + */ + tpcb->tp_rtt = elapsed; + tpcb->tp_rtv = elapsed >> 1; + } + tpcb->tp_rttemit = 0; + tpcb->tp_rxtshift = 0; + /* + * Quoting TCP: "the retransmit should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks)." + */ + TP_RANGESET(tpcb->tp_dt_ticks, TP_REXMTVAL(tpcb), + tpcb->tp_peer_acktime, 128 /* XXX */); + IFDEBUG(D_RTT) + printf("%s tpcb 0x%x, elapsed %d, delta %d, rtt %d, rtv %d, old %d\n", + "tp_rtt_rtv:",tpcb,elapsed,delta,tpcb->tp_rtt,tpcb->tp_rtv,old); + ENDDEBUG + tpcb->tp_rxtcur = tpcb->tp_dt_ticks; +} + +/* + * CALLED FROM: + * tp.trans when an AK arrives + * FUNCTION and ARGUMENTS: + * Given (cdt), the credit from the AK tpdu, and + * (seq), the sequence number from the AK tpdu, + * tp_goodack() determines if the AK acknowledges something in the send + * window, and if so, drops the appropriate packets from the retransmission + * list, computes the round trip time, and updates the retransmission timer + * based on the new smoothed round trip time. + * RETURN VALUE: + * Returns 1 if + * EITHER it actually acked something heretofore unacknowledged + * OR no news but the credit should be processed. + * If something heretofore unacked was acked with this sequence number, + * the appropriate tpdus are dropped from the retransmission control list, + * by calling tp_sbdrop(). + * No need to see the tpdu itself. + */ +int +tp_goodack(tpcb, cdt, seq, subseq) + register struct tp_pcb *tpcb; + u_int cdt; + register SeqNum seq; + u_int subseq; +{ + int old_fcredit; + int bang = 0; /* bang --> ack for something heretofore unacked */ + u_int bytes_acked; + + IFDEBUG(D_ACKRECV) + printf("goodack tpcb 0x%x seq 0x%x cdt %d una 0x%x new 0x%x nxt 0x%x\n", + tpcb, seq, cdt, tpcb->tp_snduna, tpcb->tp_sndnew, tpcb->tp_sndnxt); + ENDDEBUG + IFTRACE(D_ACKRECV) + tptraceTPCB(TPPTgotack, + seq,cdt, tpcb->tp_snduna,tpcb->tp_sndnew,subseq); + ENDTRACE + + IFPERF(tpcb) + tpmeas(tpcb->tp_lref, TPtime_ack_rcvd, (struct timeval *)0, seq, 0, 0); + ENDPERF + + if (seq == tpcb->tp_snduna) { + if (subseq < tpcb->tp_r_subseq || + (subseq == tpcb->tp_r_subseq && cdt <= tpcb->tp_fcredit)) { + discard_the_ack: + IFDEBUG(D_ACKRECV) + printf("goodack discard : tpcb 0x%x subseq %d r_subseq %d\n", + tpcb, subseq, tpcb->tp_r_subseq); + ENDDEBUG + goto done; + } + if (cdt == tpcb->tp_fcredit /*&& thus subseq > tpcb->tp_r_subseq */) { + tpcb->tp_r_subseq = subseq; + if (tpcb->tp_timer[TM_data_retrans] == 0) + tpcb->tp_dupacks = 0; + else if (++tpcb->tp_dupacks == tprexmtthresh) { + /* partner went out of his way to signal with different + subsequences that he has the same lack of an expected + packet. This may be an early indiciation of a loss */ + + SeqNum onxt = tpcb->tp_sndnxt; + struct mbuf *onxt_m = tpcb->tp_sndnxt_m; + u_int win = min(tpcb->tp_fcredit, + tpcb->tp_cong_win / tpcb->tp_l_tpdusize) / 2; + IFDEBUG(D_ACKRECV) + printf("%s tpcb 0x%x seq 0x%x rttseq 0x%x onxt 0x%x\n", + "goodack dupacks:", tpcb, seq, tpcb->tp_rttseq, onxt); + ENDDEBUG + if (win < 2) + win = 2; + tpcb->tp_ssthresh = win * tpcb->tp_l_tpdusize; + tpcb->tp_timer[TM_data_retrans] = 0; + tpcb->tp_rttemit = 0; + tpcb->tp_sndnxt = tpcb->tp_snduna; + tpcb->tp_sndnxt_m = 0; + tpcb->tp_cong_win = tpcb->tp_l_tpdusize; + tp_send(tpcb); + tpcb->tp_cong_win = tpcb->tp_ssthresh + + tpcb->tp_dupacks * tpcb->tp_l_tpdusize; + if (SEQ_GT(tpcb, onxt, tpcb->tp_sndnxt)) { + tpcb->tp_sndnxt = onxt; + tpcb->tp_sndnxt_m = onxt_m; + } + + } else if (tpcb->tp_dupacks > tprexmtthresh) { + tpcb->tp_cong_win += tpcb->tp_l_tpdusize; + } + goto done; + } + } else if (SEQ_LT(tpcb, seq, tpcb->tp_snduna)) + goto discard_the_ack; + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (tpcb->tp_dupacks > tprexmtthresh && + tpcb->tp_cong_win > tpcb->tp_ssthresh) + tpcb->tp_cong_win = tpcb->tp_ssthresh; + tpcb->tp_r_subseq = subseq; + old_fcredit = tpcb->tp_fcredit; + tpcb->tp_fcredit = cdt; + if (cdt > tpcb->tp_maxfcredit) + tpcb->tp_maxfcredit = cdt; + tpcb->tp_dupacks = 0; + + if (IN_SWINDOW(tpcb, seq, tpcb->tp_snduna, tpcb->tp_sndnew)) { + + tpsbcheck(tpcb, 0); + bytes_acked = tp_sbdrop(tpcb, seq); + tpsbcheck(tpcb, 1); + /* + * If transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + */ + if (tpcb->tp_rttemit && SEQ_GT(tpcb, seq, tpcb->tp_rttseq)) + tp_rtt_rtv(tpcb); + /* + * If all outstanding data is acked, stop retransmit timer. + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + * OSI combines the keepalive and persistance functions. + * So, there is no persistance timer per se, to restart. + */ + if (tpcb->tp_class != TP_CLASS_0) + tpcb->tp_timer[TM_data_retrans] = + (seq == tpcb->tp_sndnew) ? 0 : tpcb->tp_rxtcur; + /* + * When new data is acked, open the congestion window. + * If the window gives us less than ssthresh packets + * in flight, open exponentially (maxseg per packet). + * Otherwise open linearly: maxseg per window + * (maxseg^2 / cwnd per packet), plus a constant + * fraction of a packet (maxseg/8) to help larger windows + * open quickly enough. + */ + { + u_int cw = tpcb->tp_cong_win, incr = tpcb->tp_l_tpdusize; + + incr = min(incr, bytes_acked); + if (cw > tpcb->tp_ssthresh) + incr = incr * incr / cw + incr / 8; + tpcb->tp_cong_win = + min(cw + incr, tpcb->tp_sock->so_snd.sb_hiwat); + } + tpcb->tp_snduna = seq; + if (SEQ_LT(tpcb, tpcb->tp_sndnxt, seq)) { + tpcb->tp_sndnxt = seq; + tpcb->tp_sndnxt_m = 0; + } + bang++; + } + + if( cdt != 0 && old_fcredit == 0 ) { + tpcb->tp_sendfcc = 1; + } + if (cdt == 0) { + if (old_fcredit != 0) + IncStat(ts_zfcdt); + /* The following might mean that the window shrunk */ + if (tpcb->tp_timer[TM_data_retrans]) { + tpcb->tp_timer[TM_data_retrans] = 0; + tpcb->tp_timer[TM_sendack] = tpcb->tp_dt_ticks; + if (tpcb->tp_sndnxt != tpcb->tp_snduna) { + tpcb->tp_sndnxt = tpcb->tp_snduna; + tpcb->tp_sndnxt_m = 0; + } + } + } + tpcb->tp_fcredit = cdt; + bang |= (old_fcredit < cdt); + +done: + IFDEBUG(D_ACKRECV) + printf("goodack returns 0x%x, cdt 0x%x ocdt 0x%x cwin 0x%x\n", + bang, cdt, old_fcredit, tpcb->tp_cong_win); + ENDDEBUG + /* if (bang) XXXXX Very bad to remove this test, but somethings broken */ + tp_send(tpcb); + return (bang); +} + +/* + * CALLED FROM: + * tp_goodack() + * FUNCTION and ARGUMENTS: + * drops everything up TO but not INCLUDING seq # (seq) + * from the retransmission queue. + */ +tp_sbdrop(tpcb, seq) + register struct tp_pcb *tpcb; + SeqNum seq; +{ + struct sockbuf *sb = &tpcb->tp_sock->so_snd; + register int i = SEQ_SUB(tpcb, seq, tpcb->tp_snduna); + int oldcc = sb->sb_cc, oldi = i; + + if (i >= tpcb->tp_seqhalf) + printf("tp_spdropping too much -- should panic"); + while (i-- > 0) + sbdroprecord(sb); + IFDEBUG(D_ACKRECV) + printf("tp_sbdroping %d pkts %d bytes on %x at 0x%x\n", + oldi, oldcc - sb->sb_cc, tpcb, seq); + ENDDEBUG + if (sb->sb_flags & SB_NOTIFY) + sowwakeup(tpcb->tp_sock); + return (oldcc - sb->sb_cc); +} + +/* + * CALLED FROM: + * tp.trans on user send request, arrival of AK and arrival of XAK + * FUNCTION and ARGUMENTS: + * Emits tpdus starting at sequence number (tpcb->tp_sndnxt). + * Emits until a) runs out of data, or b) runs into an XPD mark, or + * c) it hits seq number (highseq) limited by cong or credit. + * + * If you want XPD to buffer > 1 du per socket buffer, you can + * modifiy this to issue XPD tpdus also, but then it'll have + * to take some argument(s) to distinguish between the type of DU to + * hand tp_emit. + * + * When something is sent for the first time, its time-of-send + * is stashed (in system clock ticks rather than pf_slowtimo ticks). + * When the ack arrives, the smoothed round-trip time is figured + * using this value. + */ +void +tp_send(tpcb) + register struct tp_pcb *tpcb; +{ + register int len; + register struct mbuf *m; + struct mbuf *mb = 0; + struct sockbuf *sb = &tpcb->tp_sock->so_snd; + unsigned int eotsdu = 0; + SeqNum highseq, checkseq; + int idle, idleticks, off, cong_win; +#ifdef TP_PERF_MEAS + int send_start_time = ticks; + SeqNum oldnxt = tpcb->tp_sndnxt; +#endif /* TP_PERF_MEAS */ + + idle = (tpcb->tp_snduna == tpcb->tp_sndnew); + if (idle) { + idleticks = tpcb->tp_inact_ticks - tpcb->tp_timer[TM_inact]; + if (idleticks > tpcb->tp_dt_ticks) + /* + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + */ + tpcb->tp_cong_win = tpcb->tp_l_tpdusize; + } + + cong_win = tpcb->tp_cong_win; + highseq = SEQ(tpcb, tpcb->tp_fcredit + tpcb->tp_snduna); + if (tpcb->tp_Xsnd.sb_mb) + highseq = SEQ_MIN(tpcb, highseq, tpcb->tp_sndnew); + + IFDEBUG(D_DATA) + printf("tp_send enter tpcb 0x%x nxt 0x%x win %d high 0x%x\n", + tpcb, tpcb->tp_sndnxt, cong_win, highseq); + ENDDEBUG + IFTRACE(D_DATA) + tptraceTPCB( TPPTmisc, "tp_send sndnew snduna", + tpcb->tp_sndnew, tpcb->tp_snduna, 0, 0); + tptraceTPCB( TPPTmisc, "tp_send tpcb->tp_sndnxt win fcredit congwin", + tpcb->tp_sndnxt, cong_win, tpcb->tp_fcredit, tpcb->tp_cong_win); + ENDTRACE + IFTRACE(D_DATA) + tptraceTPCB( TPPTmisc, "tp_send 2 nxt high fcredit congwin", + tpcb->tp_sndnxt, highseq, tpcb->tp_fcredit, cong_win); + ENDTRACE + + if (tpcb->tp_sndnxt_m) + m = tpcb->tp_sndnxt_m; + else { + off = SEQ_SUB(tpcb, tpcb->tp_sndnxt, tpcb->tp_snduna); + for (m = sb->sb_mb; m && off > 0; m = m->m_next) + off--; + } +send: + /* + * Avoid silly window syndrome here . . . figure out how! + */ + checkseq = tpcb->tp_sndnum; + if (idle && SEQ_LT(tpcb, tpcb->tp_sndnum, highseq)) + checkseq = highseq; /* i.e. DON'T retain highest assigned packet */ + + while ((SEQ_LT(tpcb, tpcb->tp_sndnxt, highseq)) && m && cong_win > 0) { + + eotsdu = (m->m_flags & M_EOR) != 0; + len = m->m_pkthdr.len; + if (tpcb->tp_sndnxt == checkseq && eotsdu == 0 && + len < (tpcb->tp_l_tpdusize / 2)) + break; /* Nagle . . . . . */ + cong_win -= len; + /* make a copy - mb goes into the retransmission list + * while m gets emitted. m_copy won't copy a zero-length mbuf. + */ + mb = m; + m = m_copy(mb, 0, M_COPYALL); + if (m == MNULL) + break; + IFTRACE(D_STASH) + tptraceTPCB( TPPTmisc, + "tp_send mcopy nxt high eotsdu len", + tpcb->tp_sndnxt, highseq, eotsdu, len); + ENDTRACE + + IFDEBUG(D_DATA) + printf("tp_sending tpcb 0x%x nxt 0x%x\n", + tpcb, tpcb->tp_sndnxt); + ENDDEBUG + /* when headers are precomputed, may need to fill + in checksum here */ + if (tpcb->tp_sock->so_error = + tp_emit(DT_TPDU_type, tpcb, tpcb->tp_sndnxt, eotsdu, m)) { + /* error */ + break; + } + m = mb->m_nextpkt; + tpcb->tp_sndnxt_m = m; + if (tpcb->tp_sndnxt == tpcb->tp_sndnew) { + SEQ_INC(tpcb, tpcb->tp_sndnew); + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + */ + if (tpcb->tp_rttemit == 0) { + tpcb->tp_rttemit = ticks; + tpcb->tp_rttseq = tpcb->tp_sndnxt; + } + tpcb->tp_sndnxt = tpcb->tp_sndnew; + } else + SEQ_INC(tpcb, tpcb->tp_sndnxt); + /* + * Set retransmit timer if not currently set. + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. + */ + if (tpcb->tp_timer[TM_data_retrans] == 0 && + tpcb->tp_class != TP_CLASS_0) { + tpcb->tp_timer[TM_data_retrans] = tpcb->tp_dt_ticks; + tpcb->tp_timer[TM_sendack] = tpcb->tp_keepalive_ticks; + tpcb->tp_rxtshift = 0; + } + } + if (SEQ_GT(tpcb, tpcb->tp_sndnew, tpcb->tp_sndnum)) + tpcb->tp_oktonagle = 0; +#ifdef TP_PERF_MEAS + IFPERF(tpcb) + { + register int npkts; + int elapsed = ticks - send_start_time, *t; + struct timeval now; + + npkts = SEQ_SUB(tpcb, tpcb->tp_sndnxt, oldnxt); + + if (npkts > 0) + tpcb->tp_Nwindow++; + + if (npkts > TP_PM_MAX) + npkts = TP_PM_MAX; + + t = &(tpcb->tp_p_meas->tps_sendtime[npkts]); + *t += (t - elapsed) >> TP_RTT_ALPHA; + + if (mb == 0) { + IncPStat(tpcb, tps_win_lim_by_data[npkts] ); + } else { + IncPStat(tpcb, tps_win_lim_by_cdt[npkts] ); + /* not true with congestion-window being used */ + } + now.tv_sec = elapsed / hz; + now.tv_usec = (elapsed - (hz * now.tv_sec)) * 1000000 / hz; + tpmeas( tpcb->tp_lref, + TPsbsend, &elapsed, newseq, tpcb->tp_Nwindow, npkts); + } + ENDPERF +#endif /* TP_PERF_MEAS */ + + + IFTRACE(D_DATA) + tptraceTPCB( TPPTmisc, + "tp_send at end: new nxt eotsdu error", + tpcb->tp_sndnew, tpcb->tp_sndnxt, eotsdu, tpcb->tp_sock->so_error); + + ENDTRACE +} + +int TPNagleok; +int TPNagled; + +tp_packetize(tpcb, m, eotsdu) +register struct tp_pcb *tpcb; +register struct mbuf *m; +int eotsdu; +{ + register struct mbuf *n; + register struct sockbuf *sb = &tpcb->tp_sock->so_snd; + int maxsize = tpcb->tp_l_tpdusize + - tp_headersize(DT_TPDU_type, tpcb) + - (tpcb->tp_use_checksum?4:0) ; + int totlen = m->m_pkthdr.len; + struct mbuf *m_split(); + /* + * Pre-packetize the data in the sockbuf + * according to negotiated mtu. Do it here + * where we can safely wait for mbufs. + * + * This presumes knowledge of sockbuf conventions. + * TODO: allocate space for header and fill it in (once!). + */ + IFDEBUG(D_DATA) + printf("SEND BF: maxsize %d totlen %d eotsdu %d sndnum 0x%x\n", + maxsize, totlen, eotsdu, tpcb->tp_sndnum); + ENDTRACE + if (tpcb->tp_oktonagle) { + if ((n = sb->sb_mb) == 0) + panic("tp_packetize"); + while (n->m_act) + n = n->m_act; + if (n->m_flags & M_EOR) + panic("tp_packetize 2"); + SEQ_INC(tpcb, tpcb->tp_sndnum); + if (totlen + n->m_pkthdr.len < maxsize) { + /* There is an unsent packet with space, combine data */ + struct mbuf *old_n = n; + tpsbcheck(tpcb,3); + n->m_pkthdr.len += totlen; + while (n->m_next) + n = n->m_next; + sbcompress(sb, m, n); + tpsbcheck(tpcb,4); + n = old_n; + TPNagled++; + goto out; + } + } + while (m) { + n = m; + if (totlen > maxsize) { + if ((m = m_split(n, maxsize, M_WAIT)) == 0) + panic("tp_packetize"); + } else + m = 0; + totlen -= maxsize; + tpsbcheck(tpcb, 5); + sbappendrecord(sb, n); + tpsbcheck(tpcb, 6); + SEQ_INC(tpcb, tpcb->tp_sndnum); + } +out: + if (eotsdu) { + n->m_flags |= M_EOR; /* XXX belongs at end */ + tpcb->tp_oktonagle = 0; + } else { + SEQ_DEC(tpcb, tpcb->tp_sndnum); + tpcb->tp_oktonagle = 1; + TPNagleok++; + } + IFDEBUG(D_DATA) + printf("SEND out: oktonagle %d sndnum 0x%x\n", + tpcb->tp_oktonagle, tpcb->tp_sndnum); + ENDTRACE + return 0; +} + + +/* + * NAME: tp_stash() + * CALLED FROM: + * tp.trans on arrival of a DT tpdu + * FUNCTION, ARGUMENTS, and RETURN VALUE: + * Returns 1 if + * a) something new arrived and it's got eotsdu_reached bit on, + * b) this arrival was caused other out-of-sequence things to be + * accepted, or + * c) this arrival is the highest seq # for which we last gave credit + * (sender just sent a whole window) + * In other words, returns 1 if tp should send an ack immediately, 0 if + * the ack can wait a while. + * + * Note: this implementation no longer renegs on credit, (except + * when debugging option D_RENEG is on, for the purpose of testing + * ack subsequencing), so we don't need to check for incoming tpdus + * being in a reneged portion of the window. + */ + +tp_stash(tpcb, e) + register struct tp_pcb *tpcb; + register struct tp_event *e; +{ + register int ack_reason= tpcb->tp_ack_strat & ACK_STRAT_EACH; + /* 0--> delay acks until full window */ + /* 1--> ack each tpdu */ +#ifndef lint +#define E e->ATTR(DT_TPDU) +#else /* lint */ +#define E e->ev_union.EV_DT_TPDU +#endif /* lint */ + + if ( E.e_eot ) { + register struct mbuf *n = E.e_data; + n->m_flags |= M_EOR; + n->m_act = 0; + } + IFDEBUG(D_STASH) + dump_mbuf(tpcb->tp_sock->so_rcv.sb_mb, + "stash: so_rcv before appending"); + dump_mbuf(E.e_data, + "stash: e_data before appending"); + ENDDEBUG + + IFPERF(tpcb) + PStat(tpcb, Nb_from_ll) += E.e_datalen; + tpmeas(tpcb->tp_lref, TPtime_from_ll, &e->e_time, + E.e_seq, (u_int)PStat(tpcb, Nb_from_ll), (u_int)E.e_datalen); + ENDPERF + + if (E.e_seq == tpcb->tp_rcvnxt) { + + IFDEBUG(D_STASH) + printf("stash EQ: seq 0x%x datalen 0x%x eot 0x%x\n", + E.e_seq, E.e_datalen, E.e_eot); + ENDDEBUG + + IFTRACE(D_STASH) + tptraceTPCB(TPPTmisc, "stash EQ: seq len eot", + E.e_seq, E.e_datalen, E.e_eot, 0); + ENDTRACE + + SET_DELACK(tpcb); + + sbappend(&tpcb->tp_sock->so_rcv, E.e_data); + + SEQ_INC( tpcb, tpcb->tp_rcvnxt ); + /* + * move chains from the reassembly queue to the socket buffer + */ + if (tpcb->tp_rsycnt) { + register struct mbuf **mp; + struct mbuf **mplim; + + mp = tpcb->tp_rsyq + (tpcb->tp_rcvnxt % tpcb->tp_maxlcredit); + mplim = tpcb->tp_rsyq + tpcb->tp_maxlcredit; + + while (tpcb->tp_rsycnt && *mp) { + sbappend(&tpcb->tp_sock->so_rcv, *mp); + tpcb->tp_rsycnt--; + *mp = 0; + SEQ_INC(tpcb, tpcb->tp_rcvnxt); + ack_reason |= ACK_REORDER; + if (++mp == mplim) + mp = tpcb->tp_rsyq; + } + } + IFDEBUG(D_STASH) + dump_mbuf(tpcb->tp_sock->so_rcv.sb_mb, + "stash: so_rcv after appending"); + ENDDEBUG + + } else { + register struct mbuf **mp; + SeqNum uwe; + + IFTRACE(D_STASH) + tptraceTPCB(TPPTmisc, "stash Reseq: seq rcvnxt lcdt", + E.e_seq, tpcb->tp_rcvnxt, tpcb->tp_lcredit, 0); + ENDTRACE + + if (tpcb->tp_rsyq == 0) + tp_rsyset(tpcb); + uwe = SEQ(tpcb, tpcb->tp_rcvnxt + tpcb->tp_maxlcredit); + if (tpcb->tp_rsyq == 0 || + !IN_RWINDOW(tpcb, E.e_seq, tpcb->tp_rcvnxt, uwe)) { + ack_reason = ACK_DONT; + m_freem(E.e_data); + } else if (*(mp = tpcb->tp_rsyq + (E.e_seq % tpcb->tp_maxlcredit))) { + IFDEBUG(D_STASH) + printf("tp_stash - drop & ack\n"); + ENDDEBUG + + /* retransmission - drop it and force an ack */ + IncStat(ts_dt_dup); + IFPERF(tpcb) + IncPStat(tpcb, tps_n_ack_cuz_dup); + ENDPERF + + m_freem(E.e_data); + ack_reason |= ACK_DUP; + } else { + *mp = E.e_data; + tpcb->tp_rsycnt++; + ack_reason = ACK_DONT; + } + } + /* there were some comments of historical interest here. */ + { + LOCAL_CREDIT(tpcb); + + if ( E.e_seq == tpcb->tp_sent_uwe ) + ack_reason |= ACK_STRAT_FULLWIN; + + IFTRACE(D_STASH) + tptraceTPCB(TPPTmisc, + "end of stash, eot, ack_reason, sent_uwe ", + E.e_eot, ack_reason, tpcb->tp_sent_uwe, 0); + ENDTRACE + + if ( ack_reason == ACK_DONT ) { + IncStat( ts_ackreason[ACK_DONT] ); + return 0; + } else { + IFPERF(tpcb) + if(ack_reason & ACK_STRAT_EACH) { + IncPStat(tpcb, tps_n_ack_cuz_strat); + } else if(ack_reason & ACK_STRAT_FULLWIN) { + IncPStat(tpcb, tps_n_ack_cuz_fullwin); + } else if(ack_reason & ACK_REORDER) { + IncPStat(tpcb, tps_n_ack_cuz_reorder); + } + tpmeas(tpcb->tp_lref, TPtime_ack_sent, 0, + SEQ_ADD(tpcb, E.e_seq, 1), 0, 0); + ENDPERF + { + register int i; + + /* keep track of all reasons that apply */ + for( i=1; i<_ACK_NUM_REASONS_ ;i++) { + if( ack_reason & (1<tp_rsycnt) { + for (mp == tpcb->tp_rsyq + tpcb->tp_maxlcredit; + --mp >= tpcb->tp_rsyq; ) + if (*mp) { + tpcb->tp_rsycnt--; + m_freem(*mp); + } + if (tpcb->tp_rsycnt) { + printf("tp_rsyflush %x\n", tpcb); + tpcb->tp_rsycnt = 0; + } + } + free((caddr_t)tpcb->tp_rsyq, M_PCB); + tpcb->tp_rsyq = 0; +} + +tp_rsyset(tpcb) +register struct tp_pcb *tpcb; +{ + register struct socket *so = tpcb->tp_sock; + int maxcredit = tpcb->tp_xtd_format ? 0xffff : 0xf; + int old_credit = tpcb->tp_maxlcredit; + caddr_t rsyq; + + tpcb->tp_maxlcredit = maxcredit = min(maxcredit, + (so->so_rcv.sb_hiwat + tpcb->tp_l_tpdusize)/ tpcb->tp_l_tpdusize); + + if (old_credit == tpcb->tp_maxlcredit && tpcb->tp_rsyq != 0) + return; + maxcredit *= sizeof(struct mbuf *); + if (tpcb->tp_rsyq) + tp_rsyflush(tpcb); + if (rsyq = (caddr_t)malloc(maxcredit, M_PCB, M_NOWAIT)) + bzero(rsyq, maxcredit); + tpcb->tp_rsyq = (struct mbuf **)rsyq; +} + +tpsbcheck(tpcb, i) +struct tp_pcb *tpcb; +{ + register struct mbuf *n, *m; + register int len = 0, mbcnt = 0, pktlen; + struct sockbuf *sb = &tpcb->tp_sock->so_snd; + + for (n = sb->sb_mb; n; n = n->m_nextpkt) { + if ((n->m_flags & M_PKTHDR) == 0) + panic("tpsbcheck nohdr"); + pktlen = len + n->m_pkthdr.len; + for (m = n; m; m = m->m_next) { + len += m->m_len; + mbcnt += MSIZE; + if (m->m_flags & M_EXT) + mbcnt += m->m_ext.ext_size; + } + if (len != pktlen) { + printf("test %d; len %d != pktlen %d on mbuf 0x%x\n", + i, len, pktlen, n); + panic("tpsbcheck short"); + } + } + if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { + printf("test %d: cc %d != %d || mbcnt %d != %d\n", i, len, sb->sb_cc, + mbcnt, sb->sb_mbcnt); + panic("tpsbcheck"); + } +} diff --git a/sys/netiso/tp_subr2.c b/sys/netiso/tp_subr2.c new file mode 100644 index 00000000000..60c7ce2a50b --- /dev/null +++ b/sys/netiso/tp_subr2.c @@ -0,0 +1,880 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_subr2.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_subr2.c,v 5.5 88/11/18 17:28:55 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_subr2.c,v $ + * + * Some auxiliary routines: + * tp_protocol_error: required by xebec- called when a combo of state, + * event, predicate isn't covered for by the transition file. + * tp_indicate: gives indications(signals) to the user process + * tp_getoptions: initializes variables that are affected by the options + * chosen. + */ + +/* this def'n is to cause the expansion of this macro in the + * routine tp_local_credit : + */ +#define LOCAL_CREDIT_EXPAND + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#undef MNULL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef TRUE +#undef FALSE +#undef TRUE +#endif +#include +#include +#include + +void tp_rsyset(); + +/* + * NAME: tp_local_credit() + * + * CALLED FROM: + * tp_emit(), tp_usrreq() + * + * FUNCTION and ARGUMENTS: + * Computes the local credit and stashes it in tpcb->tp_lcredit. + * It's a macro in the production system rather than a procdure. + * + * RETURNS: + * + * SIDE EFFECTS: + * + * NOTES: + * This doesn't actually get called in a production system - + * the macro gets expanded instead in place of calls to this proc. + * But for debugging, we call this and that allows us to add + * debugging messages easily here. + */ +void +tp_local_credit(tpcb) + struct tp_pcb *tpcb; +{ + LOCAL_CREDIT(tpcb); + IFDEBUG(D_CREDIT) + printf("ref 0x%x lcdt 0x%x l_tpdusize 0x%x decbit 0x%x\n", + tpcb->tp_lref, + tpcb->tp_lcredit, + tpcb->tp_l_tpdusize, + tpcb->tp_decbit, + tpcb->tp_cong_win + ); + ENDDEBUG + IFTRACE(D_CREDIT) + tptraceTPCB(TPPTmisc, + "lcdt tpdusz \n", + tpcb->tp_lcredit, tpcb->tp_l_tpdusize, 0, 0); + ENDTRACE +} + +/* + * NAME: tp_protocol_error() + * + * CALLED FROM: + * tp_driver(), when it doesn't know what to do with + * a combo of event, state, predicate + * + * FUNCTION and ARGUMENTS: + * print error mesg + * + * RETURN VALUE: + * EIO - always + * + * SIDE EFFECTS: + * + * NOTES: + */ +int +tp_protocol_error(e,tpcb) + struct tp_event *e; + struct tp_pcb *tpcb; +{ + printf("TP PROTOCOL ERROR! tpcb 0x%x event 0x%x, state 0x%x\n", + tpcb, e->ev_number, tpcb->tp_state); + IFTRACE(D_DRIVER) + tptraceTPCB(TPPTmisc, "PROTOCOL ERROR tpcb event state", + tpcb, e->ev_number, tpcb->tp_state, 0 ); + ENDTRACE + return EIO; /* for lack of anything better */ +} + + +/* Not used at the moment */ +ProtoHook +tp_drain() +{ + return 0; +} + + +/* + * NAME: tp_indicate() + * + * CALLED FROM: + * tp.trans when XPD arrive, when a connection is being disconnected by + * the arrival of a DR or ER, and when a connection times out. + * + * FUNCTION and ARGUMENTS: + * (ind) is the type of indication : T_DISCONNECT, T_XPD + * (error) is an E* value that will be put in the socket structure + * to be passed along to the user later. + * Gives a SIGURG to the user process or group indicated by the socket + * attached to the tpcb. + * + * RETURNS: Rien + * + * SIDE EFFECTS: + * + * NOTES: + */ +void +tp_indicate(ind, tpcb, error) + int ind; + u_short error; + register struct tp_pcb *tpcb; +{ + register struct socket *so = tpcb->tp_sock; + IFTRACE(D_INDICATION) + tptraceTPCB(TPPTindicate, ind, *(u_short *)(tpcb->tp_lsuffix), + *(u_short *)(tpcb->tp_fsuffix), error,so->so_pgid); + ENDTRACE + IFDEBUG(D_INDICATION) + char *ls, *fs; + ls = tpcb->tp_lsuffix, + fs = tpcb->tp_fsuffix, + + printf( +"indicate 0x%x lsuf 0x%02x%02x fsuf 0x%02x%02x err 0x%x noind 0x%x ref 0x%x\n", + ind, + *ls, *(ls+1), *fs, *(fs+1), + error, /*so->so_pgrp,*/ + tpcb->tp_no_disc_indications, + tpcb->tp_lref); + ENDDEBUG + + if (ind == ER_TPDU) { + register struct mbuf *m; + struct tp_disc_reason x; + + if ((so->so_state & SS_CANTRCVMORE) == 0 && + (m = m_get(M_DONTWAIT, MT_OOBDATA)) != 0) { + + x.dr_hdr.cmsg_len = m->m_len = sizeof(x); + x.dr_hdr.cmsg_level = SOL_TRANSPORT; + x.dr_hdr.cmsg_type= TPOPT_DISC_REASON; + x.dr_reason = error; + *mtod(m, struct tp_disc_reason *) = x; + sbappendrecord(&tpcb->tp_Xrcv, m); + error = 0; + } else + error = ECONNRESET; + } + so->so_error = error; + + if (ind == T_DISCONNECT) { + if (error == 0) + so->so_error = ENOTCONN; + if ( tpcb->tp_no_disc_indications ) + return; + } + IFTRACE(D_INDICATION) + tptraceTPCB(TPPTmisc, "doing sohasoutofband(so)", so,0,0,0); + ENDTRACE + sohasoutofband(so); +} + +/* + * NAME : tp_getoptions() + * + * CALLED FROM: + * tp.trans whenever we go into OPEN state + * + * FUNCTION and ARGUMENTS: + * sets the proper flags and values in the tpcb, to control + * the appropriate actions for the given class, options, + * sequence space, etc, etc. + * + * RETURNS: Nada + * + * SIDE EFFECTS: + * + * NOTES: + */ +void +tp_getoptions(tpcb) +struct tp_pcb *tpcb; +{ + tpcb->tp_seqmask = + tpcb->tp_xtd_format ? TP_XTD_FMT_MASK : TP_NML_FMT_MASK ; + tpcb->tp_seqbit = + tpcb->tp_xtd_format ? TP_XTD_FMT_BIT : TP_NML_FMT_BIT ; + tpcb->tp_seqhalf = tpcb->tp_seqbit >> 1; + tpcb->tp_dt_ticks = + max(tpcb->tp_dt_ticks, (tpcb->tp_peer_acktime + 2)); + tp_rsyset(tpcb); + +} + +/* + * NAME: tp_recycle_tsuffix() + * + * CALLED FROM: + * Called when a ref is frozen. + * + * FUNCTION and ARGUMENTS: + * allows the suffix to be reused. + * + * RETURNS: zilch + * + * SIDE EFFECTS: + * + * NOTES: + */ +void +tp_recycle_tsuffix(tpcb) + struct tp_pcb *tpcb; +{ + bzero((caddr_t)tpcb->tp_lsuffix, sizeof( tpcb->tp_lsuffix)); + bzero((caddr_t)tpcb->tp_fsuffix, sizeof( tpcb->tp_fsuffix)); + tpcb->tp_fsuffixlen = tpcb->tp_lsuffixlen = 0; + + (tpcb->tp_nlproto->nlp_recycle_suffix)(tpcb->tp_npcb); +} + +/* + * NAME: tp_quench() + * + * CALLED FROM: + * tp{af}_quench() when ICMP source quench or similar thing arrives. + * + * FUNCTION and ARGUMENTS: + * Drop the congestion window back to 1. + * Congestion window scheme: + * Initial value is 1. ("slow start" as Nagle, et. al. call it) + * For each good ack that arrives, the congestion window is increased + * by 1 (up to max size of logical infinity, which is to say, + * it doesn't wrap around). + * Source quench causes it to drop back to 1. + * tp_send() uses the smaller of (regular window, congestion window). + * One retransmission strategy option is to have any retransmission + * cause reset the congestion window back to 1. + * + * (cmd) is either PRC_QUENCH: source quench, or + * PRC_QUENCH2: dest. quench (dec bit) + * + * RETURNS: + * + * SIDE EFFECTS: + * + * NOTES: + */ +void +tp_quench( tpcb, cmd ) + struct tp_pcb *tpcb; + int cmd; +{ + IFDEBUG(D_QUENCH) + printf("tp_quench tpcb 0x%x ref 0x%x sufx 0x%x\n", + tpcb, tpcb->tp_lref, *(u_short *)(tpcb->tp_lsuffix)); + printf("cong_win 0x%x decbit 0x%x \n", + tpcb->tp_cong_win, tpcb->tp_decbit); + ENDDEBUG + switch(cmd) { + case PRC_QUENCH: + tpcb->tp_cong_win = tpcb->tp_l_tpdusize; + IncStat(ts_quench); + break; + case PRC_QUENCH2: + tpcb->tp_cong_win = tpcb->tp_l_tpdusize; /* might as well quench source also */ + tpcb->tp_decbit = TP_DECBIT_CLEAR_COUNT; + IncStat(ts_rcvdecbit); + break; + } +} + + +/* + * NAME: tp_netcmd() + * + * CALLED FROM: + * + * FUNCTION and ARGUMENTS: + * + * RETURNS: + * + * SIDE EFFECTS: + * + * NOTES: + */ +tp_netcmd( tpcb, cmd ) + struct tp_pcb *tpcb; + int cmd; +{ +#ifdef TPCONS + struct isopcb *isop; + struct pklcd *lcp; + + if (tpcb->tp_netservice != ISO_CONS) + return; + isop = (struct isopcb *)tpcb->tp_npcb; + lcp = (struct pklcd *)isop->isop_chan; + switch (cmd) { + + case CONN_CLOSE: + case CONN_REFUSE: + if (isop->isop_refcnt == 1) { + /* This is really superfluous, since it would happen + anyway in iso_pcbdetach, although it is a courtesy + to free up the x.25 channel before the refwait timer + expires. */ + lcp->lcd_upper = 0; + lcp->lcd_upnext = 0; + pk_disconnect(lcp); + isop->isop_chan = 0; + isop->isop_refcnt = 0; + } + break; + + default: + printf("tp_netcmd(0x%x, 0x%x) NOT IMPLEMENTED\n", tpcb, cmd); + break; + } +#else /* TPCONS */ + printf("tp_netcmd(): X25 NOT CONFIGURED!!\n"); +#endif +} +/* + * CALLED FROM: + * tp_ctloutput() and tp_emit() + * FUNCTION and ARGUMENTS: + * Convert a class mask to the highest numeric value it represents. + */ + +int +tp_mask_to_num(x) + u_char x; +{ + register int j; + + for(j = 4; j>=0 ;j--) { + if(x & (1<p_tpdusize = src->p_tpdusize; + dst->p_ack_strat = src->p_ack_strat; + dst->p_rx_strat = src->p_rx_strat; +#undef COPYSIZE +} +/* + * Determine a reasonable value for maxseg size. + * If the route is known, check route for mtu. + * We also initialize the congestion/slow start + * window to be a single segment if the destination isn't local. + * While looking at the routing entry, we also initialize other path-dependent + * parameters from pre-set or cached values in the routing entry. + */ +void +tp_mss(tpcb, nhdr_size) + register struct tp_pcb *tpcb; + int nhdr_size; +{ + register struct rtentry *rt; + struct ifnet *ifp; + register int rtt, mss; + u_long bufsize; + int i, ssthresh = 0, rt_mss; + struct socket *so; + + if (tpcb->tp_ptpdusize) + mss = tpcb->tp_ptpdusize << 7; + else + mss = 1 << tpcb->tp_tpdusize; + so = tpcb->tp_sock; + if ((rt = *(tpcb->tp_routep)) == 0) { + bufsize = so->so_rcv.sb_hiwat; + goto punt_route; + } + ifp = rt->rt_ifp; + +#ifdef RTV_MTU /* if route characteristics exist ... */ + /* + * While we're here, check if there's an initial rtt + * or rttvar. Convert from the route-table units + * to hz ticks for the smoothed timers and slow-timeout units + * for other inital variables. + */ + if (tpcb->tp_rtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { + tpcb->tp_rtt = rtt * hz / RTM_RTTUNIT; + if (rt->rt_rmx.rmx_rttvar) + tpcb->tp_rtv = rt->rt_rmx.rmx_rttvar + * hz / RTM_RTTUNIT; + else + tpcb->tp_rtv = tpcb->tp_rtt; + } + /* + * if there's an mtu associated with the route, use it + */ + if (rt->rt_rmx.rmx_mtu) + rt_mss = rt->rt_rmx.rmx_mtu - nhdr_size; + else +#endif /* RTV_MTU */ + rt_mss = (ifp->if_mtu - nhdr_size); + if (tpcb->tp_ptpdusize == 0 || /* assume application doesn't care */ + mss > rt_mss /* network won't support what was asked for */) + mss = rt_mss; + /* can propose mtu which are multiples of 128 */ + mss &= ~0x7f; + /* + * If there's a pipesize, change the socket buffer + * to that size. + */ +#ifdef RTV_SPIPE + if ((bufsize = rt->rt_rmx.rmx_sendpipe) > 0) { +#endif + bufsize = min(bufsize, so->so_snd.sb_hiwat); + (void) sbreserve(&so->so_snd, bufsize); + } +#ifdef RTV_SPIPE + if ((bufsize = rt->rt_rmx.rmx_recvpipe) > 0) { +#endif + bufsize = min(bufsize, so->so_rcv.sb_hiwat); + (void) sbreserve(&so->so_rcv, bufsize); + } else + bufsize = so->so_rcv.sb_hiwat; +#ifdef RTV_SSTHRESH + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + ssthresh = rt->rt_rmx.rmx_ssthresh; +punt_route: + /* + * The current mss is initialized to the default value. + * If we compute a smaller value, reduce the current mss. + * If we compute a larger value, return it for use in sending + * a max seg size option. + * If we received an offer, don't exceed it. + * However, do not accept offers under 128 bytes. + */ + if (tpcb->tp_l_tpdusize) + mss = min(mss, tpcb->tp_l_tpdusize); + /* + * We want a minimum recv window of 4 packets to + * signal packet loss by duplicate acks. + */ + mss = min(mss, bufsize >> 2) & ~0x7f; + mss = max(mss, 128); /* sanity */ + tpcb->tp_cong_win = + (rt == 0 || (rt->rt_flags & RTF_GATEWAY)) ? mss : bufsize; + tpcb->tp_l_tpdusize = mss; + tp_rsyset(tpcb); + tpcb->tp_ssthresh = max(2 * mss, ssthresh); + /* Calculate log2 of mss */ + for (i = TP_MIN_TPDUSIZE + 1; i <= TP_MAX_TPDUSIZE; i++) + if ((1 << i) > mss) + break; + i--; + tpcb->tp_tpdusize = i; +#endif /* RTV_MTU */ +} + +/* + * CALLED FROM: + * tp_usrreq on PRU_CONNECT and tp_input on receipt of CR + * + * FUNCTION and ARGUMENTS: + * -- An mbuf containing the peer's network address. + * -- Our control block, which will be modified + * -- In the case of cons, a control block for that layer. + * + * + * RETURNS: + * errno value : + * EAFNOSUPPORT if can't find an nl_protosw for x.25 (really could panic) + * ECONNREFUSED if trying to run TP0 with non-type 37 address + * possibly other E* returned from cons_netcmd() + * + * SIDE EFFECTS: + * Determines recommended tpdusize, buffering and intial delays + * based on information cached on the route. + */ +int +tp_route_to( m, tpcb, channel) + struct mbuf *m; + register struct tp_pcb *tpcb; + caddr_t channel; +{ + register struct sockaddr_iso *siso; /* NOTE: this may be a sockaddr_in */ + extern struct tp_conn_param tp_conn_param[]; + int error = 0, save_netservice = tpcb->tp_netservice; + register struct rtentry *rt = 0; + int nhdr_size, mtu, bufsize; + + siso = mtod(m, struct sockaddr_iso *); + IFTRACE(D_CONN) + tptraceTPCB(TPPTmisc, + "route_to: so afi netservice class", + tpcb->tp_sock, siso->siso_addr.isoa_genaddr[0], tpcb->tp_netservice, + tpcb->tp_class); + ENDTRACE + IFDEBUG(D_CONN) + printf("tp_route_to( m x%x, channel 0x%x, tpcb 0x%x netserv 0x%x)\n", + m, channel, tpcb, tpcb->tp_netservice); + printf("m->mlen x%x, m->m_data:\n", m->m_len); + dump_buf(mtod(m, caddr_t), m->m_len); + ENDDEBUG + if (channel) { +#ifdef TPCONS + struct pklcd *lcp = (struct pklcd *)channel; + struct isopcb *isop = (struct isopcb *)lcp->lcd_upnext, + *isop_new = (struct isopcb *)tpcb->tp_npcb; + /* The next 2 lines believe that you haven't + set any network level options or done a pcbconnect + and XXXXXXX'edly apply to both inpcb's and isopcb's */ + remque(isop_new); + free(isop_new, M_PCB); + tpcb->tp_npcb = (caddr_t)isop; + tpcb->tp_netservice = ISO_CONS; + tpcb->tp_nlproto = nl_protosw + ISO_CONS; + if (isop->isop_refcnt++ == 0) { + iso_putsufx(isop, tpcb->tp_lsuffix, tpcb->tp_lsuffixlen, TP_LOCAL); + isop->isop_socket = tpcb->tp_sock; + } else + /* there are already connections sharing this */; +#endif + } else { + switch (siso->siso_family) { + default: + error = EAFNOSUPPORT; + goto done; +#ifdef ISO + case AF_ISO: + { + struct isopcb *isop = (struct isopcb *)tpcb->tp_npcb; + int flags = tpcb->tp_sock->so_options & SO_DONTROUTE; + tpcb->tp_netservice = ISO_CLNS; + if (clnp_route(&siso->siso_addr, &isop->isop_route, + flags, (void **)0, (void **)0) == 0) { + rt = isop->isop_route.ro_rt; + if (rt && rt->rt_flags & RTF_PROTO1) + tpcb->tp_netservice = ISO_CONS; + } + } break; +#endif +#ifdef INET + case AF_INET: + tpcb->tp_netservice = IN_CLNS; +#endif + } + if (tpcb->tp_nlproto->nlp_afamily != siso->siso_family) { + IFDEBUG(D_CONN) + printf("tp_route_to( CHANGING nlproto old 0x%x new 0x%x)\n", + save_netservice, tpcb->tp_netservice); + ENDDEBUG + if (error = tp_set_npcb(tpcb)) + goto done; + } + IFDEBUG(D_CONN) + printf("tp_route_to calling nlp_pcbconn, netserv %d\n", + tpcb->tp_netservice); + ENDDEBUG + tpcb->tp_nlproto = nl_protosw + tpcb->tp_netservice; + error = (tpcb->tp_nlproto->nlp_pcbconn)(tpcb->tp_npcb, m); + } + if (error) + goto done; + nhdr_size = tpcb->tp_nlproto->nlp_mtu(tpcb); /* only gets common info */ + tp_mss(tpcb, nhdr_size); +done: + IFDEBUG(D_CONN) + printf("tp_route_to returns 0x%x\n", error); + ENDDEBUG + IFTRACE(D_CONN) + tptraceTPCB(TPPTmisc, "route_to: returns: error netserv class", error, + tpcb->tp_netservice, tpcb->tp_class, 0); + ENDTRACE + return error; +} + + +/* class zero version */ +void +tp0_stash( tpcb, e ) + register struct tp_pcb *tpcb; + register struct tp_event *e; +{ +#ifndef lint +#define E e->ATTR(DT_TPDU) +#else /* lint */ +#define E e->ev_union.EV_DT_TPDU +#endif /* lint */ + + register struct sockbuf *sb = &tpcb->tp_sock->so_rcv; + register struct isopcb *isop = (struct isopcb *)tpcb->tp_npcb; + + IFPERF(tpcb) + PStat(tpcb, Nb_from_ll) += E.e_datalen; + tpmeas(tpcb->tp_lref, TPtime_from_ll, &e->e_time, + E.e_seq, PStat(tpcb, Nb_from_ll), E.e_datalen); + ENDPERF + + IFDEBUG(D_STASH) + printf("stash EQ: seq 0x%x datalen 0x%x eot 0x%x", + E.e_seq, E.e_datalen, E.e_eot); + ENDDEBUG + + IFTRACE(D_STASH) + tptraceTPCB(TPPTmisc, "stash EQ: seq len eot", + E.e_seq, E.e_datalen, E.e_eot, 0); + ENDTRACE + + if ( E.e_eot ) { + register struct mbuf *n = E.e_data; + n->m_flags |= M_EOR; + n->m_act = MNULL; /* set on tp_input */ + } + sbappend(sb, E.e_data); + IFDEBUG(D_STASH) + dump_mbuf(sb->sb_mb, "stash 0: so_rcv after appending"); + ENDDEBUG + if (tpcb->tp_netservice != ISO_CONS) + printf("tp0_stash: tp running over something wierd\n"); + else { + register struct pklcd *lcp = (struct pklcd *)isop->isop_chan; + pk_flowcontrol(lcp, sbspace(sb) <= 0, 1); + } +} + +void +tp0_openflow(tpcb) +register struct tp_pcb *tpcb; +{ + register struct isopcb *isop = (struct isopcb *)tpcb->tp_npcb; + if (tpcb->tp_netservice != ISO_CONS) + printf("tp0_openflow: tp running over something wierd\n"); + else { + register struct pklcd *lcp = (struct pklcd *)isop->isop_chan; + if (lcp->lcd_rxrnr_condition) + pk_flowcontrol(lcp, 0, 0); + } +} +#ifndef TPCONS +static +pk_flowcontrol() {} +#endif + +#ifdef TP_PERF_MEAS +/* + * CALLED FROM: + * tp_ctloutput() when the user sets TPOPT_PERF_MEAS on + * and tp_newsocket() when a new connection is made from + * a listening socket with tp_perf_on == true. + * FUNCTION and ARGUMENTS: + * (tpcb) is the usual; this procedure gets a clear cluster mbuf for + * a tp_pmeas structure, and makes tpcb->tp_p_meas point to it. + * RETURN VALUE: + * ENOBUFS if it cannot get a cluster mbuf. + */ + +int +tp_setup_perf(tpcb) + register struct tp_pcb *tpcb; +{ + register struct mbuf *q; + + if( tpcb->tp_p_meas == 0 ) { + MGET(q, M_WAITOK, MT_PCB); + if (q == 0) + return ENOBUFS; + MCLGET(q, M_WAITOK); + if ((q->m_flags & M_EXT) == 0) { + (void) m_free(q); + return ENOBUFS; + } + q->m_len = sizeof (struct tp_pmeas); + tpcb->tp_p_mbuf = q; + tpcb->tp_p_meas = mtod(q, struct tp_pmeas *); + bzero( (caddr_t)tpcb->tp_p_meas, sizeof (struct tp_pmeas) ); + IFDEBUG(D_PERF_MEAS) + printf( + "tpcb 0x%x so 0x%x ref 0x%x tp_p_meas 0x%x tp_perf_on 0x%x\n", + tpcb, tpcb->tp_sock, tpcb->tp_lref, + tpcb->tp_p_meas, tpcb->tp_perf_on); + ENDDEBUG + tpcb->tp_perf_on = 1; + } + return 0; +} +#endif /* TP_PERF_MEAS */ + +#ifdef ARGO_DEBUG +dump_addr (addr) + register struct sockaddr *addr; +{ + switch( addr->sa_family ) { + case AF_INET: + dump_inaddr((struct sockaddr_in *)addr); + break; +#ifdef ISO + case AF_ISO: + dump_isoaddr((struct sockaddr_iso *)addr); + break; +#endif /* ISO */ + default: + printf("BAD AF: 0x%x\n", addr->sa_family); + break; + } +} + +#define MAX_COLUMNS 8 +/* + * Dump the buffer to the screen in a readable format. Format is: + * + * hex/dec where hex is the hex format, dec is the decimal format. + * columns of hex/dec numbers will be printed, followed by the + * character representations (if printable). + */ +Dump_buf(buf, len) +caddr_t buf; +int len; +{ + int i,j; +#define Buf ((u_char *)buf) + printf("Dump buf 0x%x len 0x%x\n", buf, len); + for (i = 0; i < len; i += MAX_COLUMNS) { + printf("+%d:\t", i); + for (j = 0; j < MAX_COLUMNS; j++) { + if (i + j < len) { + printf("%x/%d\t", Buf[i+j], Buf[i+j]); + } else { + printf(" "); + } + } + + for (j = 0; j < MAX_COLUMNS; j++) { + if (i + j < len) { + if (((Buf[i+j]) > 31) && ((Buf[i+j]) < 128)) + printf("%c", Buf[i+j]); + else + printf("."); + } + } + printf("\n"); + } +} +#endif /* ARGO_DEBUG */ diff --git a/sys/netiso/tp_timer.c b/sys/netiso/tp_timer.c new file mode 100644 index 00000000000..b3a0be3a945 --- /dev/null +++ b/sys/netiso/tp_timer.c @@ -0,0 +1,377 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_timer.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_timer.c,v 5.2 88/11/18 17:29:07 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_timer.c,v $ + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +struct tp_ref *tp_ref; +int tp_rttdiv, tp_rttadd, N_TPREF = 127; +struct tp_refinfo tp_refinfo; +struct tp_pcb *tp_ftimeolist = (struct tp_pcb *)&tp_ftimeolist; + +/* + * CALLED FROM: + * at autoconfig time from tp_init() + * a combo of event, state, predicate + * FUNCTION and ARGUMENTS: + * initialize data structures for the timers + */ +void +tp_timerinit() +{ + register int s; + /* + * Initialize storage + */ + if (tp_refinfo.tpr_base) + return; + tp_refinfo.tpr_size = N_TPREF + 1; /* Need to start somewhere */ + s = sizeof(*tp_ref) * tp_refinfo.tpr_size; + if ((tp_ref = (struct tp_ref *) malloc(s, M_PCB, M_NOWAIT)) == 0) + panic("tp_timerinit"); + bzero((caddr_t)tp_ref, (unsigned) s); + tp_refinfo.tpr_base = tp_ref; + tp_rttdiv = hz / PR_SLOWHZ; + tp_rttadd = (2 * tp_rttdiv) - 1; +} +#ifdef TP_DEBUG_TIMERS +/********************** e timers *************************/ + +/* + * CALLED FROM: + * tp.trans all over + * FUNCTION and ARGUMENTS: + * Set an E type timer. + */ +void +tp_etimeout(tpcb, fun, ticks) + register struct tp_pcb *tpcb; + int fun; /* function to be called */ + int ticks; +{ + + register u_int *callp; + IFDEBUG(D_TIMER) + printf("etimeout pcb 0x%x state 0x%x\n", tpcb, tpcb->tp_state); + ENDDEBUG + IFTRACE(D_TIMER) + tptrace(TPPTmisc, "tp_etimeout ref refstate tks Etick", tpcb->tp_lref, + tpcb->tp_state, ticks, tp_stat.ts_Eticks); + ENDTRACE + if (tpcb == 0) + return; + IncStat(ts_Eset); + if (ticks == 0) + ticks = 1; + callp = tpcb->tp_timer + fun; + if (*callp == 0 || *callp > ticks) + *callp = ticks; +} + +/* + * CALLED FROM: + * tp.trans all over + * FUNCTION and ARGUMENTS: + * Cancel all occurrences of E-timer function (fun) for reference (refp) + */ +void +tp_euntimeout(tpcb, fun) + register struct tp_pcb *tpcb; + int fun; +{ + IFTRACE(D_TIMER) + tptrace(TPPTmisc, "tp_euntimeout ref", tpcb->tp_lref, 0, 0, 0); + ENDTRACE + + if (tpcb) + tpcb->tp_timer[fun] = 0; +} + +/**************** c timers ********************** + * + * These are not chained together; they sit + * in the tp_ref structure. they are the kind that + * are typically cancelled so it's faster not to + * mess with the chains + */ +#endif +/* + * CALLED FROM: + * the clock, every 500 ms + * FUNCTION and ARGUMENTS: + * Look for open references with active timers. + * If they exist, call the appropriate timer routines to update + * the timers and possibly generate events. + */ +ProtoHook +tp_slowtimo() +{ + register u_int *cp; + register struct tp_ref *rp; + struct tp_pcb *tpcb; + struct tp_event E; + int s = splnet(), t; + + /* check only open reference structures */ + IncStat(ts_Cticks); + /* tp_ref[0] is never used */ + for (rp = tp_ref + tp_refinfo.tpr_maxopen; rp > tp_ref; rp--) { + if ((tpcb = rp->tpr_pcb) == 0 || tpcb->tp_refstate < REF_OPEN) + continue; + /* check the timers */ + for (t = 0; t < TM_NTIMERS; t++) { + cp = tpcb->tp_timer + t; + if (*cp && --(*cp) <= 0 ) { + *cp = 0; + E.ev_number = t; + IFDEBUG(D_TIMER) + printf("tp_slowtimo: pcb 0x%x t %d\n", + tpcb, t); + ENDDEBUG + IncStat(ts_Cexpired); + tp_driver(tpcb, &E); + if (t == TM_reference && tpcb->tp_state == TP_CLOSED) { + if (tpcb->tp_notdetached) { + IFDEBUG(D_CONN) + printf("PRU_DETACH: not detached\n"); + ENDDEBUG + tp_detach(tpcb); + } + /* XXX wart; where else to do it? */ + free((caddr_t)tpcb, M_PCB); + } + } + } + } + splx(s); + return 0; +} + +/* + * Called From: tp.trans from tp_slowtimo() -- retransmission timer went off. + */ +tp_data_retrans(tpcb) +register struct tp_pcb *tpcb; +{ + int rexmt, win; + tpcb->tp_rttemit = 0; /* cancel current round trip time */ + tpcb->tp_dupacks = 0; + tpcb->tp_sndnxt = tpcb->tp_snduna; + if (tpcb->tp_fcredit == 0) { + /* + * We transmitted new data, started timing it and the window + * got shrunk under us. This can only happen if all data + * that they wanted us to send got acked, so don't + * bother shrinking the congestion windows, et. al. + * The retransmission timer should have been reset in goodack() + */ + IFDEBUG(D_ACKRECV) + printf("tp_data_retrans: 0 window tpcb 0x%x una 0x%x\n", + tpcb, tpcb->tp_snduna); + ENDDEBUG + tpcb->tp_rxtshift = 0; + tpcb->tp_timer[TM_data_retrans] = 0; + tpcb->tp_timer[TM_sendack] = tpcb->tp_dt_ticks; + return; + } + rexmt = tpcb->tp_dt_ticks << min(tpcb->tp_rxtshift, TP_MAXRXTSHIFT); + win = min(tpcb->tp_fcredit, (tpcb->tp_cong_win / tpcb->tp_l_tpdusize / 2)); + win = max(win, 2); + tpcb->tp_cong_win = tpcb->tp_l_tpdusize; /* slow start again. */ + tpcb->tp_ssthresh = win * tpcb->tp_l_tpdusize; + /* We're losing; our srtt estimate is probably bogus. + * Clobber it so we'll take the next rtt measurement as our srtt; + * Maintain current rxt times until then. + */ + if (++tpcb->tp_rxtshift > TP_NRETRANS / 4) { + /* tpcb->tp_nlprotosw->nlp_losing(tpcb->tp_npcb) someday */ + tpcb->tp_rtt = 0; + } + TP_RANGESET(tpcb->tp_rxtcur, rexmt, tpcb->tp_peer_acktime, 128); + tpcb->tp_timer[TM_data_retrans] = tpcb->tp_rxtcur; + tp_send(tpcb); +} + +int +tp_fasttimo() +{ + register struct tp_pcb *t; + int s = splnet(); + struct tp_event E; + + E.ev_number = TM_sendack; + while ((t = tp_ftimeolist) != (struct tp_pcb *)&tp_ftimeolist) { + if (t == 0) { + printf("tp_fasttimeo: should panic"); + tp_ftimeolist = (struct tp_pcb *)&tp_ftimeolist; + } else { + if (t->tp_flags & TPF_DELACK) { + IncStat(ts_Fdelack); + tp_driver(t, &E); + t->tp_flags &= ~TPF_DELACK; + } else + IncStat(ts_Fpruned); + tp_ftimeolist = t->tp_fasttimeo; + t->tp_fasttimeo = 0; + } + } + splx(s); +} + +#ifdef TP_DEBUG_TIMERS +/* + * CALLED FROM: + * tp.trans, tp_emit() + * FUNCTION and ARGUMENTS: + * Set a C type timer of type (which) to go off after (ticks) time. + */ +void +tp_ctimeout(tpcb, which, ticks) + register struct tp_pcb *tpcb; + int which, ticks; +{ + + IFTRACE(D_TIMER) + tptrace(TPPTmisc, "tp_ctimeout ref which tpcb active", + tpcb->tp_lref, which, tpcb, tpcb->tp_timer[which]); + ENDTRACE + if(tpcb->tp_timer[which]) + IncStat(ts_Ccan_act); + IncStat(ts_Cset); + if (ticks <= 0) + ticks = 1; + tpcb->tp_timer[which] = ticks; +} + +/* + * CALLED FROM: + * tp.trans + * FUNCTION and ARGUMENTS: + * Version of tp_ctimeout that resets the C-type time if the + * parameter (ticks) is > the current value of the timer. + */ +void +tp_ctimeout_MIN(tpcb, which, ticks) + register struct tp_pcb *tpcb; + int which, ticks; +{ + IFTRACE(D_TIMER) + tptrace(TPPTmisc, "tp_ctimeout_MIN ref which tpcb active", + tpcb->tp_lref, which, tpcb, tpcb->tp_timer[which]); + ENDTRACE + IncStat(ts_Cset); + if (tpcb->tp_timer[which]) { + tpcb->tp_timer[which] = min(ticks, tpcb->tp_timer[which]); + IncStat(ts_Ccan_act); + } else + tpcb->tp_timer[which] = ticks; +} + +/* + * CALLED FROM: + * tp.trans + * FUNCTION and ARGUMENTS: + * Cancel the (which) timer in the ref structure indicated by (refp). + */ +void +tp_cuntimeout(tpcb, which) + register struct tp_pcb *tpcb; + int which; +{ + IFDEBUG(D_TIMER) + printf("tp_cuntimeout(0x%x, %d) active %d\n", + tpcb, which, tpcb->tp_timer[which]); + ENDDEBUG + + IFTRACE(D_TIMER) + tptrace(TPPTmisc, "tp_cuntimeout ref which, active", refp-tp_ref, + which, tpcb->tp_timer[which], 0); + ENDTRACE + + if (tpcb->tp_timer[which]) + IncStat(ts_Ccan_act); + else + IncStat(ts_Ccan_inact); + tpcb->tp_timer[which] = 0; +} +#endif diff --git a/sys/netiso/tp_timer.h b/sys/netiso/tp_timer.h new file mode 100644 index 00000000000..a6f7735586b --- /dev/null +++ b/sys/netiso/tp_timer.h @@ -0,0 +1,93 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_timer.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_timer.h,v 5.1 88/10/12 12:21:41 root Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_timer.h,v $ + * + * ARGO TP + * The callout structures used by the tp timers. + */ + +#ifndef __TP_TIMER__ +#define __TP_TIMER__ + +#define SET_DELACK(t) {\ + (t)->tp_flags |= TPF_DELACK; \ + if ((t)->tp_fasttimeo == 0)\ + { (t)->tp_fasttimeo = tp_ftimeolist; tp_ftimeolist = (t); } } + +#ifdef ARGO_DEBUG +#define TP_DEBUG_TIMERS +#endif + +#ifndef TP_DEBUG_TIMERS +#define tp_ctimeout(tpcb, which, timo) ((tpcb)->tp_timer[which] = (timo)) +#define tp_cuntimeout(tpcb, which) ((tpcb)->tp_timer[which] = 0) +#define tp_etimeout tp_ctimeout +#define tp_euntimeout tp_cuntimeout +#define tp_ctimeout_MIN(p, w, t) \ + { if((p)->tp_timer[w] > (t)) (p)->tp_timer[w] = (t);} +#endif /* TP_DEBUG_TIMERS */ + +#endif /* __TP_TIMER__ */ diff --git a/sys/netiso/tp_tpdu.h b/sys/netiso/tp_tpdu.h new file mode 100644 index 00000000000..15f130d1703 --- /dev/null +++ b/sys/netiso/tp_tpdu.h @@ -0,0 +1,296 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_tpdu.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_tpdu.h,v 4.4 88/07/26 16:45:40 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_tpdu.h,v $ + * + * This ghastly set of macros makes it possible to + * refer to tpdu structures without going mad. + */ + +#ifndef __TP_TPDU__ +#define __TP_TPDU__ + +#ifndef BYTE_ORDER +/* + * Definitions for byte order, + * according to byte significance from low address to high. + */ +#define LITTLE_ENDIAN 1234 /* least-significant byte first (vax) */ +#define BIG_ENDIAN 4321 /* most-significant byte first (IBM, net) */ +#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long (pdp) */ + +#ifdef vax +#define BYTE_ORDER LITTLE_ENDIAN +#else +#define BYTE_ORDER BIG_ENDIAN /* mc68000, tahoe, most others */ +#endif +#endif /* BYTE_ORDER */ + +/* This much of a tpdu is the same for all types of tpdus (except + * DT tpdus in class 0; their exceptions are handled by the data + * structure below + */ +struct tpdu_fixed { + u_char _tpduf_li:8, /* length indicator */ +#if BYTE_ORDER == LITTLE_ENDIAN + _tpduf_cdt: 4, /* credit */ + _tpduf_type: 4; /* type of tpdu (DT, CR, etc.) */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + _tpduf_type: 4, /* type of tpdu (DT, CR, etc.) */ + _tpduf_cdt: 4; /* credit */ +#endif + u_short _tpduf_dref; /* destination ref; not in DT in class 0 */ +}; + +#define tpdu_li _tpduf._tpduf_li +#define tpdu_type _tpduf._tpduf_type +#define tpdu_cdt _tpduf._tpduf_cdt +#define tpdu_dref _tpduf._tpduf_dref + +struct tp0du { + u_char _tp0_li, + _tp0_cdt_type, /* same as in tpdu_fixed */ +#if BYTE_ORDER == BIG_ENDIAN + _tp0_eot: 1, /* eot */ + _tp0_mbz: 7, /* must be zero */ +#endif +#if BYTE_ORDER == LITTLE_ENDIAN + _tp0_mbz: 7, /* must be zero */ + _tp0_eot: 1, /* eot */ +#endif + _tp0_notused: 8; /* data begins on this octet */ +}; + +#define tp0du_eot _tp0_eot +#define tp0du_mbz _tp0_mbz + +/* + * This is used when the extended format seqence numbers are + * being sent and received. + */ + /* + * the seqeot field is an int that overlays the seq + * and eot fields, this allows the htonl operation + * to be applied to the entire 32 bit quantity, and + * simplifies the structure definitions. + */ +union seq_type { + struct { +#if BYTE_ORDER == BIG_ENDIAN + unsigned int st_eot:1, /* end-of-tsdu */ + st_seq:31; /* 31 bit sequence number */ +#endif +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned int st_seq:31, /* 31 bit sequence number */ + st_eot:1; /* end-of-tsdu */ +#endif + } st; + unsigned int s_seqeot; +#define s_eot st.st_eot +#define s_seq st.st_seq +}; + +/* Then most tpdu types have a portion that is always present but + * differs among the tpdu types : + */ +union tpdu_fixed_rest { + + struct { + u_short _tpdufr_sref, /* source reference */ +#if BYTE_ORDER == BIG_ENDIAN + _tpdufr_class: 4, /* class [ ISO 8073 13.3.3.e ] */ + _tpdufr_opt: 4, /* options [ ISO 8073 13.3.3.e ] */ +#endif +#if BYTE_ORDER == LITTLE_ENDIAN + _tpdufr_opt: 4, /* options [ ISO 8073 13.3.3.e ] */ + _tpdufr_class: 4, /* class [ ISO 8073 13.3.3.e ] */ +#endif + _tpdufr_xx: 8; /* unused */ + } CRCC; + +#define tpdu_CRli _tpduf._tpduf_li +#define tpdu_CRtype _tpduf._tpduf_type +#define tpdu_CRcdt _tpduf._tpduf_cdt +#define tpdu_CRdref_0 _tpduf._tpduf_dref +#define tpdu_CRsref _tpdufr.CRCC._tpdufr_sref +#define tpdu_sref _tpdufr.CRCC._tpdufr_sref +#define tpdu_CRclass _tpdufr.CRCC._tpdufr_class +#define tpdu_CRoptions _tpdufr.CRCC._tpdufr_opt + +#define tpdu_CCli _tpduf._tpduf_li +#define tpdu_CCtype _tpduf._tpduf_type +#define tpdu_CCcdt _tpduf._tpduf_cdt +#define tpdu_CCdref _tpduf._tpduf_dref +#define tpdu_CCsref _tpdufr.CRCC._tpdufr_sref +#define tpdu_CCclass _tpdufr.CRCC._tpdufr_class +#define tpdu_CCoptions _tpdufr.CRCC._tpdufr_opt + +/* OPTIONS and ADDL OPTIONS bits */ +#define TPO_USE_EFC 0x1 +#define TPO_XTD_FMT 0x2 +#define TPAO_USE_TXPD 0x1 +#define TPAO_NO_CSUM 0x2 +#define TPAO_USE_RCC 0x4 +#define TPAO_USE_NXPD 0x8 + + struct { + unsigned short _tpdufr_sref; /* source reference */ + unsigned char _tpdufr_reason; /* [ ISO 8073 13.5.3.d ] */ + } DR; +#define tpdu_DRli _tpduf._tpduf_li +#define tpdu_DRtype _tpduf._tpduf_type +#define tpdu_DRdref _tpduf._tpduf_dref +#define tpdu_DRsref _tpdufr.DR._tpdufr_sref +#define tpdu_DRreason _tpdufr.DR._tpdufr_reason + + unsigned short _tpdufr_sref; /* source reference */ + +#define tpdu_DCli _tpduf._tpduf_li +#define tpdu_DCtype _tpduf._tpduf_type +#define tpdu_DCdref _tpduf._tpduf_dref +#define tpdu_DCsref _tpdufr._tpdufr_sref + + struct { +#if BYTE_ORDER == BIG_ENDIAN + unsigned char _tpdufr_eot:1, /* end-of-tsdu */ + _tpdufr_seq:7; /* 7 bit sequence number */ +#endif +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned char _tpdufr_seq:7, /* 7 bit sequence number */ + _tpdufr_eot:1; /* end-of-tsdu */ +#endif + }SEQEOT; + struct { +#if BYTE_ORDER == BIG_ENDIAN + unsigned int _tpdufr_Xeot:1, /* end-of-tsdu */ + _tpdufr_Xseq:31; /* 31 bit sequence number */ +#endif +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned int _tpdufr_Xseq:31, /* 31 bit sequence number */ + _tpdufr_Xeot:1; /* end-of-tsdu */ +#endif + }SEQEOT31; + unsigned int _tpdufr_Xseqeot; +#define tpdu_seqeotX _tpdufr._tpdufr_Xseqeot + +#define tpdu_DTli _tpduf._tpduf_li +#define tpdu_DTtype _tpduf._tpduf_type +#define tpdu_DTdref _tpduf._tpduf_dref +#define tpdu_DTseq _tpdufr.SEQEOT._tpdufr_seq +#define tpdu_DTeot _tpdufr.SEQEOT._tpdufr_eot +#define tpdu_DTseqX _tpdufr.SEQEOT31._tpdufr_Xseq +#define tpdu_DTeotX _tpdufr.SEQEOT31._tpdufr_Xeot + +#define tpdu_XPDli _tpduf._tpduf_li +#define tpdu_XPDtype _tpduf._tpduf_type +#define tpdu_XPDdref _tpduf._tpduf_dref +#define tpdu_XPDseq _tpdufr.SEQEOT._tpdufr_seq +#define tpdu_XPDeot _tpdufr.SEQEOT._tpdufr_eot +#define tpdu_XPDseqX _tpdufr.SEQEOT31._tpdufr_Xseq +#define tpdu_XPDeotX _tpdufr.SEQEOT31._tpdufr_Xeot + + struct { +#if BYTE_ORDER == BIG_ENDIAN + unsigned _tpdufr_yrseq0:1, /* always zero */ + _tpdufr_yrseq:31; /* [ ISO 8073 13.9.3.d ] */ +#endif +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned _tpdufr_yrseq:31, /* [ ISO 8073 13.9.3.d ] */ + _tpdufr_yrseq0:1; /* always zero */ +#endif + unsigned short _tpdufr_cdt; /* [ ISO 8073 13.9.3.b ] */ + } AK31; + +#define tpdu_AKli _tpduf._tpduf_li +#define tpdu_AKtype _tpduf._tpduf_type +#define tpdu_AKdref _tpduf._tpduf_dref +#define tpdu_AKseq _tpdufr.SEQEOT._tpdufr_seq +#define tpdu_AKseqX _tpdufr.AK31._tpdufr_yrseq +/* location of cdt depends on size of seq. numbers */ +#define tpdu_AKcdt _tpduf._tpduf_cdt +#define tpdu_AKcdtX _tpdufr.AK31._tpdufr_cdt + +#define tpdu_XAKli _tpduf._tpduf_li +#define tpdu_XAKtype _tpduf._tpduf_type +#define tpdu_XAKdref _tpduf._tpduf_dref +#define tpdu_XAKseq _tpdufr.SEQEOT._tpdufr_seq +#define tpdu_XAKseqX _tpdufr.SEQEOT31._tpdufr_Xseq + + unsigned char _tpdu_ERreason; /* [ ISO 8073 13.12.3.c ] */ + +#define tpdu_ERli _tpduf._tpduf_li +#define tpdu_ERtype _tpduf._tpduf_type +#define tpdu_ERdref _tpduf._tpduf_dref +#define tpdu_ERreason _tpdufr._tpdu_ERreason + +}; + +struct tpdu { + struct tpdu_fixed _tpduf; + union tpdu_fixed_rest _tpdufr; +}; + +#endif /* __TP_TPDU__ */ diff --git a/sys/netiso/tp_trace.c b/sys/netiso/tp_trace.c new file mode 100644 index 00000000000..115597bf472 --- /dev/null +++ b/sys/netiso/tp_trace.c @@ -0,0 +1,175 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_trace.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_trace.c,v 5.3 88/11/18 17:29:14 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_trace.c,v $ + * + * The whole protocol trace module. + * We keep a circular buffer of trace structures, which are big + * unions of different structures we might want to see. + * Unfortunately this gets too big pretty easily. Pcbs were removed + * from the tracing when the kernel got too big to boot. + */ + +#define TP_TRACEFILE + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef TPPT +static tp_seq = 0; +u_char tp_traceflags[128]; + +/* + * The argument tpcb is the obvious. + * event here is just the type of trace event - TPPTmisc, etc. + * The rest of the arguments have different uses depending + * on the type of trace event. + */ +/*ARGSUSED*/ +/*VARARGS*/ + +void +tpTrace(tpcb, event, arg, src, len, arg4, arg5) + struct tp_pcb *tpcb; + u_int event, arg; + u_int src; + u_int len; + u_int arg4; + u_int arg5; +{ + register struct tp_Trace *tp; + + tp = &tp_Trace[tp_Tracen++]; + tp_Tracen %= TPTRACEN; + + tp->tpt_event = event; + tp->tpt_tseq = tp_seq++; + tp->tpt_arg = arg; + if(tpcb) + tp->tpt_arg2 = tpcb->tp_lref; + bcopy( (caddr_t)&time, (caddr_t)&tp->tpt_time, sizeof(struct timeval) ); + + switch(event) { + + case TPPTertpdu: + bcopy((caddr_t)src, (caddr_t)&tp->tpt_ertpdu, + (unsigned)MIN((int)len, sizeof(struct tp_Trace))); + break; + + case TPPTusrreq: + case TPPTmisc: + + /* arg is a string */ + bcopy((caddr_t)arg, (caddr_t)tp->tpt_str, + (unsigned)MIN(1+strlen((caddr_t) arg), TPTRACE_STRLEN)); + tp->tpt_m2 = src; + tp->tpt_m3 = len; + tp->tpt_m4 = arg4; + tp->tpt_m1 = arg5; + break; + + case TPPTgotXack: + case TPPTXack: + case TPPTsendack: + case TPPTgotack: + case TPPTack: + case TPPTindicate: + default: + case TPPTdriver: + tp->tpt_m2 = arg; + tp->tpt_m3 = src; + tp->tpt_m4 = len; + tp->tpt_m5 = arg4; + tp->tpt_m1 = arg5; + break; + case TPPTparam: + bcopy((caddr_t)src, (caddr_t)&tp->tpt_param, sizeof(struct tp_param)); + break; + case TPPTref: + bcopy((caddr_t)src, (caddr_t)&tp->tpt_ref, sizeof(struct tp_ref)); + break; + + case TPPTtpduin: + case TPPTtpduout: + tp->tpt_arg2 = arg4; + bcopy((caddr_t)src, (caddr_t)&tp->tpt_tpdu, + (unsigned)MIN((int)len, sizeof(struct tp_Trace))); + break; + } +} +#endif /* TPPT */ diff --git a/sys/netiso/tp_trace.h b/sys/netiso/tp_trace.h new file mode 100644 index 00000000000..885730549e5 --- /dev/null +++ b/sys/netiso/tp_trace.h @@ -0,0 +1,198 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_trace.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_trace.h,v 5.1 88/10/12 12:21:51 root Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_trace.h,v $ + * + * + * Definitions needed for the protocol trace mechanism. + */ + +#ifndef __TP_TRACE__ +#define __TP_TRACE__ + + +#define TPPTsendack 1 +#define TPPTgotack 2 +#define TPPTXack 3 +#define TPPTgotXack 4 +#define TPPTack 5 +#define TPPTindicate 6 +#define TPPTusrreq 7 +#define TPPTmisc 8 +#define TPPTpcb 9 +#define TPPTref 10 +#define TPPTtpduin 11 +#define TPPTparam 12 +#define TPPTertpdu 13 +#define TPPTdriver 14 +#define TPPTtpduout 15 + +#include + +/* this #if is to avoid lint */ + +#if defined(TP_TRACEFILE)||!defined(KERNEL) + +#include + +#define TPTRACE_STRLEN 50 + + +/* for packet tracing */ +struct tp_timeval { + SeqNum tptv_seq; + u_int tptv_kind; + u_int tptv_window; + u_int tptv_size; +}; + +struct tp_Trace { + u_int tpt_event; + u_int tpt_arg; + u_int tpt_arg2; + int tpt_tseq; + struct timeval tpt_time; + union { + struct inpcb tpt_Inpcb; /* protocol control block */ + struct tp_ref tpt_Ref; /* ref part of pcb */ + struct tpdu tpt_Tpdu; /* header*/ + struct tp_refinfo tpt_Param; /* ?? bytes, make sure < 128??*/ + struct tp_timeval tpt_Time; + struct { + u_int tptm_2; + u_int tptm_3; + u_int tptm_4; + u_int tptm_5; + char tpt_Str[TPTRACE_STRLEN]; + u_int tptm_1; + } tptmisc; + u_char tpt_Ertpdu; /* use rest of structure */ + } tpt_stuff; +}; +#define tpt_inpcb tpt_stuff.tpt_Inpcb +#define tpt_pcb tpt_stuff.tpt_Pcb +#define tpt_ref tpt_stuff.tpt_Ref +#define tpt_tpdu tpt_stuff.tpt_Tpdu +#define tpt_param tpt_stuff.tpt_Param +#define tpt_ertpdu tpt_stuff.tpt_Ertpdu +#define tpt_str tpt_stuff.tptmisc.tpt_Str +#define tpt_m1 tpt_stuff.tptmisc.tptm_1 +#define tpt_m2 tpt_stuff.tptmisc.tptm_2 +#define tpt_m3 tpt_stuff.tptmisc.tptm_3 +#define tpt_m4 tpt_stuff.tptmisc.tptm_4 +#define tpt_m5 tpt_stuff.tptmisc.tptm_5 + +#define tpt_seq tpt_stuff.tpt_Time.tptv_seq +#define tpt_kind tpt_stuff.tpt_Time.tptv_kind +#define tpt_window tpt_stuff.tpt_Time.tptv_window +#define tpt_size tpt_stuff.tpt_Time.tptv_size + +#endif /* defined(TP_TRACEFILE)||!defined(KERNEL) */ + + +#ifdef TPPT + +#define TPTRACEN 300 + +#define tptrace(A,B,C,D,E,F) \ + tpTrace((struct tp_pcb *)0,\ + (u_int)(A),(u_int)(B),(u_int)(C),(u_int)(D),(u_int)(E),(u_int)(F)) + +#define tptraceTPCB(A,B,C,D,E,F) \ + tpTrace(tpcb,\ + (u_int)(A),(u_int)(B),(u_int)(C),(u_int)(D),(u_int)(E),(u_int)(F)) + +extern void tpTrace(); +extern struct tp_Trace tp_Trace[]; +extern u_char tp_traceflags[]; +int tp_Tracen = 0; + +#define IFTRACE(ascii)\ + if(tp_traceflags[ascii]) { +/* + * for some reason lint complains about tp_param being undefined no + * matter where or how many times I define it. + */ +#define ENDTRACE } + + +#else /* TPPT */ + +/*********************************************** + * NO TPPT TRACE STUFF + **********************************************/ +#define TPTRACEN 1 + +#define tptrace(A,B,C,D,E,F) 0 +#define tptraceTPCB(A,B,C,D,E,F) 0 + +#define IFTRACE(ascii) if (0) { +#define ENDTRACE } + +#endif /* TPPT */ + + + +#endif /* __TP_TRACE__ */ diff --git a/sys/netiso/tp_user.h b/sys/netiso/tp_user.h new file mode 100644 index 00000000000..b81491b7650 --- /dev/null +++ b/sys/netiso/tp_user.h @@ -0,0 +1,162 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_user.h 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_user.h,v 5.2 88/11/04 15:44:44 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_user.h,v $ + * + * These are the values a real-live user ;-) needs. + */ + +#ifndef _TYPES_ +#include +#endif + +#ifndef __TP_USER__ +#define __TP_USER__ + +struct tp_conn_param { + /* PER CONNECTION parameters */ + short p_Nretrans; + short p_dr_ticks; + + short p_cc_ticks; + short p_dt_ticks; + + short p_x_ticks; + short p_cr_ticks; + + short p_keepalive_ticks; + short p_sendack_ticks; + + short p_ref_ticks; + short p_inact_ticks; + + short p_ptpdusize; /* preferred tpdusize/128 */ + short p_winsize; + + u_char p_tpdusize; /* log 2 of size */ + + u_char p_ack_strat; /* see comments in tp_pcb.h */ + u_char p_rx_strat; /* see comments in tp_pcb.h */ + u_char p_class; /* class bitmask */ + u_char p_xtd_format; + u_char p_xpd_service; + u_char p_use_checksum; + u_char p_use_nxpd; /* netwk expedited data: not implemented */ + u_char p_use_rcc; /* receipt confirmation: not implemented */ + u_char p_use_efc; /* explicit flow control: not implemented */ + u_char p_no_disc_indications; /* don't deliver indic on disc */ + u_char p_dont_change_params; /* use these params as they are */ + u_char p_netservice; + u_char p_version; /* only here for checking */ +}; + +/* + * These sockopt level definitions should be considered for socket.h + */ +#define SOL_TRANSPORT 0xfffe +#define SOL_NETWORK 0xfffd + +/* get/set socket opt commands */ +#define TPACK_WINDOW 0x0 /* ack only on full window */ +#define TPACK_EACH 0x1 /* ack every packet */ + +#define TPRX_USE_CW 0x8 /* use congestion window transmit */ +#define TPRX_EACH 0x4 /* retrans each packet of a set */ +#define TPRX_FASTSTART 0x1 /* don't use slow start */ + +#define TPOPT_INTERCEPT 0x200 +#define TPOPT_FLAGS 0x300 +#define TPOPT_CONN_DATA 0x400 +#define TPOPT_DISC_DATA 0x500 +#define TPOPT_CFRM_DATA 0x600 +#define TPOPT_CDDATA_CLEAR 0x700 +#define TPOPT_MY_TSEL 0x800 +#define TPOPT_PEER_TSEL 0x900 +#define TPOPT_PERF_MEAS 0xa00 +#define TPOPT_PSTATISTICS 0xb00 +#define TPOPT_PARAMS 0xc00 /* to replace a bunch of the others */ +#define TPOPT_DISC_REASON 0xe00 + +struct tp_disc_reason { + struct cmsghdr dr_hdr; + u_int dr_reason; +}; + +/* + ***********************flags********************************** + */ + +/* read only flags */ +#define TPFLAG_NLQOS_PDN (u_char)0x01 +#define TPFLAG_PEER_ON_SAMENET (u_char)0x02 +#define TPFLAG_GENERAL_ADDR (u_char)0x04 /* bound to wildcard addr */ + + +/* + ***********************end flags****************************** + */ + + +#endif /* __TP_USER__ */ diff --git a/sys/netiso/tp_usrreq.c b/sys/netiso/tp_usrreq.c new file mode 100644 index 00000000000..8060c947f54 --- /dev/null +++ b/sys/netiso/tp_usrreq.c @@ -0,0 +1,756 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tp_usrreq.c 8.1 (Berkeley) 6/10/93 + */ + +/*********************************************************** + Copyright IBM Corporation 1987 + + All Rights Reserved + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, +provided that the above copyright notice appear in all copies and that +both that copyright notice and this permission notice appear in +supporting documentation, and that the name of IBM not be +used in advertising or publicity pertaining to distribution of the +software without specific, written prior permission. + +IBM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING +ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL +IBM BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR +ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +SOFTWARE. + +******************************************************************/ + +/* + * ARGO Project, Computer Sciences Dept., University of Wisconsin - Madison + */ +/* + * ARGO TP + * + * $Header: tp_usrreq.c,v 5.4 88/11/18 17:29:18 nhall Exp $ + * $Source: /usr/argo/sys/netiso/RCS/tp_usrreq.c,v $ + * + * tp_usrreq(), the fellow that gets called from most of the socket code. + * Pretty straighforward. + * THe only really awful stuff here is the OOB processing, which is done + * wholly here. + * tp_rcvoob() and tp_sendoob() are contained here and called by tp_usrreq(). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int tp_attach(), tp_driver(), tp_pcbbind(); +int TNew; +int TPNagle1, TPNagle2; +struct tp_pcb *tp_listeners, *tp_intercepts; + +#ifdef ARGO_DEBUG +/* + * CALLED FROM: + * anywhere you want to debug... + * FUNCTION and ARGUMENTS: + * print (str) followed by the control info in the mbufs of an mbuf chain (n) + */ +void +dump_mbuf(n, str) + struct mbuf *n; + char *str; +{ + struct mbuf *nextrecord; + + printf("dump %s\n", str); + + if (n == MNULL) { + printf("EMPTY:\n"); + return; + } + + while (n) { + nextrecord = n->m_act; + printf("RECORD:\n"); + while (n) { + printf("%x : Len %x Data %x A %x Nx %x Tp %x\n", + n, n->m_len, n->m_data, n->m_act, n->m_next, n->m_type); +#ifdef notdef + { + register char *p = mtod(n, char *); + register int i; + + printf("data: "); + for (i = 0; i < n->m_len; i++) { + if (i%8 == 0) + printf("\n"); + printf("0x%x ", *(p+i)); + } + printf("\n"); + } +#endif /* notdef */ + if (n->m_next == n) { + printf("LOOP!\n"); + return; + } + n = n->m_next; + } + n = nextrecord; + } + printf("\n"); +} + +#endif /* ARGO_DEBUG */ + +/* + * CALLED FROM: + * tp_usrreq(), PRU_RCVOOB + * FUNCTION and ARGUMENTS: + * Copy data from the expedited data socket buffer into + * the pre-allocated mbuf m. + * There is an isomorphism between XPD TPDUs and expedited data TSDUs. + * XPD tpdus are limited to 16 bytes of data so they fit in one mbuf. + * RETURN VALUE: + * EINVAL if debugging is on and a disaster has occurred + * ENOTCONN if the socket isn't connected + * EWOULDBLOCK if the socket is in non-blocking mode and there's no + * xpd data in the buffer + * E* whatever is returned from the fsm. + */ +tp_rcvoob(tpcb, so, m, outflags, inflags) + struct tp_pcb *tpcb; + register struct socket *so; + register struct mbuf *m; + int *outflags; + int inflags; +{ + register struct mbuf *n; + register struct sockbuf *sb = &so->so_rcv; + struct tp_event E; + int error = 0; + register struct mbuf **nn; + + IFDEBUG(D_XPD) + printf("PRU_RCVOOB, sostate 0x%x\n", so->so_state); + ENDDEBUG + + /* if you use soreceive */ + if (m == MNULL) + return ENOBUFS; + +restart: + if ((((so->so_state & SS_ISCONNECTED) == 0) + || (so->so_state & SS_ISDISCONNECTING) != 0) && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { + return ENOTCONN; + } + + /* Take the first mbuf off the chain. + * Each XPD TPDU gives you a complete TSDU so the chains don't get + * coalesced, but one TSDU may span several mbufs. + * Nevertheless, since n should have a most 16 bytes, it + * will fit into m. (size was checked in tp_input() ) + */ + + /* + * Code for excision of OOB data should be added to + * uipc_socket2.c (like sbappend). + */ + + sblock(sb, M_WAITOK); + for (nn = &sb->sb_mb; n = *nn; nn = &n->m_act) + if (n->m_type == MT_OOBDATA) + break; + + if (n == 0) { + IFDEBUG(D_XPD) + printf("RCVOOB: empty queue!\n"); + ENDDEBUG + sbunlock(sb); + if (so->so_state & SS_NBIO) { + return EWOULDBLOCK; + } + sbwait(sb); + goto restart; + } + m->m_len = 0; + + /* Assuming at most one xpd tpdu is in the buffer at once */ + while (n != MNULL) { + m->m_len += n->m_len; + bcopy(mtod(n, caddr_t), mtod(m, caddr_t), (unsigned)n->m_len); + m->m_data += n->m_len; /* so mtod() in bcopy() above gives right addr */ + n = n->m_next; + } + m->m_data = m->m_dat; + m->m_flags |= M_EOR; + + IFDEBUG(D_XPD) + printf("tp_rcvoob: xpdlen 0x%x\n", m->m_len); + dump_mbuf(so->so_rcv.sb_mb, "RCVOOB: Rcv socketbuf"); + dump_mbuf(sb->sb_mb, "RCVOOB: Xrcv socketbuf"); + ENDDEBUG + + if ((inflags & MSG_PEEK) == 0) { + n = *nn; + *nn = n->m_act; + for (; n; n = m_free(n)) + sbfree(sb, n); + } + +release: + sbunlock(sb); + + IFTRACE(D_XPD) + tptraceTPCB(TPPTmisc, "PRU_RCVOOB @ release sb_cc m_len", + tpcb->tp_Xrcv.sb_cc, m->m_len, 0, 0); + ENDTRACE + if (error == 0) + error = DoEvent(T_USR_Xrcvd); + return error; +} + +/* + * CALLED FROM: + * tp_usrreq(), PRU_SENDOOB + * FUNCTION and ARGUMENTS: + * Send what's in the mbuf chain (m) as an XPD TPDU. + * The mbuf may not contain more then 16 bytes of data. + * XPD TSDUs aren't segmented, so they translate into + * exactly one XPD TPDU, with EOT bit set. + * RETURN VALUE: + * EWOULDBLOCK if socket is in non-blocking mode and the previous + * xpd data haven't been acked yet. + * EMSGSIZE if trying to send > max-xpd bytes (16) + * ENOBUFS if ran out of mbufs + */ +tp_sendoob(tpcb, so, xdata, outflags) + struct tp_pcb *tpcb; + register struct socket *so; + register struct mbuf *xdata; + int *outflags; /* not used */ +{ + /* + * Each mbuf chain represents a sequence # in the XPD seq space. + * The first one in the queue has sequence # tp_Xuna. + * When we add to the XPD queue, we stuff a zero-length + * mbuf (mark) into the DATA queue, with its sequence number in m_next + * to be assigned to this XPD tpdu, so data xfer can stop + * when it reaches the zero-length mbuf if this XPD TPDU hasn't + * yet been acknowledged. + */ + register struct sockbuf *sb = &(tpcb->tp_Xsnd); + register struct mbuf *xmark; + register int len=0; + struct tp_event E; + + IFDEBUG(D_XPD) + printf("tp_sendoob:"); + if (xdata) + printf("xdata len 0x%x\n", xdata->m_len); + ENDDEBUG + /* DO NOT LOCK the Xsnd buffer!!!! You can have at MOST one + * socket buf locked at any time!!! (otherwise you might + * sleep() in sblock() w/ a signal pending and cause the + * system call to be aborted w/ a locked socketbuf, which + * is a problem. So the so_snd buffer lock + * (done in sosend()) serves as the lock for Xpd. + */ + if (sb->sb_mb) { /* Anything already in eXpedited data sockbuf? */ + if (so->so_state & SS_NBIO) { + return EWOULDBLOCK; + } + while (sb->sb_mb) { + sbunlock(&so->so_snd); /* already locked by sosend */ + sbwait(&so->so_snd); + sblock(&so->so_snd, M_WAITOK); /* sosend will unlock on return */ + } + } + + if (xdata == (struct mbuf *)0) { + /* empty xpd packet */ + MGETHDR(xdata, M_WAIT, MT_OOBDATA); + if (xdata == NULL) { + return ENOBUFS; + } + xdata->m_len = 0; + xdata->m_pkthdr.len = 0; + } + IFDEBUG(D_XPD) + printf("tp_sendoob 1:"); + if (xdata) + printf("xdata len 0x%x\n", xdata->m_len); + ENDDEBUG + xmark = xdata; /* temporary use of variable xmark */ + while (xmark) { + len += xmark->m_len; + xmark = xmark->m_next; + } + if (len > TP_MAX_XPD_DATA) { + return EMSGSIZE; + } + IFDEBUG(D_XPD) + printf("tp_sendoob 2:"); + if (xdata) + printf("xdata len 0x%x\n", len); + ENDDEBUG + + + IFTRACE(D_XPD) + tptraceTPCB(TPPTmisc, "XPD mark m_next ", xdata->m_next, 0, 0, 0); + ENDTRACE + + sbappendrecord(sb, xdata); + + IFDEBUG(D_XPD) + printf("tp_sendoob len 0x%x\n", len); + dump_mbuf(so->so_snd.sb_mb, "XPD request Regular sndbuf:"); + dump_mbuf(tpcb->tp_Xsnd.sb_mb, "XPD request Xsndbuf:"); + ENDDEBUG + return DoEvent(T_XPD_req); +} + +/* + * CALLED FROM: + * the socket routines + * FUNCTION and ARGUMENTS: + * Handles all "user requests" except the [gs]ockopts() requests. + * The argument (req) is the request type (PRU*), + * (m) is an mbuf chain, generally used for send and + * receive type requests only. + * (nam) is used for addresses usually, in particular for the bind request. + * + */ +/*ARGSUSED*/ +ProtoHook +tp_usrreq(so, req, m, nam, controlp) + struct socket *so; + u_int req; + struct mbuf *m, *nam, *controlp; +{ + register struct tp_pcb *tpcb = sototpcb(so); + int s = splnet(); + int error = 0; + int flags, *outflags = &flags; + u_long eotsdu = 0; + struct tp_event E; + + IFDEBUG(D_REQUEST) + printf("usrreq(0x%x,%d,0x%x,0x%x,0x%x)\n",so,req,m,nam,outflags); + if (so->so_error) + printf("WARNING!!! so->so_error is 0x%x\n", so->so_error); + ENDDEBUG + IFTRACE(D_REQUEST) + tptraceTPCB(TPPTusrreq, "req so m state [", req, so, m, + tpcb?tpcb->tp_state:0); + ENDTRACE + + if ((u_int)tpcb == 0 && req != PRU_ATTACH) { + IFTRACE(D_REQUEST) + tptraceTPCB(TPPTusrreq, "req failed NO TPCB[", 0, 0, 0, 0); + ENDTRACE + splx(s); + return ENOTCONN; + } + + switch (req) { + + case PRU_ATTACH: + if (tpcb) { + error = EISCONN; + } else if ((error = tp_attach(so, (int)nam)) == 0) + tpcb = sototpcb(so); + break; + + case PRU_ABORT: /* called from close() */ + /* called for each incoming connect queued on the + * parent (accepting) socket + */ + if (tpcb->tp_state == TP_OPEN || tpcb->tp_state == TP_CONFIRMING) { + E.ATTR(T_DISC_req).e_reason = E_TP_NO_SESSION; + error = DoEvent(T_DISC_req); /* pretend it was a close() */ + break; + } /* else DROP THROUGH */ + + case PRU_DETACH: /* called from close() */ + /* called only after disconnect was called */ + error = DoEvent(T_DETACH); + if (tpcb->tp_state == TP_CLOSED) { + if (tpcb->tp_notdetached) { + IFDEBUG(D_CONN) + printf("PRU_DETACH: not detached\n"); + ENDDEBUG + tp_detach(tpcb); + } + free((caddr_t)tpcb, M_PCB); + tpcb = 0; + } + break; + + case PRU_SHUTDOWN: + /* recv end may have been released; local credit might be zero */ + case PRU_DISCONNECT: + E.ATTR(T_DISC_req).e_reason = E_TP_NORMAL_DISC; + error = DoEvent(T_DISC_req); + break; + + case PRU_BIND: + error = tp_pcbbind(tpcb, nam); + break; + + case PRU_LISTEN: + if (tpcb->tp_state != TP_CLOSED || tpcb->tp_lsuffixlen == 0 || + tpcb->tp_next == 0) + error = EINVAL; + else { + register struct tp_pcb **tt; + remque(tpcb); + tpcb->tp_next = tpcb->tp_prev = tpcb; + for (tt = &tp_listeners; *tt; tt = &((*tt)->tp_nextlisten)) + if ((*tt)->tp_lsuffixlen) + break; + tpcb->tp_nextlisten = *tt; + *tt = tpcb; + error = DoEvent(T_LISTEN_req); + } + break; + + case PRU_CONNECT2: + error = EOPNOTSUPP; /* for unix domain sockets */ + break; + + case PRU_CONNECT: + IFTRACE(D_CONN) + tptraceTPCB(TPPTmisc, + "PRU_CONNECT: so 0x%x *SHORT_LSUFXP(tpcb) 0x%x lsuflen 0x%x, class 0x%x", + tpcb->tp_sock, *SHORT_LSUFXP(tpcb), tpcb->tp_lsuffixlen, + tpcb->tp_class); + ENDTRACE + IFDEBUG(D_CONN) + printf("PRU_CONNECT: so *SHORT_LSUFXP(tpcb) 0x%x lsuflen 0x%x, class 0x%x", + tpcb->tp_sock, *SHORT_LSUFXP(tpcb), tpcb->tp_lsuffixlen, + tpcb->tp_class); + ENDDEBUG + if (tpcb->tp_lsuffixlen == 0) { + if (error = tp_pcbbind(tpcb, MNULL)) { + IFDEBUG(D_CONN) + printf("pcbbind returns error 0x%x\n", error); + ENDDEBUG + break; + } + } + IFDEBUG(D_CONN) + printf("isop 0x%x isop->isop_socket offset 12 :\n", tpcb->tp_npcb); + dump_buf(tpcb->tp_npcb, 16); + ENDDEBUG + if (error = tp_route_to(nam, tpcb, /* channel */0)) + break; + IFDEBUG(D_CONN) + printf( + "PRU_CONNECT after tpcb 0x%x so 0x%x npcb 0x%x flags 0x%x\n", + tpcb, so, tpcb->tp_npcb, tpcb->tp_flags); + printf("isop 0x%x isop->isop_socket offset 12 :\n", tpcb->tp_npcb); + dump_buf(tpcb->tp_npcb, 16); + ENDDEBUG + if (tpcb->tp_fsuffixlen == 0) { + /* didn't set peer extended suffix */ + (tpcb->tp_nlproto->nlp_getsufx)(tpcb->tp_npcb, &tpcb->tp_fsuffixlen, + tpcb->tp_fsuffix, TP_FOREIGN); + } + if (tpcb->tp_state == TP_CLOSED) { + soisconnecting(so); + error = DoEvent(T_CONN_req); + } else { + (tpcb->tp_nlproto->nlp_pcbdisc)(tpcb->tp_npcb); + error = EISCONN; + } + IFPERF(tpcb) + u_int lsufx, fsufx; + lsufx = *(u_short *)(tpcb->tp_lsuffix); + fsufx = *(u_short *)(tpcb->tp_fsuffix); + + tpmeas(tpcb->tp_lref, + TPtime_open | (tpcb->tp_xtd_format << 4), + &time, lsufx, fsufx, tpcb->tp_fref); + ENDPERF + break; + + case PRU_ACCEPT: + (tpcb->tp_nlproto->nlp_getnetaddr)(tpcb->tp_npcb, nam, TP_FOREIGN); + IFDEBUG(D_REQUEST) + printf("ACCEPT PEERADDDR:"); + dump_buf(mtod(nam, char *), nam->m_len); + ENDDEBUG + IFPERF(tpcb) + u_int lsufx, fsufx; + lsufx = *(u_short *)(tpcb->tp_lsuffix); + fsufx = *(u_short *)(tpcb->tp_fsuffix); + + tpmeas(tpcb->tp_lref, TPtime_open, + &time, lsufx, fsufx, tpcb->tp_fref); + ENDPERF + break; + + case PRU_RCVD: + if (so->so_state & SS_ISCONFIRMING) { + if (tpcb->tp_state == TP_CONFIRMING) + error = tp_confirm(tpcb); + break; + } + IFTRACE(D_DATA) + tptraceTPCB(TPPTmisc, + "RCVD BF: lcredit sent_lcdt cc hiwat \n", + tpcb->tp_lcredit, tpcb->tp_sent_lcdt, + so->so_rcv.sb_cc, so->so_rcv.sb_hiwat); + LOCAL_CREDIT(tpcb); + tptraceTPCB(TPPTmisc, + "PRU_RCVD AF sbspace lcredit hiwat cc", + sbspace(&so->so_rcv), tpcb->tp_lcredit, + so->so_rcv.sb_cc, so->so_rcv.sb_hiwat); + ENDTRACE + IFDEBUG(D_REQUEST) + printf("RCVD: cc %d space %d hiwat %d\n", + so->so_rcv.sb_cc, sbspace(&so->so_rcv), + so->so_rcv.sb_hiwat); + ENDDEBUG + if (((int)nam) & MSG_OOB) + error = DoEvent(T_USR_Xrcvd); + else + error = DoEvent(T_USR_rcvd); + break; + + case PRU_RCVOOB: + if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + break; + } + if (! tpcb->tp_xpd_service) { + error = EOPNOTSUPP; + break; + } + /* kludge - nam is really flags here */ + error = tp_rcvoob(tpcb, so, m, outflags, (int)nam); + break; + + case PRU_SEND: + case PRU_SENDOOB: + if (controlp) { + error = tp_snd_control(controlp, so, &m); + controlp = NULL; + if (error) + break; + } + if ((so->so_state & SS_ISCONFIRMING) && + (tpcb->tp_state == TP_CONFIRMING) && + (error = tp_confirm(tpcb))) + break; + if (req == PRU_SENDOOB) { + error = (tpcb->tp_xpd_service == 0) ? + EOPNOTSUPP : tp_sendoob(tpcb, so, m, outflags); + break; + } + if (m == 0) + break; + if (m->m_flags & M_EOR) { + eotsdu = 1; + m->m_flags &= ~M_EOR; + } + if (eotsdu == 0 && m->m_pkthdr.len == 0) + break; + if (tpcb->tp_state != TP_AKWAIT && tpcb->tp_state != TP_OPEN) { + error = ENOTCONN; + break; + } + /* + * The protocol machine copies mbuf chains, + * prepends headers, assigns seq numbers, and + * puts the packets on the device. + * When they are acked they are removed from the socket buf. + * + * sosend calls this up until sbspace goes negative. + * Sbspace may be made negative by appending this mbuf chain, + * possibly by a whole cluster. + */ + { + /* + * Could have eotsdu and no data.(presently MUST have + * an mbuf though, even if its length == 0) + */ + int totlen = m->m_pkthdr.len; + struct sockbuf *sb = &so->so_snd; + IFPERF(tpcb) + PStat(tpcb, Nb_from_sess) += totlen; + tpmeas(tpcb->tp_lref, TPtime_from_session, 0, 0, + PStat(tpcb, Nb_from_sess), totlen); + ENDPERF + IFDEBUG(D_SYSCALL) + printf( + "PRU_SEND: eot %d before sbappend 0x%x len 0x%x to sb @ 0x%x\n", + eotsdu, m, totlen, sb); + dump_mbuf(sb->sb_mb, "so_snd.sb_mb"); + dump_mbuf(m, "m : to be added"); + ENDDEBUG + tp_packetize(tpcb, m, eotsdu); + IFDEBUG(D_SYSCALL) + printf("PRU_SEND: eot %d after sbappend 0x%x\n", eotsdu, m); + dump_mbuf(sb->sb_mb, "so_snd.sb_mb"); + ENDDEBUG + if (tpcb->tp_state == TP_OPEN) + error = DoEvent(T_DATA_req); + IFDEBUG(D_SYSCALL) + printf("PRU_SEND: after driver error 0x%x \n",error); + printf("so_snd 0x%x cc 0t%d mbcnt 0t%d\n", + sb, sb->sb_cc, sb->sb_mbcnt); + dump_mbuf(sb->sb_mb, "so_snd.sb_mb after driver"); + ENDDEBUG + } + break; + + case PRU_SOCKADDR: + (tpcb->tp_nlproto->nlp_getnetaddr)(tpcb->tp_npcb, nam, TP_LOCAL); + break; + + case PRU_PEERADDR: + (tpcb->tp_nlproto->nlp_getnetaddr)(tpcb->tp_npcb, nam, TP_FOREIGN); + break; + + case PRU_CONTROL: + error = EOPNOTSUPP; + break; + + case PRU_PROTOSEND: + case PRU_PROTORCV: + case PRU_SENSE: + case PRU_SLOWTIMO: + case PRU_FASTTIMO: + error = EOPNOTSUPP; + break; + + default: +#ifdef ARGO_DEBUG + printf("tp_usrreq UNKNOWN PRU %d\n", req); +#endif /* ARGO_DEBUG */ + error = EOPNOTSUPP; + } + + IFDEBUG(D_REQUEST) + printf("%s, so 0x%x, tpcb 0x%x, error %d, state %d\n", + "returning from tp_usrreq", so, tpcb, error, + tpcb ? tpcb->tp_state : 0); + ENDDEBUG + IFTRACE(D_REQUEST) + tptraceTPCB(TPPTusrreq, "END req so m state [", req, so, m, + tpcb ? tpcb->tp_state : 0); + ENDTRACE + if (controlp) { + m_freem(controlp); + printf("control data unexpectedly retained in tp_usrreq()"); + } + splx(s); + return error; +} +tp_ltrace(so, uio) +struct socket *so; +struct uio *uio; +{ + IFTRACE(D_DATA) + register struct tp_pcb *tpcb = sototpcb(so); + if (tpcb) { + tptraceTPCB(TPPTmisc, "sosend so resid iovcnt", so, + uio->uio_resid, uio->uio_iovcnt, 0); + } + ENDTRACE +} + +tp_confirm(tpcb) +register struct tp_pcb *tpcb; +{ + struct tp_event E; + if (tpcb->tp_state == TP_CONFIRMING) + return DoEvent(T_ACPT_req); + printf("Tp confirm called when not confirming; tpcb 0x%x, state 0x%x\n", + tpcb, tpcb->tp_state); + return 0; +} + +/* + * Process control data sent with sendmsg() + */ +tp_snd_control(m, so, data) + struct mbuf *m; + struct socket *so; + register struct mbuf **data; +{ + register struct cmsghdr *ch; + int error = 0; + + if (m && m->m_len) { + ch = mtod(m, struct cmsghdr *); + m->m_len -= sizeof (*ch); + m->m_data += sizeof (*ch); + error = tp_ctloutput(PRCO_SETOPT, + so, ch->cmsg_level, ch->cmsg_type, &m); + if (ch->cmsg_type == TPOPT_DISC_DATA) { + if (data && *data) { + m_freem(*data); + *data = 0; + } + error = tp_usrreq(so, PRU_DISCONNECT, (struct mbuf *)0, + (caddr_t)0, (struct mbuf *)0); + } + } + if (m) + m_freem(m); + return error; +} diff --git a/sys/netiso/tuba_subr.c b/sys/netiso/tuba_subr.c new file mode 100644 index 00000000000..d346927255c --- /dev/null +++ b/sys/netiso/tuba_subr.c @@ -0,0 +1,348 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tuba_subr.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static struct sockaddr_iso null_siso = { sizeof(null_siso), AF_ISO, }; +extern int tuba_table_size, tcp_keepidle, tcp_keepintvl, tcp_maxidle; +extern int tcppcbcachemiss, tcppredack, tcppreddat, tcprexmtthresh; +extern struct tcpiphdr tcp_saveti; +struct inpcb tuba_inpcb; +struct inpcb *tuba_last_inpcb = &tuba_inpcb; +struct isopcb tuba_isopcb; +/* + * Tuba initialization + */ +tuba_init() +{ +#define TUBAHDRSIZE (3 /*LLC*/ + 9 /*CLNP Fixed*/ + 42 /*Addresses*/ \ + + 6 /*CLNP Segment*/ + 20 /*TCP*/) + + tuba_inpcb.inp_next = tuba_inpcb.inp_prev = &tuba_inpcb; + tuba_isopcb.isop_next = tuba_isopcb.isop_prev = &tuba_isopcb; + tuba_isopcb.isop_faddr = &tuba_isopcb.isop_sfaddr; + tuba_isopcb.isop_laddr = &tuba_isopcb.isop_sladdr; + if (max_protohdr < TUBAHDRSIZE) + max_protohdr = TUBAHDRSIZE; + if (max_linkhdr + TUBAHDRSIZE > MHLEN) + panic("tuba_init"); +} + +struct addr_arg { + int error; + int offset; + u_long sum; +}; + +/* + * Calculate contribution to fudge factor for TCP checksum, + * and coincidentally set pointer for convenience of clnp_output + * if we are are responding when there is no isopcb around. + */ +static void +tuba_getaddr(arg, siso, index) + register struct addr_arg *arg; + struct sockaddr_iso **siso; + u_long index; +{ + register struct tuba_cache *tc; + if (index <= tuba_table_size && (tc = tuba_table[index])) { + if (siso) + *siso = &tc->tc_siso; + arg->sum += (arg->offset & 1 ? tc->tc_ssum : tc->tc_sum) + + (0xffff ^ index); + arg->offset += tc->tc_siso.siso_nlen + 1; + } else + arg->error = 1; +} + +tuba_output(m, tp) + register struct mbuf *m; + struct tcpcb *tp; +{ + register struct tcpiphdr *n; + struct isopcb *isop; + struct addr_arg arg; + + if (tp == 0 || (n = tp->t_template) == 0 || + (isop = (struct isopcb *)tp->t_tuba_pcb) == 0) { + isop = &tuba_isopcb; + n = mtod(m, struct tcpiphdr *); + arg.error = arg.sum = arg.offset = 0; + tuba_getaddr(&arg, &tuba_isopcb.isop_faddr, n->ti_dst.s_addr); + tuba_getaddr(&arg, &tuba_isopcb.isop_laddr, n->ti_src.s_addr); + REDUCE(arg.sum, arg.sum); + goto adjust; + } + if (n->ti_sum == 0) { + arg.error = arg.sum = arg.offset = 0; + tuba_getaddr(&arg, (struct sockaddr_iso **)0, n->ti_dst.s_addr); + tuba_getaddr(&arg, (struct sockaddr_iso **)0, n->ti_src.s_addr); + REDUCE(arg.sum, arg.sum); + n->ti_sum = arg.sum; + n = mtod(m, struct tcpiphdr *); + adjust: + if (arg.error) { + m_freem(m); + return (EADDRNOTAVAIL); + } + REDUCE(n->ti_sum, n->ti_sum + (0xffff ^ arg.sum)); + } + m->m_len -= sizeof (struct ip); + m->m_pkthdr.len -= sizeof (struct ip); + m->m_data += sizeof (struct ip); + return (clnp_output(m, isop, m->m_pkthdr.len, 0)); +} + +tuba_refcnt(isop, delta) + struct isopcb *isop; +{ + register struct tuba_cache *tc; + unsigned index, sum; + + if (delta != 1) + delta = -1; + if (isop == 0 || isop->isop_faddr == 0 || isop->isop_laddr == 0 || + (delta == -1 && isop->isop_tuba_cached == 0) || + (delta == 1 && isop->isop_tuba_cached != 0)) + return; + isop->isop_tuba_cached = (delta == 1); + if ((index = tuba_lookup(isop->isop_faddr, M_DONTWAIT)) != 0 && + (tc = tuba_table[index]) != 0 && (delta == 1 || tc->tc_refcnt > 0)) + tc->tc_refcnt += delta; + if ((index = tuba_lookup(isop->isop_laddr, M_DONTWAIT)) != 0 && + (tc = tuba_table[index]) != 0 && (delta == 1 || tc->tc_refcnt > 0)) + tc->tc_refcnt += delta; +} + +tuba_pcbdetach(isop) + struct isopcb *isop; +{ + if (isop == 0) + return; + tuba_refcnt(isop, -1); + isop->isop_socket = 0; + iso_pcbdetach(isop); +} + +/* + * Avoid in_pcbconnect in faked out tcp_input() + */ +tuba_pcbconnect(inp, nam) + register struct inpcb *inp; + struct mbuf *nam; +{ + register struct sockaddr_iso *siso; + struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *); + struct tcpcb *tp = intotcpcb(inp); + struct isopcb *isop = (struct isopcb *)tp->t_tuba_pcb; + int error; + + /* hardwire iso_pcbbind() here */ + siso = isop->isop_laddr = &isop->isop_sladdr; + *siso = tuba_table[inp->inp_laddr.s_addr]->tc_siso; + siso->siso_tlen = sizeof(inp->inp_lport); + bcopy((caddr_t)&inp->inp_lport, TSEL(siso), sizeof(inp->inp_lport)); + + /* hardwire in_pcbconnect() here without assigning route */ + inp->inp_fport = sin->sin_port; + inp->inp_faddr = sin->sin_addr; + + /* reuse nam argument to call iso_pcbconnect() */ + nam->m_len = sizeof(*siso); + siso = mtod(nam, struct sockaddr_iso *); + *siso = tuba_table[inp->inp_faddr.s_addr]->tc_siso; + siso->siso_tlen = sizeof(inp->inp_fport); + bcopy((caddr_t)&inp->inp_fport, TSEL(siso), sizeof(inp->inp_fport)); + + if ((error = iso_pcbconnect(isop, nam)) == 0) + tuba_refcnt(isop, 1); + return (error); +} + +/* + * CALLED FROM: + * clnp's input routine, indirectly through the protosw. + * FUNCTION and ARGUMENTS: + * Take a packet (m) from clnp, strip off the clnp header + * and do tcp input processing. + * No return value. + */ +tuba_tcpinput(m, src, dst) + register struct mbuf *m; + struct sockaddr_iso *src, *dst; +{ + unsigned long sum, lindex, findex; + register struct tcpiphdr *ti; + register struct inpcb *inp; + caddr_t optp = NULL; + int optlen; + int len, tlen, off; + register struct tcpcb *tp = 0; + int tiflags; + struct socket *so; + int todrop, acked, ourfinisacked, needoutput = 0; + short ostate; + struct in_addr laddr; + int dropsocket = 0, iss = 0; + u_long tiwin, ts_val, ts_ecr; + int ts_present = 0; + + if ((m->m_flags & M_PKTHDR) == 0) + panic("tuba_tcpinput"); + /* + * Do some housekeeping looking up CLNP addresses. + * If we are out of space might as well drop the packet now. + */ + tcpstat.tcps_rcvtotal++; + lindex = tuba_lookup(dst, M_DONTWAIT); + findex = tuba_lookup(src, M_DONTWAIT); + if (lindex == 0 || findex == 0) + goto drop; + /* + * CLNP gave us an mbuf chain WITH the clnp header pulled up, + * but the data pointer pushed past it. + */ + len = m->m_len; + tlen = m->m_pkthdr.len; + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + m->m_pkthdr.len += sizeof(struct ip); + m->m_flags &= ~(M_MCAST|M_BCAST); /* XXX should do this in clnp_input */ + /* + * The reassembly code assumes it will be overwriting a useless + * part of the packet, which is why we need to have it point + * into the packet itself. + * + * Check to see if the data is properly alligned + * so that we can save copying the tcp header. + * This code knows way too much about the structure of mbufs! + */ + off = ((sizeof (long) - 1) & ((m->m_flags & M_EXT) ? + (m->m_data - m->m_ext.ext_buf) : (m->m_data - m->m_pktdat))); + if (off || len < sizeof(struct tcphdr)) { + struct mbuf *m0 = m; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == 0) { + m = m0; + goto drop; + } + m->m_next = m0; + m->m_data += max_linkhdr; + m->m_pkthdr = m0->m_pkthdr; + m->m_flags = m0->m_flags & M_COPYFLAGS; + if (len < sizeof(struct tcphdr)) { + m->m_len = 0; + if ((m = m_pullup(m, sizeof(struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + } else { + bcopy(mtod(m0, caddr_t) + sizeof(struct ip), + mtod(m, caddr_t) + sizeof(struct ip), + sizeof(struct tcphdr)); + m0->m_len -= sizeof(struct tcpiphdr); + m0->m_data += sizeof(struct tcpiphdr); + m->m_len = sizeof(struct tcpiphdr); + } + } + /* + * Calculate checksum of extended TCP header and data, + * replacing what would have been IP addresses by + * the IP checksum of the CLNP addresses. + */ + ti = mtod(m, struct tcpiphdr *); + ti->ti_dst.s_addr = tuba_table[lindex]->tc_sum; + if (dst->siso_nlen & 1) + ti->ti_src.s_addr = tuba_table[findex]->tc_sum; + else + ti->ti_src.s_addr = tuba_table[findex]->tc_ssum; + ti->ti_prev = ti->ti_next = 0; + ti->ti_x1 = 0; ti->ti_pr = ISOPROTO_TCP; + ti->ti_len = htons((u_short)tlen); + if (ti->ti_sum = in_cksum(m, m->m_pkthdr.len)) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } + ti->ti_src.s_addr = findex; + ti->ti_dst.s_addr = lindex; + /* + * Now include the rest of TCP input + */ +#define TUBA_INCLUDE +#define in_pcbconnect tuba_pcbconnect +#define tcb tuba_inpcb +#define tcp_last_inpcb tuba_last_inpcb + +#include +} + +#define tcp_slowtimo tuba_slowtimo +#define tcp_fasttimo tuba_fasttimo + +#include diff --git a/sys/netiso/tuba_table.c b/sys/netiso/tuba_table.c new file mode 100644 index 00000000000..a1bf5f98de0 --- /dev/null +++ b/sys/netiso/tuba_table.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tuba_table.c 8.2 (Berkeley) 11/15/93 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +int tuba_table_size; +struct tuba_cache **tuba_table; +struct radix_node_head *tuba_tree; +extern int arpt_keep, arpt_prune; /* use same values as arp cache */ + +void +tuba_timer() +{ + int s = splnet(); + int i; + register struct tuba_cache *tc; + long timelimit = time.tv_sec - arpt_keep; + + timeout(tuba_timer, (caddr_t)0, arpt_prune * hz); + for (i = tuba_table_size; i > 0; i--) + if ((tc = tuba_table[i]) && (tc->tc_refcnt == 0) && + (tc->tc_time < timelimit)) { + tuba_table[i] = 0; + rn_delete(&tc->tc_siso.siso_addr, NULL, tuba_tree); + free((caddr_t)tc, M_RTABLE); + } + splx(s); +} + +tuba_table_init() +{ + rn_inithead((void **)&tuba_tree, 40); + timeout(tuba_timer, (caddr_t)0, arpt_prune * hz); +} + +int +tuba_lookup(siso, wait) + register struct sockaddr_iso *siso; +{ + struct radix_node *rn, *rn_match(); + register struct tuba_cache *tc; + struct tuba_cache **new; + int dupentry = 0, sum_a = 0, sum_b = 0, old_size, i; + + if ((rn = rn_match((caddr_t)&siso->siso_addr, tuba_tree->rnh_treetop)) + && ((rn->rn_flags & RNF_ROOT) == 0)) { + tc = (struct tuba_cache *)rn; + tc->tc_time = time.tv_sec; + return (tc->tc_index); + } + if ((tc = (struct tuba_cache *)malloc(sizeof(*tc), M_RTABLE, wait)) + == NULL) + return (0); + bzero((caddr_t)tc, sizeof (*tc)); + bcopy(siso->siso_data, tc->tc_siso.siso_data, + tc->tc_siso.siso_nlen = siso->siso_nlen); + rn_insert(&tc->tc_siso.siso_addr, tuba_tree, &dupentry, tc->tc_nodes); + if (dupentry) + panic("tuba_lookup 1"); + tc->tc_siso.siso_family = AF_ISO; + tc->tc_siso.siso_len = sizeof(tc->tc_siso); + tc->tc_time = time.tv_sec; + for (i = sum_a = tc->tc_siso.siso_nlen; --i >= 0; ) + (i & 1 ? sum_a : sum_b) += (u_char)tc->tc_siso.siso_data[i]; + REDUCE(tc->tc_sum, (sum_a << 8) + sum_b); + HTONS(tc->tc_sum); + SWAB(tc->tc_ssum, tc->tc_sum); + for (i = tuba_table_size; i > 0; i--) + if (tuba_table[i] == 0) + goto fixup; + old_size = tuba_table_size; + if (tuba_table_size == 0) + tuba_table_size = 15; + if (tuba_table_size > 0x7fff) + return (0); + tuba_table_size = 1 + 2 * tuba_table_size; + i = (tuba_table_size + 1) * sizeof(tc); + new = (struct tuba_cache **)malloc((unsigned)i, M_RTABLE, wait); + if (new == 0) { + tuba_table_size = old_size; + rn_delete(&tc->tc_siso.siso_addr, NULL, tuba_tree); + free((caddr_t)tc, M_RTABLE); + return (0); + } + bzero((caddr_t)new, (unsigned)i); + if (tuba_table) { + bcopy((caddr_t)tuba_table, (caddr_t)new, i >> 1); + free((caddr_t)tuba_table, M_RTABLE); + } + tuba_table = new; + i = tuba_table_size; +fixup: + tuba_table[i] = tc; + tc->tc_index = i; + return (tc->tc_index); +} diff --git a/sys/netiso/tuba_table.h b/sys/netiso/tuba_table.h new file mode 100644 index 00000000000..6be8afaf523 --- /dev/null +++ b/sys/netiso/tuba_table.h @@ -0,0 +1,59 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tuba_table.h 8.1 (Berkeley) 6/10/93 + */ + +struct tuba_cache { + struct radix_node tc_nodes[2]; /* convenient lookup */ + int tc_refcnt; + int tc_time; /* last looked up */ + int tc_flags; +#define TCF_PERM 1 + int tc_index; + u_short tc_sum; /* cksum of nsap inc. length */ + u_short tc_ssum; /* swab(tc_sum) */ + struct sockaddr_iso tc_siso; /* for responding */ +}; + +#define ADDCARRY(x) (x >= 65535 ? x -= 65535 : x) +#define REDUCE(a, b) { union { u_short s[2]; long l;} l_util; long x; \ + l_util.l = (b); x = l_util.s[0] + l_util.s[1]; ADDCARRY(x); \ + if (x == 0) x = 0xffff; a = x;} +#define SWAB(a, b) { union { u_char c[2]; u_short s;} s; u_char t; \ + s.s = (b); t = s.c[0]; s.c[0] = s.c[1]; s.c[1] = t; a = s.s;} + +#ifdef KERNEL +extern int tuba_table_size; +extern struct tuba_cache **tuba_table; +extern struct radix_node_head *tuba_tree; +#endif diff --git a/sys/netiso/tuba_usrreq.c b/sys/netiso/tuba_usrreq.c new file mode 100644 index 00000000000..2d9211707a4 --- /dev/null +++ b/sys/netiso/tuba_usrreq.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tuba_usrreq.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +/* + * TCP protocol interface to socket abstraction. + */ +extern char *tcpstates[]; +extern struct inpcb tuba_inpcb; +extern struct isopcb tuba_isopcb; + +/* + * Process a TCP user request for TCP tb. If this is a send request + * then m is the mbuf chain of send data. If this is a timer expiration + * (called from the software clock routine), then timertype tells which timer. + */ +/*ARGSUSED*/ +tuba_usrreq(so, req, m, nam, control) + struct socket *so; + int req; + struct mbuf *m, *nam, *control; +{ + register struct inpcb *inp; + register struct isopcb *isop; + register struct tcpcb *tp; + int s; + int error = 0; + int ostate; + struct sockaddr_iso *siso; + + if (req == PRU_CONTROL) + return (iso_control(so, (int)m, (caddr_t)nam, + (struct ifnet *)control)); + + s = splnet(); + inp = sotoinpcb(so); + /* + * When a TCP is attached to a socket, then there will be + * a (struct inpcb) pointed at by the socket, and this + * structure will point at a subsidary (struct tcpcb). + */ + if (inp == 0 && req != PRU_ATTACH) { + splx(s); + return (EINVAL); /* XXX */ + } + if (inp) { + tp = intotcpcb(inp); + if (tp == 0) + panic("tuba_usrreq"); + ostate = tp->t_state; + isop = (struct isopcb *)tp->t_tuba_pcb; + if (isop == 0) + panic("tuba_usrreq 2"); + } else + ostate = 0; + switch (req) { + + /* + * TCP attaches to socket via PRU_ATTACH, reserving space, + * and an internet control block. We also need to + * allocate an isopcb and separate the control block from + * tcp/ip ones. + */ + case PRU_ATTACH: + if (error = iso_pcballoc(so, &tuba_isopcb)) + break; + isop = (struct isopcb *)so->so_pcb; + so->so_pcb = 0; + if (error = tcp_usrreq(so, req, m, nam, control)) { + isop->isop_socket = 0; + iso_pcbdetach(isop); + } else { + inp = sotoinpcb(so); + remque(inp); + insque(inp, &tuba_inpcb); + inp->inp_head = &tuba_inpcb; + tp = intotcpcb(inp); + if (tp == 0) + panic("tuba_usrreq 3"); + tp->t_tuba_pcb = (caddr_t) isop; + } + goto notrace; + + /* + * PRU_DETACH detaches the TCP protocol from the socket. + * If the protocol state is non-embryonic, then can't + * do this directly: have to initiate a PRU_DISCONNECT, + * which may finish later; embryonic TCB's can just + * be discarded here. + */ + case PRU_DETACH: + if (tp->t_state > TCPS_LISTEN) + tp = tcp_disconnect(tp); + else + tp = tcp_close(tp); + if (tp == 0) + tuba_pcbdetach(isop); + break; + + /* + * Give the socket an address. + */ + case PRU_BIND: + siso = mtod(nam, struct sockaddr_iso *); + if (siso->siso_tlen && siso->siso_tlen != 2) { + error = EINVAL; + break; + } + if ((error = iso_pcbbind(isop, nam)) || + (siso = isop->isop_laddr) == 0) + break; + bcopy(TSEL(siso), &inp->inp_lport, 2); + if (siso->siso_nlen && + !(inp->inp_laddr.s_addr = tuba_lookup(siso, M_WAITOK))) + error = ENOBUFS; + break; + + /* + * Prepare to accept connections. + */ + case PRU_CONNECT: + case PRU_LISTEN: + if (inp->inp_lport == 0 && + (error = iso_pcbbind(isop, (struct mbuf *)0))) + break; + bcopy(TSEL(isop->isop_laddr), &inp->inp_lport, 2); + if (req == PRU_LISTEN) { + tp->t_state = TCPS_LISTEN; + break; + } + /*FALLTHROUGH*/ + /* + * Initiate connection to peer. + * Create a template for use in transmissions on this connection. + * Enter SYN_SENT state, and mark socket as connecting. + * Start keep-alive timer, and seed output sequence space. + * Send initial segment on connection. + */ + /* case PRU_CONNECT: */ + if (error = iso_pcbconnect(isop, nam)) + break; + if ((siso = isop->isop_laddr) && siso->siso_nlen > 1) + siso->siso_data[siso->siso_nlen - 1] = ISOPROTO_TCP; + else + panic("tuba_usrreq: connect"); + siso = mtod(nam, struct sockaddr_iso *); + if (!(inp->inp_faddr.s_addr = tuba_lookup(siso, M_WAITOK))) { + unconnect: + iso_pcbdisconnect(isop); + error = ENOBUFS; + break; + } + bcopy(TSEL(isop->isop_faddr), &inp->inp_fport, 2); + if (inp->inp_laddr.s_addr == 0 && + (inp->inp_laddr.s_addr = + tuba_lookup(isop->isop_laddr, M_WAITOK)) == 0) + goto unconnect; + if ((tp->t_template = tcp_template(tp)) == 0) + goto unconnect; + soisconnecting(so); + tcpstat.tcps_connattempt++; + tp->t_state = TCPS_SYN_SENT; + tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; + tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; + tcp_sendseqinit(tp); + error = tcp_output(tp); + tuba_refcnt(isop, 1); + break; + + /* + * Initiate disconnect from peer. + * If connection never passed embryonic stage, just drop; + * else if don't need to let data drain, then can just drop anyways, + * else have to begin TCP shutdown process: mark socket disconnecting, + * drain unread data, state switch to reflect user close, and + * send segment (e.g. FIN) to peer. Socket will be really disconnected + * when peer sends FIN and acks ours. + * + * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. + */ + case PRU_DISCONNECT: + if ((tp = tcp_disconnect(tp)) == 0) + tuba_pcbdetach(isop); + break; + + /* + * Accept a connection. Essentially all the work is + * done at higher levels; just return the address + * of the peer, storing through addr. + */ + case PRU_ACCEPT: + bcopy((caddr_t)isop->isop_faddr, mtod(nam, caddr_t), + nam->m_len = isop->isop_faddr->siso_len); + break; + + /* + * Mark the connection as being incapable of further output. + */ + case PRU_SHUTDOWN: + socantsendmore(so); + tp = tcp_usrclosed(tp); + if (tp) + error = tcp_output(tp); + else + tuba_pcbdetach(isop); + break; + /* + * Abort the TCP. + */ + case PRU_ABORT: + if ((tp = tcp_drop(tp, ECONNABORTED)) == 0) + tuba_pcbdetach(isop); + break; + + + case PRU_SOCKADDR: + if (isop->isop_laddr) + bcopy((caddr_t)isop->isop_laddr, mtod(nam, caddr_t), + nam->m_len = isop->isop_laddr->siso_len); + break; + + case PRU_PEERADDR: + if (isop->isop_faddr) + bcopy((caddr_t)isop->isop_faddr, mtod(nam, caddr_t), + nam->m_len = isop->isop_faddr->siso_len); + break; + + default: + error = tcp_usrreq(so, req, m, nam, control); + goto notrace; + } + if (tp && (so->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req); +notrace: + splx(s); + return(error); +} + +tuba_ctloutput(op, so, level, optname, mp) + int op; + struct socket *so; + int level, optname; + struct mbuf **mp; +{ + int clnp_ctloutput(), tcp_ctloutput(); + + return ((level != IPPROTO_TCP ? clnp_ctloutput : tcp_ctloutput) + (op, so, level, optname, mp)); +} diff --git a/sys/netiso/xebec/Makefile b/sys/netiso/xebec/Makefile new file mode 100644 index 00000000000..fa05f9cc47e --- /dev/null +++ b/sys/netiso/xebec/Makefile @@ -0,0 +1,8 @@ +# @(#)Makefile 5.16 (Berkeley) 4/26/91 + +PROG= xebec +SRCS= llparse.c llscan.c main.c malloc.c procs.c putdriver.c sets.c xebec.c +CFLAGS+= -DDEBUG -traditional +NOMAN = noman + +.include diff --git a/sys/netiso/xebec/debug.h b/sys/netiso/xebec/debug.h new file mode 100644 index 00000000000..2e3f16794d6 --- /dev/null +++ b/sys/netiso/xebec/debug.h @@ -0,0 +1,22 @@ +/* $Header: debug.h,v 2.1 88/09/19 12:56:16 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/debug.h,v $ */ + +#define OUT stdout + +extern int debug[128]; + +#ifdef DEBUG +extern int column; + +#define IFDEBUG(letter) \ + if(debug['letter']) { +#define ENDDEBUG ; (void) fflush(stdout);} + +#else + +#define STAR * +#define IFDEBUG(letter) //*beginning of comment*/STAR +#define ENDDEBUG STAR/*end of comment*// + +#endif DEBUG + diff --git a/sys/netiso/xebec/llparse.c b/sys/netiso/xebec/llparse.c new file mode 100644 index 00000000000..fee7a9f7e47 --- /dev/null +++ b/sys/netiso/xebec/llparse.c @@ -0,0 +1,366 @@ +/* $Header: llparse.c,v 2.2 88/09/19 12:54:59 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/llparse.c,v $ */ +/* + * ************************* NOTICE ******************************* + * This code is in the public domain. It cannot be copyrighted. + * This ll parser was originally written by Keith Thompson for the + * University of Wisconsin Crystal project. + * It was based on an FMQ lr parser written by Jon Mauney at the + * University of Wisconsin. + * It was subsequently modified very slightly by Nancy Hall at the + * University of Wisconsin for the Crystal project. + * **************************************************************** + */ +#include "xebec.h" +#include "llparse.h" +#include "main.h" +#include + +#include "debug.h" + +#define LLMINACTION -LLINF + +short llparsestack[STACKSIZE]; +short llstackptr = 0; +LLtoken lltoken; + +llparse() +{ + register havetoken = FALSE; + register sym; + register LLtoken *t = &lltoken; + register parseaction; + register accepted = FALSE; + + llpushprod(llnprods-1); /* $$$ ::= */ + + do { + sym = llparsestack[llstackptr]; + IFDEBUG(L) + printf("llparse() top of loop, llstackptr=%d, sym=%d\n", + llstackptr, sym); + ENDDEBUG + + if(sym < 0) { + /* action symbol */ + if(sym <= LLMINACTION) { + for(;sym<=LLMINACTION;sym++) { + llaction(1, t); /* calls llfinprod */ + } + llstackptr--; + continue; + } else { llaction(-sym, t); + llstackptr--; + continue; + } + } + + if(sym < llnterms) { + + /* it's a terminal symbol */ + + if(!havetoken) { + llgettoken(t); + havetoken = TRUE; + } + + if(sym == t->llterm) { + llpushattr(t->llattrib); + llaccept(t); + llstackptr--; /* pop terminal */ + if(t->llterm == llnterms-1) { /* end symbol $$$ */ + accepted = TRUE; + } else { + havetoken = FALSE; + } + } else { + llparsererror(t); /* wrong terminal on input */ + havetoken = FALSE; + } + continue; + } + + /* non terminal */ + + if(!havetoken) { + llgettoken(t); + havetoken = TRUE; + } + + /* consult parse table for new production */ + parseaction = llfindaction(sym, t->llterm); + + if(parseaction == 0) { + /* error entry */ + llparsererror(t); + havetoken = FALSE; + continue; + } + + if(llepsilon[parseaction]) { + /* epsilon production */ + if(llepsilonok(t->llterm)) { + llstackptr--; /* pop nonterminal */ + llpushprod(parseaction); /* push rhs of production */ + } else { + llparsererror(t); + havetoken = FALSE; + } + } else { + llstackptr--; /* pop nonterminal */ + llpushprod(parseaction); /* push rhs of production */ + } + } while(!accepted); + + return(0); +} + +llpushprod(prod) /* recognize production prod - push rhs on stack */ +short prod; +{ + register start; + register length; + register count; + + start = llprodindex[prod].llprodstart; + length = llprodindex[prod].llprodlength; + + IFDEBUG(L) + printf("llpushprod(%d) llstackptr=0x%x(%d), length = 0x%x(%d)\n", + prod, llstackptr, llstackptr, length , length); + /* + dump_parse_stack(); + */ + ENDDEBUG + if(llstackptr+length >= STACKSIZE) { + fprintf(stderr,"Parse stack overflow. llstackptr=0x%x, length=0x%x\n", + llstackptr, length); + Exit(-1); + } + + + llsetattr(llprodindex[prod].llprodtlen); + + /* put a marker on the stack to mark beginning of production */ + if(llparsestack[llstackptr] <= LLMINACTION) { + (llparsestack[llstackptr]) --; /* if there's already one there, don't + put another on; just let it represent all of + the adjacent markers */ + } + else { + llstackptr++; + llparsestack[llstackptr] = LLMINACTION; + } + + for(count=0; count STACKSIZE) { + fprintf(stderr, "PARSE STACK OVERFLOW! \n"); Exit(-1); + Exit(-1); + } +} + + +llepsilonok(term) +{ + register ptr; + register sym; + register pact; + register nomore; + register rval; + + IFDEBUG(L) + printf("llepsilonok() enter\n"); + ENDDEBUG + rval = TRUE; + + ptr = llstackptr; + + do { + sym = llparsestack[ptr]; + + if(sym < 0) { + ptr--; + nomore = ptr == 0; + continue; + } + + if(sym < llnterms) { + nomore = TRUE; + rval = sym == term; + continue; + } + + pact = llfindaction(sym, term); + + if(pact == 0) { + nomore = TRUE; + rval = FALSE; + continue; + } + + if(llepsilon[pact] == TRUE) { + ptr--; + nomore = ptr == 0; + } + else { + nomore = TRUE; + } + + } while(!nomore); + + return(rval); +} + + +short llfindaction(sym, term) +{ + register index; + + IFDEBUG(L) + printf("llfindaction(sym=%d, term=%d) enter \n", sym, term); + ENDDEBUG + index = llparseindex[sym]; + + while(llparsetable[index].llterm != 0) { + if(llparsetable[index].llterm == term) { + return(llparsetable[index].llprod); + } + index++; + } + return(0); +} + + +llparsererror(token) +LLtoken *token; +{ + IFDEBUG(L) + fprintf(stderr,"llparsererror() enter\n"); + prt_token(token); + ENDDEBUG + + fprintf(stderr, "Syntax error: "); + prt_token(token); + dump_buffer(); + Exit(-1); +} + + +llgettoken(token) +LLtoken *token; +{ + llscan(token); + token->llstate = NORMAL; + IFDEBUG(L) + printf("llgettoken(): "); + prt_token(token); + ENDDEBUG +} + + +/****************************************************************************** + + Attribute support routines + +******************************************************************************/ +/* +** attribute stack +** +** AttrStack = stack of record +** values : array of values; +** ptr : index; +** end; +** +*/ + +LLattrib llattributes[LLMAXATTR]; +int llattrtop = 0; + +struct llattr llattrdesc[LLMAXDESC]; + +int lldescindex = 1; + + +llsetattr(n) +{ + register struct llattr *ptr; + + IFDEBUG(L) + printf("llsetattr(%d) enter\n",n); + ENDDEBUG + if(lldescindex >= LLMAXDESC) { + fprintf(stdout, "llattribute stack overflow: desc\n"); + fprintf(stdout, + "lldescindex=0x%x, llattrtop=0x%x\n",lldescindex, llattrtop); + Exit(-1); + } + ptr = &llattrdesc[lldescindex]; + ptr->llabase = &llattributes[llattrtop]; + ptr->lloldtop = ++llattrtop; + ptr->llaindex = 1; + ptr->llacnt = n+1; /* the lhs ALWAYS uses an attr; it remains on the + stack when the production is recognized */ + lldescindex++; +} + +llpushattr(attr) +LLattrib attr; +{ + struct llattr *a; + + IFDEBUG(L) + printf("llpushattr() enter\n"); + ENDDEBUG + if(llattrtop + 1 > LLMAXATTR) { + fprintf(stderr, "ATTRIBUTE STACK OVERFLOW!\n"); + Exit(-1); + } + a = &llattrdesc[lldescindex-1]; + llattributes[llattrtop++] = attr; + a->llaindex++; /* inc count of attrs on the stack for this prod */ +} + +llfinprod() +{ + IFDEBUG(L) + printf("llfinprod() enter\n"); + ENDDEBUG + lldescindex--; + llattrtop = llattrdesc[lldescindex].lloldtop; + llattrdesc[lldescindex-1].llaindex++; /* lhs-of-prod.attr stays on + the stack; it is now one of the rhs attrs of the now-top production + on the stack */ +} + +#ifndef LINT +#ifdef DEBUG +dump_parse_stack() +{ + int ind; + + printf("PARSE STACK:\n"); + for(ind=llstackptr; ind>=0; ind--) { + printf("%d\t%d\t%s\n", + ind, llparsestack[ind], + llparsestack[ind]<0? "Action symbol" : llstrings[llparsestack[ind]]); + } +} + +#endif DEBUG +#endif LINT + +prt_token(t) +LLtoken *t; +{ + fprintf(stdout, "t at 0x%x\n", t); + fprintf(stdout, "t->llterm=0x%x\n", t->llterm); (void) fflush(stdout); + fprintf(stdout, "TOK: %s\n", llstrings[t->llterm]); + (void) fflush(stdout); +#ifdef LINT + /* to make lint shut up */ + fprintf(stdout, "", llnterms, llnsyms, llnprods, llinfinite); +#endif LINT +} diff --git a/sys/netiso/xebec/llparse.h b/sys/netiso/xebec/llparse.h new file mode 100644 index 00000000000..1b6133b1b7b --- /dev/null +++ b/sys/netiso/xebec/llparse.h @@ -0,0 +1,145 @@ +/* $Header: llparse.h,v 2.1 88/09/19 12:56:20 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/llparse.h,v $ */ + + /************************************************************ + attributes stack garbage + ************************************************************/ + +#define LLMAXATTR 512 +#define LLMAXDESC 256 +#define LLATTR /* build an attribute stack */ + + /* + ** attribute stack + ** + ** AttrStack = stack of record + ** values : array of values; + ** ptr : index; + ** end; + ** + */ + + typedef union llattrib LLattrib; + + extern LLattrib llattributes[LLMAXATTR]; + extern int llattrtop; + + extern struct llattr { + LLattrib *llabase; /* ptr into the attr stack (llattributes) */ + int llaindex;/* # attrs on the stack so far for this prod */ + int llacnt;/* total # ever to go on for this prod */ + + int lloldtop;/* when popping this prod, restore stack to here ; + one attr will remain on the stack (for the lhs) */ + } llattrdesc[LLMAXDESC]; + + extern int lldescindex; + + /************************************************************ + attributes stack garbage + ************************************************************/ + + extern struct lltoken { + short llterm; /* token number */ + short llstate; /* inserted deleted normal */ + LLattrib llattrib; + } lltoken; + typedef struct lltoken LLtoken; + +/************************************************************ + constants used in llparse.c +************************************************************/ + +#define STACKSIZE 500 +#define MAXCORR 16 + +#define NORMAL 0 +#define DELETE 1 +#define INSERT 2 + +/************************************************************ + datatypes used to communicate with the parser +************************************************************/ + +struct llinsert { + short llinscost; + short llinslength; + short llinsert[MAXCORR]; +}; +typedef struct llinsert LLinsert; + +extern short llparsestack[]; +extern short llstackptr; +extern short llinfinite; + +/************************************************************ + variables used to pass information + specific to each grammer +************************************************************/ + +extern short llnterms; +extern short llnsyms; +extern short llnprods; + +extern char *llefile; + +extern struct llparsetable { + short llterm; + short llprod; +} llparsetable[]; + +extern short llparseindex[]; + +extern short llepsilon[]; + +extern short llproductions[]; + +extern struct llprodindex { + short llprodstart; + short llprodlength; + short llprodtlen; +} llprodindex[]; + +extern struct llcosts { + short llinsert; + short lldelete; +} llcosts[]; + +extern struct llstable { + short llsstart; + short llslength; +} llstable[]; + +extern short llsspace[]; + +extern struct lletable { + short llecost; + short llelength; + short llestart; +} lletable[]; + +extern long lleindex[]; + +extern short llespace[]; + +extern char *llstrings[]; + +/************************************************************ + routines defined in llparse.c +************************************************************/ + +extern llparse(); +extern llcopye(); +extern llcopys(); +extern llcorrector(); +extern llepsilonok(); +extern llexpand(); +extern short llfindaction(); +extern llgetprefix(); +extern llgettoken(); +extern llinsert(); +extern llinsertsym(); +extern llinserttokens(); +extern llparsererror(); +extern llpushprod(); +extern llreadetab(); diff --git a/sys/netiso/xebec/llscan.c b/sys/netiso/xebec/llscan.c new file mode 100644 index 00000000000..ffdb9a92a60 --- /dev/null +++ b/sys/netiso/xebec/llscan.c @@ -0,0 +1,430 @@ +/* $Header: llscan.c,v 2.2 88/09/19 12:55:06 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/llscan.c,v $ */ +/* + * ************************* NOTICE ******************************* + * This code is in the public domain. It cannot be copyrighted. + * This scanner was originally written by Keith Thompson for the + * University of Wisconsin Crystal project. + * It was subsequently modified significantly by Nancy Hall at the + * University of Wisconsin for the ARGO project. + * **************************************************************** + */ +#include "xebec.h" +#include "llparse.h" + +#include "main.h" +#include +#include "procs.h" +#include "debug.h" + +#define EOFILE 0x01 +#define UNUSED 0x02 +#define IGNORE 0x04 +#define OPCHAR 0x8 +#define DIGITS 0x10 +#define LETTER 0x20 + +int chtype[128] = { +/* null, soh ^a, stx ^b etx ^c eot ^d enq ^e ack ^f bel ^g */ + EOFILE, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, +/* bs ^h ht ^i lf ^j vt ^k ff ^l cr ^m so ^n si ^o */ + UNUSED, IGNORE, IGNORE, UNUSED, IGNORE, IGNORE, UNUSED, UNUSED, +/* dle ^p dc1 ^q dc2 ^r dc3 ^s dc4 ^t nak ^u syn ^v etb ^w */ + UNUSED, UNUSED, UNUSED, UNUSED, EOFILE, UNUSED, UNUSED, UNUSED, +/* can ^x em ^y sub ^z esc ^] fs ^\ gs ^} rs ^` us ^/ */ + UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, + +/* ! " # $ % & ' */ + IGNORE, UNUSED, OPCHAR, UNUSED, OPCHAR, UNUSED, OPCHAR, OPCHAR, +/* ( ) * + , - . / */ + OPCHAR, OPCHAR, OPCHAR, OPCHAR, OPCHAR, OPCHAR, OPCHAR, OPCHAR, +/* 0 1 2 3 4 5 6 7 */ + DIGITS, DIGITS, DIGITS, DIGITS, DIGITS, DIGITS, DIGITS, DIGITS, +/* 8 9 : ; < = > ? */ + DIGITS, DIGITS, OPCHAR, OPCHAR, OPCHAR, OPCHAR, OPCHAR, OPCHAR, + +/* @ A B C D E F G */ + UNUSED, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, +/* H I J K L M N O */ + LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, +/* P Q R S T U V W */ + LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, +/* X Y Z [ \ ] ^ _ */ + LETTER, LETTER, LETTER, OPCHAR, UNUSED, OPCHAR, OPCHAR, LETTER, + +/* ` a b c d e f g */ + UNUSED, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, +/* h i j k l m n o */ + LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, +/* p q r s t u v w */ + LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, LETTER, +/* x y z { | } ~ del */ + LETTER, LETTER, LETTER, OPCHAR, UNUSED, OPCHAR, UNUSED, UNUSED +}; + + +extern FILE *astringfile; +static char *buffptr; +static char buffer[2][LINELEN]; +static int currentbuf = 1; + +#define addbuf(x) *buffptr++ = x + +static int ch = ' '; + +skip() +{ + while((chtype[ch] == IGNORE) ) { + ch = getch(); + } +} + +llaccept(t) +LLtoken *t; +{ + switch(t->llstate) { + case NORMAL: + break; + case INSERT: + fprintf(stderr,"Insert %s\n", llstrings[t->llterm]); + break; + case DELETE: + fprintf(stderr,"Delete %s\n", llstrings[t->llterm]); + break; + } +} + +#define TVAL (t->llattrib) + + +dump_buffer() +{ + register int i; + for(i=0; i<20; i++) + (void) fputc(buffer[currentbuf][i], stderr); + (void) fputc('\n', stderr); + (void) fflush(stderr); +} + +int iskey(c, buf) +char *c; +char **buf; +{ + register int i; + static struct { char *key_word; int term_type; } keys[] = { + { "SAME", T_SAME }, + { "DEFAULT", T_DEFAULT }, + { "NULLACTION", T_NULLACTION }, + { "STRUCT", T_STRUCT }, + { "SYNONYM", T_SYNONYM }, + { "TRANSITIONS", T_TRANSITIONS }, + { "STATES", T_STATES }, + { "EVENTS", T_EVENTS }, + { "PCB", T_PCB }, + { "INCLUDE", T_INCLUDE }, + { "PROTOCOL", T_PROTOCOL }, + { 0, 0}, + }; + + for (i = 0; keys[i].key_word ; i++) { + if( !strcmp(c, (*buf = keys[i].key_word) ) ) { + return ( keys[i].term_type ); + } + } + *buf = (char *)0; + return(0); +} + +getstr(o,c) + /* c is the string delimiter + * allow the delimiter to be escaped + * the messy part: translate $ID to + * e->ev_union.ID + * where ID is an event with a non-zero obj_struc + * need we check for the field??? + */ +char o,c; +{ + register int nested = 1; + register int allow_nesting = (o==c)?-1:1; + + IFDEBUG(S) + fprintf(stdout,"getstr: ch=%c, delimiters %c %c\n", + ch,o, c); + fprintf(stdout,"getstr: buffptr 0x%x, currentbuf 0x%x\n", + buffptr, currentbuf); + ENDDEBUG + + if( ch == c ) nested--; + while(nested) { + if(ch == '\0') { + fprintf(stderr, + "Eof inside of a string, delims= %c,%c, nesting %d",c,o, nested); + Exit(-1); + /* notreached */ + } else if(ch == '$') { + /* might be an attribute */ + IFDEBUG(S) + fprintf(stdout,"getstr: atttribute?\n"); + ENDDEBUG + + /* assume it's an event */ + /* addbuf is a macro so this isn't as bad as + * it looks + * add "e->ev_union." + */ + if( (ch = getch()) == '$' ) { + addbuf('e'); addbuf('-'); addbuf('>'); + addbuf('e'); addbuf('v'); addbuf('_'); + addbuf('u'); addbuf('n'); addbuf('i'); + addbuf('o'); addbuf('n'); + addbuf('.'); + AddCurrentEventName(& buffptr); + } else { + char *obufp = buffptr; + + do { + addbuf(ch); + ch = getch(); + } while(chtype[ch] & LETTER); + addbuf('\0'); + if( !strncmp(obufp, synonyms[PCB_SYN], + strlen(synonyms[PCB_SYN]) )) { + buffptr = obufp; + addbuf('p'); + } else if( !strncmp(obufp, synonyms[EVENT_SYN], + strlen(synonyms[EVENT_SYN]))) { + buffptr = obufp; + addbuf('e'); + } else { + fprintf(stderr, "Unknown synonym %s\n", obufp); + Exit(-1); + } + if(ch == '.') { + addbuf('-'); addbuf('>'); + } else { + /* needs to be checked for nesting */ + goto check; + } + } + /* end of attribute handling */ + goto skip; + } else if(ch == '\\') { + /* possible escape - this is kludgy beyond belief: + * \ is used to escape open and closing delimiters + * and '$' + * otherwise it's passed through to be compiled by C + */ + ch = getch(); + if( (ch != o ) && (ch != c) && (ch != '$') ) { + /* may need to handle case where \ is last char in file... */ + /* don't treat is as escape; not open or close so + * don't have to worry about nesting either + */ + addbuf('\\'); + } + } + addbuf(ch); + skip: + ch = getch(); + check: + if( ch == o ) nested += allow_nesting; + else if( ch == c ) nested--; + if ( (buffptr - buffer[currentbuf]) > LINELEN) { + fprintf(stderr, + "%s too long.\n", (o=='{')?"Action":"Predicate"); /*}*/ + fprintf(stderr, + "buffptr, currentbuf 0x%x, 0x%x\n",buffptr,currentbuf ); + Exit(-1); + } + IFDEBUG(S) + fprintf(stdout,"loop in getstr: ch 0x%x,%c o=%c,c=%c nested=%d\n", + ch,ch,o,c,nested); + ENDDEBUG + } + addbuf(ch); + addbuf('\0'); + + IFDEBUG(S) + fprintf(stdout,"exit getstr: got %s\n", buffer[currentbuf]); + fprintf(stdout,"exit getstr: buffptr 0x%x, currentbuf 0x%x\n", + buffptr, currentbuf); + ENDDEBUG +} + +getch() +{ + char c; + extern FILE *infile; + extern int lineno; + + c = fgetc(infile) ; + if (c == '\n') lineno++; + if ((int)c == EOF) c = (char)0; + if (feof(infile)) c = (char) 0; + IFDEBUG(e) + fprintf(stdout, "getch: 0x%x\n", c); + (void) fputc( c, stdout); + fflush(stdout); + ENDDEBUG + + return c; +} + +llscan(t) +LLtoken *t; +{ + char c; + + t->llstate = NORMAL; + + ++currentbuf; + currentbuf&=1; +again: + buffptr = &buffer[currentbuf][0]; + + skip(); + + switch(chtype[ch]) { + + case EOFILE: + t->llterm = T_ENDMARKER; + break; + + case UNUSED: + fprintf(stderr, "Illegal character in input - 0x%x ignored.", ch); + ch = getch(); + goto again; + + case OPCHAR: + + switch(ch) { + + case '/': + /* possible comment : elide ; kludge */ + IFDEBUG(S) + fprintf(stdout, "Comment ch=%c\n", ch); + ENDDEBUG + c = getch(); + if (c != '*') { + fprintf(stderr,"Syntax error : character(0x%x) ignored", ch); + ch = c; + goto again; + } else { + register int state = 2, whatchar=0; + static int dfa[3][3] = { + /* done seen-star middle */ + /* star */ { 0, 1, 1 }, + /* / */ { 0, 0, 2 }, + /* other */ { 0, 2, 2 } + }; + + while( state ) { + if( (c = getch()) == (char)0) + break; + whatchar = (c=='*')?0:(c=='/'?1:2); + IFDEBUG(S) + fprintf(stdout, + "comment: whatchar = %d, c = 0x%x,%c, oldstate=%d", + whatchar, c,c, state); + ENDDEBUG + state = dfa[whatchar][state]; + IFDEBUG(S) + fprintf(stdout, ", newstate=%d\n", state); + ENDDEBUG + } + if(state) { + fprintf(stderr, + "Syntax error: end of file inside a comment"); + Exit(-1); + } else ch = getch(); + } + IFDEBUG(S) + fprintf(stdout, "end of comment at 0x%x,%c\n",ch,ch); + ENDDEBUG + goto again; + + + case '*': + t->llterm = T_STAR; + break; + + case ',': + t->llterm = T_COMMA; + break; + + case ';': + t->llterm = T_SEMI; + break; + + case '<': + t->llterm = T_LANGLE; + break; + + case '=': + t->llterm = T_EQUAL; + break; + + case '[': + t->llterm = T_LBRACK; + break; + + case ']': + t->llterm = T_RBRACK; + break; + +#ifdef T_FSTRING + case '"': + t->llterm = T_FSTRING; + addbuf(ch); + ch = getch(); + getstr('"', '"'); + TVAL.FSTRING.address = stash(buffer[currentbuf]); + break; +#endif T_FSTRING + + case '(': + t->llterm = T_PREDICATE; + getstr(ch, ')' ); + TVAL.PREDICATE.address = buffer[currentbuf]; + break; + + case '{': + t->llterm = T_ACTION; + getstr(ch, '}'); + TVAL.ACTION.address = buffer[currentbuf]; + break; + + default: + fprintf(stderr,"Syntax error : character(0x%x) ignored", ch); + ch = getch(); + goto again; + + } + ch = getch(); + break; + + case LETTER: + do { + addbuf(ch); + ch = getch(); + } while(chtype[ch] & (LETTER | DIGITS)); + + addbuf('\0'); + + t->llterm = iskey(buffer[currentbuf], &TVAL.ID.address); + if(!t->llterm) { + t->llterm = T_ID; + TVAL.ID.address = buffer[currentbuf]; + } + IFDEBUG(S) + fprintf(stdout, "llscan: id or keyword 0x%x, %s\n", + TVAL.ID.address, TVAL.ID.address); + ENDDEBUG + break; + + default: + fprintf(stderr, "Snark in llscan: chtype=0x%x, ch=0x%x\n", + chtype[ch], ch); + } +} diff --git a/sys/netiso/xebec/main.c b/sys/netiso/xebec/main.c new file mode 100644 index 00000000000..a0b4842f30c --- /dev/null +++ b/sys/netiso/xebec/main.c @@ -0,0 +1,410 @@ +/* $Header: main.c,v 2.4 88/09/19 12:55:13 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/main.c,v $ */ +/* + * TODO: + * rewrite the command line stuff altogether - it's kludged beyond + * belief (as is the rest of the code...) + * + * DISCLAIMER DISCLAIMER DISCLAIMER + * This code is such a kludge that I don't want to put my name on it. + * It was a ridiculously fast hack and needs rewriting. + * However it does work... + */ + +#include +#include +#include "malloc.h" +#include "debug.h" +#include "main.h" + +int debug[128]; + +int lineno = 1; + +FILE *statefile, *actfile, *eventfile_h, *statevalfile; +FILE *infile, *astringfile; +char *Transfilename; +char *astringfile_name = DEBUGFILE; +char *actfile_name = ACTFILE; +char *statefile_name = STATEFILE; +char *statevalfile_name = STATEVALFILE; +char *eventfile_h_name = EVENTFILE_H; +int print_trans = 0; +int print_protoerrs = 0; +int pgoption = 0; +char kerneldirname[50] = "\0"; + +char protocol[50]; + +char *synonyms[] = { + "EVENT", + "PCB", + 0 +}; + +usage(a) +char *a; +{ + fprintf(stderr, + "usage: %s {-D} \n", + a); + fprintf(stderr, "\t is any combination of:\n"); + fprintf(stderr, "\t\t-A\n"); + fprintf(stderr, "\t\t-E\n"); + fprintf(stderr, "\t\t-S\n"); + fprintf(stderr, "\t\t-I\n"); + fprintf(stderr, "\t\t-X\n"); + fprintf(stderr, "\t\t-K\n"); + fprintf(stderr, + "\tThese names do NOT include the suffices (.c, .h)\n"); + fprintf(stderr, + "\t\t-D to turn on debug options for xebec itself\n"); + fprintf(stderr, "\t- for levels of debugging output\n"); + fprintf(stderr, "\t\t ranges from 1 to 3, 1 is default(everything)\n"); + fprintf(stderr, "\t\t-T to print transitions\n"); + fprintf(stderr, "\t\t-e to print list of combinations of\n"); + fprintf(stderr, "\t\t\t [event,old_state] that produce protocol errors\n"); + fprintf(stderr, "\t\t-g include profiling code in driver\n"); + Exit(-1); +} + +openfiles(proto) +register char *proto; +{ + register char *junk; + register int lenp = strlen(proto); + + IFDEBUG(b) + fprintf(OUT, "openfiles %s\n",proto); + ENDDEBUG + +#define HEADER Header +#define SOURCE Source +#define DOIT(X)\ + /* GAG */\ + junk = Malloc( 2 + lenp + strlen(X/**/_name) );\ + (void) sprintf(junk, "%s_", proto);\ + X/**/_name = strcat(junk, X/**/_name);\ + X = fopen(X/**/_name, "w");\ + if((X)==(FILE *)0)\ + { fprintf(stderr,"Open failed: %s\n", "X"); Exit(-1); }\ + fprintf(X, "/* %cHeader%c */\n",'$', '$' );\ + fprintf(X, "/* %cSource%c */\n",'$', '$' ); + + DOIT(eventfile_h); + + IFDEBUG(X) +#ifdef DEBUG + DOIT(astringfile); +#endif DEBUG + fprintf(astringfile, + "#ifndef _NFILE\n#include \n#endif _NFILE\n" ); + ENDDEBUG + + DOIT(statevalfile); + DOIT(statefile); + DOIT(actfile); + fprintf(actfile, + "#ifndef lint\nstatic char *rcsid = \"$Header/**/$\";\n#endif lint\n"); + + if(pgoption) + putdriver(actfile, 15); + else + putdriver(actfile, 14); + + FakeFilename(actfile, Transfilename, lineno); + putdriver(actfile, 1); + FakeFilename(actfile, Transfilename, lineno); + putdriver(actfile, 12); + fprintf(actfile, "#include \"%s%s\"\n", kerneldirname, statevalfile_name); + FakeFilename(actfile, Transfilename, lineno); + putdriver(actfile, 2); + + initsets(eventfile_h, statefile); +} + +includecode(file, f) +FILE *file; +register char *f; +{ + register int count=1; + static char o='{'; + static char c='}'; + register char *g; + + IFDEBUG(a) + fprintf(stdout, "including: %s, f=0x%x", f,f); + ENDDEBUG + g = ++f; + while(count>0) { + if(*g == o) count++; + if(*g == c) count--; + g++; + } + *(--g) = '\0'; + IFDEBUG(a) + fprintf(stdout, "derived: %s", f); + ENDDEBUG + fprintf(file, "%s", f); + FakeFilename(file, Transfilename, lineno); +} + +putincludes() +{ + FakeFilename(actfile, Transfilename, lineno); + fprintf(actfile, "\n#include \"%s%s\"\n", kerneldirname, eventfile_h_name); + IFDEBUG(X) + if( !debug['K'] ) + fprintf(actfile, "\n#include \"%s\"\n", astringfile_name); + /* not in kernel mode */ + ENDDEBUG + FakeFilename(actfile, Transfilename, lineno); +} + +main(argc, argv) +int argc; +char *argv[]; +{ + register int i = 2; + extern char *strcpy(); + int start, finish; + extern int FirstEventAttribute; + extern int Nevents, Nstates; + + start = time(0); + if(argc < 2) { + usage(argv[0]); + } + IFDEBUG(a) + fprintf(stdout, "infile = %s\n",argv[1]); + ENDDEBUG + Transfilename = argv[1]; + infile = fopen(argv[1], "r"); + + if(argc > 2) while(i < argc) { + register int j=0; + char c; + char *name; + + if(argv[i][j] == '-') j++; + switch(c = argv[i][j]) { + + /* GROT */ + case 'A': + name = &argv[i][++j]; + actfile_name = Malloc( strlen(name)+4); + actfile_name = (char *)strcpy(actfile_name,name); +#ifdef LINT + name = +#endif LINT + strcat(actfile_name, ".c"); + fprintf(stdout, "debugging file is %s\n",actfile_name); + break; + case 'K': + debug[c]=1; + fprintf(OUT, "option %c file %s\n",c, &argv[i][j+1]); + (void) strcpy(kerneldirname,&argv[i][++j]); + break; + case 'X': + debug[c]=1; + name = &argv[i][++j]; + astringfile_name = Malloc( strlen(name)+4); + astringfile_name = (char *)strcpy(astringfile_name,name); +#ifdef LINT + name = +#endif LINT + strcat(astringfile_name, ".c"); + fprintf(OUT, "option %c, astringfile name %s\n",c, name); + break; + case 'E': + name = &argv[i][++j]; + eventfile_h_name = Malloc( strlen(name)+4); + eventfile_h_name = (char *)strcpy(eventfile_h_name,name); +#ifdef LINT + name = +#endif LINT + strcat(eventfile_h_name, ".h"); + fprintf(stdout, "event files is %s\n",eventfile_h_name); + break; + case 'I': + name = &argv[i][++j]; + statevalfile_name = Malloc( strlen(name)+4 ); + statevalfile_name = (char *)strcpy(statevalfile_name,name); +#ifdef LINT + name = +#endif LINT + strcat(statevalfile_name, ".init"); + fprintf(stdout, "state table initial values file is %s\n",statevalfile_name); + break; + case 'S': + name = &argv[i][++j]; + statefile_name = Malloc( strlen(name)+4); + statefile_name = (char *)strcpy(statefile_name,name); +#ifdef LINT + name = +#endif LINT + strcat(statefile_name, ".h"); + fprintf(stdout, "state file is %s\n",statefile_name); + break; + /* END GROT */ + case '1': + case '2': + case '3': + debug['X']= (int)argv[i][j] - (int) '0'; + fprintf(OUT, "value of debug['X'] is 0x%x,%d\n", debug['X'], + debug['X']); + break; + case 'D': + while( c = argv[i][++j] ) { + if(c == 'X') { + fprintf(OUT, "debugging on"); + if(debug['X']) fprintf(OUT, + " - overrides any -%d flags used\n", debug['X']); + } + debug[c]=1; + fprintf(OUT, "debug %c\n",c); + } + break; + case 'g': + pgoption = 1; + fprintf(stdout, "Profiling\n"); + break; + case 'e': + print_protoerrs = 1; + fprintf(stdout, "Protocol error table:\n"); + break; + + case 'T': + print_trans = 1; + fprintf(stdout, "Transitions:\n"); + break; + default: + usage(argv[0]); + break; + } + i++; + } + if(kerneldirname[0]) { + char *c; +#ifdef notdef + if(debug['X']) { + fprintf(OUT, "Option K overrides option X\n"); + debug['X'] = 0; + } +#endif notdef + if(strlen(kerneldirname)<1) { + fprintf(OUT, "K option: dir name too short!\n"); + exit(-1); + } + /* add ../name/ */ + c = (char *) Malloc(strlen(kerneldirname)+6) ; + if(c <= (char *)0) { + fprintf(OUT, "Cannot allocate %d bytes for kerneldirname\n", + strlen(kerneldirname + 6) ); + fprintf(OUT, "kerneldirname is %s\n", kerneldirname ); + exit(-1); + } + *c = '.'; + *(c+1) = '.'; + *(c+2) = '/'; + (void) strcat(c, kerneldirname); + (void) strcat(c, "/\0"); + strcpy(kerneldirname, c); + } + + init_alloc(); + + (void) llparse(); + + /* {{ */ + if( !FirstEventAttribute ) + fprintf(eventfile_h, "\t}ev_union;\n"); + fprintf(eventfile_h, "};/* end struct event */\n"); + fprintf(eventfile_h, "\n#define %s_NEVENTS 0x%x\n", protocol, Nevents); + fprintf(eventfile_h, + "\n#define ATTR(X)ev_union.%s/**/X/**/\n",EV_PREFIX); + (void) fclose(eventfile_h); + + /* {{ */ fprintf(actfile, "\t}\nreturn 0;\n}\n"); /* end switch; end action() */ + dump_predtable(actfile); + + putdriver(actfile, 3); + IFDEBUG(X) + if(!debug['K']) + putdriver(actfile, 4); + ENDDEBUG + putdriver(actfile, 6); + IFDEBUG(X) + /* + putdriver(actfile, 10); + */ + if(debug['K']) { + putdriver(actfile, 11); + } else { + switch(debug['X']) { + case 1: + default: + putdriver(actfile, 7); + break; + case 2: + putdriver(actfile, 13); + break; + case 3: + break; + } + } + ENDDEBUG + putdriver(actfile, 8); + (void) fclose(actfile); + IFDEBUG(X) + /* { */ + fprintf(astringfile, "};\n"); + (void) fclose(astringfile); + ENDDEBUG + + (void) fclose(statevalfile); + + fprintf(statefile, "\n#define %s_NSTATES 0x%x\n", protocol, Nstates); + (void) fclose(statefile); + + finish = time(0); + fprintf(stdout, "%d seconds\n", finish - start); + if( print_protoerrs ) + printprotoerrs(); +} + +int transno = 0; + +Exit(n) +{ + fprintf(stderr, "Error at line %d\n",lineno); + if(transno) fprintf(stderr, "Transition number %d\n",transno); + (void) fflush(stdout); + (void) fflush(statefile); + (void) fflush(eventfile_h); + (void) fflush(actfile); + exit(n); +} + +syntax() +{ + static char *synt[] = { + "*PROTOCOL \n", + "*PCB \n", + "\n} >\n", + "*STATES \n", + "*EVENTS \n", + "*TRANSITIONS \n", + }; +} + +FakeFilename(outfile, name, l) +FILE *outfile; +char *name; +int l; +{ + /* + doesn't work + fprintf(outfile, "\n\n\n\n# line %d \"%s\"\n", l, name); + */ +} diff --git a/sys/netiso/xebec/main.h b/sys/netiso/xebec/main.h new file mode 100644 index 00000000000..cb5bd74f31e --- /dev/null +++ b/sys/netiso/xebec/main.h @@ -0,0 +1,32 @@ +/* $Header: main.h,v 2.1 88/09/19 12:56:24 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/main.h,v $ */ + +#define TRUE 1 +#define FALSE 0 +#define LINELEN 2350 + /* approx limit on token size for C compiler + * which matters for the purpose of debugging (astring.c...) + */ + +#define MSIZE 4000 +#define DEBUGFILE "astring.c" +#define ACTFILE "driver.c" +#define EVENTFILE_H "events.h" +#define STATEFILE "states.h" +#define STATEVALFILE "states.init" + +#define EV_PREFIX "EV_" +#define ST_PREFIX "ST_" + +#define PCBNAME "_PCB_" + +extern char kerneldirname[]; +extern char protocol[]; +extern char *synonyms[]; +#define EVENT_SYN 0 +#define PCB_SYN 1 + +extern int transno; +extern int print_trans; +extern char *stash(); + diff --git a/sys/netiso/xebec/malloc.c b/sys/netiso/xebec/malloc.c new file mode 100644 index 00000000000..5cdfc147a70 --- /dev/null +++ b/sys/netiso/xebec/malloc.c @@ -0,0 +1,136 @@ +/* $Header: malloc.c,v 2.2 88/09/19 12:55:18 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/malloc.c,v $ */ +/* + * This code is such a kludge that I don't want to put my name on it. + * It was a ridiculously fast hack and needs rewriting. + * However it does work... + */ + +/* + * a simple malloc + * it might be brain-damaged but for the purposes of xebec + * it's a whole lot faster than the c library malloc + */ + +#include +#include "malloc.h" +#include "debug.h" +#define CHUNKSIZE 4096*2 + +static char *hiwat, *highend; +int bytesmalloced=0; +int byteswasted = 0; + + +init_alloc() +{ +#ifdef LINT + hiwat = 0; + highend = 0; +#else LINT + extern char *sbrk(); + + hiwat = (char *) sbrk(0); + hiwat = (char *)((unsigned)(hiwat + 3) & ~0x3); + highend = hiwat; +#endif LINT +} + +HIWAT(s) +char *s; +{ + IFDEBUG(M) + fprintf(stdout, "HIWAT 0x%x %s\n", hiwat,s); + fflush(stdout); + ENDDEBUG +} + +#define MIN(x,y) ((x highend) { + c = sbrk(CHUNKSIZE); + IFDEBUG(M) + fprintf(stdout, "hiwat 0x%x, x 0x%x, highend 0x%x, c 0x%x\n", + hiwat, x, highend, c); + fflush(stdout); + ENDDEBUG + if( c == (char *) -1 ) { + fprintf(stderr, "Ran out of memory!\n"); + Exit(-1); + } + if(first_iter) { + returnvalue = c; + first_iter = 0; + } + bytesmalloced += CHUNKSIZE; + IFDEBUG(m) + if (highend != c) { + fprintf(OUT, "warning: %d wasted bytes!\n", highend - hiwat); + fprintf(OUT, " chunksize 0x%x, x 0x%x \n", CHUNKSIZE, x); + } + ENDDEBUG + highend = c + CHUNKSIZE; + hiwat = c; + } + c = hiwat; + if(first_iter) { + returnvalue = c; + first_iter = 0; + } + hiwat += x; + total -= x; + } + if((unsigned)hiwat & 0x3) { + byteswasted += (int)((unsigned)(hiwat) & 0x3); + hiwat = (char *)((unsigned)(hiwat + 3) & ~0x3); + } + IFDEBUG(M) + fprintf(stdout, "Malloc = 0x%x, bytesm 0x%x, wasted 0x%x, hiwat 0x%x\n", + returnvalue, bytesmalloced, byteswasted, hiwat); + ENDDEBUG + IFDEBUG(N) + fprintf(stdout, "Malloc returns 0x%x, sbrk(0) 0x%x\n", returnvalue, sbrk(0)); + fflush(stdout); + ENDDEBUG + return(returnvalue); +} + diff --git a/sys/netiso/xebec/malloc.h b/sys/netiso/xebec/malloc.h new file mode 100644 index 00000000000..53d865bf47b --- /dev/null +++ b/sys/netiso/xebec/malloc.h @@ -0,0 +1,4 @@ +/* $Header: malloc.h,v 2.1 88/09/19 12:56:27 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/malloc.h,v $ */ + +char *Malloc(); diff --git a/sys/netiso/xebec/procs.c b/sys/netiso/xebec/procs.c new file mode 100644 index 00000000000..49d862ac5b9 --- /dev/null +++ b/sys/netiso/xebec/procs.c @@ -0,0 +1,437 @@ +/* $Header: procs.c,v 2.3 88/09/19 12:55:22 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/procs.c,v $ */ +/* + * This code is such a kludge that I don't want to put my name on it. + * It was a ridiculously fast hack and needs rewriting. + * However it does work... + */ + +#include +#include +#include "malloc.h" +#include "main.h" +#include "debug.h" +#include "sets.h" +#include "procs.h" + +struct Predicate { + int p_index; + int p_transno; + char *p_str; + struct Predicate *p_next; +}; + +struct Stateent { + int s_index; + int s_newstate; + int s_action; + struct Stateent *s_next; +}; + +struct Object *SameState = (struct Object *)-1; +int Index = 0; +int Nstates = 0; +int Nevents = 0; +struct Predicate **Predlist; +struct Stateent **Statelist; +extern FILE *astringfile; + +end_events() { + int size, part; + char *addr; + + IFDEBUG(X) + /* finish estring[], start astring[] */ + if(debug['X'] < 2 ) + fprintf(astringfile, "};\n\nchar *%s_astring[] = {\n\"NULLACTION\",\n", + protocol); + ENDDEBUG + /* NOSTRICT */ + Statelist = + (struct Stateent **) Malloc((Nstates+1) * sizeof(struct Statent *)); + /* NOSTRICT */ + Predlist = + (struct Predicate **) + Malloc ( (((Nevents)<BZSIZE?BZSIZE:size; + IFDEBUG(N) + fprintf(OUT, "bzero addr 0x%x part %d size %d\n",addr, part, size); + ENDDEBUG + bzero(addr, part); + IFDEBUG(N) + fprintf(OUT, "after bzero addr 0x%x part %d size %d\n",addr, part, size); + ENDDEBUG + addr += part; + size -= part; + + } + IFDEBUG(N) + fprintf(OUT, "endevents..done \n"); + ENDDEBUG +} + +int acttable(f,actstring) +char *actstring; +FILE *f; +{ + static Actindex = 0; + extern FILE *astringfile; + extern int pgoption; + + IFDEBUG(a) + fprintf(OUT,"acttable()\n"); + ENDDEBUG + fprintf(f, "case 0x%x: \n", ++Actindex); + + if(pgoption) { + fprintf(f, "asm(\" # dummy statement\");\n"); + fprintf(f, "asm(\"_Xebec_action_%x: \");\n", Actindex ); + fprintf(f, "asm(\".data\");\n"); + fprintf(f, "asm(\".globl _Xebec_action_%x# X profiling\");\n", + Actindex ); + fprintf(f, "asm(\".long 0 # X profiling\");\n"); + fprintf(f, "asm(\".text # X profiling\");\n"); + fprintf(f, "asm(\"cas r0,r15,r0 # X profiling\");\n"); + fprintf(f, "asm(\"bali r15,mcount # X profiling\");\n"); + } + + fprintf(f, "\t\t%s\n\t\t break;\n", actstring); + IFDEBUG(X) + if(debug['X']<2) { + register int len = 0; + fputc('"',astringfile); + while(*actstring) { + if( *actstring == '\n' ) { + fputc('\\', astringfile); + len++; + fputc('n', astringfile); + } else if (*actstring == '\\') { + fputc('\\', astringfile); + len ++; + fputc('\\', astringfile); + } else if (*actstring == '\"') { + fputc('\\', astringfile); + len ++; + fputc('\"', astringfile); + } else fputc(*actstring, astringfile); + actstring++; + len++; + } + fprintf(astringfile,"\",\n"); + if (len > LINELEN) { + fprintf(stderr, "Action too long: %d\n",len); Exit(-1); + } + } + ENDDEBUG + + return(Actindex); +} + +static int Npred=0, Ndefpred=0, Ntrans=0, Ndefevent=0, Nnulla=0; + +statetable(string, oldstate, newstate, action, event) +char *string; +int action; +struct Object *oldstate, *newstate, *event; +{ + register int different; + + IFDEBUG(a) + fprintf(OUT,"statetable(0x%x, 0x%x,0x%x, 0x%x)\n", + string, oldstate, newstate, action); + fprintf(OUT,"statetable(%s, %s,%s, 0x%x)\n", + string, oldstate->obj_name, newstate->obj_name, action); + ENDDEBUG + + if( !action) Nnulla++; + if( newstate->obj_kind == OBJ_SET) { + fprintf(stderr, "Newstate cannot be a set\n"); + Exit(-1); + } + different = (newstate != SameState); + + (void) predtable( oldstate, event, string, + action, (newstate->obj_number) * different ); + IFDEBUG(a) + fprintf(OUT,"EXIT statetable\n"); + ENDDEBUG +} + +stateentry(index, oldstate, newstate, action) +int index, action; +int oldstate, newstate; +{ + extern FILE *statevalfile; + + IFDEBUG(a) + fprintf(OUT,"stateentry(0x%x,0x%x,0x%x,0x%x) Statelist@0x%x, val 0x%x\n", + index, oldstate, newstate,action, &Statelist, Statelist); + ENDDEBUG + + + fprintf(statevalfile, "{0x%x,0x%x},\n", newstate, action); +} + +int predtable(os, oe, str, action, newstate) +struct Object *os, *oe; +char *str; +int action, newstate; +{ + register struct Predicate *p, **q; + register int event, state; + register struct Object *e, *s; + struct Object *firste; + + if (oe == (struct Object *)0 ) { + Ndefevent ++; + fprintf(stderr, "DEFAULT EVENTS aren't implemented; trans ignored\n"); + return; + } + Ntrans++; + IFDEBUG(g) + fprintf(stdout, + "PREDTAB: s %5s; e %5s\n", os->obj_kind==OBJ_SET?"SET":"item", + oe->obj_kind==OBJ_SET?"SET":"item"); + ENDDEBUG + if (os->obj_kind == OBJ_SET) s = os->obj_members; + else s = os; + if (oe->obj_kind == OBJ_SET) firste = oe->obj_members; + else firste = oe; + if(newstate) { + fprintf(statevalfile, "{0x%x,0x%x},\n",newstate, action); + Index++; + } + while (s) { + if( !newstate ) { /* !newstate --> SAME */ + /* i.e., use old obj_number */ + fprintf(statevalfile, "{0x%x,0x%x},\n",s->obj_number, action); + Index++; + } + e = firste; + while (e) { + event = e->obj_number; state = s->obj_number; + IFDEBUG(g) + fprintf(stdout,"pred table event=0x%x, state 0x%x\n", + event, state); + fflush(stdout); + ENDDEBUG + if( !str /* DEFAULT PREDICATE */) { + Ndefpred++; + IFDEBUG(g) + fprintf(stdout, + "DEFAULT pred state 0x%x, event 0x%x, Index 0x%x\n", + state, event, Index); + fflush(stdout); + ENDDEBUG + } else + Npred++; + /* put at END of list */ +#ifndef LINT + IFDEBUG(g) + fprintf(stdout, + "predicate for event 0x%x, state 0x%x is 0x%x, %s\n", + event, state, Index, str); + fflush(stdout); + ENDDEBUG +#endif LINT + for( ((q = &Predlist[(event<p_next ) { + q = &p->p_next; + } + + p = (struct Predicate *)Malloc(sizeof(struct Predicate)); + p->p_next = (struct Predicate *)0; + p->p_str = str; + p->p_index = Index; + p->p_transno = transno; + *q = p; + + IFDEBUG(g) + fprintf(stdout, + "predtable index 0x%x, transno %d, E 0x%x, S 0x%x\n", + Index, transno, e, s); + ENDDEBUG + + e = e->obj_members; + } + s = s->obj_members; + } + return Index ; +} + +printprotoerrs() +{ + register int e,s; + + fprintf(stderr, "[ Event, State ] without any transitions :\n"); + for(e = 0; e < Nevents; e++) { + fprintf(stderr, "Event 0x%x: states ", e); + for(s = 0; s < Nstates; s++) { + if( Predlist[(e<p_str) { + if(!hadapred) + fprintf(f, "case 0x%x:\n\t", (e<p_str, p->p_index); + } else { + if(defaultindex) { + fprintf(stderr, +"\nConflict between transitions %d and %d: duplicate default \n", + p->p_transno, defaultItrans); + Exit(-1); + } + defaultindex = p->p_index; + defaultItrans = p->p_transno; + } + p = p->p_next; + } + if( hadapred) { + fprintf(f, "return 0x%x;\n", defaultindex); + } + IFDEBUG(d) + fflush(f); + ENDDEBUG + } + IFDEBUG(g) + fprintf(stdout, + "loop: e 0x%x s 0x%x hadapred 0x%x dindex 0x%x for trans 0x%x\n", + e, s, hadapred, defaultindex, defaultItrans); + ENDDEBUG + if ( hadapred ) { + /* put a -1 in the array - Predlist is temporary storage */ + Predlist[(e<p_next ) { +#ifndef LINT + IFDEBUG(a) + fprintf(OUT, + "dump_pentry for event 0x%x, state 0x%x is 0x%x\n", + event, state, p); + ENDDEBUG +#endif LINT + q = &p->p_next; + } +} +#endif notdef diff --git a/sys/netiso/xebec/procs.h b/sys/netiso/xebec/procs.h new file mode 100644 index 00000000000..e41ae75995c --- /dev/null +++ b/sys/netiso/xebec/procs.h @@ -0,0 +1,5 @@ +/* $Header: procs.h,v 2.1 88/09/19 12:56:30 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/procs.h,v $ */ + +extern char *stash(); +extern struct Object *SameState; diff --git a/sys/netiso/xebec/putdriver.c b/sys/netiso/xebec/putdriver.c new file mode 100644 index 00000000000..996ac643d10 --- /dev/null +++ b/sys/netiso/xebec/putdriver.c @@ -0,0 +1,244 @@ +/* $Header: putdriver.c,v 2.2 88/09/19 12:55:27 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/putdriver.c,v $ */ + +/* + * This code is such a kludge that I don't want to put my name on it. + * It was a ridiculously fast hack and needs rewriting. + * However it does work... + */ + +/* The original idea was to put all the driver code + * in one place so it would be easy to modify + * but as hacks got thrown in it got worse and worse... + * It's to the point where a user would be better off + * writing his own driver and xebec should JUST produce + * the tables. + */ + +#include +#include "main.h" +#include "debug.h" + +extern char protocol[]; +char Eventshiftstring[10]; +static char statename[] = {'_', 's', 't', 'a', 't', 'e', 0 }; + +static char *strings[] = { + +#define PART1 { 0,3 } + + "\n#include \"", + kerneldirname, + protocol, + "_states.h\"", + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + +#define PART12 { 10,12 } + "\n\nstatic struct act_ent {\n", + "\tint a_newstate;\n\tint a_action;\n", + "} statetable[] = { {0,0},\n", + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + +#define PART2 { 20,20 } + "};\n", + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + +#define PART3 { 30,41 } + "\n", + protocol, + "_driver(p, e)\nregister ", + protocol, + PCBNAME, + " *p;\nregister struct ", + protocol, + "_event *e;\n", + "{\n", + "\tregister int index, error=0;\n", + "\tstruct act_ent *a;\n", + "\tstatic struct act_ent erroraction = {0,-1};\n", + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + +#define PART4 { 50,54 } + + "\textern int ", + protocol, + "_debug;\n\textern FILE *", + protocol, + "_astringfile;\n", + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + +#define PART6 { 60, 65 } + "\n\tindex = inx[1 + e->ev_number][p->", + protocol, + statename, + "];\n\tif(index<0) index=_Xebec_index(e, p);\n", + "\tif (index==0) {\n\t\ta = &erroraction;\n", + "\t} else\n\t\ta = &statetable[index];\n\n", + (char *)0, + (char *)0, + (char *)0, + (char *)0, + +#define PART7 {70, 77 } + "\tif(", + protocol, + "_debug) fprintf(", + protocol, + "_astringfile, \"%15s <-- %15s [%15s] \\n\\t%s\\n\",\n", + "\t\tsstring[a->a_newstate], sstring[p->", + protocol, + "_state], estring[e->ev_number], astring[a->a_action]);\n\n", + (char *)0, + (char *)0, + +#define PART8 { 80, 84 } + "\tif(a->a_action)\n", + "\t\terror = _Xebec_action( a->a_action, e, p );\n", + "\tif(error==0)\n\tp->", + protocol, + "_state = a->a_newstate;\n\treturn error;\n}\n", + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + +#define PART9 { 90, 99 } + "\n_XEBEC_PG int _Xebec_action(a,e,p)\nint a;\nstruct ", + protocol, + "_event *e;\n", + protocol, + PCBNAME, + " *p;\n{\n", + "switch(a) {\n", + "case -1: return ", + protocol, + "_protocol_error(e,p);\n", + (char *)0, + +#define PART10 { 101, 105 } + "\tif(", + protocol, + "_debug) fprintf(", + protocol, + "_astringfile, \"index 0x%5x\\n\", index);\n", + (char *)0, + (char *)0, + (char *)0, + (char *)0, + +#define PART5 { 110, 121 } + "\n_XEBEC_PG int\n_Xebec_index( e,p )\n", + "\tstruct ", + protocol, + "_event *e;\n\t", + protocol, + PCBNAME, + " *p;\n{\nswitch( (e->ev_number<<", + Eventshiftstring, + ")+(p->", + protocol, + statename, + ") ) {\n", + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + +#define PART11 {130, 137 } + "\tIFTRACE(D_DRIVER)\n", + "\t", + protocol, + "trace(DRIVERTRACE,", + "\t\ta->a_newstate, p->", + protocol, + "_state, e->ev_number, a->a_action, 0);\n\n", + "\tENDTRACE\n", + (char *)0, + (char *)0, + +#define PART13 {140, 147 } + "\tif(", + protocol, + "_debug) fprintf(", + protocol, + "_astringfile, \"%15s <-- %15s [%15s] \\n\",\n", + "\t\tsstring[a->a_newstate], sstring[p->", + protocol, + "_state], estring[e->ev_number]);\n\n", + (char *)0, + (char *)0, + +#define PART14 { 150,150 } + "#define _XEBEC_PG static\n", + +#define PART15 { 151,151 } + "#define _XEBEC_PG \n", + +}; + +static struct { int start; int finish; } parts[] = { + { 0,0 }, + PART1, + PART2, + PART3, + PART4, + PART5, + PART6, + PART7, + PART8, + PART9, + PART10, + PART11, + PART12, + PART13, + PART14, + PART15, +}; + +putdriver(f, x) +FILE *f; +int x; +{ + register int i; + + for( i = parts[x].start; i<= parts[x].finish; i++) + fprintf(f, "%s", strings[i]); + IFDEBUG(d) + fflush(f); + ENDDEBUG +} diff --git a/sys/netiso/xebec/sets.c b/sys/netiso/xebec/sets.c new file mode 100644 index 00000000000..3bb74ed8d29 --- /dev/null +++ b/sys/netiso/xebec/sets.c @@ -0,0 +1,472 @@ +/* $Header: sets.c,v 2.3 88/09/19 12:55:30 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/sets.c,v $ */ +/* + * This code is such a kludge that I don't want to put my name on it. + * It was a ridiculously fast hack and needs rewriting. + * However it does work... + */ +#include "main.h" +#include "malloc.h" +#include "sets.h" +#include "debug.h" +#include + +struct Object *CurrentEvent = (struct Object *)0; +struct Object *Objtree; +struct Object dummy; +/* + * define a set w/ type and name + * return a set number + */ +#undef NULL +#define NULL (struct Object *)0 + +static FILE *Sfile, *Efile; +extern FILE *astringfile; +char *Noname = "Unnamed set\0"; + +initsets(f,s) +FILE *f, *s; +{ + static char errorstring[20]; + extern struct Object *SameState; + Efile = f; + Sfile = s; + + IFDEBUG(X) + fprintf(astringfile, "char *%s_sstring[] = {\n", protocol); + ENDDEBUG + sprintf(errorstring, "%sERROR\0", ST_PREFIX); + defineitem(STATESET, errorstring, (char *)0); /* state 0 */ + SameState = (struct Object *) Malloc( sizeof (struct Object) ); + SameState->obj_kind = OBJ_ITEM; + SameState->obj_type = STATESET; + SameState->obj_name = "SAME"; + SameState->obj_struc = (char *)0; + SameState->obj_number = 0; + SameState->obj_members = (struct Object *)0; + SameState->obj_left = (struct Object *)0; + SameState->obj_right = (struct Object *)0; + SameState->obj_parent = (struct Object *)0; +} + +/* + * get a set based on its type and name + * returns address of an Object, may be set or item + */ + +struct Object *lookup(type, name) +unsigned char type; +char *name; +{ + register struct Object *p = Objtree; + int val = 1 ; + + IFDEBUG(o) + fprintf(stdout,"lookup 0x%x,%s \n", + type, name); + ENDDEBUG + + while( p && val ) { + IFDEBUG(o) + fprintf(OUT, "lookup strcmp 0x%x,%s, 0x%x,%s\n", + name, name, OBJ_NAME(p), OBJ_NAME(p)); + ENDDEBUG + if( p->obj_name == (char *)0 ) { + fprintf(stderr, "Unnamed set in table!\n"); + Exit(-1); + } + val = (int) strcmp(name, OBJ_NAME(p)); + if(val < 0) { + /* left */ + p = p->obj_left; + } else if (val > 0) { + /* right */ + p = p->obj_right; + } + } + if( p && ( p->obj_type != type)) { + fprintf(stdout, "lookup(0x%x,%s) found wrong obj type 0x%x\n", + type,name, p->obj_type); + p = NULL; + } + IFDEBUG(o) + fprintf(stdout,"lookup 0x%x,%s returning 0x%x\n",type, name, p); + ENDDEBUG + return(p); +} + +static int states_done = 0; + +end_states(f) +FILE *f; +{ + register unsigned n = Nstates; + register int i; + extern char Eventshiftstring[]; + + states_done = 1; + + for( i = 0; ;i++) { + if( (n >>= 1) <= 0 ) break; + } + Eventshift = i+1; + IFDEBUG(d) + fprintf(OUT, "Eventshift=%d\n", Eventshift); + ENDDEBUG + sprintf(Eventshiftstring, "%d\0",Eventshift); + fprintf(f, "struct %s_event {\n\tint ev_number;\n", &protocol[0]); + IFDEBUG(X) + /* finish sstring[] & start estring[] */ + fprintf(astringfile, + "};\n\nchar *%s_estring[] = {\n", protocol); + ENDDEBUG +} + +int FirstEventAttribute = 1; + +static +insert(o) +struct Object *o; +{ + struct Object *p = Objtree; + struct Object **q = &Objtree; + int val=1; + + + if (o->obj_name == (char *)0) { + fprintf(stderr, "Internal Error: inserting unnamed object\n"); + Exit(-1); + } + if( o->obj_type == STATESET) { + if( states_done ) { + fprintf(stderr, "No states may be defined after *TRANSITIONS\n"); + Exit(-1); + } + o->obj_number = Nstates++ ; + if(Nstates > MAXSTATES) { + fprintf(stderr, "Too many states\n"); + Exit(-1); + } + fprintf(Sfile, "#define %s 0x%x\n", o->obj_name, o->obj_number); + IFDEBUG(X) + fprintf(astringfile, "\"%s(0x%x)\",\n", o->obj_name, o->obj_number); + ENDDEBUG + } else { + /* EVENTSET */ + if( ! states_done ) { + fprintf(stderr, "states must precede events\n"); + Exit(-1); + } + o->obj_number = Nevents++ ; + if(Nevents > MAXEVENTS) { + fprintf(stderr, "Too many events\n"); + Exit(-1); + } + if(o->obj_struc) { + if( FirstEventAttribute ) { + fprintf(Efile, "\n\tunion{\n"); /*} */ + FirstEventAttribute = 0; + } + fprintf(Efile, + "struct %s %s%s;\n\n", o->obj_struc, EV_PREFIX, o->obj_name); + } + fprintf(Efile, "#define %s 0x%x\n", o->obj_name, o->obj_number); + IFDEBUG(X) + fprintf(astringfile, "\"%s(0x%x)\",\n", o->obj_name, o->obj_number); + ENDDEBUG + } + IFDEBUG(o) + fprintf(OUT, "insert(%s)\n", OBJ_NAME(o) ); + if(o->obj_right != NULL) { + fprintf(OUT, "insert: unclean Object right\n"); + exit(-1); + } + if(o->obj_left != NULL) { + fprintf(OUT, "insert: unclean Object left\n"); + exit(-1); + } + fflush(OUT); + ENDDEBUG + + while( val ) { + if(p == NULL) { + *q = o; + o->obj_parent = (struct Object *)q; + break; + } + if(!(val = strcmp(o->obj_name, p->obj_name)) ) { + /* equal */ + fprintf(stderr, "re-inserting %s\n",o->obj_name); + exit(-1); + } + if(val < 0) { + /* left */ + q = &p->obj_left; + p = p->obj_left; + } else { + /* right */ + q = &p->obj_right; + p = p->obj_right; + } + } + IFDEBUG(a) + dumptree(Objtree,0); + ENDDEBUG +} + +delete(o) +struct Object *o; +{ + register struct Object *p = o->obj_right; + register struct Object *q; + register struct Object *newparent; + register struct Object **np_childlink; + + IFDEBUG(T) + fprintf(stdout, "delete(0x%x)\n", o); + dumptree(Objtree,0); + ENDDEBUG + + /* q <== lowest valued node of the right subtree */ + while( p ) { + q = p; + p = p->obj_left; + } + + if (o->obj_parent == (struct Object *)&Objtree) { + newparent = (struct Object *)&Objtree; + np_childlink = (struct Object **)&Objtree; + } else if(o->obj_parent->obj_left == o) { + newparent = o->obj_parent; + np_childlink = &(o->obj_parent->obj_left); + } else { + newparent = o->obj_parent; + np_childlink = &(o->obj_parent->obj_right); + } + IFDEBUG(T) + fprintf(OUT, "newparent=0x%x\n"); + ENDDEBUG + + if (q) { /* q gets the left, parent gets the right */ + IFDEBUG(T) + fprintf(OUT, "delete: q null\n"); + ENDDEBUG + q->obj_left = p; + if(p) p->obj_parent = q; + p = o->obj_right; + } else { /* parent(instead of q) gets the left ; there is no right */ + IFDEBUG(T) + fprintf(OUT, "delete: q not null\n"); + ENDDEBUG + p = o->obj_left; + } + *np_childlink = p; + if(p) + p->obj_parent = newparent; + + IFDEBUG(T) + fprintf(OUT, "After deleting 0x%x\n",o); + dumptree(Objtree,0); + ENDDEBUG +} + +struct Object * +defineset(type, adr, keep) +unsigned char type; +char *adr; +int keep; +{ + struct Object *onew; + IFDEBUG(o) + printf("defineset(0x%x,%s, %s)\n", type , adr, keep?"KEEP":"NO_KEEP"); + ENDDEBUG + + onew = (struct Object *)Malloc(sizeof (struct Object)); + bzero(onew, sizeof(struct Object)); + onew->obj_name = adr; + onew->obj_kind = OBJ_SET; + onew->obj_type = type; + if(keep) + insert( onew ); + /* address already stashed before calling defineset */ + IFDEBUG(o) + printf("defineset(0x%x,%s) returning 0x%x\n", type , adr, onew); + dumptree(Objtree,0); + ENDDEBUG + return(onew); +} + +dumpit(o, s) +char *o; +char *s; +{ + register int i; + +IFDEBUG(o) + fprintf(OUT, "object 0x%x, %s\n",o, s); + for(i=0; i< sizeof(struct Object); i+=4) { + fprintf(OUT, "0x%x: 0x%x 0x%x 0x%x 0x%x\n", + *((int *)o), *o, *(o+1), *(o+2), *(o+3) ); + } +ENDDEBUG +} + +defineitem(type, adr, struc) +unsigned char type; +char *adr; +char *struc; +{ + struct Object *onew; + IFDEBUG(o) + printf("defineitem(0x%x, %s at 0x%x, %s)\n", type, adr, adr, struc); + ENDDEBUG + + if( onew = lookup( type, adr ) ) { + fprintf(stderr, + "Internal error at defineitem: trying to redefine obj type 0x%x, adr %s\n", + type, adr); + exit(-1); + } else { + onew = (struct Object *)Malloc(sizeof (struct Object)); + bzero(onew, sizeof(struct Object)); + onew->obj_name = stash(adr); + onew->obj_kind = OBJ_ITEM; + onew->obj_type = type; + onew->obj_struc = struc?stash(struc):struc; + insert( onew ); + } + IFDEBUG(o) + fprintf(OUT, "defineitem(0x%x, %s) returning 0x%x\n", type, adr, onew); + ENDDEBUG +} + +member(o, adr) +struct Object *o; +char *adr; +{ + struct Object *onew, *oold; + IFDEBUG(o) + printf("member(0x%x, %s)\n", o, adr); + ENDDEBUG + + oold = lookup( o->obj_type, adr ); + + onew = (struct Object *)Malloc(sizeof (struct Object)); + if( oold == NULL ) { + extern int lineno; + + fprintf(stderr, + "Warning at line %d: set definition of %s causes definition of\n", + lineno, OBJ_NAME(o)); + fprintf(stderr, "\t (previously undefined) member %s\n", adr); + bzero(onew, sizeof(struct Object)); + onew->obj_name = stash(adr); + onew->obj_kind = OBJ_ITEM; + onew->obj_type = o->obj_type; + onew->obj_members = NULL; + insert( onew ); + } else { + if(oold->obj_kind != OBJ_ITEM) { + fprintf(stderr, "Sets cannot be members of sets; %s\n", adr); + exit(-1); + } + bcopy(oold, onew, sizeof(struct Object)); + onew->obj_members = onew->obj_left = onew->obj_right = NULL; + } + onew->obj_members = o->obj_members; + o->obj_members = onew; +} + +struct Object *Lookup(type, name) +unsigned char type; +char *name; +{ + register struct Object *o = lookup(type,name); + + if(o == NULL) { + fprintf(stderr, "Trying to use undefined %s: %s\n", + type==STATESET?"state":"event", name); + Exit(-1); + } + return(o); +} + +AddCurrentEventName(x) +register char **x; +{ + register char *n = EV_PREFIX; ; + + if( CurrentEvent == (struct Object *)0 ) { + fprintf(stderr, "No event named! BARF!\n"); Exit(-1); + } + + if( ! CurrentEvent->obj_struc ) { + fprintf(stderr, "No attributes for current event!\n"); Exit(-1); + } + + /* add prefix first */ + while(*n) { + *(*x)++ = *n++; + } + + n = CurrentEvent->obj_name; + + while(*n) { + *(*x)++ = *n++; + } +} + +dumptree(o,i) + register struct Object *o; + int i; +{ + register int j; + + if(o == NULL) { + for(j=0; jobj_left, i+1); + for(j=0; jobj_right, i+1); + } +} + +dump(c,a) +{ + register int x = 8; + int zero = 0; +#include + + fprintf(stderr, "dump: c 0x%x, a 0x%x\n",c,a); + + x = x/zero; + kill(0, SIGQUIT); +} + +dump_trans( pred, oldstate, newstate, action, event ) +struct Object *oldstate, *newstate, *event; +char *pred, *action; +{ + extern int transno; + struct Object *o; + + fprintf(stdout, "\n%d: ", transno); +#define dumpit(x)\ + if((x)->obj_kind == OBJ_SET) {\ + o = (x)->obj_members; fprintf( stdout, "[ " );\ + while(o) { fprintf(stdout, "%s ", o->obj_name); o = o->obj_members; }\ + fprintf( stdout, " ] ");\ + } else { fprintf(stdout, "%s ", (x)->obj_name); } + + dumpit(newstate); + fprintf(stdout, " <== "); + dumpit(oldstate); + dumpit(event); + fprintf(stdout, "\n\t\t%s\n\t\t%s\n", pred?pred:"DEFAULT", + action); +} diff --git a/sys/netiso/xebec/sets.h b/sys/netiso/xebec/sets.h new file mode 100644 index 00000000000..96eb791edc2 --- /dev/null +++ b/sys/netiso/xebec/sets.h @@ -0,0 +1,36 @@ +/* $Header: sets.h,v 2.1 88/09/19 12:56:33 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/sets.h,v $ */ + +#define MAXEVENTS 200 +#define MAXSTATES 200 + +#define STATESET 10 +#define EVENTSET 5 + +#define OBJ_ITEM 2 +#define OBJ_SET 3 + +struct Object { + unsigned char obj_kind; + unsigned char obj_type; /* state or event */ + char *obj_name; + char *obj_struc; + int obj_number; + struct Object *obj_members; /* must be null for kind==item */ + /* for the tree */ + struct Object *obj_left; + struct Object *obj_right; + struct Object *obj_parent; +} ; + +extern char *Noname; + +#define OBJ_NAME(o) (((o)->obj_name)?(o)->obj_name:Noname) + +extern int Nevents, Nstates; +int Eventshift; +extern struct Object *CurrentEvent; + +extern struct Object *Lookup(); +extern struct Object *defineset(); + diff --git a/sys/netiso/xebec/test.trans b/sys/netiso/xebec/test.trans new file mode 100644 index 00000000000..49db3610994 --- /dev/null +++ b/sys/netiso/xebec/test.trans @@ -0,0 +1,64 @@ +/* $Header: test.trans,v 0.2 88/09/19 12:58:29 nhall Exp $ + */ +*PROTOCOL test + +*INCLUDE + +{ +#include "test_def.h" +} + +*PCB test_pcbstruct SYNONYM P + +*STATES + +STATE_A +STATE_B +STATE_C +ALL_STATES = [STATE_A, STATE_B, STATE_C] + +*EVENTS { int ev_all; } SYNONYM E + +EV_1 { char *ev1_char; } +EV_2 { int ev2_int; char ev2_char; } +EV_3 +EV_4 { struct blah *ev4_blahptr; + unsigned int ev4_uint; + int ev4_int; + } + +*TRANSITIONS + +SAME <== [ STATE_A, STATE_B ] [ EV_1, EV_2, EV_3 ] + ( $E.ev_all > 0 ) + { + if( $P.test_state == STATE_A ) + printf("state is STATE_A\n"); + else + printf("state is STATE_B\n"); + printf("action first transition\n"); + } + +; +STATE_C <== [ STATE_A, STATE_B ] [ EV_1, EV_2, EV_3 ] + DEFAULT + { + printf("default - transition 2\n"); + MACRO1( $P.test_pcbfield ); + } +; + +STATE_C <== [ STATE_A, STATE_B ] EV_4 + ( $$.ev4_blahptr->blahfield & 0x1 ) + NULLACTION +; + +STATE_C <== ALL_STATES EV_4 + DEFAULT + { + printf("default - transition 4\n"); + printf("pcb is 0x%x, event is 0x%x \n", $P, $E); + printf("ev4 values are : blahptr 0x%x uint 0x%x int 0x%x\n", + $$.ev4_blahptr, $$.ev4_uint, $$.ev4_int); + } +; diff --git a/sys/netiso/xebec/test_def.h b/sys/netiso/xebec/test_def.h new file mode 100644 index 00000000000..6faa2dfce81 --- /dev/null +++ b/sys/netiso/xebec/test_def.h @@ -0,0 +1,13 @@ + +struct blah { + unsigned int blahfield; + int dummyi; + char dummyc; +}; + +struct test_pcbstruct { + int test_pcbfield; + int test_state; +}; + +#define MACRO1(arg) if(arg != 0) { printf("macro1\n"); } diff --git a/sys/netiso/xebec/xebec.bnf b/sys/netiso/xebec/xebec.bnf new file mode 100644 index 00000000000..d7406d9d5c2 --- /dev/null +++ b/sys/netiso/xebec/xebec.bnf @@ -0,0 +1,315 @@ +{ +#include "main.h" +#include "sets.h" +#include + +extern FILE *eventfile_h, *actfile; +} + +*fmq + + novocab + nobnf + nofirst + nofollow + noparsetable + noerrortables + nos + noe + +*terminals + +ID 0 0 { char *address; } +STRUCT 0 0 +SYNONYM 0 0 +PREDICATE 0 0 { char *address; } +ACTION 0 0 { char *address; } +/* +FSTRING 0 0 { char *address; } +*/ +PROTOCOL 0 0 +LBRACK 0 0 +RBRACK 0 0 +LANGLE 0 0 +EQUAL 0 0 +COMMA 0 0 +STAR 0 0 +EVENTS 0 0 +TRANSITIONS 0 0 +INCLUDE 0 0 +STATES 0 0 +SEMI 0 0 +PCB 0 0 { char *address; } +DEFAULT 0 0 +NULLACTION 0 0 +SAME 0 0 + +*nonterminals + +pcb { char *address; int isevent; } +syn { int type; } +setlist { struct Object *setnum; } +setlisttail { struct Object *setnum; } +part { unsigned char type; } +parttail { unsigned char type; } +partrest { unsigned char type; char *address; } +setstruct { struct Object *object; } +setdef { unsigned char type,keep; char *address; struct Object *object; } +translist +transition +event { struct Object *object; } +oldstate { struct Object *object; } +newstate { struct Object *object; } +predicatepart { char *string; } +actionpart { char *string; struct Object *oldstate; struct Object *newstate; } + +*productions + +program ::= + STAR PROTOCOL ID + { + if(strlen($ID.address) > 50 ) { + fprintf(stderr, + "Protocol name may not exceed 50 chars in length.\n"); + Exit(-1); + } + strcpy(protocol, $ID.address); + openfiles(protocol); + } + STAR includelist + PCB + { + $$pcb.isevent = 0; + } + pcb + { + fprintf(actfile, "\ntypedef %s %s%s;\n", + $pcb[7].address,protocol, PCBNAME); + $$syn.type = PCB_SYN; + } + syn + STAR STATES { $$part.type = (unsigned char) STATESET; } part + STAR { end_states(eventfile_h); } EVENTS + { $$pcb.isevent = 1; } + pcb + { + fprintf(eventfile_h, "\t"); /* fmq gags on single chars */ + includecode(eventfile_h, $pcb[14].address); + fprintf(eventfile_h, "\n"); /* fmq gags on single chars */ + $$syn.type = EVENT_SYN; + } + syn + { + $$part.type = (unsigned char)EVENTSET; + } + part + STAR { end_events(); } + TRANSITIONS + { + putincludes(); + putdriver(actfile, 9); + } + translist +; +pcb ::= STRUCT + { if($pcb.isevent) { + fprintf(stderr, + "Event is a list of objects enclosed by \"{}\"\n"); + Exit(-1); + } + fprintf(eventfile_h, "struct "); + } + ACTION { $pcb.address = $ACTION.address; } + optsemi + ::= ACTION + { if( ! $pcb.isevent) { + fprintf(stderr, + "Pcb requires a type or structure definition.\"{}\"\n"); + Exit(-1); + } + $pcb.address = $ACTION.address; + } + optsemi + ::= ID { $pcb.address = $ID.address; } optsemi +; + +syn ::= SYNONYM ID { synonyms[$syn.type] = stash( $ID.address ); } + ::= +; + +optsemi ::= SEMI + ::= +; +includelist ::= INCLUDE ACTION { includecode(actfile, $ACTION.address);} STAR + ::= +; +part ::= ID + { + $$partrest.address = $ID.address; + $$partrest.type = $part.type; + } + partrest + { $$parttail.type = $part.type; } + parttail +; +parttail ::= { $$part.type = $parttail.type; } part + ::= +; +partrest ::= EQUAL + { + if( lookup( $partrest.type, $partrest.address ) ) { + fprintf(stderr, "bnf:trying to redefine obj type 0x%x, adr %s\n", + $partrest.type, $partrest.address); + Exit(-1); + } + $$setdef.type = $partrest.type; + $$setdef.address = stash( $partrest.address ); + $$setdef.keep = 1; + } setdef { $$setstruct.object = $setdef.object; } setstruct + + ::= ACTION + { + defineitem($partrest.type, + $partrest.address, $ACTION.address); + } + + ::= { + defineitem($partrest.type, $partrest.address, (char *)0); + } +; + +setstruct ::= ACTION + { + if($setstruct.object) { + /* WHEN COULD THIS BE FALSE?? + * isn't it supposed to be setstruct.object??? + * (it used to be $ACTION.address) + */ + + $setstruct.object->obj_struc = $ACTION.address; + fprintf(eventfile_h, + "struct %s %s%s;\n\n", $ACTION.address, + EV_PREFIX, $setstruct.object->obj_name); + } + } + ::= +; + +setdef ::= LBRACK + { + $$setlist.setnum = + defineset($setdef.type, $setdef.address, $setdef.keep); + } setlist RBRACK { $setdef.object = $setlist.setnum; } +; + +setlist ::= ID + { + member($setlist.setnum, $ID.address); + $$setlisttail.setnum = $setlist.setnum; + } setlisttail +; + +setlisttail ::= COMMA { $$setlist.setnum = $setlisttail.setnum; } setlist + ::= +; +translist ::= transition translisttail +; +translisttail ::= translist + ::= +; +transition ::= newstate { transno ++; } LANGLE EQUAL EQUAL oldstate + event + { + CurrentEvent /* GAG! */ = $event.object; + } + predicatepart + { + $$actionpart.string = $predicatepart.string; + $$actionpart.newstate = $newstate.object; + $$actionpart.oldstate = $oldstate.object; + } + actionpart + SEMI +; + +predicatepart ::= PREDICATE + { + $predicatepart.string = stash ( $PREDICATE.address ); + } + ::= DEFAULT + { + $predicatepart.string = (char *)0; + } +; + +actionpart ::= + ACTION + { + statetable( $actionpart.string, $actionpart.oldstate, + $actionpart.newstate, + acttable(actfile, $ACTION.address ), + CurrentEvent ); + if( print_trans ) { + dump_trans( $actionpart.string, $actionpart.oldstate, + $actionpart.newstate, + $ACTION.address, CurrentEvent ); + } + } + ::= NULLACTION + { + statetable($actionpart.string, $actionpart.oldstate, $actionpart.newstate, + 0, CurrentEvent ); /* KLUDGE - remove this */ + if( print_trans ) { + dump_trans( $actionpart.string, $actionpart.oldstate, + $actionpart.newstate, + "NULLACTION", CurrentEvent ); + } + } +; + +oldstate ::= ID + { + $oldstate.object = Lookup(STATESET, $ID.address); + } + ::= { + $$setdef.address = (char *)0; + $$setdef.type = (unsigned char)STATESET; + $$setdef.keep = 0; + } + setdef + { + $oldstate.object = $setdef.object; + } +; + +newstate ::= ID + { + $newstate.object = Lookup(STATESET, $ID.address); + } +; + +newstate ::= SAME + { + extern struct Object *SameState; + + $newstate.object = SameState; + } +; + +event ::= ID + { + $event.object = Lookup(EVENTSET, $ID.address); + } + ::= + { + $$setdef.address = (char *)0; + $$setdef.type = (unsigned char)EVENTSET; + $$setdef.keep = 0; + } + setdef + { + $event.object = $setdef.object; + } +; + +*end diff --git a/sys/netiso/xebec/xebec.c b/sys/netiso/xebec/xebec.c new file mode 100644 index 00000000000..132bcb8487a --- /dev/null +++ b/sys/netiso/xebec/xebec.c @@ -0,0 +1,451 @@ +/* $Header: xebec.c,v 2.2 88/09/19 12:55:37 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/xebec.c,v $ */ + +#include "xebec.h" +#include "llparse.h" +#ifndef E_TABLE +#define E_TABLE "xebec.e" +#endif E_TABLE + +#include "main.h" +#include "sets.h" +#include + +extern FILE *eventfile_h, *actfile; + +llaction(lln,token) +LLtoken *token; +{ + struct llattr *llattr; + llattr = &llattrdesc[lldescindex-1]; +switch(lln) { +case 1: + llfinprod(); + break; + +case 10: { + + if(strlen(llattr->llabase[3].ID.address) > 50 ) { + fprintf(stderr, + "Protocol name may not exceed 50 chars in length.\n"); + Exit(-1); + } + strcpy(protocol, llattr->llabase[3].ID.address); + openfiles(protocol); + +} break; + +case 11: { + + llattr->llabase[7].pcb.isevent = 0; + +} break; + +case 12: { + + fprintf(actfile, "\ntypedef %s %s%s;\n", + llattr->llabase[7].pcb.address,protocol, PCBNAME); + llattr->llabase[8].syn.type = PCB_SYN; + +} break; + +case 13: { + llattr->llabase[11].part.type = (unsigned char) STATESET; +} break; + +case 14: { + end_states(eventfile_h); +} break; + +case 15: { + llattr->llabase[14].pcb.isevent = 1; +} break; + +case 16: { + + fprintf(eventfile_h, "\t"); /* fmq gags on single chars */ + includecode(eventfile_h, llattr->llabase[14].pcb.address); + fprintf(eventfile_h, "\n"); /* fmq gags on single chars */ + llattr->llabase[15].syn.type = EVENT_SYN; + +} break; + +case 17: { + + llattr->llabase[16].part.type = (unsigned char)EVENTSET; + +} break; + +case 18: { + end_events(); +} break; + +case 19: { + + putincludes(); + putdriver(actfile, 9); + +} break; + +case 20: { + if(llattr->llabase[0].pcb.isevent) { + fprintf(stderr, + "Event is a list of objects enclosed by \"{}\"\n"); + Exit(-1); + } + fprintf(eventfile_h, "struct "); + +} break; + +case 21: { + llattr->llabase[0].pcb.address = llattr->llabase[2].ACTION.address; +} break; + +case 22: { + if( ! llattr->llabase[0].pcb.isevent) { + fprintf(stderr, + "Pcb requires a type or structure definition.\"{}\"\n"); + Exit(-1); + } + llattr->llabase[0].pcb.address = llattr->llabase[1].ACTION.address; + +} break; + +case 23: { + llattr->llabase[0].pcb.address = llattr->llabase[1].ID.address; +} break; + +case 24: { + synonyms[llattr->llabase[0].syn.type] = stash( llattr->llabase[2].ID.address ); +} break; + +case 25: { + includecode(actfile, llattr->llabase[2].ACTION.address); +} break; + +case 26: { + + llattr->llabase[2].partrest.address = llattr->llabase[1].ID.address; + llattr->llabase[2].partrest.type = llattr->llabase[0].part.type; + +} break; + +case 27: { + llattr->llabase[3].parttail.type = llattr->llabase[0].part.type; +} break; + +case 28: { + llattr->llabase[1].part.type = llattr->llabase[0].parttail.type; +} break; + +case 29: { + + if( lookup( llattr->llabase[0].partrest.type, llattr->llabase[0].partrest.address ) ) { + fprintf(stderr, "bnf:trying to redefine obj type 0x%x, adr %s\n", + llattr->llabase[0].partrest.type, llattr->llabase[0].partrest.address); + Exit(-1); + } + llattr->llabase[2].setdef.type = llattr->llabase[0].partrest.type; + llattr->llabase[2].setdef.address = stash( llattr->llabase[0].partrest.address ); + llattr->llabase[2].setdef.keep = 1; + +} break; + +case 30: { + llattr->llabase[3].setstruct.object = llattr->llabase[2].setdef.object; +} break; + +case 31: { + + defineitem(llattr->llabase[0].partrest.type, + llattr->llabase[0].partrest.address, llattr->llabase[1].ACTION.address); + +} break; + +case 32: { + + defineitem(llattr->llabase[0].partrest.type, llattr->llabase[0].partrest.address, (char *)0); + +} break; + +case 33: { + + if(llattr->llabase[0].setstruct.object) { + /* WHEN COULD THIS BE FALSE?? + * isn't it supposed to be setstruct.object??? + * (it used to be $ACTION.address) + */ + + llattr->llabase[0].setstruct.object->obj_struc = llattr->llabase[1].ACTION.address; + fprintf(eventfile_h, + "struct %s %s%s;\n\n", llattr->llabase[1].ACTION.address, + EV_PREFIX, llattr->llabase[0].setstruct.object->obj_name); + } + +} break; + +case 34: { + + llattr->llabase[2].setlist.setnum = + defineset(llattr->llabase[0].setdef.type, llattr->llabase[0].setdef.address, llattr->llabase[0].setdef.keep); + +} break; + +case 35: { + llattr->llabase[0].setdef.object = llattr->llabase[2].setlist.setnum; +} break; + +case 36: { + + member(llattr->llabase[0].setlist.setnum, llattr->llabase[1].ID.address); + llattr->llabase[2].setlisttail.setnum = llattr->llabase[0].setlist.setnum; + +} break; + +case 37: { + llattr->llabase[2].setlist.setnum = llattr->llabase[0].setlisttail.setnum; +} break; + +case 38: { + transno ++; +} break; + +case 39: { + + CurrentEvent /* GAG! */ = llattr->llabase[6].event.object; + +} break; + +case 40: { + + llattr->llabase[8].actionpart.string = llattr->llabase[7].predicatepart.string; + llattr->llabase[8].actionpart.newstate = llattr->llabase[1].newstate.object; + llattr->llabase[8].actionpart.oldstate = llattr->llabase[5].oldstate.object; + +} break; + +case 41: { + + llattr->llabase[0].predicatepart.string = stash ( llattr->llabase[1].PREDICATE.address ); + +} break; + +case 42: { + + llattr->llabase[0].predicatepart.string = (char *)0; + +} break; + +case 43: { + + statetable( llattr->llabase[0].actionpart.string, llattr->llabase[0].actionpart.oldstate, + llattr->llabase[0].actionpart.newstate, + acttable(actfile, llattr->llabase[1].ACTION.address ), + CurrentEvent ); + if( print_trans ) { + dump_trans( llattr->llabase[0].actionpart.string, llattr->llabase[0].actionpart.oldstate, + llattr->llabase[0].actionpart.newstate, + llattr->llabase[1].ACTION.address, CurrentEvent ); + } + +} break; + +case 44: { + + statetable(llattr->llabase[0].actionpart.string, llattr->llabase[0].actionpart.oldstate, llattr->llabase[0].actionpart.newstate, + 0, CurrentEvent ); /* KLUDGE - remove this */ + if( print_trans ) { + dump_trans( llattr->llabase[0].actionpart.string, llattr->llabase[0].actionpart.oldstate, + llattr->llabase[0].actionpart.newstate, + "NULLACTION", CurrentEvent ); + } + +} break; + +case 45: { + + llattr->llabase[0].oldstate.object = Lookup(STATESET, llattr->llabase[1].ID.address); + +} break; + +case 46: { + + llattr->llabase[1].setdef.address = (char *)0; + llattr->llabase[1].setdef.type = (unsigned char)STATESET; + llattr->llabase[1].setdef.keep = 0; + +} break; + +case 47: { + + llattr->llabase[0].oldstate.object = llattr->llabase[1].setdef.object; + +} break; + +case 48: { + + llattr->llabase[0].newstate.object = Lookup(STATESET, llattr->llabase[1].ID.address); + +} break; + +case 49: { + + extern struct Object *SameState; + + llattr->llabase[0].newstate.object = SameState; + +} break; + +case 50: { + + llattr->llabase[0].event.object = Lookup(EVENTSET, llattr->llabase[1].ID.address); + +} break; + +case 51: { + + llattr->llabase[1].setdef.address = (char *)0; + llattr->llabase[1].setdef.type = (unsigned char)EVENTSET; + llattr->llabase[1].setdef.keep = 0; + +} break; + +case 52: { + + llattr->llabase[0].event.object = llattr->llabase[1].setdef.object; + +} break; +} +} +char *llstrings[] = { + "", + "ID", + "STRUCT", + "SYNONYM", + "PREDICATE", + "ACTION", + "PROTOCOL", + "LBRACK", + "RBRACK", + "LANGLE", + "EQUAL", + "COMMA", + "STAR", + "EVENTS", + "TRANSITIONS", + "INCLUDE", + "STATES", + "SEMI", + "PCB", + "DEFAULT", + "NULLACTION", + "SAME", + "ENDMARKER", + "pcb", + "syn", + "setlist", + "setlisttail", + "part", + "parttail", + "partrest", + "setstruct", + "setdef", + "translist", + "transition", + "event", + "oldstate", + "newstate", + "predicatepart", + "actionpart", + "program", + "includelist", + "optsemi", + "translisttail", + "$goal$", + (char *) 0 +}; +short llnterms = 23; +short llnsyms = 44; +short llnprods = 38; +short llinfinite = 10000; +short llproductions[] = { +41, -21, 5, -20, 2, +41, -22, 5, +41, -23, 1, +-24, 1, 3, + +26, -36, 1, +25, -37, 11, + +28, -27, 29, -26, 1, +27, -28, + +30, -30, 31, -29, 10, +-31, 5, +-32, +-33, 5, + +-35, 8, 25, -34, 7, +42, 33, +17, 38, -40, 37, -39, 34, 35, 10, 10, 9, -38, 36, +-50, 1, +-52, 31, -51, +-45, 1, +-47, 31, -46, +-48, 1, +-49, 21, +-41, 4, +-42, 19, +-43, 5, +-44, 20, +32, -19, 14, -18, 12, 27, -17, 24, -16, 23, -15, 13, -14, 12, 27, -13, 16, 12, 24, -12, 23, -11, 18, 40, 12, -10, 1, 6, 12, +12, -25, 5, 15, + +17, + +32, + +22, 39, +0 +}; +struct llprodindex llprodindex[] = { +{ 0, 0, 0 }, { 0, 5, 19 }, { 5, 3, 3 }, { 8, 3, 2 }, +{ 11, 3, 2 }, { 14, 0, 2 }, { 14, 3, 0 }, { 17, 3, 1 }, +{ 20, 0, 0 }, { 20, 5, 3 }, { 25, 2, 0 }, { 27, 0, 3 }, +{ 27, 5, 1 }, { 32, 2, 0 }, { 34, 1, 3 }, { 35, 2, 1 }, +{ 37, 0, 0 }, { 37, 5, 1 }, { 42, 2, 0 }, { 44, 12, 3 }, +{ 56, 2, 2 }, { 58, 3, 2 }, { 61, 2, 0 }, { 63, 3, 2 }, +{ 66, 2, 1 }, { 68, 2, 0 }, { 70, 2, 9 }, { 72, 2, 1 }, +{ 74, 2, 1 }, { 76, 2, 1 }, { 78, 29, 1 }, { 107, 4, 1 }, +{ 111, 0, 1 }, { 111, 1, 1 }, { 112, 0, 1 }, { 112, 1, 1 }, +{ 113, 0, 1 }, { 113, 2, 2 }, { 0, 0, 0 } +}; +short llepsilon[] = { + 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, + 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 1, 0, 1, 0, 0 +}; +struct llparsetable llparsetable[] = { +{ 1, 3 }, { 2, 1 }, { 5, 2 }, { 0, 23 }, { 1, 5 }, +{ 3, 4 }, { 12, 5 }, { 0, 24 }, { 1, 6 }, { 0, 25 }, +{ 8, 8 }, { 11, 7 }, { 0, 26 }, { 1, 9 }, { 0, 27 }, +{ 1, 10 }, { 12, 11 }, { 0, 28 }, { 1, 14 }, { 5, 13 }, +{ 10, 12 }, { 12, 14 }, { 0, 29 }, { 1, 16 }, { 5, 15 }, +{ 12, 16 }, { 0, 30 }, { 7, 17 }, { 0, 31 }, { 1, 18 }, +{ 21, 18 }, { 0, 32 }, { 1, 19 }, { 21, 19 }, { 0, 33 }, +{ 1, 20 }, { 7, 21 }, { 0, 34 }, { 1, 22 }, { 7, 23 }, +{ 0, 35 }, { 1, 24 }, { 21, 25 }, { 0, 36 }, { 4, 26 }, +{ 19, 27 }, { 0, 37 }, { 5, 28 }, { 20, 29 }, { 0, 38 }, +{ 12, 30 }, { 0, 39 }, { 15, 31 }, { 18, 32 }, { 0, 40 }, +{ 1, 34 }, { 3, 34 }, { 12, 34 }, { 17, 33 }, { 0, 41 }, +{ 1, 35 }, { 21, 35 }, { 22, 36 }, { 0, 42 }, { 12, 37 }, +{ 0, 43 }, { 0, 0 } +}; +short llparseindex[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 4, 8, 10, 13, 15, 18, + 23, 27, 29, 32, 35, 38, 41, 44, 47, 50, + 52, 55, 60, 64, 0 +}; diff --git a/sys/netiso/xebec/xebec.h b/sys/netiso/xebec/xebec.h new file mode 100644 index 00000000000..168bb77b249 --- /dev/null +++ b/sys/netiso/xebec/xebec.h @@ -0,0 +1,88 @@ +/* $Header: xebec.h,v 2.1 88/09/19 12:56:35 nhall Exp $ */ +/* $Source: /var/home/tadl/src/argo/xebec/RCS/xebec.h,v $ */ + +union llattrib { + struct { + char *address; } ID; + int STRUCT; + int SYNONYM; + struct { + char *address; } PREDICATE; + struct { + char *address; } ACTION; + int PROTOCOL; + int LBRACK; + int RBRACK; + int LANGLE; + int EQUAL; + int COMMA; + int STAR; + int EVENTS; + int TRANSITIONS; + int INCLUDE; + int STATES; + int SEMI; + struct { + char *address; } PCB; + int DEFAULT; + int NULLACTION; + int SAME; + struct { + char *address; int isevent; } pcb; + struct { + int type; } syn; + struct { + struct Object *setnum; } setlist; + struct { + struct Object *setnum; } setlisttail; + struct { + unsigned char type; } part; + struct { + unsigned char type; } parttail; + struct { + unsigned char type; char *address; } partrest; + struct { + struct Object *object; } setstruct; + struct { + unsigned char type,keep; char *address; struct Object *object; } setdef; + int translist; + int transition; + struct { + struct Object *object; } event; + struct { + struct Object *object; } oldstate; + struct { + struct Object *object; } newstate; + struct { + char *string; } predicatepart; + struct { + char *string; struct Object *oldstate; struct Object *newstate; } actionpart; +}; +#define LLTERM 23 +#define LLSYM 44 +#define LLPROD 38 + +#define LLINF 10000 + +#define T_ID 1 +#define T_STRUCT 2 +#define T_SYNONYM 3 +#define T_PREDICATE 4 +#define T_ACTION 5 +#define T_PROTOCOL 6 +#define T_LBRACK 7 +#define T_RBRACK 8 +#define T_LANGLE 9 +#define T_EQUAL 10 +#define T_COMMA 11 +#define T_STAR 12 +#define T_EVENTS 13 +#define T_TRANSITIONS 14 +#define T_INCLUDE 15 +#define T_STATES 16 +#define T_SEMI 17 +#define T_PCB 18 +#define T_DEFAULT 19 +#define T_NULLACTION 20 +#define T_SAME 21 +#define T_ENDMARKER 22 diff --git a/sys/netns/idp.h b/sys/netns/idp.h new file mode 100644 index 00000000000..254208dfad8 --- /dev/null +++ b/sys/netns/idp.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)idp.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions for NS(tm) Internet Datagram Protocol + */ +struct idp { + u_short idp_sum; /* Checksum */ + u_short idp_len; /* Length, in bytes, including header */ + u_char idp_tc; /* Transport Crontrol (i.e. hop count) */ + u_char idp_pt; /* Packet Type (i.e. level 2 protocol) */ + struct ns_addr idp_dna; /* Destination Network Address */ + struct ns_addr idp_sna; /* Source Network Address */ +}; diff --git a/sys/netns/idp_usrreq.c b/sys/netns/idp_usrreq.c new file mode 100644 index 00000000000..b548a12574b --- /dev/null +++ b/sys/netns/idp_usrreq.c @@ -0,0 +1,566 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)idp_usrreq.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * IDP protocol implementation. + */ + +struct sockaddr_ns idp_ns = { sizeof(idp_ns), AF_NS }; + +/* + * This may also be called for raw listeners. + */ +idp_input(m, nsp) + struct mbuf *m; + register struct nspcb *nsp; +{ + register struct idp *idp = mtod(m, struct idp *); + struct ifnet *ifp = m->m_pkthdr.rcvif; + + if (nsp==0) + panic("No nspcb"); + /* + * Construct sockaddr format source address. + * Stuff source address and datagram in user buffer. + */ + idp_ns.sns_addr = idp->idp_sna; + if (ns_neteqnn(idp->idp_sna.x_net, ns_zeronet) && ifp) { + register struct ifaddr *ifa; + + for (ifa = ifp->if_addrlist; ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_addr->sa_family == AF_NS) { + idp_ns.sns_addr.x_net = + IA_SNS(ifa)->sns_addr.x_net; + break; + } + } + } + nsp->nsp_rpt = idp->idp_pt; + if ( ! (nsp->nsp_flags & NSP_RAWIN) ) { + m->m_len -= sizeof (struct idp); + m->m_pkthdr.len -= sizeof (struct idp); + m->m_data += sizeof (struct idp); + } + if (sbappendaddr(&nsp->nsp_socket->so_rcv, (struct sockaddr *)&idp_ns, + m, (struct mbuf *)0) == 0) + goto bad; + sorwakeup(nsp->nsp_socket); + return; +bad: + m_freem(m); +} + +idp_abort(nsp) + struct nspcb *nsp; +{ + struct socket *so = nsp->nsp_socket; + + ns_pcbdisconnect(nsp); + soisdisconnected(so); +} +/* + * Drop connection, reporting + * the specified error. + */ +struct nspcb * +idp_drop(nsp, errno) + register struct nspcb *nsp; + int errno; +{ + struct socket *so = nsp->nsp_socket; + + /* + * someday, in the xerox world + * we will generate error protocol packets + * announcing that the socket has gone away. + */ + /*if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_output(tp); + }*/ + so->so_error = errno; + ns_pcbdisconnect(nsp); + soisdisconnected(so); +} + +int noIdpRoute; +idp_output(nsp, m0) + struct nspcb *nsp; + struct mbuf *m0; +{ + register struct mbuf *m; + register struct idp *idp; + register struct socket *so; + register int len = 0; + register struct route *ro; + struct mbuf *mprev; + extern int idpcksum; + + /* + * Calculate data length. + */ + for (m = m0; m; m = m->m_next) { + mprev = m; + len += m->m_len; + } + /* + * Make sure packet is actually of even length. + */ + + if (len & 1) { + m = mprev; + if ((m->m_flags & M_EXT) == 0 && + (m->m_len + m->m_data < &m->m_dat[MLEN])) { + m->m_len++; + } else { + struct mbuf *m1 = m_get(M_DONTWAIT, MT_DATA); + + if (m1 == 0) { + m_freem(m0); + return (ENOBUFS); + } + m1->m_len = 1; + * mtod(m1, char *) = 0; + m->m_next = m1; + } + m0->m_pkthdr.len++; + } + + /* + * Fill in mbuf with extended IDP header + * and addresses and length put into network format. + */ + m = m0; + if (nsp->nsp_flags & NSP_RAWOUT) { + idp = mtod(m, struct idp *); + } else { + M_PREPEND(m, sizeof (struct idp), M_DONTWAIT); + if (m == 0) + return (ENOBUFS); + idp = mtod(m, struct idp *); + idp->idp_tc = 0; + idp->idp_pt = nsp->nsp_dpt; + idp->idp_sna = nsp->nsp_laddr; + idp->idp_dna = nsp->nsp_faddr; + len += sizeof (struct idp); + } + + idp->idp_len = htons((u_short)len); + + if (idpcksum) { + idp->idp_sum = 0; + len = ((len - 1) | 1) + 1; + idp->idp_sum = ns_cksum(m, len); + } else + idp->idp_sum = 0xffff; + + /* + * Output datagram. + */ + so = nsp->nsp_socket; + if (so->so_options & SO_DONTROUTE) + return (ns_output(m, (struct route *)0, + (so->so_options & SO_BROADCAST) | NS_ROUTETOIF)); + /* + * Use cached route for previous datagram if + * possible. If the previous net was the same + * and the interface was a broadcast medium, or + * if the previous destination was identical, + * then we are ok. + * + * NB: We don't handle broadcasts because that + * would require 3 subroutine calls. + */ + ro = &nsp->nsp_route; +#ifdef ancient_history + /* + * I think that this will all be handled in ns_pcbconnect! + */ + if (ro->ro_rt) { + if(ns_neteq(nsp->nsp_lastdst, idp->idp_dna)) { + /* + * This assumes we have no GH type routes + */ + if (ro->ro_rt->rt_flags & RTF_HOST) { + if (!ns_hosteq(nsp->nsp_lastdst, idp->idp_dna)) + goto re_route; + + } + if ((ro->ro_rt->rt_flags & RTF_GATEWAY) == 0) { + register struct ns_addr *dst = + &satons_addr(ro->ro_dst); + dst->x_host = idp->idp_dna.x_host; + } + /* + * Otherwise, we go through the same gateway + * and dst is already set up. + */ + } else { + re_route: + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + } + nsp->nsp_lastdst = idp->idp_dna; +#endif /* ancient_history */ + if (noIdpRoute) ro = 0; + return (ns_output(m, ro, so->so_options & SO_BROADCAST)); +} +/* ARGSUSED */ +idp_ctloutput(req, so, level, name, value) + int req, level; + struct socket *so; + int name; + struct mbuf **value; +{ + register struct mbuf *m; + struct nspcb *nsp = sotonspcb(so); + int mask, error = 0; + extern long ns_pexseq; + + if (nsp == NULL) + return (EINVAL); + + switch (req) { + + case PRCO_GETOPT: + if (value==NULL) + return (EINVAL); + m = m_get(M_DONTWAIT, MT_DATA); + if (m==NULL) + return (ENOBUFS); + switch (name) { + + case SO_ALL_PACKETS: + mask = NSP_ALL_PACKETS; + goto get_flags; + + case SO_HEADERS_ON_INPUT: + mask = NSP_RAWIN; + goto get_flags; + + case SO_HEADERS_ON_OUTPUT: + mask = NSP_RAWOUT; + get_flags: + m->m_len = sizeof(short); + *mtod(m, short *) = nsp->nsp_flags & mask; + break; + + case SO_DEFAULT_HEADERS: + m->m_len = sizeof(struct idp); + { + register struct idp *idp = mtod(m, struct idp *); + idp->idp_len = 0; + idp->idp_sum = 0; + idp->idp_tc = 0; + idp->idp_pt = nsp->nsp_dpt; + idp->idp_dna = nsp->nsp_faddr; + idp->idp_sna = nsp->nsp_laddr; + } + break; + + case SO_SEQNO: + m->m_len = sizeof(long); + *mtod(m, long *) = ns_pexseq++; + break; + + default: + error = EINVAL; + } + *value = m; + break; + + case PRCO_SETOPT: + switch (name) { + int *ok; + + case SO_ALL_PACKETS: + mask = NSP_ALL_PACKETS; + goto set_head; + + case SO_HEADERS_ON_INPUT: + mask = NSP_RAWIN; + goto set_head; + + case SO_HEADERS_ON_OUTPUT: + mask = NSP_RAWOUT; + set_head: + if (value && *value) { + ok = mtod(*value, int *); + if (*ok) + nsp->nsp_flags |= mask; + else + nsp->nsp_flags &= ~mask; + } else error = EINVAL; + break; + + case SO_DEFAULT_HEADERS: + { + register struct idp *idp + = mtod(*value, struct idp *); + nsp->nsp_dpt = idp->idp_pt; + } + break; +#ifdef NSIP + + case SO_NSIP_ROUTE: + error = nsip_route(*value); + break; +#endif /* NSIP */ + default: + error = EINVAL; + } + if (value && *value) + m_freem(*value); + break; + } + return (error); +} + +/*ARGSUSED*/ +idp_usrreq(so, req, m, nam, control) + struct socket *so; + int req; + struct mbuf *m, *nam, *control; +{ + struct nspcb *nsp = sotonspcb(so); + int error = 0; + + if (req == PRU_CONTROL) + return (ns_control(so, (int)m, (caddr_t)nam, + (struct ifnet *)control)); + if (control && control->m_len) { + error = EINVAL; + goto release; + } + if (nsp == NULL && req != PRU_ATTACH) { + error = EINVAL; + goto release; + } + switch (req) { + + case PRU_ATTACH: + if (nsp != NULL) { + error = EINVAL; + break; + } + error = ns_pcballoc(so, &nspcb); + if (error) + break; + error = soreserve(so, (u_long) 2048, (u_long) 2048); + if (error) + break; + break; + + case PRU_DETACH: + if (nsp == NULL) { + error = ENOTCONN; + break; + } + ns_pcbdetach(nsp); + break; + + case PRU_BIND: + error = ns_pcbbind(nsp, nam); + break; + + case PRU_LISTEN: + error = EOPNOTSUPP; + break; + + case PRU_CONNECT: + if (!ns_nullhost(nsp->nsp_faddr)) { + error = EISCONN; + break; + } + error = ns_pcbconnect(nsp, nam); + if (error == 0) + soisconnected(so); + break; + + case PRU_CONNECT2: + error = EOPNOTSUPP; + break; + + case PRU_ACCEPT: + error = EOPNOTSUPP; + break; + + case PRU_DISCONNECT: + if (ns_nullhost(nsp->nsp_faddr)) { + error = ENOTCONN; + break; + } + ns_pcbdisconnect(nsp); + soisdisconnected(so); + break; + + case PRU_SHUTDOWN: + socantsendmore(so); + break; + + case PRU_SEND: + { + struct ns_addr laddr; + int s; + + if (nam) { + laddr = nsp->nsp_laddr; + if (!ns_nullhost(nsp->nsp_faddr)) { + error = EISCONN; + break; + } + /* + * Must block input while temporarily connected. + */ + s = splnet(); + error = ns_pcbconnect(nsp, nam); + if (error) { + splx(s); + break; + } + } else { + if (ns_nullhost(nsp->nsp_faddr)) { + error = ENOTCONN; + break; + } + } + error = idp_output(nsp, m); + m = NULL; + if (nam) { + ns_pcbdisconnect(nsp); + splx(s); + nsp->nsp_laddr.x_host = laddr.x_host; + nsp->nsp_laddr.x_port = laddr.x_port; + } + } + break; + + case PRU_ABORT: + ns_pcbdetach(nsp); + sofree(so); + soisdisconnected(so); + break; + + case PRU_SOCKADDR: + ns_setsockaddr(nsp, nam); + break; + + case PRU_PEERADDR: + ns_setpeeraddr(nsp, nam); + break; + + case PRU_SENSE: + /* + * stat: don't bother with a blocksize. + */ + return (0); + + case PRU_SENDOOB: + case PRU_FASTTIMO: + case PRU_SLOWTIMO: + case PRU_PROTORCV: + case PRU_PROTOSEND: + error = EOPNOTSUPP; + break; + + case PRU_CONTROL: + case PRU_RCVD: + case PRU_RCVOOB: + return (EOPNOTSUPP); /* do not free mbuf's */ + + default: + panic("idp_usrreq"); + } +release: + if (control != NULL) + m_freem(control); + if (m != NULL) + m_freem(m); + return (error); +} +/*ARGSUSED*/ +idp_raw_usrreq(so, req, m, nam, control) + struct socket *so; + int req; + struct mbuf *m, *nam, *control; +{ + int error = 0; + struct nspcb *nsp = sotonspcb(so); + extern struct nspcb nsrawpcb; + + switch (req) { + + case PRU_ATTACH: + + if (!(so->so_state & SS_PRIV) || (nsp != NULL)) { + error = EINVAL; + break; + } + error = ns_pcballoc(so, &nsrawpcb); + if (error) + break; + error = soreserve(so, (u_long) 2048, (u_long) 2048); + if (error) + break; + nsp = sotonspcb(so); + nsp->nsp_faddr.x_host = ns_broadhost; + nsp->nsp_flags = NSP_RAWIN | NSP_RAWOUT; + break; + default: + error = idp_usrreq(so, req, m, nam, control); + } + return (error); +} + diff --git a/sys/netns/idp_var.h b/sys/netns/idp_var.h new file mode 100644 index 00000000000..fc9a4f45d81 --- /dev/null +++ b/sys/netns/idp_var.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)idp_var.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * IDP Kernel Structures and Variables + */ +struct idpstat { + int idps_badsum; /* checksum bad */ + int idps_tooshort; /* packet too short */ + int idps_toosmall; /* not enough data */ + int idps_badhlen; /* ip header length < data size */ + int idps_badlen; /* ip length < ip header length */ +}; + +#ifdef KERNEL +struct idpstat idpstat; +#endif diff --git a/sys/netns/ns.c b/sys/netns/ns.c new file mode 100644 index 00000000000..8b76543fce3 --- /dev/null +++ b/sys/netns/ns.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns.c 8.2 (Berkeley) 11/15/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#ifdef NS + +struct ns_ifaddr *ns_ifaddr; +int ns_interfaces; +extern struct sockaddr_ns ns_netmask, ns_hostmask; + +/* + * Generic internet control operations (ioctl's). + */ +/* ARGSUSED */ +ns_control(so, cmd, data, ifp) + struct socket *so; + int cmd; + caddr_t data; + register struct ifnet *ifp; +{ + register struct ifreq *ifr = (struct ifreq *)data; + register struct ns_aliasreq *ifra = (struct ns_aliasreq *)data; + register struct ns_ifaddr *ia; + struct ifaddr *ifa; + struct ns_ifaddr *oia; + int error, dstIsNew, hostIsNew; + + /* + * Find address for this interface, if it exists. + */ + if (ifp == 0) + return (EADDRNOTAVAIL); + for (ia = ns_ifaddr; ia; ia = ia->ia_next) + if (ia->ia_ifp == ifp) + break; + + switch (cmd) { + + case SIOCGIFADDR: + if (ia == (struct ns_ifaddr *)0) + return (EADDRNOTAVAIL); + *(struct sockaddr_ns *)&ifr->ifr_addr = ia->ia_addr; + return (0); + + + case SIOCGIFBRDADDR: + if (ia == (struct ns_ifaddr *)0) + return (EADDRNOTAVAIL); + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (EINVAL); + *(struct sockaddr_ns *)&ifr->ifr_dstaddr = ia->ia_broadaddr; + return (0); + + case SIOCGIFDSTADDR: + if (ia == (struct ns_ifaddr *)0) + return (EADDRNOTAVAIL); + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + *(struct sockaddr_ns *)&ifr->ifr_dstaddr = ia->ia_dstaddr; + return (0); + } + + if ((so->so_state & SS_PRIV) == 0) + return (EPERM); + + switch (cmd) { + case SIOCAIFADDR: + case SIOCDIFADDR: + if (ifra->ifra_addr.sns_family == AF_NS) + for (oia = ia; ia; ia = ia->ia_next) { + if (ia->ia_ifp == ifp && + ns_neteq(ia->ia_addr.sns_addr, + ifra->ifra_addr.sns_addr)) + break; + } + if (cmd == SIOCDIFADDR && ia == 0) + return (EADDRNOTAVAIL); + /* FALLTHROUGH */ + + case SIOCSIFADDR: + case SIOCSIFDSTADDR: + if (ia == (struct ns_ifaddr *)0) { + oia = (struct ns_ifaddr *) + malloc(sizeof *ia, M_IFADDR, M_WAITOK); + if (oia == (struct ns_ifaddr *)NULL) + return (ENOBUFS); + bzero((caddr_t)oia, sizeof(*oia)); + if (ia = ns_ifaddr) { + for ( ; ia->ia_next; ia = ia->ia_next) + ; + ia->ia_next = oia; + } else + ns_ifaddr = oia; + ia = oia; + if (ifa = ifp->if_addrlist) { + for ( ; ifa->ifa_next; ifa = ifa->ifa_next) + ; + ifa->ifa_next = (struct ifaddr *) ia; + } else + ifp->if_addrlist = (struct ifaddr *) ia; + ia->ia_ifp = ifp; + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + + ia->ia_ifa.ifa_netmask = + (struct sockaddr *)&ns_netmask; + + ia->ia_ifa.ifa_dstaddr = + (struct sockaddr *)&ia->ia_dstaddr; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sns_family = AF_NS; + ia->ia_broadaddr.sns_len = sizeof(ia->ia_addr); + ia->ia_broadaddr.sns_addr.x_host = ns_broadhost; + } + ns_interfaces++; + } + } + + switch (cmd) { + int error; + + case SIOCSIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + if (ia->ia_flags & IFA_ROUTE) { + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + ia->ia_flags &= ~IFA_ROUTE; + } + if (ifp->if_ioctl) { + error = (*ifp->if_ioctl)(ifp, SIOCSIFDSTADDR, ia); + if (error) + return (error); + } + *(struct sockaddr *)&ia->ia_dstaddr = ifr->ifr_dstaddr; + return (0); + + case SIOCSIFADDR: + return (ns_ifinit(ifp, ia, + (struct sockaddr_ns *)&ifr->ifr_addr, 1)); + + case SIOCDIFADDR: + ns_ifscrub(ifp, ia); + if ((ifa = ifp->if_addrlist) == (struct ifaddr *)ia) + ifp->if_addrlist = ifa->ifa_next; + else { + while (ifa->ifa_next && + (ifa->ifa_next != (struct ifaddr *)ia)) + ifa = ifa->ifa_next; + if (ifa->ifa_next) + ifa->ifa_next = ((struct ifaddr *)ia)->ifa_next; + else + printf("Couldn't unlink nsifaddr from ifp\n"); + } + oia = ia; + if (oia == (ia = ns_ifaddr)) { + ns_ifaddr = ia->ia_next; + } else { + while (ia->ia_next && (ia->ia_next != oia)) { + ia = ia->ia_next; + } + if (ia->ia_next) + ia->ia_next = oia->ia_next; + else + printf("Didn't unlink nsifadr from list\n"); + } + IFAFREE((&oia->ia_ifa)); + if (0 == --ns_interfaces) { + /* + * We reset to virginity and start all over again + */ + ns_thishost = ns_zerohost; + } + return (0); + + case SIOCAIFADDR: + dstIsNew = 0; hostIsNew = 1; + if (ia->ia_addr.sns_family == AF_NS) { + if (ifra->ifra_addr.sns_len == 0) { + ifra->ifra_addr = ia->ia_addr; + hostIsNew = 0; + } else if (ns_neteq(ifra->ifra_addr.sns_addr, + ia->ia_addr.sns_addr)) + hostIsNew = 0; + } + if ((ifp->if_flags & IFF_POINTOPOINT) && + (ifra->ifra_dstaddr.sns_family == AF_NS)) { + if (hostIsNew == 0) + ns_ifscrub(ifp, ia); + ia->ia_dstaddr = ifra->ifra_dstaddr; + dstIsNew = 1; + } + if (ifra->ifra_addr.sns_family == AF_NS && + (hostIsNew || dstIsNew)) + error = ns_ifinit(ifp, ia, &ifra->ifra_addr, 0); + return (error); + + default: + if (ifp->if_ioctl == 0) + return (EOPNOTSUPP); + return ((*ifp->if_ioctl)(ifp, cmd, data)); + } +} + +/* +* Delete any previous route for an old address. +*/ +ns_ifscrub(ifp, ia) + register struct ifnet *ifp; + register struct ns_ifaddr *ia; +{ + if (ia->ia_flags & IFA_ROUTE) { + if (ifp->if_flags & IFF_POINTOPOINT) { + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + } else + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, 0); + ia->ia_flags &= ~IFA_ROUTE; + } +} +/* + * Initialize an interface's internet address + * and routing table entry. + */ +ns_ifinit(ifp, ia, sns, scrub) + register struct ifnet *ifp; + register struct ns_ifaddr *ia; + register struct sockaddr_ns *sns; +{ + struct sockaddr_ns oldaddr; + register union ns_host *h = &ia->ia_addr.sns_addr.x_host; + int s = splimp(), error; + + /* + * Set up new addresses. + */ + oldaddr = ia->ia_addr; + ia->ia_addr = *sns; + /* + * The convention we shall adopt for naming is that + * a supplied address of zero means that "we don't care". + * if there is a single interface, use the address of that + * interface as our 6 byte host address. + * if there are multiple interfaces, use any address already + * used. + * + * Give the interface a chance to initialize + * if this is its first address, + * and to validate the address if necessary. + */ + if (ns_hosteqnh(ns_thishost, ns_zerohost)) { + if (ifp->if_ioctl && + (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, ia))) { + ia->ia_addr = oldaddr; + splx(s); + return (error); + } + ns_thishost = *h; + } else if (ns_hosteqnh(sns->sns_addr.x_host, ns_zerohost) + || ns_hosteqnh(sns->sns_addr.x_host, ns_thishost)) { + *h = ns_thishost; + if (ifp->if_ioctl && + (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, ia))) { + ia->ia_addr = oldaddr; + splx(s); + return (error); + } + if (!ns_hosteqnh(ns_thishost,*h)) { + ia->ia_addr = oldaddr; + splx(s); + return (EINVAL); + } + } else { + ia->ia_addr = oldaddr; + splx(s); + return (EINVAL); + } + ia->ia_ifa.ifa_metric = ifp->if_metric; + /* + * Add route for the network. + */ + if (scrub) { + ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; + ns_ifscrub(ifp, ia); + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + } + if (ifp->if_flags & IFF_POINTOPOINT) + rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); + else { + ia->ia_broadaddr.sns_addr.x_net = ia->ia_addr.sns_addr.x_net; + rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_UP); + } + ia->ia_flags |= IFA_ROUTE; + return (0); +} + +/* + * Return address info for specified internet network. + */ +struct ns_ifaddr * +ns_iaonnetof(dst) + register struct ns_addr *dst; +{ + register struct ns_ifaddr *ia; + register struct ns_addr *compare; + register struct ifnet *ifp; + struct ns_ifaddr *ia_maybe = 0; + union ns_net net = dst->x_net; + + for (ia = ns_ifaddr; ia; ia = ia->ia_next) { + if (ifp = ia->ia_ifp) { + if (ifp->if_flags & IFF_POINTOPOINT) { + compare = &satons_addr(ia->ia_dstaddr); + if (ns_hosteq(*dst, *compare)) + return (ia); + if (ns_neteqnn(net, ia->ia_addr.sns_addr.x_net)) + ia_maybe = ia; + } else { + if (ns_neteqnn(net, ia->ia_addr.sns_addr.x_net)) + return (ia); + } + } + } + return (ia_maybe); +} +#endif diff --git a/sys/netns/ns.h b/sys/netns/ns.h new file mode 100644 index 00000000000..cf51f0047e9 --- /dev/null +++ b/sys/netns/ns.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Constants and Structures defined by the Xerox Network Software + * per "Internet Transport Protocols", XSIS 028112, December 1981 + */ + +/* + * Protocols + */ +#define NSPROTO_RI 1 /* Routing Information */ +#define NSPROTO_ECHO 2 /* Echo Protocol */ +#define NSPROTO_ERROR 3 /* Error Protocol */ +#define NSPROTO_PE 4 /* Packet Exchange */ +#define NSPROTO_SPP 5 /* Sequenced Packet */ +#define NSPROTO_RAW 255 /* Placemarker*/ +#define NSPROTO_MAX 256 /* Placemarker*/ + + +/* + * Port/Socket numbers: network standard functions + */ + +#define NSPORT_RI 1 /* Routing Information */ +#define NSPORT_ECHO 2 /* Echo */ +#define NSPORT_RE 3 /* Router Error */ + +/* + * Ports < NSPORT_RESERVED are reserved for priveleged + * processes (e.g. root). + */ +#define NSPORT_RESERVED 3000 + +/* flags passed to ns_output as last parameter */ + +#define NS_FORWARDING 0x1 /* most of idp header exists */ +#define NS_ROUTETOIF 0x10 /* same as SO_DONTROUTE */ +#define NS_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets */ + +#define NS_MAXHOPS 15 + +/* flags passed to get/set socket option */ +#define SO_HEADERS_ON_INPUT 1 +#define SO_HEADERS_ON_OUTPUT 2 +#define SO_DEFAULT_HEADERS 3 +#define SO_LAST_HEADER 4 +#define SO_NSIP_ROUTE 5 +#define SO_SEQNO 6 +#define SO_ALL_PACKETS 7 +#define SO_MTU 8 + + +/* + * NS addressing + */ +union ns_host { + u_char c_host[6]; + u_short s_host[3]; +}; + +union ns_net { + u_char c_net[4]; + u_short s_net[2]; +}; + +union ns_net_u { + union ns_net net_e; + u_long long_e; +}; + +struct ns_addr { + union ns_net x_net; + union ns_host x_host; + u_short x_port; +}; + +/* + * Socket address, Xerox style + */ +struct sockaddr_ns { + u_char sns_len; + u_char sns_family; + struct ns_addr sns_addr; + char sns_zero[2]; +}; +#define sns_port sns_addr.x_port + +#ifdef vax +#define ns_netof(a) (*(long *) & ((a).x_net)) /* XXX - not needed */ +#endif +#define ns_neteqnn(a,b) (((a).s_net[0]==(b).s_net[0]) && \ + ((a).s_net[1]==(b).s_net[1])) +#define ns_neteq(a,b) ns_neteqnn((a).x_net, (b).x_net) +#define satons_addr(sa) (((struct sockaddr_ns *)&(sa))->sns_addr) +#define ns_hosteqnh(s,t) ((s).s_host[0] == (t).s_host[0] && \ + (s).s_host[1] == (t).s_host[1] && (s).s_host[2] == (t).s_host[2]) +#define ns_hosteq(s,t) (ns_hosteqnh((s).x_host,(t).x_host)) +#define ns_nullhost(x) (((x).x_host.s_host[0]==0) && \ + ((x).x_host.s_host[1]==0) && ((x).x_host.s_host[2]==0)) + +#ifdef KERNEL +extern struct domain nsdomain; +union ns_host ns_thishost; +union ns_host ns_zerohost; +union ns_host ns_broadhost; +union ns_net ns_zeronet; +union ns_net ns_broadnet; +u_short ns_cksum(); +#else + +#include + +__BEGIN_DECLS +extern struct ns_addr ns_addr __P((const char *)); +extern char *ns_ntoa __P((struct ns_addr)); +__END_DECLS + +#endif diff --git a/sys/netns/ns_cksum.c b/sys/netns/ns_cksum.c new file mode 100644 index 00000000000..52eba8bce81 --- /dev/null +++ b/sys/netns/ns_cksum.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 1982, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns_cksum.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include + +/* + * Checksum routine for Network Systems Protocol Packets (Big-Endian). + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + */ + +#define ADDCARRY(x) { if ((x) > 65535) (x) -= 65535; } +#define FOLD(x) {l_util.l = (x); (x) = l_util.s[0] + l_util.s[1]; ADDCARRY(x);} + +u_short +ns_cksum(m, len) + register struct mbuf *m; + register int len; +{ + register u_short *w; + register int sum = 0; + register int mlen = 0; + register int sum2; + + union { + u_short s[2]; + long l; + } l_util; + + for (;m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + /* + * Each trip around loop adds in + * word from one mbuf segment. + */ + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * There is a byte left from the last segment; + * ones-complement add it into the checksum. + */ +#if BYTE_ORDER == BIG_ENDIAN + sum += *(u_char *)w; +#else + sum += *(u_char *)w << 8; +#endif + sum += sum; + w = (u_short *)(1 + (char *)w); + mlen = m->m_len - 1; + len--; + FOLD(sum); + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + /* + * We can do a 16 bit ones complement sum using + * 32 bit arithmetic registers for adding, + * with carries from the low added + * into the high (by normal carry-chaining) + * so long as we fold back before 16 carries have occured. + */ + if (1 & (int) w) + goto uuuuglyy; +#ifndef TINY +/* -DTINY reduces the size from 1250 to 550, but slows it down by 22% */ + while ((mlen -= 32) >= 0) { + sum += w[0]; sum += sum; sum += w[1]; sum += sum; + sum += w[2]; sum += sum; sum += w[3]; sum += sum; + sum += w[4]; sum += sum; sum += w[5]; sum += sum; + sum += w[6]; sum += sum; sum += w[7]; sum += sum; + FOLD(sum); + sum += w[8]; sum += sum; sum += w[9]; sum += sum; + sum += w[10]; sum += sum; sum += w[11]; sum += sum; + sum += w[12]; sum += sum; sum += w[13]; sum += sum; + sum += w[14]; sum += sum; sum += w[15]; sum += sum; + FOLD(sum); + w += 16; + } + mlen += 32; +#endif + while ((mlen -= 8) >= 0) { + sum += w[0]; sum += sum; sum += w[1]; sum += sum; + sum += w[2]; sum += sum; sum += w[3]; sum += sum; + FOLD(sum); + w += 4; + } + mlen += 8; + while ((mlen -= 2) >= 0) { + sum += *w++; sum += sum; + } + goto commoncase; +uuuuglyy: +#if BYTE_ORDER == BIG_ENDIAN +#define ww(n) (((u_char *)w)[n + n + 1]) +#define vv(n) (((u_char *)w)[n + n]) +#else +#if BYTE_ORDER == LITTLE_ENDIAN +#define vv(n) (((u_char *)w)[n + n + 1]) +#define ww(n) (((u_char *)w)[n + n]) +#endif +#endif + sum2 = 0; +#ifndef TINY + while ((mlen -= 32) >= 0) { + sum += ww(0); sum += sum; sum += ww(1); sum += sum; + sum += ww(2); sum += sum; sum += ww(3); sum += sum; + sum += ww(4); sum += sum; sum += ww(5); sum += sum; + sum += ww(6); sum += sum; sum += ww(7); sum += sum; + FOLD(sum); + sum += ww(8); sum += sum; sum += ww(9); sum += sum; + sum += ww(10); sum += sum; sum += ww(11); sum += sum; + sum += ww(12); sum += sum; sum += ww(13); sum += sum; + sum += ww(14); sum += sum; sum += ww(15); sum += sum; + FOLD(sum); + sum2 += vv(0); sum2 += sum2; sum2 += vv(1); sum2 += sum2; + sum2 += vv(2); sum2 += sum2; sum2 += vv(3); sum2 += sum2; + sum2 += vv(4); sum2 += sum2; sum2 += vv(5); sum2 += sum2; + sum2 += vv(6); sum2 += sum2; sum2 += vv(7); sum2 += sum2; + FOLD(sum2); + sum2 += vv(8); sum2 += sum2; sum2 += vv(9); sum2 += sum2; + sum2 += vv(10); sum2 += sum2; sum2 += vv(11); sum2 += sum2; + sum2 += vv(12); sum2 += sum2; sum2 += vv(13); sum2 += sum2; + sum2 += vv(14); sum2 += sum2; sum2 += vv(15); sum2 += sum2; + FOLD(sum2); + w += 16; + } + mlen += 32; +#endif + while ((mlen -= 8) >= 0) { + sum += ww(0); sum += sum; sum += ww(1); sum += sum; + sum += ww(2); sum += sum; sum += ww(3); sum += sum; + FOLD(sum); + sum2 += vv(0); sum2 += sum2; sum2 += vv(1); sum2 += sum2; + sum2 += vv(2); sum2 += sum2; sum2 += vv(3); sum2 += sum2; + FOLD(sum2); + w += 4; + } + mlen += 8; + while ((mlen -= 2) >= 0) { + sum += ww(0); sum += sum; + sum2 += vv(0); sum2 += sum2; + w++; + } + sum += (sum2 << 8); +commoncase: + if (mlen == -1) { +#if BYTE_ORDER == BIG_ENDIAN + sum += *(u_char *)w << 8; +#else + sum += *(u_char *)w; +#endif + } + FOLD(sum); + } + if (mlen == -1) { + /* We had an odd number of bytes to sum; assume a garbage + byte of zero and clean up */ + sum += sum; + FOLD(sum); + } + /* + * sum has already been kept to low sixteen bits. + * just examine result and exit. + */ + if(sum==0xffff) sum = 0; + return (sum); +} diff --git a/sys/netns/ns_error.c b/sys/netns/ns_error.c new file mode 100644 index 00000000000..03473a23680 --- /dev/null +++ b/sys/netns/ns_error.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) 1984, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns_error.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#ifdef lint +#define NS_ERRPRINTFS 1 +#endif + +#ifdef NS_ERRPRINTFS +/* + * NS_ERR routines: error generation, receive packet processing, and + * routines to turnaround packets back to the originator. + */ +int ns_errprintfs = 0; +#endif + +ns_err_x(c) +{ + register u_short *w, *lim, *base = ns_errstat.ns_es_codes; + u_short x = c; + + /* + * zero is a legit error code, handle specially + */ + if (x == 0) + return (0); + lim = base + NS_ERR_MAX - 1; + for (w = base + 1; w < lim; w++) { + if (*w == 0) + *w = x; + if (*w == x) + break; + } + return (w - base); +} + +/* + * Generate an error packet of type error + * in response to bad packet. + */ + +ns_error(om, type, param) + struct mbuf *om; + int type; +{ + register struct ns_epidp *ep; + struct mbuf *m; + struct idp *nip; + register struct idp *oip = mtod(om, struct idp *); + extern int idpcksum; + + /* + * If this packet was sent to the echo port, + * and nobody was there, just echo it. + * (Yes, this is a wart!) + */ + if (type == NS_ERR_NOSOCK && + oip->idp_dna.x_port == htons(2) && + (type = ns_echo(om))==0) + return; + +#ifdef NS_ERRPRINTFS + if (ns_errprintfs) + printf("ns_err_error(%x, %d, %d)\n", oip, type, param); +#endif + /* + * Don't Generate error packets in response to multicasts. + */ + if (oip->idp_dna.x_host.c_host[0] & 1) + goto freeit; + + ns_errstat.ns_es_error++; + /* + * Make sure that the old IDP packet had 30 bytes of data to return; + * if not, don't bother. Also don't EVER error if the old + * packet protocol was NS_ERR. + */ + if (oip->idp_len < sizeof(struct idp)) { + ns_errstat.ns_es_oldshort++; + goto freeit; + } + if (oip->idp_pt == NSPROTO_ERROR) { + ns_errstat.ns_es_oldns_err++; + goto freeit; + } + + /* + * First, formulate ns_err message + */ + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + goto freeit; + m->m_len = sizeof(*ep); + MH_ALIGN(m, m->m_len); + ep = mtod(m, struct ns_epidp *); + if ((u_int)type > NS_ERR_TOO_BIG) + panic("ns_err_error"); + ns_errstat.ns_es_outhist[ns_err_x(type)]++; + ep->ns_ep_errp.ns_err_num = htons((u_short)type); + ep->ns_ep_errp.ns_err_param = htons((u_short)param); + bcopy((caddr_t)oip, (caddr_t)&ep->ns_ep_errp.ns_err_idp, 42); + nip = &ep->ns_ep_idp; + nip->idp_len = sizeof(*ep); + nip->idp_len = htons((u_short)nip->idp_len); + nip->idp_pt = NSPROTO_ERROR; + nip->idp_tc = 0; + nip->idp_dna = oip->idp_sna; + nip->idp_sna = oip->idp_dna; + if (idpcksum) { + nip->idp_sum = 0; + nip->idp_sum = ns_cksum(m, sizeof(*ep)); + } else + nip->idp_sum = 0xffff; + (void) ns_output(m, (struct route *)0, 0); + +freeit: + m_freem(om); +} + +ns_printhost(p) +register struct ns_addr *p; +{ + + printf("", + p->x_net.s_net[0], + p->x_net.s_net[1], + p->x_host.s_host[0], + p->x_host.s_host[1], + p->x_host.s_host[2], + p->x_port); + +} + +/* + * Process a received NS_ERR message. + */ +ns_err_input(m) + struct mbuf *m; +{ + register struct ns_errp *ep; + register struct ns_epidp *epidp = mtod(m, struct ns_epidp *); + register int i; + int type, code, param; + + /* + * Locate ns_err structure in mbuf, and check + * that not corrupted and of at least minimum length. + */ +#ifdef NS_ERRPRINTFS + if (ns_errprintfs) { + printf("ns_err_input from "); + ns_printhost(&epidp->ns_ep_idp.idp_sna); + printf("len %d\n", ntohs(epidp->ns_ep_idp.idp_len)); + } +#endif + i = sizeof (struct ns_epidp); + if (((m->m_flags & M_EXT) || m->m_len < i) && + (m = m_pullup(m, i)) == 0) { + ns_errstat.ns_es_tooshort++; + return; + } + ep = &(mtod(m, struct ns_epidp *)->ns_ep_errp); + type = ntohs(ep->ns_err_num); + param = ntohs(ep->ns_err_param); + ns_errstat.ns_es_inhist[ns_err_x(type)]++; + +#ifdef NS_ERRPRINTFS + /* + * Message type specific processing. + */ + if (ns_errprintfs) + printf("ns_err_input, type %d param %d\n", type, param); +#endif + if (type >= NS_ERR_TOO_BIG) { + goto badcode; + } + ns_errstat.ns_es_outhist[ns_err_x(type)]++; + switch (type) { + + case NS_ERR_UNREACH_HOST: + code = PRC_UNREACH_NET; + goto deliver; + + case NS_ERR_TOO_OLD: + code = PRC_TIMXCEED_INTRANS; + goto deliver; + + case NS_ERR_TOO_BIG: + code = PRC_MSGSIZE; + goto deliver; + + case NS_ERR_FULLUP: + code = PRC_QUENCH; + goto deliver; + + case NS_ERR_NOSOCK: + code = PRC_UNREACH_PORT; + goto deliver; + + case NS_ERR_UNSPEC_T: + case NS_ERR_BADSUM_T: + case NS_ERR_BADSUM: + case NS_ERR_UNSPEC: + code = PRC_PARAMPROB; + goto deliver; + + deliver: + /* + * Problem with datagram; advise higher level routines. + */ +#ifdef NS_ERRPRINTFS + if (ns_errprintfs) + printf("deliver to protocol %d\n", + ep->ns_err_idp.idp_pt); +#endif + switch(ep->ns_err_idp.idp_pt) { + case NSPROTO_SPP: + spp_ctlinput(code, (caddr_t)ep); + break; + + default: + idp_ctlinput(code, (caddr_t)ep); + } + + goto freeit; + + default: + badcode: + ns_errstat.ns_es_badcode++; + goto freeit; + + } +freeit: + m_freem(m); +} + +#ifdef notdef +u_long +nstime() +{ + int s = splclock(); + u_long t; + + t = (time.tv_sec % (24*60*60)) * 1000 + time.tv_usec / 1000; + splx(s); + return (htonl(t)); +} +#endif + +ns_echo(m) +struct mbuf *m; +{ + register struct idp *idp = mtod(m, struct idp *); + register struct echo { + struct idp ec_idp; + u_short ec_op; /* Operation, 1 = request, 2 = reply */ + } *ec = (struct echo *)idp; + struct ns_addr temp; + + if (idp->idp_pt!=NSPROTO_ECHO) return(NS_ERR_NOSOCK); + if (ec->ec_op!=htons(1)) return(NS_ERR_UNSPEC); + + ec->ec_op = htons(2); + + temp = idp->idp_dna; + idp->idp_dna = idp->idp_sna; + idp->idp_sna = temp; + + if (idp->idp_sum != 0xffff) { + idp->idp_sum = 0; + idp->idp_sum = ns_cksum(m, + (int)(((ntohs(idp->idp_len) - 1)|1)+1)); + } + (void) ns_output(m, (struct route *)0, NS_FORWARDING); + return(0); +} diff --git a/sys/netns/ns_error.h b/sys/netns/ns_error.h new file mode 100644 index 00000000000..992911f1552 --- /dev/null +++ b/sys/netns/ns_error.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 1984, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns_error.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Xerox NS error messages + */ + +struct ns_errp { + u_short ns_err_num; /* Error Number */ + u_short ns_err_param; /* Error Parameter */ + struct idp ns_err_idp; /* Initial segment of offending + packet */ + u_char ns_err_lev2[12]; /* at least this much higher + level protocol */ +}; +struct ns_epidp { + struct idp ns_ep_idp; + struct ns_errp ns_ep_errp; +}; + +#define NS_ERR_UNSPEC 0 /* Unspecified Error detected at dest. */ +#define NS_ERR_BADSUM 1 /* Bad Checksum detected at dest */ +#define NS_ERR_NOSOCK 2 /* Specified socket does not exist at dest*/ +#define NS_ERR_FULLUP 3 /* Dest. refuses packet due to resource lim.*/ +#define NS_ERR_UNSPEC_T 0x200 /* Unspec. Error occured before reaching dest*/ +#define NS_ERR_BADSUM_T 0x201 /* Bad Checksum detected in transit */ +#define NS_ERR_UNREACH_HOST 0x202 /* Dest cannot be reached from here*/ +#define NS_ERR_TOO_OLD 0x203 /* Packet x'd 15 routers without delivery*/ +#define NS_ERR_TOO_BIG 0x204 /* Packet too large to be forwarded through + some intermediate gateway. The error + parameter field contains the max packet + size that can be accommodated */ +#define NS_ERR_MAX 20 + +/* + * Variables related to this implementation + * of the network systems error message protocol. + */ +struct ns_errstat { +/* statistics related to ns_err packets generated */ + int ns_es_error; /* # of calls to ns_error */ + int ns_es_oldshort; /* no error 'cuz old ip too short */ + int ns_es_oldns_err; /* no error 'cuz old was ns_err */ + int ns_es_outhist[NS_ERR_MAX]; +/* statistics related to input messages processed */ + int ns_es_badcode; /* ns_err_code out of range */ + int ns_es_tooshort; /* packet < IDP_MINLEN */ + int ns_es_checksum; /* bad checksum */ + int ns_es_badlen; /* calculated bound mismatch */ + int ns_es_reflect; /* number of responses */ + int ns_es_inhist[NS_ERR_MAX]; + u_short ns_es_codes[NS_ERR_MAX];/* which error code for outhist + since we might not know all */ +}; + +#ifdef KERNEL +struct ns_errstat ns_errstat; +#endif diff --git a/sys/netns/ns_if.h b/sys/netns/ns_if.h new file mode 100644 index 00000000000..3abb284a1dd --- /dev/null +++ b/sys/netns/ns_if.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns_if.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Interface address, xerox version. One of these structures + * is allocated for each interface with an internet address. + * The ifaddr structure contains the protocol-independent part + * of the structure and is assumed to be first. + */ + +struct ns_ifaddr { + struct ifaddr ia_ifa; /* protocol-independent info */ +#define ia_ifp ia_ifa.ifa_ifp +#define ia_flags ia_ifa.ifa_flags + struct ns_ifaddr *ia_next; /* next in list of xerox addresses */ + struct sockaddr_ns ia_addr; /* reserve space for my address */ + struct sockaddr_ns ia_dstaddr; /* space for my broadcast address */ +#define ia_broadaddr ia_dstaddr + struct sockaddr_ns ia_netmask; /* space for my network mask */ +}; + +struct ns_aliasreq { + char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct sockaddr_ns ifra_addr; + struct sockaddr_ns ifra_broadaddr; +#define ifra_dstaddr ifra_broadaddr +}; +/* + * Given a pointer to an ns_ifaddr (ifaddr), + * return a pointer to the addr as a sockadd_ns. + */ + +#define IA_SNS(ia) (&(((struct ns_ifaddr *)(ia))->ia_addr)) + +/* This is not the right place for this but where is? */ +#define ETHERTYPE_NS 0x0600 + +#ifdef NSIP +struct nsip_req { + struct sockaddr rq_ns; /* must be ns format destination */ + struct sockaddr rq_ip; /* must be ip format gateway */ + short rq_flags; +}; +#endif + +#ifdef KERNEL +struct ns_ifaddr *ns_ifaddr; +struct ns_ifaddr *ns_iaonnetof(); +struct ifqueue nsintrq; /* XNS input packet queue */ +#endif diff --git a/sys/netns/ns_input.c b/sys/netns/ns_input.c new file mode 100644 index 00000000000..7a6e1babc7c --- /dev/null +++ b/sys/netns/ns_input.c @@ -0,0 +1,485 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns_input.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * NS initialization. + */ +union ns_host ns_thishost; +union ns_host ns_zerohost; +union ns_host ns_broadhost; +union ns_net ns_zeronet; +union ns_net ns_broadnet; +struct sockaddr_ns ns_netmask, ns_hostmask; + +static u_short allones[] = {-1, -1, -1}; + +struct nspcb nspcb; +struct nspcb nsrawpcb; + +struct ifqueue nsintrq; +int nsqmaxlen = IFQ_MAXLEN; + +int idpcksum = 1; +long ns_pexseq; + +ns_init() +{ + extern struct timeval time; + + ns_broadhost = * (union ns_host *) allones; + ns_broadnet = * (union ns_net *) allones; + nspcb.nsp_next = nspcb.nsp_prev = &nspcb; + nsrawpcb.nsp_next = nsrawpcb.nsp_prev = &nsrawpcb; + nsintrq.ifq_maxlen = nsqmaxlen; + ns_pexseq = time.tv_usec; + ns_netmask.sns_len = 6; + ns_netmask.sns_addr.x_net = ns_broadnet; + ns_hostmask.sns_len = 12; + ns_hostmask.sns_addr.x_net = ns_broadnet; + ns_hostmask.sns_addr.x_host = ns_broadhost; +} + +/* + * Idp input routine. Pass to next level. + */ +int nsintr_getpck = 0; +int nsintr_swtch = 0; +nsintr() +{ + register struct idp *idp; + register struct mbuf *m; + register struct nspcb *nsp; + register int i; + int len, s, error; + char oddpacketp; + +next: + /* + * Get next datagram off input queue and get IDP header + * in first mbuf. + */ + s = splimp(); + IF_DEQUEUE(&nsintrq, m); + splx(s); + nsintr_getpck++; + if (m == 0) + return; + if ((m->m_flags & M_EXT || m->m_len < sizeof (struct idp)) && + (m = m_pullup(m, sizeof (struct idp))) == 0) { + idpstat.idps_toosmall++; + goto next; + } + + /* + * Give any raw listeners a crack at the packet + */ + for (nsp = nsrawpcb.nsp_next; nsp != &nsrawpcb; nsp = nsp->nsp_next) { + struct mbuf *m1 = m_copy(m, 0, (int)M_COPYALL); + if (m1) idp_input(m1, nsp); + } + + idp = mtod(m, struct idp *); + len = ntohs(idp->idp_len); + if (oddpacketp = len & 1) { + len++; /* If this packet is of odd length, + preserve garbage byte for checksum */ + } + + /* + * Check that the amount of data in the buffers + * is as at least much as the IDP header would have us expect. + * Trim mbufs if longer than we expect. + * Drop packet if shorter than we expect. + */ + if (m->m_pkthdr.len < len) { + idpstat.idps_tooshort++; + goto bad; + } + if (m->m_pkthdr.len > len) { + if (m->m_len == m->m_pkthdr.len) { + m->m_len = len; + m->m_pkthdr.len = len; + } else + m_adj(m, len - m->m_pkthdr.len); + } + if (idpcksum && ((i = idp->idp_sum)!=0xffff)) { + idp->idp_sum = 0; + if (i != (idp->idp_sum = ns_cksum(m, len))) { + idpstat.idps_badsum++; + idp->idp_sum = i; + if (ns_hosteqnh(ns_thishost, idp->idp_dna.x_host)) + error = NS_ERR_BADSUM; + else + error = NS_ERR_BADSUM_T; + ns_error(m, error, 0); + goto next; + } + } + /* + * Is this a directed broadcast? + */ + if (ns_hosteqnh(ns_broadhost,idp->idp_dna.x_host)) { + if ((!ns_neteq(idp->idp_dna, idp->idp_sna)) && + (!ns_neteqnn(idp->idp_dna.x_net, ns_broadnet)) && + (!ns_neteqnn(idp->idp_sna.x_net, ns_zeronet)) && + (!ns_neteqnn(idp->idp_dna.x_net, ns_zeronet)) ) { + /* + * Look to see if I need to eat this packet. + * Algorithm is to forward all young packets + * and prematurely age any packets which will + * by physically broadcasted. + * Any very old packets eaten without forwarding + * would die anyway. + * + * Suggestion of Bill Nesheim, Cornell U. + */ + if (idp->idp_tc < NS_MAXHOPS) { + idp_forward(m); + goto next; + } + } + /* + * Is this our packet? If not, forward. + */ + } else if (!ns_hosteqnh(ns_thishost,idp->idp_dna.x_host)) { + idp_forward(m); + goto next; + } + /* + * Locate pcb for datagram. + */ + nsp = ns_pcblookup(&idp->idp_sna, idp->idp_dna.x_port, NS_WILDCARD); + /* + * Switch out to protocol's input routine. + */ + nsintr_swtch++; + if (nsp) { + if (oddpacketp) { + m_adj(m, -1); + } + if ((nsp->nsp_flags & NSP_ALL_PACKETS)==0) + switch (idp->idp_pt) { + + case NSPROTO_SPP: + spp_input(m, nsp); + goto next; + + case NSPROTO_ERROR: + ns_err_input(m); + goto next; + } + idp_input(m, nsp); + } else { + ns_error(m, NS_ERR_NOSOCK, 0); + } + goto next; + +bad: + m_freem(m); + goto next; +} + +u_char nsctlerrmap[PRC_NCMDS] = { + ECONNABORTED, ECONNABORTED, 0, 0, + 0, 0, EHOSTDOWN, EHOSTUNREACH, + ENETUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, + EMSGSIZE, 0, 0, 0, + 0, 0, 0, 0 +}; + +int idp_donosocks = 1; + +idp_ctlinput(cmd, arg) + int cmd; + caddr_t arg; +{ + struct ns_addr *ns; + struct nspcb *nsp; + struct ns_errp *errp; + int idp_abort(); + extern struct nspcb *idp_drop(); + int type; + + if (cmd < 0 || cmd > PRC_NCMDS) + return; + if (nsctlerrmap[cmd] == 0) + return; /* XXX */ + type = NS_ERR_UNREACH_HOST; + switch (cmd) { + struct sockaddr_ns *sns; + + case PRC_IFDOWN: + case PRC_HOSTDEAD: + case PRC_HOSTUNREACH: + sns = (struct sockaddr_ns *)arg; + if (sns->sns_family != AF_NS) + return; + ns = &sns->sns_addr; + break; + + default: + errp = (struct ns_errp *)arg; + ns = &errp->ns_err_idp.idp_dna; + type = errp->ns_err_num; + type = ntohs((u_short)type); + } + switch (type) { + + case NS_ERR_UNREACH_HOST: + ns_pcbnotify(ns, (int)nsctlerrmap[cmd], idp_abort, (long)0); + break; + + case NS_ERR_NOSOCK: + nsp = ns_pcblookup(ns, errp->ns_err_idp.idp_sna.x_port, + NS_WILDCARD); + if(nsp && idp_donosocks && ! ns_nullhost(nsp->nsp_faddr)) + (void) idp_drop(nsp, (int)nsctlerrmap[cmd]); + } +} + +int idpprintfs = 0; +int idpforwarding = 1; +/* + * Forward a packet. If some error occurs return the sender + * an error packet. Note we can't always generate a meaningful + * error message because the NS errors don't have a large enough repetoire + * of codes and types. + */ +struct route idp_droute; +struct route idp_sroute; + +idp_forward(m) +struct mbuf *m; +{ + register struct idp *idp = mtod(m, struct idp *); + register int error, type, code; + struct mbuf *mcopy = NULL; + int agedelta = 1; + int flags = NS_FORWARDING; + int ok_there = 0; + int ok_back = 0; + + if (idpprintfs) { + printf("forward: src "); + ns_printhost(&idp->idp_sna); + printf(", dst "); + ns_printhost(&idp->idp_dna); + printf("hop count %d\n", idp->idp_tc); + } + if (idpforwarding == 0) { + /* can't tell difference between net and host */ + type = NS_ERR_UNREACH_HOST, code = 0; + goto senderror; + } + idp->idp_tc++; + if (idp->idp_tc > NS_MAXHOPS) { + type = NS_ERR_TOO_OLD, code = 0; + goto senderror; + } + /* + * Save at most 42 bytes of the packet in case + * we need to generate an NS error message to the src. + */ + mcopy = m_copy(m, 0, imin((int)ntohs(idp->idp_len), 42)); + + if ((ok_there = idp_do_route(&idp->idp_dna,&idp_droute))==0) { + type = NS_ERR_UNREACH_HOST, code = 0; + goto senderror; + } + /* + * Here we think about forwarding broadcast packets, + * so we try to insure that it doesn't go back out + * on the interface it came in on. Also, if we + * are going to physically broadcast this, let us + * age the packet so we can eat it safely the second time around. + */ + if (idp->idp_dna.x_host.c_host[0] & 0x1) { + struct ns_ifaddr *ia = ns_iaonnetof(&idp->idp_dna); + struct ifnet *ifp; + if (ia) { + /* I'm gonna hafta eat this packet */ + agedelta += NS_MAXHOPS - idp->idp_tc; + idp->idp_tc = NS_MAXHOPS; + } + if ((ok_back = idp_do_route(&idp->idp_sna,&idp_sroute))==0) { + /* error = ENETUNREACH; He'll never get it! */ + m_freem(m); + goto cleanup; + } + if (idp_droute.ro_rt && + (ifp=idp_droute.ro_rt->rt_ifp) && + idp_sroute.ro_rt && + (ifp!=idp_sroute.ro_rt->rt_ifp)) { + flags |= NS_ALLOWBROADCAST; + } else { + type = NS_ERR_UNREACH_HOST, code = 0; + goto senderror; + } + } + /* need to adjust checksum */ + if (idp->idp_sum!=0xffff) { + union bytes { + u_char c[4]; + u_short s[2]; + long l; + } x; + register int shift; + x.l = 0; x.c[0] = agedelta; + shift = (((((int)ntohs(idp->idp_len))+1)>>1)-2) & 0xf; + x.l = idp->idp_sum + (x.s[0] << shift); + x.l = x.s[0] + x.s[1]; + x.l = x.s[0] + x.s[1]; + if (x.l==0xffff) idp->idp_sum = 0; else idp->idp_sum = x.l; + } + if ((error = ns_output(m, &idp_droute, flags)) && + (mcopy!=NULL)) { + idp = mtod(mcopy, struct idp *); + type = NS_ERR_UNSPEC_T, code = 0; + switch (error) { + + case ENETUNREACH: + case EHOSTDOWN: + case EHOSTUNREACH: + case ENETDOWN: + case EPERM: + type = NS_ERR_UNREACH_HOST; + break; + + case EMSGSIZE: + type = NS_ERR_TOO_BIG; + code = 576; /* too hard to figure out mtu here */ + break; + + case ENOBUFS: + type = NS_ERR_UNSPEC_T; + break; + } + mcopy = NULL; + senderror: + ns_error(m, type, code); + } +cleanup: + if (ok_there) + idp_undo_route(&idp_droute); + if (ok_back) + idp_undo_route(&idp_sroute); + if (mcopy != NULL) + m_freem(mcopy); +} + +idp_do_route(src, ro) +struct ns_addr *src; +struct route *ro; +{ + + struct sockaddr_ns *dst; + + bzero((caddr_t)ro, sizeof (*ro)); + dst = (struct sockaddr_ns *)&ro->ro_dst; + + dst->sns_len = sizeof(*dst); + dst->sns_family = AF_NS; + dst->sns_addr = *src; + dst->sns_addr.x_port = 0; + rtalloc(ro); + if (ro->ro_rt == 0 || ro->ro_rt->rt_ifp == 0) { + return (0); + } + ro->ro_rt->rt_use++; + return (1); +} + +idp_undo_route(ro) +register struct route *ro; +{ + if (ro->ro_rt) {RTFREE(ro->ro_rt);} +} + +ns_watch_output(m, ifp) +struct mbuf *m; +struct ifnet *ifp; +{ + register struct nspcb *nsp; + register struct ifaddr *ifa; + /* + * Give any raw listeners a crack at the packet + */ + for (nsp = nsrawpcb.nsp_next; nsp != &nsrawpcb; nsp = nsp->nsp_next) { + struct mbuf *m0 = m_copy(m, 0, (int)M_COPYALL); + if (m0) { + register struct idp *idp; + + M_PREPEND(m0, sizeof (*idp), M_DONTWAIT); + if (m0 == NULL) + continue; + idp = mtod(m0, struct idp *); + idp->idp_sna.x_net = ns_zeronet; + idp->idp_sna.x_host = ns_thishost; + if (ifp && (ifp->if_flags & IFF_POINTOPOINT)) + for(ifa = ifp->if_addrlist; ifa; + ifa = ifa->ifa_next) { + if (ifa->ifa_addr->sa_family==AF_NS) { + idp->idp_sna = IA_SNS(ifa)->sns_addr; + break; + } + } + idp->idp_len = ntohl(m0->m_pkthdr.len); + idp_input(m0, nsp); + } + } +} diff --git a/sys/netns/ns_ip.c b/sys/netns/ns_ip.c new file mode 100644 index 00000000000..09deb8fe7c4 --- /dev/null +++ b/sys/netns/ns_ip.c @@ -0,0 +1,440 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns_ip.c 8.1 (Berkeley) 6/10/93 + */ + +/* + * Software interface driver for encapsulating ns in ip. + */ + +#ifdef NSIP +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +struct ifnet_en { + struct ifnet ifen_ifnet; + struct route ifen_route; + struct in_addr ifen_src; + struct in_addr ifen_dst; + struct ifnet_en *ifen_next; +}; + +int nsipoutput(), nsipioctl(), nsipstart(); +#define LOMTU (1024+512); + +struct ifnet nsipif; +struct ifnet_en *nsip_list; /* list of all hosts and gateways or + broadcast addrs */ + +struct ifnet_en * +nsipattach() +{ + register struct ifnet_en *m; + register struct ifnet *ifp; + + if (nsipif.if_mtu == 0) { + ifp = &nsipif; + ifp->if_name = "nsip"; + ifp->if_mtu = LOMTU; + ifp->if_ioctl = nsipioctl; + ifp->if_output = nsipoutput; + ifp->if_start = nsipstart; + ifp->if_flags = IFF_POINTOPOINT; + } + + MALLOC((m), struct ifnet_en *, sizeof(*m), M_PCB, M_NOWAIT); + if (m == NULL) return (NULL); + m->ifen_next = nsip_list; + nsip_list = m; + ifp = &m->ifen_ifnet; + + ifp->if_name = "nsip"; + ifp->if_mtu = LOMTU; + ifp->if_ioctl = nsipioctl; + ifp->if_output = nsipoutput; + ifp->if_start = nsipstart; + ifp->if_flags = IFF_POINTOPOINT; + ifp->if_unit = nsipif.if_unit++; + if_attach(ifp); + + return (m); +} + + +/* + * Process an ioctl request. + */ +/* ARGSUSED */ +nsipioctl(ifp, cmd, data) + register struct ifnet *ifp; + int cmd; + caddr_t data; +{ + int error = 0; + struct ifreq *ifr; + + switch (cmd) { + + case SIOCSIFADDR: + ifp->if_flags |= IFF_UP; + /* fall into: */ + + case SIOCSIFDSTADDR: + /* + * Everything else is done at a higher level. + */ + break; + + case SIOCSIFFLAGS: + ifr = (struct ifreq *)data; + if ((ifr->ifr_flags & IFF_UP) == 0) + error = nsip_free(ifp); + + + default: + error = EINVAL; + } + return (error); +} + +struct mbuf *nsip_badlen; +struct mbuf *nsip_lastin; +int nsip_hold_input; + +idpip_input(m, ifp) + register struct mbuf *m; + struct ifnet *ifp; +{ + register struct ip *ip; + register struct idp *idp; + register struct ifqueue *ifq = &nsintrq; + int len, s; + + if (nsip_hold_input) { + if (nsip_lastin) { + m_freem(nsip_lastin); + } + nsip_lastin = m_copym(m, 0, (int)M_COPYALL, M_DONTWAIT); + } + /* + * Get IP and IDP header together in first mbuf. + */ + nsipif.if_ipackets++; + s = sizeof (struct ip) + sizeof (struct idp); + if (((m->m_flags & M_EXT) || m->m_len < s) && + (m = m_pullup(m, s)) == 0) { + nsipif.if_ierrors++; + return; + } + ip = mtod(m, struct ip *); + if (ip->ip_hl > (sizeof (struct ip) >> 2)) { + ip_stripoptions(m, (struct mbuf *)0); + if (m->m_len < s) { + if ((m = m_pullup(m, s)) == 0) { + nsipif.if_ierrors++; + return; + } + ip = mtod(m, struct ip *); + } + } + + /* + * Make mbuf data length reflect IDP length. + * If not enough data to reflect IDP length, drop. + */ + m->m_data += sizeof (struct ip); + m->m_len -= sizeof (struct ip); + m->m_pkthdr.len -= sizeof (struct ip); + idp = mtod(m, struct idp *); + len = ntohs(idp->idp_len); + if (len & 1) len++; /* Preserve Garbage Byte */ + if (ip->ip_len != len) { + if (len > ip->ip_len) { + nsipif.if_ierrors++; + if (nsip_badlen) m_freem(nsip_badlen); + nsip_badlen = m; + return; + } + /* Any extra will be trimmed off by the NS routines */ + } + + /* + * Place interface pointer before the data + * for the receiving protocol. + */ + m->m_pkthdr.rcvif = ifp; + /* + * Deliver to NS + */ + s = splimp(); + if (IF_QFULL(ifq)) { + IF_DROP(ifq); +bad: + m_freem(m); + splx(s); + return; + } + IF_ENQUEUE(ifq, m); + schednetisr(NETISR_NS); + splx(s); + return; +} + +/* ARGSUSED */ +nsipoutput(ifn, m, dst) + struct ifnet_en *ifn; + register struct mbuf *m; + struct sockaddr *dst; +{ + + register struct ip *ip; + register struct route *ro = &(ifn->ifen_route); + register int len = 0; + register struct idp *idp = mtod(m, struct idp *); + int error; + + ifn->ifen_ifnet.if_opackets++; + nsipif.if_opackets++; + + + /* + * Calculate data length and make space + * for IP header. + */ + len = ntohs(idp->idp_len); + if (len & 1) len++; /* Preserve Garbage Byte */ + /* following clause not necessary on vax */ + if (3 & (int)m->m_data) { + /* force longword alignment of ip hdr */ + struct mbuf *m0 = m_gethdr(MT_HEADER, M_DONTWAIT); + if (m0 == 0) { + m_freem(m); + return (ENOBUFS); + } + MH_ALIGN(m0, sizeof (struct ip)); + m0->m_flags = m->m_flags & M_COPYFLAGS; + m0->m_next = m; + m0->m_len = sizeof (struct ip); + m0->m_pkthdr.len = m0->m_len + m->m_len; + m->m_flags &= ~M_PKTHDR; + } else { + M_PREPEND(m, sizeof (struct ip), M_DONTWAIT); + if (m == 0) + return (ENOBUFS); + } + /* + * Fill in IP header. + */ + ip = mtod(m, struct ip *); + *(long *)ip = 0; + ip->ip_p = IPPROTO_IDP; + ip->ip_src = ifn->ifen_src; + ip->ip_dst = ifn->ifen_dst; + ip->ip_len = (u_short)len + sizeof (struct ip); + ip->ip_ttl = MAXTTL; + + /* + * Output final datagram. + */ + error = (ip_output(m, (struct mbuf *)0, ro, SO_BROADCAST, NULL)); + if (error) { + ifn->ifen_ifnet.if_oerrors++; + ifn->ifen_ifnet.if_ierrors = error; + } + return (error); +bad: + m_freem(m); + return (ENETUNREACH); +} + +nsipstart(ifp) +struct ifnet *ifp; +{ + panic("nsip_start called\n"); +} + +struct ifreq ifr = {"nsip0"}; + +nsip_route(m) + register struct mbuf *m; +{ + register struct nsip_req *rq = mtod(m, struct nsip_req *); + struct sockaddr_ns *ns_dst = (struct sockaddr_ns *)&rq->rq_ns; + struct sockaddr_in *ip_dst = (struct sockaddr_in *)&rq->rq_ip; + struct route ro; + struct ifnet_en *ifn; + struct sockaddr_in *src; + + /* + * First, make sure we already have an ns address: + */ + if (ns_hosteqnh(ns_thishost, ns_zerohost)) + return (EADDRNOTAVAIL); + /* + * Now, determine if we can get to the destination + */ + bzero((caddr_t)&ro, sizeof (ro)); + ro.ro_dst = *(struct sockaddr *)ip_dst; + rtalloc(&ro); + if (ro.ro_rt == 0 || ro.ro_rt->rt_ifp == 0) { + return (ENETUNREACH); + } + + /* + * And see how he's going to get back to us: + * i.e., what return ip address do we use? + */ + { + register struct in_ifaddr *ia; + struct ifnet *ifp = ro.ro_rt->rt_ifp; + + for (ia = in_ifaddr; ia; ia = ia->ia_next) + if (ia->ia_ifp == ifp) + break; + if (ia == 0) + ia = in_ifaddr; + if (ia == 0) { + RTFREE(ro.ro_rt); + return (EADDRNOTAVAIL); + } + src = (struct sockaddr_in *)&ia->ia_addr; + } + + /* + * Is there a free (pseudo-)interface or space? + */ + for (ifn = nsip_list; ifn; ifn = ifn->ifen_next) { + if ((ifn->ifen_ifnet.if_flags & IFF_UP) == 0) + break; + } + if (ifn == NULL) + ifn = nsipattach(); + if (ifn == NULL) { + RTFREE(ro.ro_rt); + return (ENOBUFS); + } + ifn->ifen_route = ro; + ifn->ifen_dst = ip_dst->sin_addr; + ifn->ifen_src = src->sin_addr; + + /* + * now configure this as a point to point link + */ + ifr.ifr_name[4] = '0' + nsipif.if_unit - 1; + ifr.ifr_dstaddr = * (struct sockaddr *) ns_dst; + (void)ns_control((struct socket *)0, (int)SIOCSIFDSTADDR, (caddr_t)&ifr, + (struct ifnet *)ifn); + satons_addr(ifr.ifr_addr).x_host = ns_thishost; + return (ns_control((struct socket *)0, (int)SIOCSIFADDR, (caddr_t)&ifr, + (struct ifnet *)ifn)); +} + +nsip_free(ifp) +struct ifnet *ifp; +{ + register struct ifnet_en *ifn = (struct ifnet_en *)ifp; + struct route *ro = & ifn->ifen_route; + + if (ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = 0; + } + ifp->if_flags &= ~IFF_UP; + return (0); +} + +nsip_ctlinput(cmd, sa) + int cmd; + struct sockaddr *sa; +{ + extern u_char inetctlerrmap[]; + struct sockaddr_in *sin; + int in_rtchange(); + + if ((unsigned)cmd >= PRC_NCMDS) + return; + if (sa->sa_family != AF_INET && sa->sa_family != AF_IMPLINK) + return; + sin = (struct sockaddr_in *)sa; + if (sin->sin_addr.s_addr == INADDR_ANY) + return; + + switch (cmd) { + + case PRC_ROUTEDEAD: + case PRC_REDIRECT_NET: + case PRC_REDIRECT_HOST: + case PRC_REDIRECT_TOSNET: + case PRC_REDIRECT_TOSHOST: + nsip_rtchange(&sin->sin_addr); + break; + } +} + +nsip_rtchange(dst) + register struct in_addr *dst; +{ + register struct ifnet_en *ifn; + + for (ifn = nsip_list; ifn; ifn = ifn->ifen_next) { + if (ifn->ifen_dst.s_addr == dst->s_addr && + ifn->ifen_route.ro_rt) { + RTFREE(ifn->ifen_route.ro_rt); + ifn->ifen_route.ro_rt = 0; + } + } +} +#endif diff --git a/sys/netns/ns_output.c b/sys/netns/ns_output.c new file mode 100644 index 00000000000..4c9f364f1ea --- /dev/null +++ b/sys/netns/ns_output.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns_output.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#ifdef vax +#include +#endif +int ns_hold_output = 0; +int ns_copy_output = 0; +int ns_output_cnt = 0; +struct mbuf *ns_lastout; + +ns_output(m0, ro, flags) + struct mbuf *m0; + struct route *ro; + int flags; +{ + register struct idp *idp = mtod(m0, struct idp *); + register struct ifnet *ifp = 0; + int error = 0; + struct route idproute; + struct sockaddr_ns *dst; + extern int idpcksum; + + if (ns_hold_output) { + if (ns_lastout) { + (void)m_free(ns_lastout); + } + ns_lastout = m_copy(m0, 0, (int)M_COPYALL); + } + /* + * Route packet. + */ + if (ro == 0) { + ro = &idproute; + bzero((caddr_t)ro, sizeof (*ro)); + } + dst = (struct sockaddr_ns *)&ro->ro_dst; + if (ro->ro_rt == 0) { + dst->sns_family = AF_NS; + dst->sns_len = sizeof (*dst); + dst->sns_addr = idp->idp_dna; + dst->sns_addr.x_port = 0; + /* + * If routing to interface only, + * short circuit routing lookup. + */ + if (flags & NS_ROUTETOIF) { + struct ns_ifaddr *ia = ns_iaonnetof(&idp->idp_dna); + + if (ia == 0) { + error = ENETUNREACH; + goto bad; + } + ifp = ia->ia_ifp; + goto gotif; + } + rtalloc(ro); + } else if ((ro->ro_rt->rt_flags & RTF_UP) == 0) { + /* + * The old route has gone away; try for a new one. + */ + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + rtalloc(ro); + } + if (ro->ro_rt == 0 || (ifp = ro->ro_rt->rt_ifp) == 0) { + error = ENETUNREACH; + goto bad; + } + ro->ro_rt->rt_use++; + if (ro->ro_rt->rt_flags & (RTF_GATEWAY|RTF_HOST)) + dst = (struct sockaddr_ns *)ro->ro_rt->rt_gateway; +gotif: + + /* + * Look for multicast addresses and + * and verify user is allowed to send + * such a packet. + */ + if (dst->sns_addr.x_host.c_host[0]&1) { + if ((ifp->if_flags & IFF_BROADCAST) == 0) { + error = EADDRNOTAVAIL; + goto bad; + } + if ((flags & NS_ALLOWBROADCAST) == 0) { + error = EACCES; + goto bad; + } + } + + if (htons(idp->idp_len) <= ifp->if_mtu) { + ns_output_cnt++; + if (ns_copy_output) { + ns_watch_output(m0, ifp); + } + error = (*ifp->if_output)(ifp, m0, + (struct sockaddr *)dst, ro->ro_rt); + goto done; + } else error = EMSGSIZE; + + +bad: + if (ns_copy_output) { + ns_watch_output(m0, ifp); + } + m_freem(m0); +done: + if (ro == &idproute && (flags & NS_ROUTETOIF) == 0 && ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = 0; + } + return (error); +} diff --git a/sys/netns/ns_pcb.c b/sys/netns/ns_pcb.c new file mode 100644 index 00000000000..ca88472d594 --- /dev/null +++ b/sys/netns/ns_pcb.c @@ -0,0 +1,363 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns_pcb.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +struct ns_addr zerons_addr; + +ns_pcballoc(so, head) + struct socket *so; + struct nspcb *head; +{ + struct mbuf *m; + register struct nspcb *nsp; + + m = m_getclr(M_DONTWAIT, MT_PCB); + if (m == NULL) + return (ENOBUFS); + nsp = mtod(m, struct nspcb *); + nsp->nsp_socket = so; + insque(nsp, head); + so->so_pcb = (caddr_t)nsp; + return (0); +} + +ns_pcbbind(nsp, nam) + register struct nspcb *nsp; + struct mbuf *nam; +{ + register struct sockaddr_ns *sns; + u_short lport = 0; + + if (nsp->nsp_lport || !ns_nullhost(nsp->nsp_laddr)) + return (EINVAL); + if (nam == 0) + goto noname; + sns = mtod(nam, struct sockaddr_ns *); + if (nam->m_len != sizeof (*sns)) + return (EINVAL); + if (!ns_nullhost(sns->sns_addr)) { + int tport = sns->sns_port; + + sns->sns_port = 0; /* yech... */ + if (ifa_ifwithaddr((struct sockaddr *)sns) == 0) + return (EADDRNOTAVAIL); + sns->sns_port = tport; + } + lport = sns->sns_port; + if (lport) { + u_short aport = ntohs(lport); + + if (aport < NSPORT_RESERVED && + (nsp->nsp_socket->so_state & SS_PRIV) == 0) + return (EACCES); + if (ns_pcblookup(&zerons_addr, lport, 0)) + return (EADDRINUSE); + } + nsp->nsp_laddr = sns->sns_addr; +noname: + if (lport == 0) + do { + if (nspcb.nsp_lport++ < NSPORT_RESERVED) + nspcb.nsp_lport = NSPORT_RESERVED; + lport = htons(nspcb.nsp_lport); + } while (ns_pcblookup(&zerons_addr, lport, 0)); + nsp->nsp_lport = lport; + return (0); +} + +/* + * Connect from a socket to a specified address. + * Both address and port must be specified in argument sns. + * If don't have a local address for this socket yet, + * then pick one. + */ +ns_pcbconnect(nsp, nam) + struct nspcb *nsp; + struct mbuf *nam; +{ + struct ns_ifaddr *ia; + register struct sockaddr_ns *sns = mtod(nam, struct sockaddr_ns *); + register struct ns_addr *dst; + register struct route *ro; + struct ifnet *ifp; + + if (nam->m_len != sizeof (*sns)) + return (EINVAL); + if (sns->sns_family != AF_NS) + return (EAFNOSUPPORT); + if (sns->sns_port==0 || ns_nullhost(sns->sns_addr)) + return (EADDRNOTAVAIL); + /* + * If we haven't bound which network number to use as ours, + * we will use the number of the outgoing interface. + * This depends on having done a routing lookup, which + * we will probably have to do anyway, so we might + * as well do it now. On the other hand if we are + * sending to multiple destinations we may have already + * done the lookup, so see if we can use the route + * from before. In any case, we only + * chose a port number once, even if sending to multiple + * destinations. + */ + ro = &nsp->nsp_route; + dst = &satons_addr(ro->ro_dst); + if (nsp->nsp_socket->so_options & SO_DONTROUTE) + goto flush; + if (!ns_neteq(nsp->nsp_lastdst, sns->sns_addr)) + goto flush; + if (!ns_hosteq(nsp->nsp_lastdst, sns->sns_addr)) { + if (ro->ro_rt && ! (ro->ro_rt->rt_flags & RTF_HOST)) { + /* can patch route to avoid rtalloc */ + *dst = sns->sns_addr; + } else { + flush: + if (ro->ro_rt) + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + nsp->nsp_laddr.x_net = ns_zeronet; + } + }/* else cached route is ok; do nothing */ + nsp->nsp_lastdst = sns->sns_addr; + if ((nsp->nsp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/ + (ro->ro_rt == (struct rtentry *)0 || + ro->ro_rt->rt_ifp == (struct ifnet *)0)) { + /* No route yet, so try to acquire one */ + ro->ro_dst.sa_family = AF_NS; + ro->ro_dst.sa_len = sizeof(ro->ro_dst); + *dst = sns->sns_addr; + dst->x_port = 0; + rtalloc(ro); + } + if (ns_neteqnn(nsp->nsp_laddr.x_net, ns_zeronet)) { + /* + * If route is known or can be allocated now, + * our src addr is taken from the i/f, else punt. + */ + + ia = (struct ns_ifaddr *)0; + /* + * If we found a route, use the address + * corresponding to the outgoing interface + */ + if (ro->ro_rt && (ifp = ro->ro_rt->rt_ifp)) + for (ia = ns_ifaddr; ia; ia = ia->ia_next) + if (ia->ia_ifp == ifp) + break; + if (ia == 0) { + u_short fport = sns->sns_addr.x_port; + sns->sns_addr.x_port = 0; + ia = (struct ns_ifaddr *) + ifa_ifwithdstaddr((struct sockaddr *)sns); + sns->sns_addr.x_port = fport; + if (ia == 0) + ia = ns_iaonnetof(&sns->sns_addr); + if (ia == 0) + ia = ns_ifaddr; + if (ia == 0) + return (EADDRNOTAVAIL); + } + nsp->nsp_laddr.x_net = satons_addr(ia->ia_addr).x_net; + } + if (ns_pcblookup(&sns->sns_addr, nsp->nsp_lport, 0)) + return (EADDRINUSE); + if (ns_nullhost(nsp->nsp_laddr)) { + if (nsp->nsp_lport == 0) + (void) ns_pcbbind(nsp, (struct mbuf *)0); + nsp->nsp_laddr.x_host = ns_thishost; + } + nsp->nsp_faddr = sns->sns_addr; + /* Includes nsp->nsp_fport = sns->sns_port; */ + return (0); +} + +ns_pcbdisconnect(nsp) + struct nspcb *nsp; +{ + + nsp->nsp_faddr = zerons_addr; + if (nsp->nsp_socket->so_state & SS_NOFDREF) + ns_pcbdetach(nsp); +} + +ns_pcbdetach(nsp) + struct nspcb *nsp; +{ + struct socket *so = nsp->nsp_socket; + + so->so_pcb = 0; + sofree(so); + if (nsp->nsp_route.ro_rt) + rtfree(nsp->nsp_route.ro_rt); + remque(nsp); + (void) m_free(dtom(nsp)); +} + +ns_setsockaddr(nsp, nam) + register struct nspcb *nsp; + struct mbuf *nam; +{ + register struct sockaddr_ns *sns = mtod(nam, struct sockaddr_ns *); + + nam->m_len = sizeof (*sns); + sns = mtod(nam, struct sockaddr_ns *); + bzero((caddr_t)sns, sizeof (*sns)); + sns->sns_len = sizeof(*sns); + sns->sns_family = AF_NS; + sns->sns_addr = nsp->nsp_laddr; +} + +ns_setpeeraddr(nsp, nam) + register struct nspcb *nsp; + struct mbuf *nam; +{ + register struct sockaddr_ns *sns = mtod(nam, struct sockaddr_ns *); + + nam->m_len = sizeof (*sns); + sns = mtod(nam, struct sockaddr_ns *); + bzero((caddr_t)sns, sizeof (*sns)); + sns->sns_len = sizeof(*sns); + sns->sns_family = AF_NS; + sns->sns_addr = nsp->nsp_faddr; +} + +/* + * Pass some notification to all connections of a protocol + * associated with address dst. Call the + * protocol specific routine to handle each connection. + * Also pass an extra paramter via the nspcb. (which may in fact + * be a parameter list!) + */ +ns_pcbnotify(dst, errno, notify, param) + register struct ns_addr *dst; + long param; + int errno, (*notify)(); +{ + register struct nspcb *nsp, *oinp; + int s = splimp(); + + for (nsp = (&nspcb)->nsp_next; nsp != (&nspcb);) { + if (!ns_hosteq(*dst,nsp->nsp_faddr)) { + next: + nsp = nsp->nsp_next; + continue; + } + if (nsp->nsp_socket == 0) + goto next; + if (errno) + nsp->nsp_socket->so_error = errno; + oinp = nsp; + nsp = nsp->nsp_next; + oinp->nsp_notify_param = param; + (*notify)(oinp); + } + splx(s); +} + +#ifdef notdef +/* + * After a routing change, flush old routing + * and allocate a (hopefully) better one. + */ +ns_rtchange(nsp) + struct nspcb *nsp; +{ + if (nsp->nsp_route.ro_rt) { + rtfree(nsp->nsp_route.ro_rt); + nsp->nsp_route.ro_rt = 0; + /* + * A new route can be allocated the next time + * output is attempted. + */ + } + /* SHOULD NOTIFY HIGHER-LEVEL PROTOCOLS */ +} +#endif + +struct nspcb * +ns_pcblookup(faddr, lport, wildp) + struct ns_addr *faddr; + u_short lport; +{ + register struct nspcb *nsp, *match = 0; + int matchwild = 3, wildcard; + u_short fport; + + fport = faddr->x_port; + for (nsp = (&nspcb)->nsp_next; nsp != (&nspcb); nsp = nsp->nsp_next) { + if (nsp->nsp_lport != lport) + continue; + wildcard = 0; + if (ns_nullhost(nsp->nsp_faddr)) { + if (!ns_nullhost(*faddr)) + wildcard++; + } else { + if (ns_nullhost(*faddr)) + wildcard++; + else { + if (!ns_hosteq(nsp->nsp_faddr, *faddr)) + continue; + if (nsp->nsp_fport != fport) { + if (nsp->nsp_fport != 0) + continue; + else + wildcard++; + } + } + } + if (wildcard && wildp==0) + continue; + if (wildcard < matchwild) { + match = nsp; + matchwild = wildcard; + if (wildcard == 0) + break; + } + } + return (match); +} diff --git a/sys/netns/ns_pcb.h b/sys/netns/ns_pcb.h new file mode 100644 index 00000000000..68cf744f738 --- /dev/null +++ b/sys/netns/ns_pcb.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns_pcb.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Ns protocol interface control block. + */ +struct nspcb { + struct nspcb *nsp_next; /* doubly linked list */ + struct nspcb *nsp_prev; + struct nspcb *nsp_head; + struct socket *nsp_socket; /* back pointer to socket */ + struct ns_addr nsp_faddr; /* destination address */ + struct ns_addr nsp_laddr; /* socket's address */ + caddr_t nsp_pcb; /* protocol specific stuff */ + struct route nsp_route; /* routing information */ + struct ns_addr nsp_lastdst; /* validate cached route for dg socks*/ + long nsp_notify_param; /* extra info passed via ns_pcbnotify*/ + short nsp_flags; + u_char nsp_dpt; /* default packet type for idp_output*/ + u_char nsp_rpt; /* last received packet type by + idp_input() */ +}; + +/* possible flags */ + +#define NSP_IN_ABORT 0x1 /* calling abort through socket */ +#define NSP_RAWIN 0x2 /* show headers on input */ +#define NSP_RAWOUT 0x4 /* show header on output */ +#define NSP_ALL_PACKETS 0x8 /* Turn off higher proto processing */ + +#define NS_WILDCARD 1 + +#define nsp_lport nsp_laddr.x_port +#define nsp_fport nsp_faddr.x_port + +#define sotonspcb(so) ((struct nspcb *)((so)->so_pcb)) + +/* + * Nominal space allocated to a ns socket. + */ +#define NSSNDQ 2048 +#define NSRCVQ 2048 + + +#ifdef KERNEL +struct nspcb nspcb; /* head of list */ +struct nspcb *ns_pcblookup(); +#endif diff --git a/sys/netns/ns_proto.c b/sys/netns/ns_proto.c new file mode 100644 index 00000000000..fc9f8238c55 --- /dev/null +++ b/sys/netns/ns_proto.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ns_proto.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include + +#include + +#include + +/* + * NS protocol family: IDP, ERR, PE, SPP, ROUTE. + */ +int ns_init(); +int idp_input(), idp_output(), idp_ctlinput(), idp_usrreq(); +int idp_raw_usrreq(), idp_ctloutput(); +int spp_input(), spp_ctlinput(); +int spp_usrreq(), spp_usrreq_sp(), spp_ctloutput(); +int spp_init(), spp_fasttimo(), spp_slowtimo(); +extern int raw_usrreq(); + +extern struct domain nsdomain; + +struct protosw nssw[] = { +{ 0, &nsdomain, 0, 0, + 0, idp_output, 0, 0, + 0, + ns_init, 0, 0, 0, +}, +{ SOCK_DGRAM, &nsdomain, 0, PR_ATOMIC|PR_ADDR, + 0, 0, idp_ctlinput, idp_ctloutput, + idp_usrreq, + 0, 0, 0, 0, +}, +{ SOCK_STREAM, &nsdomain, NSPROTO_SPP, PR_CONNREQUIRED|PR_WANTRCVD, + spp_input, 0, spp_ctlinput, spp_ctloutput, + spp_usrreq, + spp_init, spp_fasttimo, spp_slowtimo, 0, +}, +{ SOCK_SEQPACKET,&nsdomain, NSPROTO_SPP, PR_CONNREQUIRED|PR_WANTRCVD|PR_ATOMIC, + spp_input, 0, spp_ctlinput, spp_ctloutput, + spp_usrreq_sp, + 0, 0, 0, 0, +}, +{ SOCK_RAW, &nsdomain, NSPROTO_RAW, PR_ATOMIC|PR_ADDR, + idp_input, idp_output, 0, idp_ctloutput, + idp_raw_usrreq, + 0, 0, 0, 0, +}, +{ SOCK_RAW, &nsdomain, NSPROTO_ERROR, PR_ATOMIC|PR_ADDR, + idp_ctlinput, idp_output, 0, idp_ctloutput, + idp_raw_usrreq, + 0, 0, 0, 0, +}, +}; + +struct domain nsdomain = + { AF_NS, "network systems", 0, 0, 0, + nssw, &nssw[sizeof(nssw)/sizeof(nssw[0])], 0, + rn_inithead, 16, sizeof(struct sockaddr_ns)}; + diff --git a/sys/netns/sp.h b/sys/netns/sp.h new file mode 100644 index 00000000000..b55dac26039 --- /dev/null +++ b/sys/netns/sp.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sp.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions for Xerox NS style sequenced packet protocol + */ + +struct sphdr { + u_char sp_cc; /* connection control */ + u_char sp_dt; /* datastream type */ +#define SP_SP 0x80 /* system packet */ +#define SP_SA 0x40 /* send acknowledgement */ +#define SP_OB 0x20 /* attention (out of band data) */ +#define SP_EM 0x10 /* end of message */ + u_short sp_sid; /* source connection identifier */ + u_short sp_did; /* destination connection identifier */ + u_short sp_seq; /* sequence number */ + u_short sp_ack; /* acknowledge number */ + u_short sp_alo; /* allocation number */ +}; diff --git a/sys/netns/spidp.h b/sys/netns/spidp.h new file mode 100644 index 00000000000..332df5be235 --- /dev/null +++ b/sys/netns/spidp.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)spidp.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions for NS(tm) Internet Datagram Protocol + * containing a Sequenced Packet Protocol packet. + */ +struct spidp { + struct idp si_i; + struct sphdr si_s; +}; +struct spidp_q { + struct spidp_q *si_next; + struct spidp_q *si_prev; +}; +#define SI(x) ((struct spidp *)x) +#define si_sum si_i.idp_sum +#define si_len si_i.idp_len +#define si_tc si_i.idp_tc +#define si_pt si_i.idp_pt +#define si_dna si_i.idp_dna +#define si_sna si_i.idp_sna +#define si_sport si_i.idp_sna.x_port +#define si_cc si_s.sp_cc +#define si_dt si_s.sp_dt +#define si_sid si_s.sp_sid +#define si_did si_s.sp_did +#define si_seq si_s.sp_seq +#define si_ack si_s.sp_ack +#define si_alo si_s.sp_alo diff --git a/sys/netns/spp_debug.c b/sys/netns/spp_debug.c new file mode 100644 index 00000000000..eaa1d023f87 --- /dev/null +++ b/sys/netns/spp_debug.c @@ -0,0 +1,170 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)spp_debug.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#define SPPTIMERS +#include +#include +#define SANAMES +#include + +int sppconsdebug = 0; +/* + * spp debug routines + */ +spp_trace(act, ostate, sp, si, req) + short act; + u_char ostate; + struct sppcb *sp; + struct spidp *si; + int req; +{ +#ifdef INET +#ifdef TCPDEBUG + u_short seq, ack, len, alo; + unsigned long iptime(); + int flags; + struct spp_debug *sd = &spp_debug[spp_debx++]; + extern char *prurequests[]; + extern char *sanames[]; + extern char *tcpstates[]; + extern char *spptimers[]; + + if (spp_debx == SPP_NDEBUG) + spp_debx = 0; + sd->sd_time = iptime(); + sd->sd_act = act; + sd->sd_ostate = ostate; + sd->sd_cb = (caddr_t)sp; + if (sp) + sd->sd_sp = *sp; + else + bzero((caddr_t)&sd->sd_sp, sizeof (*sp)); + if (si) + sd->sd_si = *si; + else + bzero((caddr_t)&sd->sd_si, sizeof (*si)); + sd->sd_req = req; + if (sppconsdebug == 0) + return; + if (ostate >= TCP_NSTATES) ostate = 0; + if (act >= SA_DROP) act = SA_DROP; + if (sp) + printf("%x %s:", sp, tcpstates[ostate]); + else + printf("???????? "); + printf("%s ", sanames[act]); + switch (act) { + + case SA_RESPOND: + case SA_INPUT: + case SA_OUTPUT: + case SA_DROP: + if (si == 0) + break; + seq = si->si_seq; + ack = si->si_ack; + alo = si->si_alo; + len = si->si_len; + if (act == SA_OUTPUT) { + seq = ntohs(seq); + ack = ntohs(ack); + alo = ntohs(alo); + len = ntohs(len); + } +#ifndef lint +#define p1(f) { printf("%s = %x, ", "f", f); } + p1(seq); p1(ack); p1(alo); p1(len); +#endif + flags = si->si_cc; + if (flags) { + char *cp = "<"; +#ifndef lint +#define pf(f) { if (flags&SP_/**/f) { printf("%s%s", cp, "f"); cp = ","; } } + pf(SP); pf(SA); pf(OB); pf(EM); +#else + cp = cp; +#endif + printf(">"); + } +#ifndef lint +#define p2(f) { printf("%s = %x, ", "f", si->si_/**/f); } + p2(sid);p2(did);p2(dt);p2(pt); +#endif + ns_printhost(&si->si_sna); + ns_printhost(&si->si_dna); + + if (act==SA_RESPOND) { + printf("idp_len = %x, ", + ((struct idp *)si)->idp_len); + } + break; + + case SA_USER: + printf("%s", prurequests[req&0xff]); + if ((req & 0xff) == PRU_SLOWTIMO) + printf("<%s>", spptimers[req>>8]); + break; + } + if (sp) + printf(" -> %s", tcpstates[sp->s_state]); + /* print out internal state of sp !?! */ + printf("\n"); + if (sp == 0) + return; +#ifndef lint +#define p3(f) { printf("%s = %x, ", "f", sp->s_/**/f); } + printf("\t"); p3(rack);p3(ralo);p3(smax);p3(flags); printf("\n"); +#endif +#endif +#endif +} diff --git a/sys/netns/spp_debug.h b/sys/netns/spp_debug.h new file mode 100644 index 00000000000..8dfe2422069 --- /dev/null +++ b/sys/netns/spp_debug.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)spp_debug.h 8.1 (Berkeley) 6/10/93 + */ + +struct spp_debug { + u_long sd_time; + short sd_act; + short sd_ostate; + caddr_t sd_cb; + short sd_req; + struct spidp sd_si; + struct sppcb sd_sp; +}; + +#define SA_INPUT 0 +#define SA_OUTPUT 1 +#define SA_USER 2 +#define SA_RESPOND 3 +#define SA_DROP 4 + +#ifdef SANAMES +char *sanames[] = + { "input", "output", "user", "respond", "drop" }; +#endif + +#define SPP_NDEBUG 100 +struct spp_debug spp_debug[SPP_NDEBUG]; +int spp_debx; diff --git a/sys/netns/spp_timer.h b/sys/netns/spp_timer.h new file mode 100644 index 00000000000..f84e3282a23 --- /dev/null +++ b/sys/netns/spp_timer.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)spp_timer.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions of the SPP timers. These timers are counted + * down PR_SLOWHZ times a second. + */ +#define SPPT_NTIMERS 4 + +#define SPPT_REXMT 0 /* retransmit */ +#define SPPT_PERSIST 1 /* retransmit persistance */ +#define SPPT_KEEP 2 /* keep alive */ +#define SPPT_2MSL 3 /* 2*msl quiet time timer */ + +/* + * The SPPT_REXMT timer is used to force retransmissions. + * The SPP has the SPPT_REXMT timer set whenever segments + * have been sent for which ACKs are expected but not yet + * received. If an ACK is received which advances tp->snd_una, + * then the retransmit timer is cleared (if there are no more + * outstanding segments) or reset to the base value (if there + * are more ACKs expected). Whenever the retransmit timer goes off, + * we retransmit one unacknowledged segment, and do a backoff + * on the retransmit timer. + * + * The SPPT_PERSIST timer is used to keep window size information + * flowing even if the window goes shut. If all previous transmissions + * have been acknowledged (so that there are no retransmissions in progress), + * and the window is too small to bother sending anything, then we start + * the SPPT_PERSIST timer. When it expires, if the window is nonzero, + * we go to transmit state. Otherwise, at intervals send a single byte + * into the peer's window to force him to update our window information. + * We do this at most as often as SPPT_PERSMIN time intervals, + * but no more frequently than the current estimate of round-trip + * packet time. The SPPT_PERSIST timer is cleared whenever we receive + * a window update from the peer. + * + * The SPPT_KEEP timer is used to keep connections alive. If an + * connection is idle (no segments received) for SPPTV_KEEP amount of time, + * but not yet established, then we drop the connection. If the connection + * is established, then we force the peer to send us a segment by sending: + * + * This segment is (deliberately) outside the window, and should elicit + * an ack segment in response from the peer. If, despite the SPPT_KEEP + * initiated segments we cannot elicit a response from a peer in SPPT_MAXIDLE + * amount of time, then we drop the connection. + */ + +#define SPP_TTL 30 /* default time to live for SPP segs */ +/* + * Time constants. + */ +#define SPPTV_MSL ( 15*PR_SLOWHZ) /* max seg lifetime */ +#define SPPTV_SRTTBASE 0 /* base roundtrip time; + if 0, no idea yet */ +#define SPPTV_SRTTDFLT ( 3*PR_SLOWHZ) /* assumed RTT if no info */ + +#define SPPTV_PERSMIN ( 5*PR_SLOWHZ) /* retransmit persistance */ +#define SPPTV_PERSMAX ( 60*PR_SLOWHZ) /* maximum persist interval */ + +#define SPPTV_KEEP ( 75*PR_SLOWHZ) /* keep alive - 75 secs */ +#define SPPTV_MAXIDLE ( 8*SPPTV_KEEP) /* maximum allowable idle + time before drop conn */ + +#define SPPTV_MIN ( 1*PR_SLOWHZ) /* minimum allowable value */ +#define SPPTV_REXMTMAX ( 64*PR_SLOWHZ) /* max allowable REXMT value */ + +#define SPP_LINGERTIME 120 /* linger at most 2 minutes */ + +#define SPP_MAXRXTSHIFT 12 /* maximum retransmits */ + +#ifdef SPPTIMERS +char *spptimers[] = + { "REXMT", "PERSIST", "KEEP", "2MSL" }; +#endif + +/* + * Force a time value to be in a certain range. + */ +#define SPPT_RANGESET(tv, value, tvmin, tvmax) { \ + (tv) = (value); \ + if ((tv) < (tvmin)) \ + (tv) = (tvmin); \ + else if ((tv) > (tvmax)) \ + (tv) = (tvmax); \ +} + +#ifdef KERNEL +extern int spp_backoff[]; +#endif diff --git a/sys/netns/spp_usrreq.c b/sys/netns/spp_usrreq.c new file mode 100644 index 00000000000..062bbec5fab --- /dev/null +++ b/sys/netns/spp_usrreq.c @@ -0,0 +1,1804 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)spp_usrreq.c 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * SP protocol implementation. + */ +spp_init() +{ + + spp_iss = 1; /* WRONG !! should fish it out of TODR */ +} +struct spidp spp_savesi; +int traceallspps = 0; +extern int sppconsdebug; +int spp_hardnosed; +int spp_use_delack = 0; +u_short spp_newchecks[50]; + +/*ARGSUSED*/ +spp_input(m, nsp) + register struct mbuf *m; + register struct nspcb *nsp; +{ + register struct sppcb *cb; + register struct spidp *si = mtod(m, struct spidp *); + register struct socket *so; + short ostate; + int dropsocket = 0; + + + sppstat.spps_rcvtotal++; + if (nsp == 0) { + panic("No nspcb in spp_input\n"); + return; + } + + cb = nstosppcb(nsp); + if (cb == 0) goto bad; + + if (m->m_len < sizeof(*si)) { + if ((m = m_pullup(m, sizeof(*si))) == 0) { + sppstat.spps_rcvshort++; + return; + } + si = mtod(m, struct spidp *); + } + si->si_seq = ntohs(si->si_seq); + si->si_ack = ntohs(si->si_ack); + si->si_alo = ntohs(si->si_alo); + + so = nsp->nsp_socket; + if (so->so_options & SO_DEBUG || traceallspps) { + ostate = cb->s_state; + spp_savesi = *si; + } + if (so->so_options & SO_ACCEPTCONN) { + struct sppcb *ocb = cb; + + so = sonewconn(so, 0); + if (so == 0) { + goto drop; + } + /* + * This is ugly, but .... + * + * Mark socket as temporary until we're + * committed to keeping it. The code at + * ``drop'' and ``dropwithreset'' check the + * flag dropsocket to see if the temporary + * socket created here should be discarded. + * We mark the socket as discardable until + * we're committed to it below in TCPS_LISTEN. + */ + dropsocket++; + nsp = (struct nspcb *)so->so_pcb; + nsp->nsp_laddr = si->si_dna; + cb = nstosppcb(nsp); + cb->s_mtu = ocb->s_mtu; /* preserve sockopts */ + cb->s_flags = ocb->s_flags; /* preserve sockopts */ + cb->s_flags2 = ocb->s_flags2; /* preserve sockopts */ + cb->s_state = TCPS_LISTEN; + } + + /* + * Packet received on connection. + * reset idle time and keep-alive timer; + */ + cb->s_idle = 0; + cb->s_timer[SPPT_KEEP] = SPPTV_KEEP; + + switch (cb->s_state) { + + case TCPS_LISTEN:{ + struct mbuf *am; + register struct sockaddr_ns *sns; + struct ns_addr laddr; + + /* + * If somebody here was carying on a conversation + * and went away, and his pen pal thinks he can + * still talk, we get the misdirected packet. + */ + if (spp_hardnosed && (si->si_did != 0 || si->si_seq != 0)) { + spp_istat.gonawy++; + goto dropwithreset; + } + am = m_get(M_DONTWAIT, MT_SONAME); + if (am == NULL) + goto drop; + am->m_len = sizeof (struct sockaddr_ns); + sns = mtod(am, struct sockaddr_ns *); + sns->sns_len = sizeof(*sns); + sns->sns_family = AF_NS; + sns->sns_addr = si->si_sna; + laddr = nsp->nsp_laddr; + if (ns_nullhost(laddr)) + nsp->nsp_laddr = si->si_dna; + if (ns_pcbconnect(nsp, am)) { + nsp->nsp_laddr = laddr; + (void) m_free(am); + spp_istat.noconn++; + goto drop; + } + (void) m_free(am); + spp_template(cb); + dropsocket = 0; /* committed to socket */ + cb->s_did = si->si_sid; + cb->s_rack = si->si_ack; + cb->s_ralo = si->si_alo; +#define THREEWAYSHAKE +#ifdef THREEWAYSHAKE + cb->s_state = TCPS_SYN_RECEIVED; + cb->s_force = 1 + SPPT_KEEP; + sppstat.spps_accepts++; + cb->s_timer[SPPT_KEEP] = SPPTV_KEEP; + } + break; + /* + * This state means that we have heard a response + * to our acceptance of their connection + * It is probably logically unnecessary in this + * implementation. + */ + case TCPS_SYN_RECEIVED: { + if (si->si_did!=cb->s_sid) { + spp_istat.wrncon++; + goto drop; + } +#endif + nsp->nsp_fport = si->si_sport; + cb->s_timer[SPPT_REXMT] = 0; + cb->s_timer[SPPT_KEEP] = SPPTV_KEEP; + soisconnected(so); + cb->s_state = TCPS_ESTABLISHED; + sppstat.spps_accepts++; + } + break; + + /* + * This state means that we have gotten a response + * to our attempt to establish a connection. + * We fill in the data from the other side, + * telling us which port to respond to, instead of the well- + * known one we might have sent to in the first place. + * We also require that this is a response to our + * connection id. + */ + case TCPS_SYN_SENT: + if (si->si_did!=cb->s_sid) { + spp_istat.notme++; + goto drop; + } + sppstat.spps_connects++; + cb->s_did = si->si_sid; + cb->s_rack = si->si_ack; + cb->s_ralo = si->si_alo; + cb->s_dport = nsp->nsp_fport = si->si_sport; + cb->s_timer[SPPT_REXMT] = 0; + cb->s_flags |= SF_ACKNOW; + soisconnected(so); + cb->s_state = TCPS_ESTABLISHED; + /* Use roundtrip time of connection request for initial rtt */ + if (cb->s_rtt) { + cb->s_srtt = cb->s_rtt << 3; + cb->s_rttvar = cb->s_rtt << 1; + SPPT_RANGESET(cb->s_rxtcur, + ((cb->s_srtt >> 2) + cb->s_rttvar) >> 1, + SPPTV_MIN, SPPTV_REXMTMAX); + cb->s_rtt = 0; + } + } + if (so->so_options & SO_DEBUG || traceallspps) + spp_trace(SA_INPUT, (u_char)ostate, cb, &spp_savesi, 0); + + m->m_len -= sizeof (struct idp); + m->m_pkthdr.len -= sizeof (struct idp); + m->m_data += sizeof (struct idp); + + if (spp_reass(cb, si)) { + (void) m_freem(m); + } + if (cb->s_force || (cb->s_flags & (SF_ACKNOW|SF_WIN|SF_RXT))) + (void) spp_output(cb, (struct mbuf *)0); + cb->s_flags &= ~(SF_WIN|SF_RXT); + return; + +dropwithreset: + if (dropsocket) + (void) soabort(so); + si->si_seq = ntohs(si->si_seq); + si->si_ack = ntohs(si->si_ack); + si->si_alo = ntohs(si->si_alo); + ns_error(dtom(si), NS_ERR_NOSOCK, 0); + if (cb->s_nspcb->nsp_socket->so_options & SO_DEBUG || traceallspps) + spp_trace(SA_DROP, (u_char)ostate, cb, &spp_savesi, 0); + return; + +drop: +bad: + if (cb == 0 || cb->s_nspcb->nsp_socket->so_options & SO_DEBUG || + traceallspps) + spp_trace(SA_DROP, (u_char)ostate, cb, &spp_savesi, 0); + m_freem(m); +} + +int spprexmtthresh = 3; + +/* + * This is structurally similar to the tcp reassembly routine + * but its function is somewhat different: It merely queues + * packets up, and suppresses duplicates. + */ +spp_reass(cb, si) +register struct sppcb *cb; +register struct spidp *si; +{ + register struct spidp_q *q; + register struct mbuf *m; + register struct socket *so = cb->s_nspcb->nsp_socket; + char packetp = cb->s_flags & SF_HI; + int incr; + char wakeup = 0; + + if (si == SI(0)) + goto present; + /* + * Update our news from them. + */ + if (si->si_cc & SP_SA) + cb->s_flags |= (spp_use_delack ? SF_DELACK : SF_ACKNOW); + if (SSEQ_GT(si->si_alo, cb->s_ralo)) + cb->s_flags |= SF_WIN; + if (SSEQ_LEQ(si->si_ack, cb->s_rack)) { + if ((si->si_cc & SP_SP) && cb->s_rack != (cb->s_smax + 1)) { + sppstat.spps_rcvdupack++; + /* + * If this is a completely duplicate ack + * and other conditions hold, we assume + * a packet has been dropped and retransmit + * it exactly as in tcp_input(). + */ + if (si->si_ack != cb->s_rack || + si->si_alo != cb->s_ralo) + cb->s_dupacks = 0; + else if (++cb->s_dupacks == spprexmtthresh) { + u_short onxt = cb->s_snxt; + int cwnd = cb->s_cwnd; + + cb->s_snxt = si->si_ack; + cb->s_cwnd = CUNIT; + cb->s_force = 1 + SPPT_REXMT; + (void) spp_output(cb, (struct mbuf *)0); + cb->s_timer[SPPT_REXMT] = cb->s_rxtcur; + cb->s_rtt = 0; + if (cwnd >= 4 * CUNIT) + cb->s_cwnd = cwnd / 2; + if (SSEQ_GT(onxt, cb->s_snxt)) + cb->s_snxt = onxt; + return (1); + } + } else + cb->s_dupacks = 0; + goto update_window; + } + cb->s_dupacks = 0; + /* + * If our correspondent acknowledges data we haven't sent + * TCP would drop the packet after acking. We'll be a little + * more permissive + */ + if (SSEQ_GT(si->si_ack, (cb->s_smax + 1))) { + sppstat.spps_rcvacktoomuch++; + si->si_ack = cb->s_smax + 1; + } + sppstat.spps_rcvackpack++; + /* + * If transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * See discussion of algorithm in tcp_input.c + */ + if (cb->s_rtt && SSEQ_GT(si->si_ack, cb->s_rtseq)) { + sppstat.spps_rttupdated++; + if (cb->s_srtt != 0) { + register short delta; + delta = cb->s_rtt - (cb->s_srtt >> 3); + if ((cb->s_srtt += delta) <= 0) + cb->s_srtt = 1; + if (delta < 0) + delta = -delta; + delta -= (cb->s_rttvar >> 2); + if ((cb->s_rttvar += delta) <= 0) + cb->s_rttvar = 1; + } else { + /* + * No rtt measurement yet + */ + cb->s_srtt = cb->s_rtt << 3; + cb->s_rttvar = cb->s_rtt << 1; + } + cb->s_rtt = 0; + cb->s_rxtshift = 0; + SPPT_RANGESET(cb->s_rxtcur, + ((cb->s_srtt >> 2) + cb->s_rttvar) >> 1, + SPPTV_MIN, SPPTV_REXMTMAX); + } + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value; + */ + if (si->si_ack == cb->s_smax + 1) { + cb->s_timer[SPPT_REXMT] = 0; + cb->s_flags |= SF_RXT; + } else if (cb->s_timer[SPPT_PERSIST] == 0) + cb->s_timer[SPPT_REXMT] = cb->s_rxtcur; + /* + * When new data is acked, open the congestion window. + * If the window gives us less than ssthresh packets + * in flight, open exponentially (maxseg at a time). + * Otherwise open linearly (maxseg^2 / cwnd at a time). + */ + incr = CUNIT; + if (cb->s_cwnd > cb->s_ssthresh) + incr = max(incr * incr / cb->s_cwnd, 1); + cb->s_cwnd = min(cb->s_cwnd + incr, cb->s_cwmx); + /* + * Trim Acked data from output queue. + */ + while ((m = so->so_snd.sb_mb) != NULL) { + if (SSEQ_LT((mtod(m, struct spidp *))->si_seq, si->si_ack)) + sbdroprecord(&so->so_snd); + else + break; + } + sowwakeup(so); + cb->s_rack = si->si_ack; +update_window: + if (SSEQ_LT(cb->s_snxt, cb->s_rack)) + cb->s_snxt = cb->s_rack; + if (SSEQ_LT(cb->s_swl1, si->si_seq) || cb->s_swl1 == si->si_seq && + (SSEQ_LT(cb->s_swl2, si->si_ack) || + cb->s_swl2 == si->si_ack && SSEQ_LT(cb->s_ralo, si->si_alo))) { + /* keep track of pure window updates */ + if ((si->si_cc & SP_SP) && cb->s_swl2 == si->si_ack + && SSEQ_LT(cb->s_ralo, si->si_alo)) { + sppstat.spps_rcvwinupd++; + sppstat.spps_rcvdupack--; + } + cb->s_ralo = si->si_alo; + cb->s_swl1 = si->si_seq; + cb->s_swl2 = si->si_ack; + cb->s_swnd = (1 + si->si_alo - si->si_ack); + if (cb->s_swnd > cb->s_smxw) + cb->s_smxw = cb->s_swnd; + cb->s_flags |= SF_WIN; + } + /* + * If this packet number is higher than that which + * we have allocated refuse it, unless urgent + */ + if (SSEQ_GT(si->si_seq, cb->s_alo)) { + if (si->si_cc & SP_SP) { + sppstat.spps_rcvwinprobe++; + return (1); + } else + sppstat.spps_rcvpackafterwin++; + if (si->si_cc & SP_OB) { + if (SSEQ_GT(si->si_seq, cb->s_alo + 60)) { + ns_error(dtom(si), NS_ERR_FULLUP, 0); + return (0); + } /* else queue this packet; */ + } else { + /*register struct socket *so = cb->s_nspcb->nsp_socket; + if (so->so_state && SS_NOFDREF) { + ns_error(dtom(si), NS_ERR_NOSOCK, 0); + (void)spp_close(cb); + } else + would crash system*/ + spp_istat.notyet++; + ns_error(dtom(si), NS_ERR_FULLUP, 0); + return (0); + } + } + /* + * If this is a system packet, we don't need to + * queue it up, and won't update acknowledge # + */ + if (si->si_cc & SP_SP) { + return (1); + } + /* + * We have already seen this packet, so drop. + */ + if (SSEQ_LT(si->si_seq, cb->s_ack)) { + spp_istat.bdreas++; + sppstat.spps_rcvduppack++; + if (si->si_seq == cb->s_ack - 1) + spp_istat.lstdup++; + return (1); + } + /* + * Loop through all packets queued up to insert in + * appropriate sequence. + */ + for (q = cb->s_q.si_next; q!=&cb->s_q; q = q->si_next) { + if (si->si_seq == SI(q)->si_seq) { + sppstat.spps_rcvduppack++; + return (1); + } + if (SSEQ_LT(si->si_seq, SI(q)->si_seq)) { + sppstat.spps_rcvoopack++; + break; + } + } + insque(si, q->si_prev); + /* + * If this packet is urgent, inform process + */ + if (si->si_cc & SP_OB) { + cb->s_iobc = ((char *)si)[1 + sizeof(*si)]; + sohasoutofband(so); + cb->s_oobflags |= SF_IOOB; + } +present: +#define SPINC sizeof(struct sphdr) + /* + * Loop through all packets queued up to update acknowledge + * number, and present all acknowledged data to user; + * If in packet interface mode, show packet headers. + */ + for (q = cb->s_q.si_next; q!=&cb->s_q; q = q->si_next) { + if (SI(q)->si_seq == cb->s_ack) { + cb->s_ack++; + m = dtom(q); + if (SI(q)->si_cc & SP_OB) { + cb->s_oobflags &= ~SF_IOOB; + if (so->so_rcv.sb_cc) + so->so_oobmark = so->so_rcv.sb_cc; + else + so->so_state |= SS_RCVATMARK; + } + q = q->si_prev; + remque(q->si_next); + wakeup = 1; + sppstat.spps_rcvpack++; +#ifdef SF_NEWCALL + if (cb->s_flags2 & SF_NEWCALL) { + struct sphdr *sp = mtod(m, struct sphdr *); + u_char dt = sp->sp_dt; + spp_newchecks[4]++; + if (dt != cb->s_rhdr.sp_dt) { + struct mbuf *mm = + m_getclr(M_DONTWAIT, MT_CONTROL); + spp_newchecks[0]++; + if (mm != NULL) { + u_short *s = + mtod(mm, u_short *); + cb->s_rhdr.sp_dt = dt; + mm->m_len = 5; /*XXX*/ + s[0] = 5; + s[1] = 1; + *(u_char *)(&s[2]) = dt; + sbappend(&so->so_rcv, mm); + } + } + if (sp->sp_cc & SP_OB) { + MCHTYPE(m, MT_OOBDATA); + spp_newchecks[1]++; + so->so_oobmark = 0; + so->so_state &= ~SS_RCVATMARK; + } + if (packetp == 0) { + m->m_data += SPINC; + m->m_len -= SPINC; + m->m_pkthdr.len -= SPINC; + } + if ((sp->sp_cc & SP_EM) || packetp) { + sbappendrecord(&so->so_rcv, m); + spp_newchecks[9]++; + } else + sbappend(&so->so_rcv, m); + } else +#endif + if (packetp) { + sbappendrecord(&so->so_rcv, m); + } else { + cb->s_rhdr = *mtod(m, struct sphdr *); + m->m_data += SPINC; + m->m_len -= SPINC; + m->m_pkthdr.len -= SPINC; + sbappend(&so->so_rcv, m); + } + } else + break; + } + if (wakeup) sorwakeup(so); + return (0); +} + +spp_ctlinput(cmd, arg) + int cmd; + caddr_t arg; +{ + struct ns_addr *na; + extern u_char nsctlerrmap[]; + extern spp_abort(), spp_quench(); + extern struct nspcb *idp_drop(); + struct ns_errp *errp; + struct nspcb *nsp; + struct sockaddr_ns *sns; + int type; + + if (cmd < 0 || cmd > PRC_NCMDS) + return; + type = NS_ERR_UNREACH_HOST; + + switch (cmd) { + + case PRC_ROUTEDEAD: + return; + + case PRC_IFDOWN: + case PRC_HOSTDEAD: + case PRC_HOSTUNREACH: + sns = (struct sockaddr_ns *)arg; + if (sns->sns_family != AF_NS) + return; + na = &sns->sns_addr; + break; + + default: + errp = (struct ns_errp *)arg; + na = &errp->ns_err_idp.idp_dna; + type = errp->ns_err_num; + type = ntohs((u_short)type); + } + switch (type) { + + case NS_ERR_UNREACH_HOST: + ns_pcbnotify(na, (int)nsctlerrmap[cmd], spp_abort, (long) 0); + break; + + case NS_ERR_TOO_BIG: + case NS_ERR_NOSOCK: + nsp = ns_pcblookup(na, errp->ns_err_idp.idp_sna.x_port, + NS_WILDCARD); + if (nsp) { + if(nsp->nsp_pcb) + (void) spp_drop((struct sppcb *)nsp->nsp_pcb, + (int)nsctlerrmap[cmd]); + else + (void) idp_drop(nsp, (int)nsctlerrmap[cmd]); + } + break; + + case NS_ERR_FULLUP: + ns_pcbnotify(na, 0, spp_quench, (long) 0); + } +} +/* + * When a source quench is received, close congestion window + * to one packet. We will gradually open it again as we proceed. + */ +spp_quench(nsp) + struct nspcb *nsp; +{ + struct sppcb *cb = nstosppcb(nsp); + + if (cb) + cb->s_cwnd = CUNIT; +} + +#ifdef notdef +int +spp_fixmtu(nsp) +register struct nspcb *nsp; +{ + register struct sppcb *cb = (struct sppcb *)(nsp->nsp_pcb); + register struct mbuf *m; + register struct spidp *si; + struct ns_errp *ep; + struct sockbuf *sb; + int badseq, len; + struct mbuf *firstbad, *m0; + + if (cb) { + /* + * The notification that we have sent + * too much is bad news -- we will + * have to go through queued up so far + * splitting ones which are too big and + * reassigning sequence numbers and checksums. + * we should then retransmit all packets from + * one above the offending packet to the last one + * we had sent (or our allocation) + * then the offending one so that the any queued + * data at our destination will be discarded. + */ + ep = (struct ns_errp *)nsp->nsp_notify_param; + sb = &nsp->nsp_socket->so_snd; + cb->s_mtu = ep->ns_err_param; + badseq = SI(&ep->ns_err_idp)->si_seq; + for (m = sb->sb_mb; m; m = m->m_act) { + si = mtod(m, struct spidp *); + if (si->si_seq == badseq) + break; + } + if (m == 0) return; + firstbad = m; + /*for (;;) {*/ + /* calculate length */ + for (m0 = m, len = 0; m ; m = m->m_next) + len += m->m_len; + if (len > cb->s_mtu) { + } + /* FINISH THIS + } */ + } +} +#endif + +spp_output(cb, m0) + register struct sppcb *cb; + struct mbuf *m0; +{ + struct socket *so = cb->s_nspcb->nsp_socket; + register struct mbuf *m; + register struct spidp *si = (struct spidp *) 0; + register struct sockbuf *sb = &so->so_snd; + int len = 0, win, rcv_win; + short span, off, recordp = 0; + u_short alo; + int error = 0, sendalot; +#ifdef notdef + int idle; +#endif + struct mbuf *mprev; + extern int idpcksum; + + if (m0) { + int mtu = cb->s_mtu; + int datalen; + /* + * Make sure that packet isn't too big. + */ + for (m = m0; m ; m = m->m_next) { + mprev = m; + len += m->m_len; + if (m->m_flags & M_EOR) + recordp = 1; + } + datalen = (cb->s_flags & SF_HO) ? + len - sizeof (struct sphdr) : len; + if (datalen > mtu) { + if (cb->s_flags & SF_PI) { + m_freem(m0); + return (EMSGSIZE); + } else { + int oldEM = cb->s_cc & SP_EM; + + cb->s_cc &= ~SP_EM; + while (len > mtu) { + /* + * Here we are only being called + * from usrreq(), so it is OK to + * block. + */ + m = m_copym(m0, 0, mtu, M_WAIT); + if (cb->s_flags & SF_NEWCALL) { + struct mbuf *mm = m; + spp_newchecks[7]++; + while (mm) { + mm->m_flags &= ~M_EOR; + mm = mm->m_next; + } + } + error = spp_output(cb, m); + if (error) { + cb->s_cc |= oldEM; + m_freem(m0); + return(error); + } + m_adj(m0, mtu); + len -= mtu; + } + cb->s_cc |= oldEM; + } + } + /* + * Force length even, by adding a "garbage byte" if + * necessary. + */ + if (len & 1) { + m = mprev; + if (M_TRAILINGSPACE(m) >= 1) + m->m_len++; + else { + struct mbuf *m1 = m_get(M_DONTWAIT, MT_DATA); + + if (m1 == 0) { + m_freem(m0); + return (ENOBUFS); + } + m1->m_len = 1; + *(mtod(m1, u_char *)) = 0; + m->m_next = m1; + } + } + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == 0) { + m_freem(m0); + return (ENOBUFS); + } + /* + * Fill in mbuf with extended SP header + * and addresses and length put into network format. + */ + MH_ALIGN(m, sizeof (struct spidp)); + m->m_len = sizeof (struct spidp); + m->m_next = m0; + si = mtod(m, struct spidp *); + si->si_i = *cb->s_idp; + si->si_s = cb->s_shdr; + if ((cb->s_flags & SF_PI) && (cb->s_flags & SF_HO)) { + register struct sphdr *sh; + if (m0->m_len < sizeof (*sh)) { + if((m0 = m_pullup(m0, sizeof(*sh))) == NULL) { + (void) m_free(m); + m_freem(m0); + return (EINVAL); + } + m->m_next = m0; + } + sh = mtod(m0, struct sphdr *); + si->si_dt = sh->sp_dt; + si->si_cc |= sh->sp_cc & SP_EM; + m0->m_len -= sizeof (*sh); + m0->m_data += sizeof (*sh); + len -= sizeof (*sh); + } + len += sizeof(*si); + if ((cb->s_flags2 & SF_NEWCALL) && recordp) { + si->si_cc |= SP_EM; + spp_newchecks[8]++; + } + if (cb->s_oobflags & SF_SOOB) { + /* + * Per jqj@cornell: + * make sure OB packets convey exactly 1 byte. + * If the packet is 1 byte or larger, we + * have already guaranted there to be at least + * one garbage byte for the checksum, and + * extra bytes shouldn't hurt! + */ + if (len > sizeof(*si)) { + si->si_cc |= SP_OB; + len = (1 + sizeof(*si)); + } + } + si->si_len = htons((u_short)len); + m->m_pkthdr.len = ((len - 1) | 1) + 1; + /* + * queue stuff up for output + */ + sbappendrecord(sb, m); + cb->s_seq++; + } +#ifdef notdef + idle = (cb->s_smax == (cb->s_rack - 1)); +#endif +again: + sendalot = 0; + off = cb->s_snxt - cb->s_rack; + win = min(cb->s_swnd, (cb->s_cwnd/CUNIT)); + + /* + * If in persist timeout with window of 0, send a probe. + * Otherwise, if window is small but nonzero + * and timer expired, send what we can and go into + * transmit state. + */ + if (cb->s_force == 1 + SPPT_PERSIST) { + if (win != 0) { + cb->s_timer[SPPT_PERSIST] = 0; + cb->s_rxtshift = 0; + } + } + span = cb->s_seq - cb->s_rack; + len = min(span, win) - off; + + if (len < 0) { + /* + * Window shrank after we went into it. + * If window shrank to 0, cancel pending + * restransmission and pull s_snxt back + * to (closed) window. We will enter persist + * state below. If the widndow didn't close completely, + * just wait for an ACK. + */ + len = 0; + if (win == 0) { + cb->s_timer[SPPT_REXMT] = 0; + cb->s_snxt = cb->s_rack; + } + } + if (len > 1) + sendalot = 1; + rcv_win = sbspace(&so->so_rcv); + + /* + * Send if we owe peer an ACK. + */ + if (cb->s_oobflags & SF_SOOB) { + /* + * must transmit this out of band packet + */ + cb->s_oobflags &= ~ SF_SOOB; + sendalot = 1; + sppstat.spps_sndurg++; + goto found; + } + if (cb->s_flags & SF_ACKNOW) + goto send; + if (cb->s_state < TCPS_ESTABLISHED) + goto send; + /* + * Silly window can't happen in spp. + * Code from tcp deleted. + */ + if (len) + goto send; + /* + * Compare available window to amount of window + * known to peer (as advertised window less + * next expected input.) If the difference is at least two + * packets or at least 35% of the mximum possible window, + * then want to send a window update to peer. + */ + if (rcv_win > 0) { + u_short delta = 1 + cb->s_alo - cb->s_ack; + int adv = rcv_win - (delta * cb->s_mtu); + + if ((so->so_rcv.sb_cc == 0 && adv >= (2 * cb->s_mtu)) || + (100 * adv / so->so_rcv.sb_hiwat >= 35)) { + sppstat.spps_sndwinup++; + cb->s_flags |= SF_ACKNOW; + goto send; + } + + } + /* + * Many comments from tcp_output.c are appropriate here + * including . . . + * If send window is too small, there is data to transmit, and no + * retransmit or persist is pending, then go to persist state. + * If nothing happens soon, send when timer expires: + * if window is nonzero, transmit what we can, + * otherwise send a probe. + */ + if (so->so_snd.sb_cc && cb->s_timer[SPPT_REXMT] == 0 && + cb->s_timer[SPPT_PERSIST] == 0) { + cb->s_rxtshift = 0; + spp_setpersist(cb); + } + /* + * No reason to send a packet, just return. + */ + cb->s_outx = 1; + return (0); + +send: + /* + * Find requested packet. + */ + si = 0; + if (len > 0) { + cb->s_want = cb->s_snxt; + for (m = sb->sb_mb; m; m = m->m_act) { + si = mtod(m, struct spidp *); + if (SSEQ_LEQ(cb->s_snxt, si->si_seq)) + break; + } + found: + if (si) { + if (si->si_seq == cb->s_snxt) + cb->s_snxt++; + else + sppstat.spps_sndvoid++, si = 0; + } + } + /* + * update window + */ + if (rcv_win < 0) + rcv_win = 0; + alo = cb->s_ack - 1 + (rcv_win / ((short)cb->s_mtu)); + if (SSEQ_LT(alo, cb->s_alo)) + alo = cb->s_alo; + + if (si) { + /* + * must make a copy of this packet for + * idp_output to monkey with + */ + m = m_copy(dtom(si), 0, (int)M_COPYALL); + if (m == NULL) { + return (ENOBUFS); + } + si = mtod(m, struct spidp *); + if (SSEQ_LT(si->si_seq, cb->s_smax)) + sppstat.spps_sndrexmitpack++; + else + sppstat.spps_sndpack++; + } else if (cb->s_force || cb->s_flags & SF_ACKNOW) { + /* + * Must send an acknowledgement or a probe + */ + if (cb->s_force) + sppstat.spps_sndprobe++; + if (cb->s_flags & SF_ACKNOW) + sppstat.spps_sndacks++; + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == 0) + return (ENOBUFS); + /* + * Fill in mbuf with extended SP header + * and addresses and length put into network format. + */ + MH_ALIGN(m, sizeof (struct spidp)); + m->m_len = sizeof (*si); + m->m_pkthdr.len = sizeof (*si); + si = mtod(m, struct spidp *); + si->si_i = *cb->s_idp; + si->si_s = cb->s_shdr; + si->si_seq = cb->s_smax + 1; + si->si_len = htons(sizeof (*si)); + si->si_cc |= SP_SP; + } else { + cb->s_outx = 3; + if (so->so_options & SO_DEBUG || traceallspps) + spp_trace(SA_OUTPUT, cb->s_state, cb, si, 0); + return (0); + } + /* + * Stuff checksum and output datagram. + */ + if ((si->si_cc & SP_SP) == 0) { + if (cb->s_force != (1 + SPPT_PERSIST) || + cb->s_timer[SPPT_PERSIST] == 0) { + /* + * If this is a new packet and we are not currently + * timing anything, time this one. + */ + if (SSEQ_LT(cb->s_smax, si->si_seq)) { + cb->s_smax = si->si_seq; + if (cb->s_rtt == 0) { + sppstat.spps_segstimed++; + cb->s_rtseq = si->si_seq; + cb->s_rtt = 1; + } + } + /* + * Set rexmt timer if not currently set, + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. + */ + if (cb->s_timer[SPPT_REXMT] == 0 && + cb->s_snxt != cb->s_rack) { + cb->s_timer[SPPT_REXMT] = cb->s_rxtcur; + if (cb->s_timer[SPPT_PERSIST]) { + cb->s_timer[SPPT_PERSIST] = 0; + cb->s_rxtshift = 0; + } + } + } else if (SSEQ_LT(cb->s_smax, si->si_seq)) { + cb->s_smax = si->si_seq; + } + } else if (cb->s_state < TCPS_ESTABLISHED) { + if (cb->s_rtt == 0) + cb->s_rtt = 1; /* Time initial handshake */ + if (cb->s_timer[SPPT_REXMT] == 0) + cb->s_timer[SPPT_REXMT] = cb->s_rxtcur; + } + { + /* + * Do not request acks when we ack their data packets or + * when we do a gratuitous window update. + */ + if (((si->si_cc & SP_SP) == 0) || cb->s_force) + si->si_cc |= SP_SA; + si->si_seq = htons(si->si_seq); + si->si_alo = htons(alo); + si->si_ack = htons(cb->s_ack); + + if (idpcksum) { + si->si_sum = 0; + len = ntohs(si->si_len); + if (len & 1) + len++; + si->si_sum = ns_cksum(m, len); + } else + si->si_sum = 0xffff; + + cb->s_outx = 4; + if (so->so_options & SO_DEBUG || traceallspps) + spp_trace(SA_OUTPUT, cb->s_state, cb, si, 0); + + if (so->so_options & SO_DONTROUTE) + error = ns_output(m, (struct route *)0, NS_ROUTETOIF); + else + error = ns_output(m, &cb->s_nspcb->nsp_route, 0); + } + if (error) { + return (error); + } + sppstat.spps_sndtotal++; + /* + * Data sent (as far as we can tell). + * If this advertises a larger window than any other segment, + * then remember the size of the advertized window. + * Any pending ACK has now been sent. + */ + cb->s_force = 0; + cb->s_flags &= ~(SF_ACKNOW|SF_DELACK); + if (SSEQ_GT(alo, cb->s_alo)) + cb->s_alo = alo; + if (sendalot) + goto again; + cb->s_outx = 5; + return (0); +} + +int spp_do_persist_panics = 0; + +spp_setpersist(cb) + register struct sppcb *cb; +{ + register t = ((cb->s_srtt >> 2) + cb->s_rttvar) >> 1; + extern int spp_backoff[]; + + if (cb->s_timer[SPPT_REXMT] && spp_do_persist_panics) + panic("spp_output REXMT"); + /* + * Start/restart persistance timer. + */ + SPPT_RANGESET(cb->s_timer[SPPT_PERSIST], + t*spp_backoff[cb->s_rxtshift], + SPPTV_PERSMIN, SPPTV_PERSMAX); + if (cb->s_rxtshift < SPP_MAXRXTSHIFT) + cb->s_rxtshift++; +} +/*ARGSUSED*/ +spp_ctloutput(req, so, level, name, value) + int req; + struct socket *so; + int name; + struct mbuf **value; +{ + register struct mbuf *m; + struct nspcb *nsp = sotonspcb(so); + register struct sppcb *cb; + int mask, error = 0; + + if (level != NSPROTO_SPP) { + /* This will have to be changed when we do more general + stacking of protocols */ + return (idp_ctloutput(req, so, level, name, value)); + } + if (nsp == NULL) { + error = EINVAL; + goto release; + } else + cb = nstosppcb(nsp); + + switch (req) { + + case PRCO_GETOPT: + if (value == NULL) + return (EINVAL); + m = m_get(M_DONTWAIT, MT_DATA); + if (m == NULL) + return (ENOBUFS); + switch (name) { + + case SO_HEADERS_ON_INPUT: + mask = SF_HI; + goto get_flags; + + case SO_HEADERS_ON_OUTPUT: + mask = SF_HO; + get_flags: + m->m_len = sizeof(short); + *mtod(m, short *) = cb->s_flags & mask; + break; + + case SO_MTU: + m->m_len = sizeof(u_short); + *mtod(m, short *) = cb->s_mtu; + break; + + case SO_LAST_HEADER: + m->m_len = sizeof(struct sphdr); + *mtod(m, struct sphdr *) = cb->s_rhdr; + break; + + case SO_DEFAULT_HEADERS: + m->m_len = sizeof(struct spidp); + *mtod(m, struct sphdr *) = cb->s_shdr; + break; + + default: + error = EINVAL; + } + *value = m; + break; + + case PRCO_SETOPT: + if (value == 0 || *value == 0) { + error = EINVAL; + break; + } + switch (name) { + int *ok; + + case SO_HEADERS_ON_INPUT: + mask = SF_HI; + goto set_head; + + case SO_HEADERS_ON_OUTPUT: + mask = SF_HO; + set_head: + if (cb->s_flags & SF_PI) { + ok = mtod(*value, int *); + if (*ok) + cb->s_flags |= mask; + else + cb->s_flags &= ~mask; + } else error = EINVAL; + break; + + case SO_MTU: + cb->s_mtu = *(mtod(*value, u_short *)); + break; + +#ifdef SF_NEWCALL + case SO_NEWCALL: + ok = mtod(*value, int *); + if (*ok) { + cb->s_flags2 |= SF_NEWCALL; + spp_newchecks[5]++; + } else { + cb->s_flags2 &= ~SF_NEWCALL; + spp_newchecks[6]++; + } + break; +#endif + + case SO_DEFAULT_HEADERS: + { + register struct sphdr *sp + = mtod(*value, struct sphdr *); + cb->s_dt = sp->sp_dt; + cb->s_cc = sp->sp_cc & SP_EM; + } + break; + + default: + error = EINVAL; + } + m_freem(*value); + break; + } + release: + return (error); +} + +/*ARGSUSED*/ +spp_usrreq(so, req, m, nam, controlp) + struct socket *so; + int req; + struct mbuf *m, *nam, *controlp; +{ + struct nspcb *nsp = sotonspcb(so); + register struct sppcb *cb; + int s = splnet(); + int error = 0, ostate; + struct mbuf *mm; + register struct sockbuf *sb; + + if (req == PRU_CONTROL) + return (ns_control(so, (int)m, (caddr_t)nam, + (struct ifnet *)controlp)); + if (nsp == NULL) { + if (req != PRU_ATTACH) { + error = EINVAL; + goto release; + } + } else + cb = nstosppcb(nsp); + + ostate = cb ? cb->s_state : 0; + + switch (req) { + + case PRU_ATTACH: + if (nsp != NULL) { + error = EISCONN; + break; + } + error = ns_pcballoc(so, &nspcb); + if (error) + break; + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + error = soreserve(so, (u_long) 3072, (u_long) 3072); + if (error) + break; + } + nsp = sotonspcb(so); + + mm = m_getclr(M_DONTWAIT, MT_PCB); + sb = &so->so_snd; + + if (mm == NULL) { + error = ENOBUFS; + break; + } + cb = mtod(mm, struct sppcb *); + mm = m_getclr(M_DONTWAIT, MT_HEADER); + if (mm == NULL) { + (void) m_free(dtom(m)); + error = ENOBUFS; + break; + } + cb->s_idp = mtod(mm, struct idp *); + cb->s_state = TCPS_LISTEN; + cb->s_smax = -1; + cb->s_swl1 = -1; + cb->s_q.si_next = cb->s_q.si_prev = &cb->s_q; + cb->s_nspcb = nsp; + cb->s_mtu = 576 - sizeof (struct spidp); + cb->s_cwnd = sbspace(sb) * CUNIT / cb->s_mtu; + cb->s_ssthresh = cb->s_cwnd; + cb->s_cwmx = sbspace(sb) * CUNIT / + (2 * sizeof (struct spidp)); + /* Above is recomputed when connecting to account + for changed buffering or mtu's */ + cb->s_rtt = SPPTV_SRTTBASE; + cb->s_rttvar = SPPTV_SRTTDFLT << 2; + SPPT_RANGESET(cb->s_rxtcur, + ((SPPTV_SRTTBASE >> 2) + (SPPTV_SRTTDFLT << 2)) >> 1, + SPPTV_MIN, SPPTV_REXMTMAX); + nsp->nsp_pcb = (caddr_t) cb; + break; + + case PRU_DETACH: + if (nsp == NULL) { + error = ENOTCONN; + break; + } + if (cb->s_state > TCPS_LISTEN) + cb = spp_disconnect(cb); + else + cb = spp_close(cb); + break; + + case PRU_BIND: + error = ns_pcbbind(nsp, nam); + break; + + case PRU_LISTEN: + if (nsp->nsp_lport == 0) + error = ns_pcbbind(nsp, (struct mbuf *)0); + if (error == 0) + cb->s_state = TCPS_LISTEN; + break; + + /* + * Initiate connection to peer. + * Enter SYN_SENT state, and mark socket as connecting. + * Start keep-alive timer, setup prototype header, + * Send initial system packet requesting connection. + */ + case PRU_CONNECT: + if (nsp->nsp_lport == 0) { + error = ns_pcbbind(nsp, (struct mbuf *)0); + if (error) + break; + } + error = ns_pcbconnect(nsp, nam); + if (error) + break; + soisconnecting(so); + sppstat.spps_connattempt++; + cb->s_state = TCPS_SYN_SENT; + cb->s_did = 0; + spp_template(cb); + cb->s_timer[SPPT_KEEP] = SPPTV_KEEP; + cb->s_force = 1 + SPPTV_KEEP; + /* + * Other party is required to respond to + * the port I send from, but he is not + * required to answer from where I am sending to, + * so allow wildcarding. + * original port I am sending to is still saved in + * cb->s_dport. + */ + nsp->nsp_fport = 0; + error = spp_output(cb, (struct mbuf *) 0); + break; + + case PRU_CONNECT2: + error = EOPNOTSUPP; + break; + + /* + * We may decide later to implement connection closing + * handshaking at the spp level optionally. + * here is the hook to do it: + */ + case PRU_DISCONNECT: + cb = spp_disconnect(cb); + break; + + /* + * Accept a connection. Essentially all the work is + * done at higher levels; just return the address + * of the peer, storing through addr. + */ + case PRU_ACCEPT: { + struct sockaddr_ns *sns = mtod(nam, struct sockaddr_ns *); + + nam->m_len = sizeof (struct sockaddr_ns); + sns->sns_family = AF_NS; + sns->sns_addr = nsp->nsp_faddr; + break; + } + + case PRU_SHUTDOWN: + socantsendmore(so); + cb = spp_usrclosed(cb); + if (cb) + error = spp_output(cb, (struct mbuf *) 0); + break; + + /* + * After a receive, possibly send acknowledgment + * updating allocation. + */ + case PRU_RCVD: + cb->s_flags |= SF_RVD; + (void) spp_output(cb, (struct mbuf *) 0); + cb->s_flags &= ~SF_RVD; + break; + + case PRU_ABORT: + (void) spp_drop(cb, ECONNABORTED); + break; + + case PRU_SENSE: + case PRU_CONTROL: + m = NULL; + error = EOPNOTSUPP; + break; + + case PRU_RCVOOB: + if ((cb->s_oobflags & SF_IOOB) || so->so_oobmark || + (so->so_state & SS_RCVATMARK)) { + m->m_len = 1; + *mtod(m, caddr_t) = cb->s_iobc; + break; + } + error = EINVAL; + break; + + case PRU_SENDOOB: + if (sbspace(&so->so_snd) < -512) { + error = ENOBUFS; + break; + } + cb->s_oobflags |= SF_SOOB; + /* fall into */ + case PRU_SEND: + if (controlp) { + u_short *p = mtod(controlp, u_short *); + spp_newchecks[2]++; + if ((p[0] == 5) && p[1] == 1) { /* XXXX, for testing */ + cb->s_shdr.sp_dt = *(u_char *)(&p[2]); + spp_newchecks[3]++; + } + m_freem(controlp); + } + controlp = NULL; + error = spp_output(cb, m); + m = NULL; + break; + + case PRU_SOCKADDR: + ns_setsockaddr(nsp, nam); + break; + + case PRU_PEERADDR: + ns_setpeeraddr(nsp, nam); + break; + + case PRU_SLOWTIMO: + cb = spp_timers(cb, (int)nam); + req |= ((int)nam) << 8; + break; + + case PRU_FASTTIMO: + case PRU_PROTORCV: + case PRU_PROTOSEND: + error = EOPNOTSUPP; + break; + + default: + panic("sp_usrreq"); + } + if (cb && (so->so_options & SO_DEBUG || traceallspps)) + spp_trace(SA_USER, (u_char)ostate, cb, (struct spidp *)0, req); +release: + if (controlp != NULL) + m_freem(controlp); + if (m != NULL) + m_freem(m); + splx(s); + return (error); +} + +spp_usrreq_sp(so, req, m, nam, controlp) + struct socket *so; + int req; + struct mbuf *m, *nam, *controlp; +{ + int error = spp_usrreq(so, req, m, nam, controlp); + + if (req == PRU_ATTACH && error == 0) { + struct nspcb *nsp = sotonspcb(so); + ((struct sppcb *)nsp->nsp_pcb)->s_flags |= + (SF_HI | SF_HO | SF_PI); + } + return (error); +} + +/* + * Create template to be used to send spp packets on a connection. + * Called after host entry created, fills + * in a skeletal spp header (choosing connection id), + * minimizing the amount of work necessary when the connection is used. + */ +spp_template(cb) + register struct sppcb *cb; +{ + register struct nspcb *nsp = cb->s_nspcb; + register struct idp *idp = cb->s_idp; + register struct sockbuf *sb = &(nsp->nsp_socket->so_snd); + + idp->idp_pt = NSPROTO_SPP; + idp->idp_sna = nsp->nsp_laddr; + idp->idp_dna = nsp->nsp_faddr; + cb->s_sid = htons(spp_iss); + spp_iss += SPP_ISSINCR/2; + cb->s_alo = 1; + cb->s_cwnd = (sbspace(sb) * CUNIT) / cb->s_mtu; + cb->s_ssthresh = cb->s_cwnd; /* Try to expand fast to full complement + of large packets */ + cb->s_cwmx = (sbspace(sb) * CUNIT) / (2 * sizeof(struct spidp)); + cb->s_cwmx = max(cb->s_cwmx, cb->s_cwnd); + /* But allow for lots of little packets as well */ +} + +/* + * Close a SPIP control block: + * discard spp control block itself + * discard ns protocol control block + * wake up any sleepers + */ +struct sppcb * +spp_close(cb) + register struct sppcb *cb; +{ + register struct spidp_q *s; + struct nspcb *nsp = cb->s_nspcb; + struct socket *so = nsp->nsp_socket; + register struct mbuf *m; + + s = cb->s_q.si_next; + while (s != &(cb->s_q)) { + s = s->si_next; + m = dtom(s->si_prev); + remque(s->si_prev); + m_freem(m); + } + (void) m_free(dtom(cb->s_idp)); + (void) m_free(dtom(cb)); + nsp->nsp_pcb = 0; + soisdisconnected(so); + ns_pcbdetach(nsp); + sppstat.spps_closed++; + return ((struct sppcb *)0); +} +/* + * Someday we may do level 3 handshaking + * to close a connection or send a xerox style error. + * For now, just close. + */ +struct sppcb * +spp_usrclosed(cb) + register struct sppcb *cb; +{ + return (spp_close(cb)); +} +struct sppcb * +spp_disconnect(cb) + register struct sppcb *cb; +{ + return (spp_close(cb)); +} +/* + * Drop connection, reporting + * the specified error. + */ +struct sppcb * +spp_drop(cb, errno) + register struct sppcb *cb; + int errno; +{ + struct socket *so = cb->s_nspcb->nsp_socket; + + /* + * someday, in the xerox world + * we will generate error protocol packets + * announcing that the socket has gone away. + */ + if (TCPS_HAVERCVDSYN(cb->s_state)) { + sppstat.spps_drops++; + cb->s_state = TCPS_CLOSED; + /*(void) tcp_output(cb);*/ + } else + sppstat.spps_conndrops++; + so->so_error = errno; + return (spp_close(cb)); +} + +spp_abort(nsp) + struct nspcb *nsp; +{ + + (void) spp_close((struct sppcb *)nsp->nsp_pcb); +} + +int spp_backoff[SPP_MAXRXTSHIFT+1] = + { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; +/* + * Fast timeout routine for processing delayed acks + */ +spp_fasttimo() +{ + register struct nspcb *nsp; + register struct sppcb *cb; + int s = splnet(); + + nsp = nspcb.nsp_next; + if (nsp) + for (; nsp != &nspcb; nsp = nsp->nsp_next) + if ((cb = (struct sppcb *)nsp->nsp_pcb) && + (cb->s_flags & SF_DELACK)) { + cb->s_flags &= ~SF_DELACK; + cb->s_flags |= SF_ACKNOW; + sppstat.spps_delack++; + (void) spp_output(cb, (struct mbuf *) 0); + } + splx(s); +} + +/* + * spp protocol timeout routine called every 500 ms. + * Updates the timers in all active pcb's and + * causes finite state machine actions if timers expire. + */ +spp_slowtimo() +{ + register struct nspcb *ip, *ipnxt; + register struct sppcb *cb; + int s = splnet(); + register int i; + + /* + * Search through tcb's and update active timers. + */ + ip = nspcb.nsp_next; + if (ip == 0) { + splx(s); + return; + } + while (ip != &nspcb) { + cb = nstosppcb(ip); + ipnxt = ip->nsp_next; + if (cb == 0) + goto tpgone; + for (i = 0; i < SPPT_NTIMERS; i++) { + if (cb->s_timer[i] && --cb->s_timer[i] == 0) { + (void) spp_usrreq(cb->s_nspcb->nsp_socket, + PRU_SLOWTIMO, (struct mbuf *)0, + (struct mbuf *)i, (struct mbuf *)0, + (struct mbuf *)0); + if (ipnxt->nsp_prev != ip) + goto tpgone; + } + } + cb->s_idle++; + if (cb->s_rtt) + cb->s_rtt++; +tpgone: + ip = ipnxt; + } + spp_iss += SPP_ISSINCR/PR_SLOWHZ; /* increment iss */ + splx(s); +} +/* + * SPP timer processing. + */ +struct sppcb * +spp_timers(cb, timer) + register struct sppcb *cb; + int timer; +{ + long rexmt; + int win; + + cb->s_force = 1 + timer; + switch (timer) { + + /* + * 2 MSL timeout in shutdown went off. TCP deletes connection + * control block. + */ + case SPPT_2MSL: + printf("spp: SPPT_2MSL went off for no reason\n"); + cb->s_timer[timer] = 0; + break; + + /* + * Retransmission timer went off. Message has not + * been acked within retransmit interval. Back off + * to a longer retransmit interval and retransmit one packet. + */ + case SPPT_REXMT: + if (++cb->s_rxtshift > SPP_MAXRXTSHIFT) { + cb->s_rxtshift = SPP_MAXRXTSHIFT; + sppstat.spps_timeoutdrop++; + cb = spp_drop(cb, ETIMEDOUT); + break; + } + sppstat.spps_rexmttimeo++; + rexmt = ((cb->s_srtt >> 2) + cb->s_rttvar) >> 1; + rexmt *= spp_backoff[cb->s_rxtshift]; + SPPT_RANGESET(cb->s_rxtcur, rexmt, SPPTV_MIN, SPPTV_REXMTMAX); + cb->s_timer[SPPT_REXMT] = cb->s_rxtcur; + /* + * If we have backed off fairly far, our srtt + * estimate is probably bogus. Clobber it + * so we'll take the next rtt measurement as our srtt; + * move the current srtt into rttvar to keep the current + * retransmit times until then. + */ + if (cb->s_rxtshift > SPP_MAXRXTSHIFT / 4 ) { + cb->s_rttvar += (cb->s_srtt >> 2); + cb->s_srtt = 0; + } + cb->s_snxt = cb->s_rack; + /* + * If timing a packet, stop the timer. + */ + cb->s_rtt = 0; + /* + * See very long discussion in tcp_timer.c about congestion + * window and sstrhesh + */ + win = min(cb->s_swnd, (cb->s_cwnd/CUNIT)) / 2; + if (win < 2) + win = 2; + cb->s_cwnd = CUNIT; + cb->s_ssthresh = win * CUNIT; + (void) spp_output(cb, (struct mbuf *) 0); + break; + + /* + * Persistance timer into zero window. + * Force a probe to be sent. + */ + case SPPT_PERSIST: + sppstat.spps_persisttimeo++; + spp_setpersist(cb); + (void) spp_output(cb, (struct mbuf *) 0); + break; + + /* + * Keep-alive timer went off; send something + * or drop connection if idle for too long. + */ + case SPPT_KEEP: + sppstat.spps_keeptimeo++; + if (cb->s_state < TCPS_ESTABLISHED) + goto dropit; + if (cb->s_nspcb->nsp_socket->so_options & SO_KEEPALIVE) { + if (cb->s_idle >= SPPTV_MAXIDLE) + goto dropit; + sppstat.spps_keepprobe++; + (void) spp_output(cb, (struct mbuf *) 0); + } else + cb->s_idle = 0; + cb->s_timer[SPPT_KEEP] = SPPTV_KEEP; + break; + dropit: + sppstat.spps_keepdrops++; + cb = spp_drop(cb, ETIMEDOUT); + break; + } + return (cb); +} +#ifndef lint +int SppcbSize = sizeof (struct sppcb); +int NspcbSize = sizeof (struct nspcb); +#endif /* lint */ diff --git a/sys/netns/spp_var.h b/sys/netns/spp_var.h new file mode 100644 index 00000000000..0d44f63904f --- /dev/null +++ b/sys/netns/spp_var.h @@ -0,0 +1,215 @@ +/* + * Copyright (c) 1984, 1985, 1986, 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)spp_var.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Sp control block, one per connection + */ +struct sppcb { + struct spidp_q s_q; /* queue for out-of-order receipt */ + struct nspcb *s_nspcb; /* backpointer to internet pcb */ + u_char s_state; + u_char s_flags; +#define SF_ACKNOW 0x01 /* Ack peer immediately */ +#define SF_DELACK 0x02 /* Ack, but try to delay it */ +#define SF_HI 0x04 /* Show headers on input */ +#define SF_HO 0x08 /* Show headers on output */ +#define SF_PI 0x10 /* Packet (datagram) interface */ +#define SF_WIN 0x20 /* Window info changed */ +#define SF_RXT 0x40 /* Rxt info changed */ +#define SF_RVD 0x80 /* Calling from read usrreq routine */ + u_short s_mtu; /* Max packet size for this stream */ +/* use sequence fields in headers to store sequence numbers for this + connection */ + struct idp *s_idp; + struct sphdr s_shdr; /* prototype header to transmit */ +#define s_cc s_shdr.sp_cc /* connection control (for EM bit) */ +#define s_dt s_shdr.sp_dt /* datastream type */ +#define s_sid s_shdr.sp_sid /* source connection identifier */ +#define s_did s_shdr.sp_did /* destination connection identifier */ +#define s_seq s_shdr.sp_seq /* sequence number */ +#define s_ack s_shdr.sp_ack /* acknowledge number */ +#define s_alo s_shdr.sp_alo /* allocation number */ +#define s_dport s_idp->idp_dna.x_port /* where we are sending */ + struct sphdr s_rhdr; /* last received header (in effect!)*/ + u_short s_rack; /* their acknowledge number */ + u_short s_ralo; /* their allocation number */ + u_short s_smax; /* highest packet # we have sent */ + u_short s_snxt; /* which packet to send next */ + +/* congestion control */ +#define CUNIT 1024 /* scaling for ... */ + int s_cwnd; /* Congestion-controlled window */ + /* in packets * CUNIT */ + short s_swnd; /* == tcp snd_wnd, in packets */ + short s_smxw; /* == tcp max_sndwnd */ + /* difference of two spp_seq's can be + no bigger than a short */ + u_short s_swl1; /* == tcp snd_wl1 */ + u_short s_swl2; /* == tcp snd_wl2 */ + int s_cwmx; /* max allowable cwnd */ + int s_ssthresh; /* s_cwnd size threshhold for + * slow start exponential-to- + * linear switch */ +/* transmit timing stuff + * srtt and rttvar are stored as fixed point, for convenience in smoothing. + * srtt has 3 bits to the right of the binary point, rttvar has 2. + */ + short s_idle; /* time idle */ + short s_timer[SPPT_NTIMERS]; /* timers */ + short s_rxtshift; /* log(2) of rexmt exp. backoff */ + short s_rxtcur; /* current retransmit value */ + u_short s_rtseq; /* packet being timed */ + short s_rtt; /* timer for round trips */ + short s_srtt; /* averaged timer */ + short s_rttvar; /* variance in round trip time */ + char s_force; /* which timer expired */ + char s_dupacks; /* counter to intuit xmt loss */ + +/* out of band data */ + char s_oobflags; +#define SF_SOOB 0x08 /* sending out of band data */ +#define SF_IOOB 0x10 /* receiving out of band data */ + char s_iobc; /* input characters */ +/* debug stuff */ + u_short s_want; /* Last candidate for sending */ + char s_outx; /* exit taken from spp_output */ + char s_inx; /* exit taken from spp_input */ + u_short s_flags2; /* more flags for testing */ +#define SF_NEWCALL 0x100 /* for new_recvmsg */ +#define SO_NEWCALL 10 /* for new_recvmsg */ +}; + +#define nstosppcb(np) ((struct sppcb *)(np)->nsp_pcb) +#define sotosppcb(so) (nstosppcb(sotonspcb(so))) + +struct sppstat { + long spps_connattempt; /* connections initiated */ + long spps_accepts; /* connections accepted */ + long spps_connects; /* connections established */ + long spps_drops; /* connections dropped */ + long spps_conndrops; /* embryonic connections dropped */ + long spps_closed; /* conn. closed (includes drops) */ + long spps_segstimed; /* segs where we tried to get rtt */ + long spps_rttupdated; /* times we succeeded */ + long spps_delack; /* delayed acks sent */ + long spps_timeoutdrop; /* conn. dropped in rxmt timeout */ + long spps_rexmttimeo; /* retransmit timeouts */ + long spps_persisttimeo; /* persist timeouts */ + long spps_keeptimeo; /* keepalive timeouts */ + long spps_keepprobe; /* keepalive probes sent */ + long spps_keepdrops; /* connections dropped in keepalive */ + + long spps_sndtotal; /* total packets sent */ + long spps_sndpack; /* data packets sent */ + long spps_sndbyte; /* data bytes sent */ + long spps_sndrexmitpack; /* data packets retransmitted */ + long spps_sndrexmitbyte; /* data bytes retransmitted */ + long spps_sndacks; /* ack-only packets sent */ + long spps_sndprobe; /* window probes sent */ + long spps_sndurg; /* packets sent with URG only */ + long spps_sndwinup; /* window update-only packets sent */ + long spps_sndctrl; /* control (SYN|FIN|RST) packets sent */ + long spps_sndvoid; /* couldn't find requested packet*/ + + long spps_rcvtotal; /* total packets received */ + long spps_rcvpack; /* packets received in sequence */ + long spps_rcvbyte; /* bytes received in sequence */ + long spps_rcvbadsum; /* packets received with ccksum errs */ + long spps_rcvbadoff; /* packets received with bad offset */ + long spps_rcvshort; /* packets received too short */ + long spps_rcvduppack; /* duplicate-only packets received */ + long spps_rcvdupbyte; /* duplicate-only bytes received */ + long spps_rcvpartduppack; /* packets with some duplicate data */ + long spps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ + long spps_rcvoopack; /* out-of-order packets received */ + long spps_rcvoobyte; /* out-of-order bytes received */ + long spps_rcvpackafterwin; /* packets with data after window */ + long spps_rcvbyteafterwin; /* bytes rcvd after window */ + long spps_rcvafterclose; /* packets rcvd after "close" */ + long spps_rcvwinprobe; /* rcvd window probe packets */ + long spps_rcvdupack; /* rcvd duplicate acks */ + long spps_rcvacktoomuch; /* rcvd acks for unsent data */ + long spps_rcvackpack; /* rcvd ack packets */ + long spps_rcvackbyte; /* bytes acked by rcvd acks */ + long spps_rcvwinupd; /* rcvd window update packets */ +}; +struct spp_istat { + short hdrops; + short badsum; + short badlen; + short slotim; + short fastim; + short nonucn; + short noconn; + short notme; + short wrncon; + short bdreas; + short gonawy; + short notyet; + short lstdup; + struct sppstat newstats; +}; + +#ifdef KERNEL +struct spp_istat spp_istat; + +/* Following was struct sppstat sppstat; */ +#ifndef sppstat +#define sppstat spp_istat.newstats +#endif + +u_short spp_iss; +extern struct sppcb *spp_close(), *spp_disconnect(), + *spp_usrclosed(), *spp_timers(), *spp_drop(); +#endif + +#define SPP_ISSINCR 128 +/* + * SPP sequence numbers are 16 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. + */ +#ifdef sun +short xnsCbug; +#define SSEQ_LT(a,b) ((xnsCbug = (short)((a)-(b))) < 0) +#define SSEQ_LEQ(a,b) ((xnsCbug = (short)((a)-(b))) <= 0) +#define SSEQ_GT(a,b) ((xnsCbug = (short)((a)-(b))) > 0) +#define SSEQ_GEQ(a,b) ((xnsCbug = (short)((a)-(b))) >= 0) +#else +#define SSEQ_LT(a,b) (((short)((a)-(b))) < 0) +#define SSEQ_LEQ(a,b) (((short)((a)-(b))) <= 0) +#define SSEQ_GT(a,b) (((short)((a)-(b))) > 0) +#define SSEQ_GEQ(a,b) (((short)((a)-(b))) >= 0) +#endif diff --git a/sys/nfs/nfs.h b/sys/nfs/nfs.h new file mode 100644 index 00000000000..261fd42657a --- /dev/null +++ b/sys/nfs/nfs.h @@ -0,0 +1,297 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Tunable constants for nfs + */ + +#define NFS_MAXIOVEC 34 +#define NFS_HZ 25 /* Ticks per second for NFS timeouts */ +#define NFS_TIMEO (1*NFS_HZ) /* Default timeout = 1 second */ +#define NFS_MINTIMEO (1*NFS_HZ) /* Min timeout to use */ +#define NFS_MAXTIMEO (60*NFS_HZ) /* Max timeout to backoff to */ +#define NFS_MINIDEMTIMEO (5*NFS_HZ) /* Min timeout for non-idempotent ops*/ +#define NFS_MAXREXMIT 100 /* Stop counting after this many */ +#define NFS_MAXWINDOW 1024 /* Max number of outstanding requests */ +#define NFS_RETRANS 10 /* Num of retrans for soft mounts */ +#define NFS_MAXGRPS 16 /* Max. size of groups list */ +#define NFS_MINATTRTIMO 5 /* Attribute cache timeout in sec */ +#define NFS_MAXATTRTIMO 60 +#define NFS_WSIZE 8192 /* Def. write data size <= 8192 */ +#define NFS_RSIZE 8192 /* Def. read data size <= 8192 */ +#define NFS_DEFRAHEAD 1 /* Def. read ahead # blocks */ +#define NFS_MAXRAHEAD 4 /* Max. read ahead # blocks */ +#define NFS_MAXREADDIR NFS_MAXDATA /* Max. size of directory read */ +#define NFS_MAXUIDHASH 64 /* Max. # of hashed uid entries/mp */ +#define NFS_MAXASYNCDAEMON 20 /* Max. number async_daemons runable */ +#define NFS_DIRBLKSIZ 1024 /* Size of an NFS directory block */ +#define NMOD(a) ((a) % nfs_asyncdaemons) + +/* + * Set the attribute timeout based on how recently the file has been modified. + */ +#define NFS_ATTRTIMEO(np) \ + ((((np)->n_flag & NMODIFIED) || \ + (time.tv_sec - (np)->n_mtime) / 10 < NFS_MINATTRTIMO) ? NFS_MINATTRTIMO : \ + ((time.tv_sec - (np)->n_mtime) / 10 > NFS_MAXATTRTIMO ? NFS_MAXATTRTIMO : \ + (time.tv_sec - (np)->n_mtime) / 10)) + +/* + * Structures for the nfssvc(2) syscall. Not that anyone but nfsd and mount_nfs + * should ever try and use it. + */ +struct nfsd_args { + int sock; /* Socket to serve */ + caddr_t name; /* Client address for connection based sockets */ + int namelen; /* Length of name */ +}; + +struct nfsd_srvargs { + struct nfsd *nsd_nfsd; /* Pointer to in kernel nfsd struct */ + uid_t nsd_uid; /* Effective uid mapped to cred */ + u_long nsd_haddr; /* Ip address of client */ + struct ucred nsd_cr; /* Cred. uid maps to */ + int nsd_authlen; /* Length of auth string (ret) */ + char *nsd_authstr; /* Auth string (ret) */ +}; + +struct nfsd_cargs { + char *ncd_dirp; /* Mount dir path */ + uid_t ncd_authuid; /* Effective uid */ + int ncd_authtype; /* Type of authenticator */ + int ncd_authlen; /* Length of authenticator string */ + char *ncd_authstr; /* Authenticator string */ +}; + +/* + * Stats structure + */ +struct nfsstats { + int attrcache_hits; + int attrcache_misses; + int lookupcache_hits; + int lookupcache_misses; + int direofcache_hits; + int direofcache_misses; + int biocache_reads; + int read_bios; + int read_physios; + int biocache_writes; + int write_bios; + int write_physios; + int biocache_readlinks; + int readlink_bios; + int biocache_readdirs; + int readdir_bios; + int rpccnt[NFS_NPROCS]; + int rpcretries; + int srvrpccnt[NFS_NPROCS]; + int srvrpc_errs; + int srv_errs; + int rpcrequests; + int rpctimeouts; + int rpcunexpected; + int rpcinvalid; + int srvcache_inproghits; + int srvcache_idemdonehits; + int srvcache_nonidemdonehits; + int srvcache_misses; + int srvnqnfs_leases; + int srvnqnfs_maxleases; + int srvnqnfs_getleases; +}; + +/* + * Flags for nfssvc() system call. + */ +#define NFSSVC_BIOD 0x002 +#define NFSSVC_NFSD 0x004 +#define NFSSVC_ADDSOCK 0x008 +#define NFSSVC_AUTHIN 0x010 +#define NFSSVC_GOTAUTH 0x040 +#define NFSSVC_AUTHINFAIL 0x080 +#define NFSSVC_MNTD 0x100 + +/* + * The set of signals the interrupt an I/O in progress for NFSMNT_INT mounts. + * What should be in this set is open to debate, but I believe that since + * I/O system calls on ufs are never interrupted by signals the set should + * be minimal. My reasoning is that many current programs that use signals + * such as SIGALRM will not expect file I/O system calls to be interrupted + * by them and break. + */ +#ifdef KERNEL +#define NFSINT_SIGMASK (sigmask(SIGINT)|sigmask(SIGTERM)|sigmask(SIGKILL)| \ + sigmask(SIGHUP)|sigmask(SIGQUIT)) + +/* + * Socket errors ignored for connectionless sockets?? + * For now, ignore them all + */ +#define NFSIGNORE_SOERROR(s, e) \ + ((e) != EINTR && (e) != ERESTART && (e) != EWOULDBLOCK && \ + ((s) & PR_CONNREQUIRED) == 0) + +/* + * Nfs outstanding request list element + */ +struct nfsreq { + struct nfsreq *r_next; + struct nfsreq *r_prev; + struct mbuf *r_mreq; + struct mbuf *r_mrep; + struct mbuf *r_md; + caddr_t r_dpos; + struct nfsmount *r_nmp; + struct vnode *r_vp; + u_long r_xid; + int r_flags; /* flags on request, see below */ + int r_retry; /* max retransmission count */ + int r_rexmit; /* current retrans count */ + int r_timer; /* tick counter on reply */ + int r_procnum; /* NFS procedure number */ + int r_rtt; /* RTT for rpc */ + struct proc *r_procp; /* Proc that did I/O system call */ +}; + +/* Flag values for r_flags */ +#define R_TIMING 0x01 /* timing request (in mntp) */ +#define R_SENT 0x02 /* request has been sent */ +#define R_SOFTTERM 0x04 /* soft mnt, too many retries */ +#define R_INTR 0x08 /* intr mnt, signal pending */ +#define R_SOCKERR 0x10 /* Fatal error on socket */ +#define R_TPRINTFMSG 0x20 /* Did a tprintf msg. */ +#define R_MUSTRESEND 0x40 /* Must resend request */ +#define R_GETONEREP 0x80 /* Probe for one reply only */ + +struct nfsstats nfsstats; + +/* + * A list of nfssvc_sock structures is maintained with all the sockets + * that require service by the nfsd. + * The nfsuid structs hang off of the nfssvc_sock structs in both lru + * and uid hash lists. + */ +#define NUIDHASHSIZ 32 +#define NUIDHASH(uid) ((uid) & (NUIDHASHSIZ - 1)) + +/* + * Network address hash list element + */ +union nethostaddr { + u_long had_inetaddr; + struct mbuf *had_nam; +}; + +struct nfsuid { + struct nfsuid *nu_lrunext; /* MUST be first */ + struct nfsuid *nu_lruprev; + struct nfsuid *nu_hnext; + struct nfsuid *nu_hprev; + int nu_flag; /* Flags */ + uid_t nu_uid; /* Uid mapped by this entry */ + union nethostaddr nu_haddr; /* Host addr. for dgram sockets */ + struct ucred nu_cr; /* Cred uid mapped to */ +}; + +#define nu_inetaddr nu_haddr.had_inetaddr +#define nu_nam nu_haddr.had_nam +/* Bits for nu_flag */ +#define NU_INETADDR 0x1 + +struct nfssvc_sock { + struct nfsuid *ns_lrunext; /* MUST be first */ + struct nfsuid *ns_lruprev; + struct nfssvc_sock *ns_next; + struct nfssvc_sock *ns_prev; + int ns_flag; + u_long ns_sref; + struct file *ns_fp; + struct socket *ns_so; + int ns_solock; + struct mbuf *ns_nam; + int ns_cc; + struct mbuf *ns_raw; + struct mbuf *ns_rawend; + int ns_reclen; + struct mbuf *ns_rec; + struct mbuf *ns_recend; + int ns_numuids; + struct nfsuid *ns_uidh[NUIDHASHSIZ]; +}; + +/* Bits for "ns_flag" */ +#define SLP_VALID 0x01 +#define SLP_DOREC 0x02 +#define SLP_NEEDQ 0x04 +#define SLP_DISCONN 0x08 +#define SLP_GETSTREAM 0x10 +#define SLP_INIT 0x20 +#define SLP_WANTINIT 0x40 + +#define SLP_ALLFLAGS 0xff + +/* + * One of these structures is allocated for each nfsd. + */ +struct nfsd { + struct nfsd *nd_next; /* Must be first */ + struct nfsd *nd_prev; + int nd_flag; /* NFSD_ flags */ + struct nfssvc_sock *nd_slp; /* Current socket */ + struct mbuf *nd_nam; /* Client addr for datagram req. */ + struct mbuf *nd_mrep; /* Req. mbuf list */ + struct mbuf *nd_md; + caddr_t nd_dpos; /* Position in list */ + int nd_procnum; /* RPC procedure number */ + u_long nd_retxid; /* RPC xid */ + int nd_repstat; /* Reply status value */ + struct ucred nd_cr; /* Credentials for req. */ + int nd_nqlflag; /* Leasing flag */ + int nd_duration; /* Lease duration */ + int nd_authlen; /* Authenticator len */ + u_char nd_authstr[RPCAUTH_MAXSIZ]; /* Authenticator data */ + struct proc *nd_procp; /* Proc ptr */ +}; + +#define NFSD_WAITING 0x01 +#define NFSD_CHECKSLP 0x02 +#define NFSD_REQINPROG 0x04 +#define NFSD_NEEDAUTH 0x08 +#define NFSD_AUTHFAIL 0x10 +#endif /* KERNEL */ diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c new file mode 100644 index 00000000000..177a278b631 --- /dev/null +++ b/sys/nfs/nfs_bio.c @@ -0,0 +1,799 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_bio.c 8.5 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +struct buf *incore(), *nfs_getcacheblk(); +extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +extern int nfs_numasync; + +/* + * Vnode op for read using bio + * Any similarity to readip() is purely coincidental + */ +nfs_bioread(vp, uio, ioflag, cred) + register struct vnode *vp; + register struct uio *uio; + int ioflag; + struct ucred *cred; +{ + register struct nfsnode *np = VTONFS(vp); + register int biosize, diff; + struct buf *bp, *rabp; + struct vattr vattr; + struct proc *p; + struct nfsmount *nmp; + daddr_t lbn, bn, rabn; + caddr_t baddr; + int got_buf, nra, error = 0, n, on, not_readin; + +#ifdef lint + ioflag = ioflag; +#endif /* lint */ +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("nfs_read mode"); +#endif + if (uio->uio_resid == 0) + return (0); + if (uio->uio_offset < 0 && vp->v_type != VDIR) + return (EINVAL); + nmp = VFSTONFS(vp->v_mount); + biosize = nmp->nm_rsize; + p = uio->uio_procp; + /* + * For nfs, cache consistency can only be maintained approximately. + * Although RFC1094 does not specify the criteria, the following is + * believed to be compatible with the reference port. + * For nqnfs, full cache consistency is maintained within the loop. + * For nfs: + * If the file's modify time on the server has changed since the + * last read rpc or you have written to the file, + * you may have lost data cache consistency with the + * server, so flush all of the file's data out of the cache. + * Then force a getattr rpc to ensure that you have up to date + * attributes. + * The mount flag NFSMNT_MYWRITE says "Assume that my writes are + * the ones changing the modify time. + * NB: This implies that cache data can be read when up to + * NFS_ATTRTIMEO seconds out of date. If you find that you need current + * attributes this could be forced by setting n_attrstamp to 0 before + * the VOP_GETATTR() call. + */ + if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) { + if (np->n_flag & NMODIFIED) { + if ((nmp->nm_flag & NFSMNT_MYWRITE) == 0 || + vp->v_type != VREG) { + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + } + np->n_attrstamp = 0; + np->n_direofoffset = 0; + if (error = VOP_GETATTR(vp, &vattr, cred, p)) + return (error); + np->n_mtime = vattr.va_mtime.ts_sec; + } else { + if (error = VOP_GETATTR(vp, &vattr, cred, p)) + return (error); + if (np->n_mtime != vattr.va_mtime.ts_sec) { + np->n_direofoffset = 0; + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + np->n_mtime = vattr.va_mtime.ts_sec; + } + } + } + do { + + /* + * Get a valid lease. If cached data is stale, flush it. + */ + if (nmp->nm_flag & NFSMNT_NQNFS) { + if (NQNFS_CKINVALID(vp, np, NQL_READ)) { + do { + error = nqnfs_getlease(vp, NQL_READ, cred, p); + } while (error == NQNFS_EXPIRED); + if (error) + return (error); + if (np->n_lrev != np->n_brev || + (np->n_flag & NQNFSNONCACHE) || + ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { + if (vp->v_type == VDIR) { + np->n_direofoffset = 0; + cache_purge(vp); + } + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + np->n_brev = np->n_lrev; + } + } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { + np->n_direofoffset = 0; + cache_purge(vp); + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + } + } + if (np->n_flag & NQNFSNONCACHE) { + switch (vp->v_type) { + case VREG: + error = nfs_readrpc(vp, uio, cred); + break; + case VLNK: + error = nfs_readlinkrpc(vp, uio, cred); + break; + case VDIR: + error = nfs_readdirrpc(vp, uio, cred); + break; + }; + return (error); + } + baddr = (caddr_t)0; + switch (vp->v_type) { + case VREG: + nfsstats.biocache_reads++; + lbn = uio->uio_offset / biosize; + on = uio->uio_offset & (biosize-1); + bn = lbn * (biosize / DEV_BSIZE); + not_readin = 1; + + /* + * Start the read ahead(s), as required. + */ + if (nfs_numasync > 0 && nmp->nm_readahead > 0 && + lbn == vp->v_lastr + 1) { + for (nra = 0; nra < nmp->nm_readahead && + (lbn + 1 + nra) * biosize < np->n_size; nra++) { + rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE); + if (!incore(vp, rabn)) { + rabp = nfs_getcacheblk(vp, rabn, biosize, p); + if (!rabp) + return (EINTR); + if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) { + rabp->b_flags |= (B_READ | B_ASYNC); + if (nfs_asyncio(rabp, cred)) { + rabp->b_flags |= B_INVAL; + brelse(rabp); + } + } + } + } + } + + /* + * If the block is in the cache and has the required data + * in a valid region, just copy it out. + * Otherwise, get the block and write back/read in, + * as required. + */ + if ((bp = incore(vp, bn)) && + (bp->b_flags & (B_BUSY | B_WRITEINPROG)) == + (B_BUSY | B_WRITEINPROG)) + got_buf = 0; + else { +again: + bp = nfs_getcacheblk(vp, bn, biosize, p); + if (!bp) + return (EINTR); + got_buf = 1; + if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { + bp->b_flags |= B_READ; + not_readin = 0; + if (error = nfs_doio(bp, cred, p)) { + brelse(bp); + return (error); + } + } + } + n = min((unsigned)(biosize - on), uio->uio_resid); + diff = np->n_size - uio->uio_offset; + if (diff < n) + n = diff; + if (not_readin && n > 0) { + if (on < bp->b_validoff || (on + n) > bp->b_validend) { + if (!got_buf) { + bp = nfs_getcacheblk(vp, bn, biosize, p); + if (!bp) + return (EINTR); + got_buf = 1; + } + bp->b_flags |= B_INVAL; + if (bp->b_dirtyend > 0) { + if ((bp->b_flags & B_DELWRI) == 0) + panic("nfsbioread"); + if (VOP_BWRITE(bp) == EINTR) + return (EINTR); + } else + brelse(bp); + goto again; + } + } + vp->v_lastr = lbn; + diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); + if (diff < n) + n = diff; + break; + case VLNK: + nfsstats.biocache_readlinks++; + bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); + if (!bp) + return (EINTR); + if ((bp->b_flags & B_DONE) == 0) { + bp->b_flags |= B_READ; + if (error = nfs_doio(bp, cred, p)) { + brelse(bp); + return (error); + } + } + n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); + got_buf = 1; + on = 0; + break; + case VDIR: + nfsstats.biocache_readdirs++; + bn = (daddr_t)uio->uio_offset; + bp = nfs_getcacheblk(vp, bn, NFS_DIRBLKSIZ, p); + if (!bp) + return (EINTR); + if ((bp->b_flags & B_DONE) == 0) { + bp->b_flags |= B_READ; + if (error = nfs_doio(bp, cred, p)) { + brelse(bp); + return (error); + } + } + + /* + * If not eof and read aheads are enabled, start one. + * (You need the current block first, so that you have the + * directory offset cookie of the next block. + */ + rabn = bp->b_blkno; + if (nfs_numasync > 0 && nmp->nm_readahead > 0 && + rabn != 0 && rabn != np->n_direofoffset && + !incore(vp, rabn)) { + rabp = nfs_getcacheblk(vp, rabn, NFS_DIRBLKSIZ, p); + if (rabp) { + if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) { + rabp->b_flags |= (B_READ | B_ASYNC); + if (nfs_asyncio(rabp, cred)) { + rabp->b_flags |= B_INVAL; + brelse(rabp); + } + } + } + } + on = 0; + n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid); + got_buf = 1; + break; + }; + + if (n > 0) { + if (!baddr) + baddr = bp->b_data; + error = uiomove(baddr + on, (int)n, uio); + } + switch (vp->v_type) { + case VREG: + if (n + on == biosize || uio->uio_offset == np->n_size) + bp->b_flags |= B_AGE; + break; + case VLNK: + n = 0; + break; + case VDIR: + uio->uio_offset = bp->b_blkno; + break; + }; + if (got_buf) + brelse(bp); + } while (error == 0 && uio->uio_resid > 0 && n > 0); + return (error); +} + +/* + * Vnode op for write using bio + */ +nfs_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register int biosize; + register struct uio *uio = ap->a_uio; + struct proc *p = uio->uio_procp; + register struct vnode *vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + register struct ucred *cred = ap->a_cred; + int ioflag = ap->a_ioflag; + struct buf *bp; + struct vattr vattr; + struct nfsmount *nmp; + daddr_t lbn, bn; + int n, on, error = 0; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE) + panic("nfs_write mode"); + if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) + panic("nfs_write proc"); +#endif + if (vp->v_type != VREG) + return (EIO); + if (np->n_flag & NWRITEERR) { + np->n_flag &= ~NWRITEERR; + return (np->n_error); + } + if (ioflag & (IO_APPEND | IO_SYNC)) { + if (np->n_flag & NMODIFIED) { + np->n_attrstamp = 0; + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + } + if (ioflag & IO_APPEND) { + np->n_attrstamp = 0; + if (error = VOP_GETATTR(vp, &vattr, cred, p)) + return (error); + uio->uio_offset = np->n_size; + } + } + nmp = VFSTONFS(vp->v_mount); + if (uio->uio_offset < 0) + return (EINVAL); + if (uio->uio_resid == 0) + return (0); + /* + * Maybe this should be above the vnode op call, but so long as + * file servers have no limits, i don't think it matters + */ + if (p && uio->uio_offset + uio->uio_resid > + p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { + psignal(p, SIGXFSZ); + return (EFBIG); + } + /* + * I use nm_rsize, not nm_wsize so that all buffer cache blocks + * will be the same size within a filesystem. nfs_writerpc will + * still use nm_wsize when sizing the rpc's. + */ + biosize = nmp->nm_rsize; + do { + + /* + * Check for a valid write lease. + * If non-cachable, just do the rpc + */ + if ((nmp->nm_flag & NFSMNT_NQNFS) && + NQNFS_CKINVALID(vp, np, NQL_WRITE)) { + do { + error = nqnfs_getlease(vp, NQL_WRITE, cred, p); + } while (error == NQNFS_EXPIRED); + if (error) + return (error); + if (np->n_lrev != np->n_brev || + (np->n_flag & NQNFSNONCACHE)) { + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + np->n_brev = np->n_lrev; + } + } + if (np->n_flag & NQNFSNONCACHE) + return (nfs_writerpc(vp, uio, cred, ioflag)); + nfsstats.biocache_writes++; + lbn = uio->uio_offset / biosize; + on = uio->uio_offset & (biosize-1); + n = min((unsigned)(biosize - on), uio->uio_resid); + bn = lbn * (biosize / DEV_BSIZE); +again: + bp = nfs_getcacheblk(vp, bn, biosize, p); + if (!bp) + return (EINTR); + if (bp->b_wcred == NOCRED) { + crhold(cred); + bp->b_wcred = cred; + } + np->n_flag |= NMODIFIED; + if (uio->uio_offset + n > np->n_size) { + np->n_size = uio->uio_offset + n; + vnode_pager_setsize(vp, (u_long)np->n_size); + } + + /* + * If the new write will leave a contiguous dirty + * area, just update the b_dirtyoff and b_dirtyend, + * otherwise force a write rpc of the old dirty area. + */ + if (bp->b_dirtyend > 0 && + (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { + bp->b_proc = p; + if (VOP_BWRITE(bp) == EINTR) + return (EINTR); + goto again; + } + + /* + * Check for valid write lease and get one as required. + * In case getblk() and/or bwrite() delayed us. + */ + if ((nmp->nm_flag & NFSMNT_NQNFS) && + NQNFS_CKINVALID(vp, np, NQL_WRITE)) { + do { + error = nqnfs_getlease(vp, NQL_WRITE, cred, p); + } while (error == NQNFS_EXPIRED); + if (error) { + brelse(bp); + return (error); + } + if (np->n_lrev != np->n_brev || + (np->n_flag & NQNFSNONCACHE)) { + brelse(bp); + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + np->n_brev = np->n_lrev; + goto again; + } + } + if (error = uiomove((char *)bp->b_data + on, n, uio)) { + bp->b_flags |= B_ERROR; + brelse(bp); + return (error); + } + if (bp->b_dirtyend > 0) { + bp->b_dirtyoff = min(on, bp->b_dirtyoff); + bp->b_dirtyend = max((on + n), bp->b_dirtyend); + } else { + bp->b_dirtyoff = on; + bp->b_dirtyend = on + n; + } +#ifndef notdef + if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || + bp->b_validoff > bp->b_dirtyend) { + bp->b_validoff = bp->b_dirtyoff; + bp->b_validend = bp->b_dirtyend; + } else { + bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); + bp->b_validend = max(bp->b_validend, bp->b_dirtyend); + } +#else + bp->b_validoff = bp->b_dirtyoff; + bp->b_validend = bp->b_dirtyend; +#endif + if (ioflag & IO_APPEND) + bp->b_flags |= B_APPENDWRITE; + + /* + * If the lease is non-cachable or IO_SYNC do bwrite(). + */ + if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { + bp->b_proc = p; + if (error = VOP_BWRITE(bp)) + return (error); + } else if ((n + on) == biosize && + (nmp->nm_flag & NFSMNT_NQNFS) == 0) { + bp->b_proc = (struct proc *)0; + bawrite(bp); + } else + bdwrite(bp); + } while (uio->uio_resid > 0 && n > 0); + return (0); +} + +/* + * Get an nfs cache block. + * Allocate a new one if the block isn't currently in the cache + * and return the block marked busy. If the calling process is + * interrupted by a signal for an interruptible mount point, return + * NULL. + */ +struct buf * +nfs_getcacheblk(vp, bn, size, p) + struct vnode *vp; + daddr_t bn; + int size; + struct proc *p; +{ + register struct buf *bp; + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + + if (nmp->nm_flag & NFSMNT_INT) { + bp = getblk(vp, bn, size, PCATCH, 0); + while (bp == (struct buf *)0) { + if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) + return ((struct buf *)0); + bp = getblk(vp, bn, size, 0, 2 * hz); + } + } else + bp = getblk(vp, bn, size, 0, 0); + return (bp); +} + +/* + * Flush and invalidate all dirty buffers. If another process is already + * doing the flush, just wait for completion. + */ +nfs_vinvalbuf(vp, flags, cred, p, intrflg) + struct vnode *vp; + int flags; + struct ucred *cred; + struct proc *p; + int intrflg; +{ + register struct nfsnode *np = VTONFS(vp); + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + int error = 0, slpflag, slptimeo; + + if ((nmp->nm_flag & NFSMNT_INT) == 0) + intrflg = 0; + if (intrflg) { + slpflag = PCATCH; + slptimeo = 2 * hz; + } else { + slpflag = 0; + slptimeo = 0; + } + /* + * First wait for any other process doing a flush to complete. + */ + while (np->n_flag & NFLUSHINPROG) { + np->n_flag |= NFLUSHWANT; + error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", + slptimeo); + if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) + return (EINTR); + } + + /* + * Now, flush as required. + */ + np->n_flag |= NFLUSHINPROG; + error = vinvalbuf(vp, flags, cred, p, slpflag, 0); + while (error) { + if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { + np->n_flag &= ~NFLUSHINPROG; + if (np->n_flag & NFLUSHWANT) { + np->n_flag &= ~NFLUSHWANT; + wakeup((caddr_t)&np->n_flag); + } + return (EINTR); + } + error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); + } + np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); + if (np->n_flag & NFLUSHWANT) { + np->n_flag &= ~NFLUSHWANT; + wakeup((caddr_t)&np->n_flag); + } + return (0); +} + +/* + * Initiate asynchronous I/O. Return an error if no nfsiods are available. + * This is mainly to avoid queueing async I/O requests when the nfsiods + * are all hung on a dead server. + */ +nfs_asyncio(bp, cred) + register struct buf *bp; + struct ucred *cred; +{ + register int i; + + if (nfs_numasync == 0) + return (EIO); + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + if (nfs_iodwant[i]) { + if (bp->b_flags & B_READ) { + if (bp->b_rcred == NOCRED && cred != NOCRED) { + crhold(cred); + bp->b_rcred = cred; + } + } else { + if (bp->b_wcred == NOCRED && cred != NOCRED) { + crhold(cred); + bp->b_wcred = cred; + } + } + + TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist); + nfs_iodwant[i] = (struct proc *)0; + wakeup((caddr_t)&nfs_iodwant[i]); + return (0); + } + return (EIO); +} + +/* + * Do an I/O operation to/from a cache block. This may be called + * synchronously or from an nfsiod. + */ +int +nfs_doio(bp, cr, p) + register struct buf *bp; + struct cred *cr; + struct proc *p; +{ + register struct uio *uiop; + register struct vnode *vp; + struct nfsnode *np; + struct nfsmount *nmp; + int error, diff, len; + struct uio uio; + struct iovec io; + + vp = bp->b_vp; + np = VTONFS(vp); + nmp = VFSTONFS(vp->v_mount); + uiop = &uio; + uiop->uio_iov = &io; + uiop->uio_iovcnt = 1; + uiop->uio_segflg = UIO_SYSSPACE; + uiop->uio_procp = p; + + /* + * Historically, paging was done with physio, but no more. + */ + if (bp->b_flags & B_PHYS) + panic("doio phys"); + if (bp->b_flags & B_READ) { + io.iov_len = uiop->uio_resid = bp->b_bcount; + io.iov_base = bp->b_data; + uiop->uio_rw = UIO_READ; + switch (vp->v_type) { + case VREG: + uiop->uio_offset = bp->b_blkno * DEV_BSIZE; + nfsstats.read_bios++; + error = nfs_readrpc(vp, uiop, cr); + if (!error) { + bp->b_validoff = 0; + if (uiop->uio_resid) { + /* + * If len > 0, there is a hole in the file and + * no writes after the hole have been pushed to + * the server yet. + * Just zero fill the rest of the valid area. + */ + diff = bp->b_bcount - uiop->uio_resid; + len = np->n_size - (bp->b_blkno * DEV_BSIZE + + diff); + if (len > 0) { + len = min(len, uiop->uio_resid); + bzero((char *)bp->b_data + diff, len); + bp->b_validend = diff + len; + } else + bp->b_validend = diff; + } else + bp->b_validend = bp->b_bcount; + } + if (p && (vp->v_flag & VTEXT) && + (((nmp->nm_flag & NFSMNT_NQNFS) && + np->n_lrev != np->n_brev) || + (!(nmp->nm_flag & NFSMNT_NQNFS) && + np->n_mtime != np->n_vattr.va_mtime.ts_sec))) { + uprintf("Process killed due to text file modification\n"); + psignal(p, SIGKILL); + p->p_flag |= P_NOSWAP; + } + break; + case VLNK: + uiop->uio_offset = 0; + nfsstats.readlink_bios++; + error = nfs_readlinkrpc(vp, uiop, cr); + break; + case VDIR: + uiop->uio_offset = bp->b_lblkno; + nfsstats.readdir_bios++; + if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) + error = nfs_readdirlookrpc(vp, uiop, cr); + else + error = nfs_readdirrpc(vp, uiop, cr); + /* + * Save offset cookie in b_blkno. + */ + bp->b_blkno = uiop->uio_offset; + break; + }; + if (error) { + bp->b_flags |= B_ERROR; + bp->b_error = error; + } + } else { + io.iov_len = uiop->uio_resid = bp->b_dirtyend + - bp->b_dirtyoff; + uiop->uio_offset = (bp->b_blkno * DEV_BSIZE) + + bp->b_dirtyoff; + io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; + uiop->uio_rw = UIO_WRITE; + nfsstats.write_bios++; + if (bp->b_flags & B_APPENDWRITE) + error = nfs_writerpc(vp, uiop, cr, IO_APPEND); + else + error = nfs_writerpc(vp, uiop, cr, 0); + bp->b_flags &= ~(B_WRITEINPROG | B_APPENDWRITE); + + /* + * For an interrupted write, the buffer is still valid and the + * write hasn't been pushed to the server yet, so we can't set + * B_ERROR and report the interruption by setting B_EINTR. For + * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt + * is essentially a noop. + */ + if (error == EINTR) { + bp->b_flags &= ~B_INVAL; + bp->b_flags |= B_DELWRI; + + /* + * Since for the B_ASYNC case, nfs_bwrite() has reassigned the + * buffer to the clean list, we have to reassign it back to the + * dirty one. Ugh. + */ + if (bp->b_flags & B_ASYNC) + reassignbuf(bp, vp); + else + bp->b_flags |= B_EINTR; + } else { + if (error) { + bp->b_flags |= B_ERROR; + bp->b_error = np->n_error = error; + np->n_flag |= NWRITEERR; + } + bp->b_dirtyoff = bp->b_dirtyend = 0; + } + } + bp->b_resid = uiop->uio_resid; + biodone(bp); + return (error); +} diff --git a/sys/nfs/nfs_common.c b/sys/nfs/nfs_common.c new file mode 100644 index 00000000000..5778f7d7f01 --- /dev/null +++ b/sys/nfs/nfs_common.c @@ -0,0 +1,1130 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_subs.c 8.3 (Berkeley) 1/4/94 + */ + +/* + * These functions support the macros and help fiddle mbuf chains for + * the nfs op functions. They do things like create the rpc header and + * copy data between mbuf chains and uio lists. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#ifdef ISO +#include +#endif + +#define TRUE 1 +#define FALSE 0 + +/* + * Data items converted to xdr at startup, since they are constant + * This is kinda hokey, but may save a little time doing byte swaps + */ +u_long nfs_procids[NFS_NPROCS]; +u_long nfs_xdrneg1; +u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, + rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_rejectedcred, + rpc_auth_kerb; +u_long nfs_vers, nfs_prog, nfs_true, nfs_false; + +/* And other global data */ +static u_long nfs_xid = 0; +enum vtype ntov_type[7] = { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON }; +extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +extern struct nfsreq nfsreqh; +extern int nqnfs_piggy[NFS_NPROCS]; +extern struct nfsrtt nfsrtt; +extern time_t nqnfsstarttime; +extern u_long nqnfs_prog, nqnfs_vers; +extern int nqsrv_clockskew; +extern int nqsrv_writeslack; +extern int nqsrv_maxlease; + +/* + * Create the header for an rpc request packet + * The hsiz is the size of the rest of the nfs request header. + * (just used to decide if a cluster is a good idea) + */ +struct mbuf * +nfsm_reqh(vp, procid, hsiz, bposp) + struct vnode *vp; + u_long procid; + int hsiz; + caddr_t *bposp; +{ + register struct mbuf *mb; + register u_long *tl; + register caddr_t bpos; + struct mbuf *mb2; + struct nfsmount *nmp; + int nqflag; + + MGET(mb, M_WAIT, MT_DATA); + if (hsiz >= MINCLSIZE) + MCLGET(mb, M_WAIT); + mb->m_len = 0; + bpos = mtod(mb, caddr_t); + + /* + * For NQNFS, add lease request. + */ + if (vp) { + nmp = VFSTONFS(vp->v_mount); + if (nmp->nm_flag & NFSMNT_NQNFS) { + nqflag = NQNFS_NEEDLEASE(vp, procid); + if (nqflag) { + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(nqflag); + *tl = txdr_unsigned(nmp->nm_leaseterm); + } else { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = 0; + } + } + } + /* Finally, return values */ + *bposp = bpos; + return (mb); +} + +/* + * Build the RPC header and fill in the authorization info. + * The authorization string argument is only used when the credentials + * come from outside of the kernel. + * Returns the head of the mbuf list. + */ +struct mbuf * +nfsm_rpchead(cr, nqnfs, procid, auth_type, auth_len, auth_str, mrest, + mrest_len, mbp, xidp) + register struct ucred *cr; + int nqnfs; + int procid; + int auth_type; + int auth_len; + char *auth_str; + struct mbuf *mrest; + int mrest_len; + struct mbuf **mbp; + u_long *xidp; +{ + register struct mbuf *mb; + register u_long *tl; + register caddr_t bpos; + register int i; + struct mbuf *mreq, *mb2; + int siz, grpsiz, authsiz; + + authsiz = nfsm_rndup(auth_len); + if (auth_type == RPCAUTH_NQNFS) + authsiz += 2 * NFSX_UNSIGNED; + MGETHDR(mb, M_WAIT, MT_DATA); + if ((authsiz + 10*NFSX_UNSIGNED) >= MINCLSIZE) { + MCLGET(mb, M_WAIT); + } else if ((authsiz + 10*NFSX_UNSIGNED) < MHLEN) { + MH_ALIGN(mb, authsiz + 10*NFSX_UNSIGNED); + } else { + MH_ALIGN(mb, 8*NFSX_UNSIGNED); + } + mb->m_len = 0; + mreq = mb; + bpos = mtod(mb, caddr_t); + + /* + * First the RPC header. + */ + nfsm_build(tl, u_long *, 8*NFSX_UNSIGNED); + if (++nfs_xid == 0) + nfs_xid++; + *tl++ = *xidp = txdr_unsigned(nfs_xid); + *tl++ = rpc_call; + *tl++ = rpc_vers; + if (nqnfs) { + *tl++ = txdr_unsigned(NQNFS_PROG); + *tl++ = txdr_unsigned(NQNFS_VER1); + } else { + *tl++ = txdr_unsigned(NFS_PROG); + *tl++ = txdr_unsigned(NFS_VER2); + } + *tl++ = txdr_unsigned(procid); + + /* + * And then the authorization cred. + */ + *tl++ = txdr_unsigned(auth_type); + *tl = txdr_unsigned(authsiz); + switch (auth_type) { + case RPCAUTH_UNIX: + nfsm_build(tl, u_long *, auth_len); + *tl++ = 0; /* stamp ?? */ + *tl++ = 0; /* NULL hostname */ + *tl++ = txdr_unsigned(cr->cr_uid); + *tl++ = txdr_unsigned(cr->cr_groups[0]); + grpsiz = (auth_len >> 2) - 5; + *tl++ = txdr_unsigned(grpsiz); + for (i = 1; i <= grpsiz; i++) + *tl++ = txdr_unsigned(cr->cr_groups[i]); + break; + case RPCAUTH_NQNFS: + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(cr->cr_uid); + *tl = txdr_unsigned(auth_len); + siz = auth_len; + while (siz > 0) { + if (M_TRAILINGSPACE(mb) == 0) { + MGET(mb2, M_WAIT, MT_DATA); + if (siz >= MINCLSIZE) + MCLGET(mb2, M_WAIT); + mb->m_next = mb2; + mb = mb2; + mb->m_len = 0; + bpos = mtod(mb, caddr_t); + } + i = min(siz, M_TRAILINGSPACE(mb)); + bcopy(auth_str, bpos, i); + mb->m_len += i; + auth_str += i; + bpos += i; + siz -= i; + } + if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { + for (i = 0; i < siz; i++) + *bpos++ = '\0'; + mb->m_len += siz; + } + break; + }; + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(RPCAUTH_NULL); + *tl = 0; + mb->m_next = mrest; + mreq->m_pkthdr.len = authsiz + 10*NFSX_UNSIGNED + mrest_len; + mreq->m_pkthdr.rcvif = (struct ifnet *)0; + *mbp = mb; + return (mreq); +} + +/* + * copies mbuf chain to the uio scatter/gather list + */ +nfsm_mbuftouio(mrep, uiop, siz, dpos) + struct mbuf **mrep; + register struct uio *uiop; + int siz; + caddr_t *dpos; +{ + register char *mbufcp, *uiocp; + register int xfer, left, len; + register struct mbuf *mp; + long uiosiz, rem; + int error = 0; + + mp = *mrep; + mbufcp = *dpos; + len = mtod(mp, caddr_t)+mp->m_len-mbufcp; + rem = nfsm_rndup(siz)-siz; + while (siz > 0) { + if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + return (EFBIG); + left = uiop->uio_iov->iov_len; + uiocp = uiop->uio_iov->iov_base; + if (left > siz) + left = siz; + uiosiz = left; + while (left > 0) { + while (len == 0) { + mp = mp->m_next; + if (mp == NULL) + return (EBADRPC); + mbufcp = mtod(mp, caddr_t); + len = mp->m_len; + } + xfer = (left > len) ? len : left; +#ifdef notdef + /* Not Yet.. */ + if (uiop->uio_iov->iov_op != NULL) + (*(uiop->uio_iov->iov_op)) + (mbufcp, uiocp, xfer); + else +#endif + if (uiop->uio_segflg == UIO_SYSSPACE) + bcopy(mbufcp, uiocp, xfer); + else + copyout(mbufcp, uiocp, xfer); + left -= xfer; + len -= xfer; + mbufcp += xfer; + uiocp += xfer; + uiop->uio_offset += xfer; + uiop->uio_resid -= xfer; + } + if (uiop->uio_iov->iov_len <= siz) { + uiop->uio_iovcnt--; + uiop->uio_iov++; + } else { + uiop->uio_iov->iov_base += uiosiz; + uiop->uio_iov->iov_len -= uiosiz; + } + siz -= uiosiz; + } + *dpos = mbufcp; + *mrep = mp; + if (rem > 0) { + if (len < rem) + error = nfs_adv(mrep, dpos, rem, len); + else + *dpos += rem; + } + return (error); +} + +/* + * copies a uio scatter/gather list to an mbuf chain... + */ +nfsm_uiotombuf(uiop, mq, siz, bpos) + register struct uio *uiop; + struct mbuf **mq; + int siz; + caddr_t *bpos; +{ + register char *uiocp; + register struct mbuf *mp, *mp2; + register int xfer, left, mlen; + int uiosiz, clflg, rem; + char *cp; + + if (siz > MLEN) /* or should it >= MCLBYTES ?? */ + clflg = 1; + else + clflg = 0; + rem = nfsm_rndup(siz)-siz; + mp = mp2 = *mq; + while (siz > 0) { + if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + return (EINVAL); + left = uiop->uio_iov->iov_len; + uiocp = uiop->uio_iov->iov_base; + if (left > siz) + left = siz; + uiosiz = left; + while (left > 0) { + mlen = M_TRAILINGSPACE(mp); + if (mlen == 0) { + MGET(mp, M_WAIT, MT_DATA); + if (clflg) + MCLGET(mp, M_WAIT); + mp->m_len = 0; + mp2->m_next = mp; + mp2 = mp; + mlen = M_TRAILINGSPACE(mp); + } + xfer = (left > mlen) ? mlen : left; +#ifdef notdef + /* Not Yet.. */ + if (uiop->uio_iov->iov_op != NULL) + (*(uiop->uio_iov->iov_op)) + (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + else +#endif + if (uiop->uio_segflg == UIO_SYSSPACE) + bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + else + copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + mp->m_len += xfer; + left -= xfer; + uiocp += xfer; + uiop->uio_offset += xfer; + uiop->uio_resid -= xfer; + } + if (uiop->uio_iov->iov_len <= siz) { + uiop->uio_iovcnt--; + uiop->uio_iov++; + } else { + uiop->uio_iov->iov_base += uiosiz; + uiop->uio_iov->iov_len -= uiosiz; + } + siz -= uiosiz; + } + if (rem > 0) { + if (rem > M_TRAILINGSPACE(mp)) { + MGET(mp, M_WAIT, MT_DATA); + mp->m_len = 0; + mp2->m_next = mp; + } + cp = mtod(mp, caddr_t)+mp->m_len; + for (left = 0; left < rem; left++) + *cp++ = '\0'; + mp->m_len += rem; + *bpos = cp; + } else + *bpos = mtod(mp, caddr_t)+mp->m_len; + *mq = mp; + return (0); +} + +/* + * Help break down an mbuf chain by setting the first siz bytes contiguous + * pointed to by returned val. + * This is used by the macros nfsm_dissect and nfsm_dissecton for tough + * cases. (The macros use the vars. dpos and dpos2) + */ +nfsm_disct(mdp, dposp, siz, left, cp2) + struct mbuf **mdp; + caddr_t *dposp; + int siz; + int left; + caddr_t *cp2; +{ + register struct mbuf *mp, *mp2; + register int siz2, xfer; + register caddr_t p; + + mp = *mdp; + while (left == 0) { + *mdp = mp = mp->m_next; + if (mp == NULL) + return (EBADRPC); + left = mp->m_len; + *dposp = mtod(mp, caddr_t); + } + if (left >= siz) { + *cp2 = *dposp; + *dposp += siz; + } else if (mp->m_next == NULL) { + return (EBADRPC); + } else if (siz > MHLEN) { + panic("nfs S too big"); + } else { + MGET(mp2, M_WAIT, MT_DATA); + mp2->m_next = mp->m_next; + mp->m_next = mp2; + mp->m_len -= left; + mp = mp2; + *cp2 = p = mtod(mp, caddr_t); + bcopy(*dposp, p, left); /* Copy what was left */ + siz2 = siz-left; + p += left; + mp2 = mp->m_next; + /* Loop around copying up the siz2 bytes */ + while (siz2 > 0) { + if (mp2 == NULL) + return (EBADRPC); + xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; + if (xfer > 0) { + bcopy(mtod(mp2, caddr_t), p, xfer); + NFSMADV(mp2, xfer); + mp2->m_len -= xfer; + p += xfer; + siz2 -= xfer; + } + if (siz2 > 0) + mp2 = mp2->m_next; + } + mp->m_len = siz; + *mdp = mp2; + *dposp = mtod(mp2, caddr_t); + } + return (0); +} + +/* + * Advance the position in the mbuf chain. + */ +nfs_adv(mdp, dposp, offs, left) + struct mbuf **mdp; + caddr_t *dposp; + int offs; + int left; +{ + register struct mbuf *m; + register int s; + + m = *mdp; + s = left; + while (s < offs) { + offs -= s; + m = m->m_next; + if (m == NULL) + return (EBADRPC); + s = m->m_len; + } + *mdp = m; + *dposp = mtod(m, caddr_t)+offs; + return (0); +} + +/* + * Copy a string into mbufs for the hard cases... + */ +nfsm_strtmbuf(mb, bpos, cp, siz) + struct mbuf **mb; + char **bpos; + char *cp; + long siz; +{ + register struct mbuf *m1, *m2; + long left, xfer, len, tlen; + u_long *tl; + int putsize; + + putsize = 1; + m2 = *mb; + left = M_TRAILINGSPACE(m2); + if (left > 0) { + tl = ((u_long *)(*bpos)); + *tl++ = txdr_unsigned(siz); + putsize = 0; + left -= NFSX_UNSIGNED; + m2->m_len += NFSX_UNSIGNED; + if (left > 0) { + bcopy(cp, (caddr_t) tl, left); + siz -= left; + cp += left; + m2->m_len += left; + left = 0; + } + } + /* Loop around adding mbufs */ + while (siz > 0) { + MGET(m1, M_WAIT, MT_DATA); + if (siz > MLEN) + MCLGET(m1, M_WAIT); + m1->m_len = NFSMSIZ(m1); + m2->m_next = m1; + m2 = m1; + tl = mtod(m1, u_long *); + tlen = 0; + if (putsize) { + *tl++ = txdr_unsigned(siz); + m1->m_len -= NFSX_UNSIGNED; + tlen = NFSX_UNSIGNED; + putsize = 0; + } + if (siz < m1->m_len) { + len = nfsm_rndup(siz); + xfer = siz; + if (xfer < len) + *(tl+(xfer>>2)) = 0; + } else { + xfer = len = m1->m_len; + } + bcopy(cp, (caddr_t) tl, xfer); + m1->m_len = len+tlen; + siz -= xfer; + cp += xfer; + } + *mb = m1; + *bpos = mtod(m1, caddr_t)+m1->m_len; + return (0); +} + +/* + * Called once to initialize data structures... + */ +nfs_init() +{ + register int i; + + nfsrtt.pos = 0; + rpc_vers = txdr_unsigned(RPC_VER2); + rpc_call = txdr_unsigned(RPC_CALL); + rpc_reply = txdr_unsigned(RPC_REPLY); + rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); + rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); + rpc_mismatch = txdr_unsigned(RPC_MISMATCH); + rpc_autherr = txdr_unsigned(RPC_AUTHERR); + rpc_rejectedcred = txdr_unsigned(AUTH_REJECTCRED); + rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); + rpc_auth_kerb = txdr_unsigned(RPCAUTH_NQNFS); + nfs_vers = txdr_unsigned(NFS_VER2); + nfs_prog = txdr_unsigned(NFS_PROG); + nfs_true = txdr_unsigned(TRUE); + nfs_false = txdr_unsigned(FALSE); + /* Loop thru nfs procids */ + for (i = 0; i < NFS_NPROCS; i++) + nfs_procids[i] = txdr_unsigned(i); + /* Ensure async daemons disabled */ + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + nfs_iodwant[i] = (struct proc *)0; + TAILQ_INIT(&nfs_bufq); + nfs_xdrneg1 = txdr_unsigned(-1); + nfs_nhinit(); /* Init the nfsnode table */ + nfsrv_init(0); /* Init server data structures */ + nfsrv_initcache(); /* Init the server request cache */ + + /* + * Initialize the nqnfs server stuff. + */ + if (nqnfsstarttime == 0) { + nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + + nqsrv_clockskew + nqsrv_writeslack; + NQLOADNOVRAM(nqnfsstarttime); + nqnfs_prog = txdr_unsigned(NQNFS_PROG); + nqnfs_vers = txdr_unsigned(NQNFS_VER1); + nqthead.th_head[0] = &nqthead; + nqthead.th_head[1] = &nqthead; + nqfhead = hashinit(NQLCHSZ, M_NQLEASE, &nqfheadhash); + } + + /* + * Initialize reply list and start timer + */ + nfsreqh.r_prev = nfsreqh.r_next = &nfsreqh; + nfs_timer(); +} + +/* + * Attribute cache routines. + * nfs_loadattrcache() - loads or updates the cache contents from attributes + * that are on the mbuf list + * nfs_getattrcache() - returns valid attributes if found in cache, returns + * error otherwise + */ + +/* + * Load the attribute cache (that lives in the nfsnode entry) with + * the values on the mbuf list and + * Iff vap not NULL + * copy the attributes to *vaper + */ +nfs_loadattrcache(vpp, mdp, dposp, vaper) + struct vnode **vpp; + struct mbuf **mdp; + caddr_t *dposp; + struct vattr *vaper; +{ + register struct vnode *vp = *vpp; + register struct vattr *vap; + register struct nfsv2_fattr *fp; + extern int (**spec_nfsv2nodeop_p)(); + register struct nfsnode *np, *nq, **nhpp; + register long t1; + caddr_t dpos, cp2; + int error = 0, isnq; + struct mbuf *md; + enum vtype vtyp; + u_short vmode; + long rdev; + struct timespec mtime; + struct vnode *nvp; + + md = *mdp; + dpos = *dposp; + t1 = (mtod(md, caddr_t) + md->m_len) - dpos; + isnq = (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS); + if (error = nfsm_disct(&md, &dpos, NFSX_FATTR(isnq), t1, &cp2)) + return (error); + fp = (struct nfsv2_fattr *)cp2; + vtyp = nfstov_type(fp->fa_type); + vmode = fxdr_unsigned(u_short, fp->fa_mode); + if (vtyp == VNON || vtyp == VREG) + vtyp = IFTOVT(vmode); + if (isnq) { + rdev = fxdr_unsigned(long, fp->fa_nqrdev); + fxdr_nqtime(&fp->fa_nqmtime, &mtime); + } else { + rdev = fxdr_unsigned(long, fp->fa_nfsrdev); + fxdr_nfstime(&fp->fa_nfsmtime, &mtime); + } + /* + * If v_type == VNON it is a new node, so fill in the v_type, + * n_mtime fields. Check to see if it represents a special + * device, and if so, check for a possible alias. Once the + * correct vnode has been obtained, fill in the rest of the + * information. + */ + np = VTONFS(vp); + if (vp->v_type == VNON) { + if (vtyp == VCHR && rdev == 0xffffffff) + vp->v_type = vtyp = VFIFO; + else + vp->v_type = vtyp; + if (vp->v_type == VFIFO) { +#ifdef FIFO + extern int (**fifo_nfsv2nodeop_p)(); + vp->v_op = fifo_nfsv2nodeop_p; +#else + return (EOPNOTSUPP); +#endif /* FIFO */ + } + if (vp->v_type == VCHR || vp->v_type == VBLK) { + vp->v_op = spec_nfsv2nodeop_p; + if (nvp = checkalias(vp, (dev_t)rdev, vp->v_mount)) { + /* + * Discard unneeded vnode, but save its nfsnode. + */ + if (nq = np->n_forw) + nq->n_back = np->n_back; + *np->n_back = nq; + nvp->v_data = vp->v_data; + vp->v_data = NULL; + vp->v_op = spec_vnodeop_p; + vrele(vp); + vgone(vp); + /* + * Reinitialize aliased node. + */ + np->n_vnode = nvp; + nhpp = (struct nfsnode **)nfs_hash(&np->n_fh); + if (nq = *nhpp) + nq->n_back = &np->n_forw; + np->n_forw = nq; + np->n_back = nhpp; + *nhpp = np; + *vpp = vp = nvp; + } + } + np->n_mtime = mtime.ts_sec; + } + vap = &np->n_vattr; + vap->va_type = vtyp; + vap->va_mode = (vmode & 07777); + vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); + vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); + vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); + vap->va_rdev = (dev_t)rdev; + vap->va_mtime = mtime; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + if (isnq) { + fxdr_hyper(&fp->fa_nqsize, &vap->va_size); + vap->va_blocksize = fxdr_unsigned(long, fp->fa_nqblocksize); + fxdr_hyper(&fp->fa_nqbytes, &vap->va_bytes); + vap->va_fileid = fxdr_unsigned(long, fp->fa_nqfileid); + fxdr_nqtime(&fp->fa_nqatime, &vap->va_atime); + vap->va_flags = fxdr_unsigned(u_long, fp->fa_nqflags); + fxdr_nqtime(&fp->fa_nqctime, &vap->va_ctime); + vap->va_gen = fxdr_unsigned(u_long, fp->fa_nqgen); + fxdr_hyper(&fp->fa_nqfilerev, &vap->va_filerev); + } else { + vap->va_size = fxdr_unsigned(u_long, fp->fa_nfssize); + vap->va_blocksize = fxdr_unsigned(long, fp->fa_nfsblocksize); + vap->va_bytes = fxdr_unsigned(long, fp->fa_nfsblocks) * NFS_FABLKSIZE; + vap->va_fileid = fxdr_unsigned(long, fp->fa_nfsfileid); + fxdr_nfstime(&fp->fa_nfsatime, &vap->va_atime); + vap->va_flags = 0; + vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa_nfsctime.nfs_sec); + vap->va_ctime.ts_nsec = 0; + vap->va_gen = fxdr_unsigned(u_long, fp->fa_nfsctime.nfs_usec); + vap->va_filerev = 0; + } + if (vap->va_size != np->n_size) { + if (vap->va_type == VREG) { + if (np->n_flag & NMODIFIED) { + if (vap->va_size < np->n_size) + vap->va_size = np->n_size; + else + np->n_size = vap->va_size; + } else + np->n_size = vap->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else + np->n_size = vap->va_size; + } + np->n_attrstamp = time.tv_sec; + *dposp = dpos; + *mdp = md; + if (vaper != NULL) { + bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); +#ifdef notdef + if ((np->n_flag & NMODIFIED) && np->n_size > vap->va_size) + if (np->n_size > vap->va_size) + vaper->va_size = np->n_size; +#endif + if (np->n_flag & NCHG) { + if (np->n_flag & NACC) { + vaper->va_atime.ts_sec = np->n_atim.tv_sec; + vaper->va_atime.ts_nsec = + np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vaper->va_mtime.ts_sec = np->n_mtim.tv_sec; + vaper->va_mtime.ts_nsec = + np->n_mtim.tv_usec * 1000; + } + } + } + return (0); +} + +/* + * Check the time stamp + * If the cache is valid, copy contents to *vap and return 0 + * otherwise return an error + */ +nfs_getattrcache(vp, vaper) + register struct vnode *vp; + struct vattr *vaper; +{ + register struct nfsnode *np = VTONFS(vp); + register struct vattr *vap; + + if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQLOOKLEASE) { + if (!NQNFS_CKCACHABLE(vp, NQL_READ) || np->n_attrstamp == 0) { + nfsstats.attrcache_misses++; + return (ENOENT); + } + } else if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) { + nfsstats.attrcache_misses++; + return (ENOENT); + } + nfsstats.attrcache_hits++; + vap = &np->n_vattr; + if (vap->va_size != np->n_size) { + if (vap->va_type == VREG) { + if (np->n_flag & NMODIFIED) { + if (vap->va_size < np->n_size) + vap->va_size = np->n_size; + else + np->n_size = vap->va_size; + } else + np->n_size = vap->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else + np->n_size = vap->va_size; + } + bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); +#ifdef notdef + if ((np->n_flag & NMODIFIED) == 0) { + np->n_size = vaper->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else if (np->n_size > vaper->va_size) + if (np->n_size > vaper->va_size) + vaper->va_size = np->n_size; +#endif + if (np->n_flag & NCHG) { + if (np->n_flag & NACC) { + vaper->va_atime.ts_sec = np->n_atim.tv_sec; + vaper->va_atime.ts_nsec = np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vaper->va_mtime.ts_sec = np->n_mtim.tv_sec; + vaper->va_mtime.ts_nsec = np->n_mtim.tv_usec * 1000; + } + } + return (0); +} + +/* + * Set up nameidata for a lookup() call and do it + */ +nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, p) + register struct nameidata *ndp; + fhandle_t *fhp; + int len; + struct nfssvc_sock *slp; + struct mbuf *nam; + struct mbuf **mdp; + caddr_t *dposp; + struct proc *p; +{ + register int i, rem; + register struct mbuf *md; + register char *fromcp, *tocp; + struct vnode *dp; + int error, rdonly; + struct componentname *cnp = &ndp->ni_cnd; + + MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK); + /* + * Copy the name from the mbuf list to ndp->ni_pnbuf + * and set the various ndp fields appropriately. + */ + fromcp = *dposp; + tocp = cnp->cn_pnbuf; + md = *mdp; + rem = mtod(md, caddr_t) + md->m_len - fromcp; + cnp->cn_hash = 0; + for (i = 0; i < len; i++) { + while (rem == 0) { + md = md->m_next; + if (md == NULL) { + error = EBADRPC; + goto out; + } + fromcp = mtod(md, caddr_t); + rem = md->m_len; + } + if (*fromcp == '\0' || *fromcp == '/') { + error = EINVAL; + goto out; + } + cnp->cn_hash += (unsigned char)*fromcp; + *tocp++ = *fromcp++; + rem--; + } + *tocp = '\0'; + *mdp = md; + *dposp = fromcp; + len = nfsm_rndup(len)-len; + if (len > 0) { + if (rem >= len) + *dposp += len; + else if (error = nfs_adv(mdp, dposp, len, rem)) + goto out; + } + ndp->ni_pathlen = tocp - cnp->cn_pnbuf; + cnp->cn_nameptr = cnp->cn_pnbuf; + /* + * Extract and set starting directory. + */ + if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, + nam, &rdonly)) + goto out; + if (dp->v_type != VDIR) { + vrele(dp); + error = ENOTDIR; + goto out; + } + ndp->ni_startdir = dp; + if (rdonly) + cnp->cn_flags |= (NOCROSSMOUNT | RDONLY); + else + cnp->cn_flags |= NOCROSSMOUNT; + /* + * And call lookup() to do the real work + */ + cnp->cn_proc = p; + if (error = lookup(ndp)) + goto out; + /* + * Check for encountering a symbolic link + */ + if (cnp->cn_flags & ISSYMLINK) { + if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) + vput(ndp->ni_dvp); + else + vrele(ndp->ni_dvp); + vput(ndp->ni_vp); + ndp->ni_vp = NULL; + error = EINVAL; + goto out; + } + /* + * Check for saved name request + */ + if (cnp->cn_flags & (SAVENAME | SAVESTART)) { + cnp->cn_flags |= HASBUF; + return (0); + } +out: + FREE(cnp->cn_pnbuf, M_NAMEI); + return (error); +} + +/* + * A fiddled version of m_adj() that ensures null fill to a long + * boundary and only trims off the back end + */ +void +nfsm_adj(mp, len, nul) + struct mbuf *mp; + register int len; + int nul; +{ + register struct mbuf *m; + register int count, i; + register char *cp; + + /* + * Trim from tail. Scan the mbuf chain, + * calculating its length and finding the last mbuf. + * If the adjustment only affects this mbuf, then just + * adjust and return. Otherwise, rescan and truncate + * after the remaining size. + */ + count = 0; + m = mp; + for (;;) { + count += m->m_len; + if (m->m_next == (struct mbuf *)0) + break; + m = m->m_next; + } + if (m->m_len > len) { + m->m_len -= len; + if (nul > 0) { + cp = mtod(m, caddr_t)+m->m_len-nul; + for (i = 0; i < nul; i++) + *cp++ = '\0'; + } + return; + } + count -= len; + if (count < 0) + count = 0; + /* + * Correct length for chain is "count". + * Find the mbuf with last data, adjust its length, + * and toss data from remaining mbufs on chain. + */ + for (m = mp; m; m = m->m_next) { + if (m->m_len >= count) { + m->m_len = count; + if (nul > 0) { + cp = mtod(m, caddr_t)+m->m_len-nul; + for (i = 0; i < nul; i++) + *cp++ = '\0'; + } + break; + } + count -= m->m_len; + } + while (m = m->m_next) + m->m_len = 0; +} + +/* + * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) + * - look up fsid in mount list (if not found ret error) + * - get vp and export rights by calling VFS_FHTOVP() + * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon + * - if not lockflag unlock it with VOP_UNLOCK() + */ +nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp) + fhandle_t *fhp; + int lockflag; + struct vnode **vpp; + struct ucred *cred; + struct nfssvc_sock *slp; + struct mbuf *nam; + int *rdonlyp; +{ + register struct mount *mp; + register struct nfsuid *uidp; + register int i; + struct ucred *credanon; + int error, exflags; + + *vpp = (struct vnode *)0; + if ((mp = getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if (error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon)) + return (error); + /* + * Check/setup credentials. + */ + if (exflags & MNT_EXKERB) { + uidp = slp->ns_uidh[NUIDHASH(cred->cr_uid)]; + while (uidp) { + if (uidp->nu_uid == cred->cr_uid) + break; + uidp = uidp->nu_hnext; + } + if (uidp) { + cred->cr_uid = uidp->nu_cr.cr_uid; + for (i = 0; i < uidp->nu_cr.cr_ngroups; i++) + cred->cr_groups[i] = uidp->nu_cr.cr_groups[i]; + } else { + vput(*vpp); + return (NQNFS_AUTHERR); + } + } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { + cred->cr_uid = credanon->cr_uid; + for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) + cred->cr_groups[i] = credanon->cr_groups[i]; + } + if (exflags & MNT_EXRDONLY) + *rdonlyp = 1; + else + *rdonlyp = 0; + if (!lockflag) + VOP_UNLOCK(*vpp); + return (0); +} + +/* + * This function compares two net addresses by family and returns TRUE + * if they are the same host. + * If there is any doubt, return FALSE. + * The AF_INET family is handled as a special case so that address mbufs + * don't need to be saved to store "struct in_addr", which is only 4 bytes. + */ +netaddr_match(family, haddr, nam) + int family; + union nethostaddr *haddr; + struct mbuf *nam; +{ + register struct sockaddr_in *inetaddr; + + switch (family) { + case AF_INET: + inetaddr = mtod(nam, struct sockaddr_in *); + if (inetaddr->sin_family == AF_INET && + inetaddr->sin_addr.s_addr == haddr->had_inetaddr) + return (1); + break; +#ifdef ISO + case AF_ISO: + { + register struct sockaddr_iso *isoaddr1, *isoaddr2; + + isoaddr1 = mtod(nam, struct sockaddr_iso *); + isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *); + if (isoaddr1->siso_family == AF_ISO && + isoaddr1->siso_nlen > 0 && + isoaddr1->siso_nlen == isoaddr2->siso_nlen && + SAME_ISOADDR(isoaddr1, isoaddr2)) + return (1); + break; + } +#endif /* ISO */ + default: + break; + }; + return (0); +} diff --git a/sys/nfs/nfs_common.h b/sys/nfs/nfs_common.h new file mode 100644 index 00000000000..879db360057 --- /dev/null +++ b/sys/nfs/nfs_common.h @@ -0,0 +1,269 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsm_subs.h 8.1 (Berkeley) 6/16/93 + */ + +/* + * These macros do strange and peculiar things to mbuf chains for + * the assistance of the nfs code. To attempt to use them for any + * other purpose will be dangerous. (they make weird assumptions) + */ + +/* + * First define what the actual subs. return + */ +extern struct mbuf *nfsm_reqh(); + +#define M_HASCL(m) ((m)->m_flags & M_EXT) +#define NFSMINOFF(m) \ + if (M_HASCL(m)) \ + (m)->m_data = (m)->m_ext.ext_buf; \ + else if ((m)->m_flags & M_PKTHDR) \ + (m)->m_data = (m)->m_pktdat; \ + else \ + (m)->m_data = (m)->m_dat +#define NFSMADV(m, s) (m)->m_data += (s) +#define NFSMSIZ(m) ((M_HASCL(m))?MCLBYTES: \ + (((m)->m_flags & M_PKTHDR)?MHLEN:MLEN)) + +/* + * Now for the macros that do the simple stuff and call the functions + * for the hard stuff. + * These macros use several vars. declared in nfsm_reqhead and these + * vars. must not be used elsewhere unless you are careful not to corrupt + * them. The vars. starting with pN and tN (N=1,2,3,..) are temporaries + * that may be used so long as the value is not expected to retained + * after a macro. + * I know, this is kind of dorkey, but it makes the actual op functions + * fairly clean and deals with the mess caused by the xdr discriminating + * unions. + */ + +#define nfsm_build(a,c,s) \ + { if ((s) > M_TRAILINGSPACE(mb)) { \ + MGET(mb2, M_WAIT, MT_DATA); \ + if ((s) > MLEN) \ + panic("build > MLEN"); \ + mb->m_next = mb2; \ + mb = mb2; \ + mb->m_len = 0; \ + bpos = mtod(mb, caddr_t); \ + } \ + (a) = (c)(bpos); \ + mb->m_len += (s); \ + bpos += (s); } + +#define nfsm_dissect(a,c,s) \ + { t1 = mtod(md, caddr_t)+md->m_len-dpos; \ + if (t1 >= (s)) { \ + (a) = (c)(dpos); \ + dpos += (s); \ + } else if (error = nfsm_disct(&md, &dpos, (s), t1, &cp2)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } else { \ + (a) = (c)cp2; \ + } } + +#define nfsm_fhtom(v) \ + nfsm_build(cp,caddr_t,NFSX_FH); \ + bcopy((caddr_t)&(VTONFS(v)->n_fh), cp, NFSX_FH) + +#define nfsm_srvfhtom(f) \ + nfsm_build(cp,caddr_t,NFSX_FH); \ + bcopy((caddr_t)(f), cp, NFSX_FH) + +#define nfsm_mtofh(d,v) \ + { struct nfsnode *np; nfsv2fh_t *fhp; \ + nfsm_dissect(fhp,nfsv2fh_t *,NFSX_FH); \ + if (error = nfs_nget((d)->v_mount, fhp, &np)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } \ + (v) = NFSTOV(np); \ + nfsm_loadattr(v, (struct vattr *)0); \ + } + +#define nfsm_loadattr(v,a) \ + { struct vnode *tvp = (v); \ + if (error = nfs_loadattrcache(&tvp, &md, &dpos, (a))) { \ + m_freem(mrep); \ + goto nfsmout; \ + } \ + (v) = tvp; } + +#define nfsm_strsiz(s,m) \ + { nfsm_dissect(tl,u_long *,NFSX_UNSIGNED); \ + if (((s) = fxdr_unsigned(long,*tl)) > (m)) { \ + m_freem(mrep); \ + error = EBADRPC; \ + goto nfsmout; \ + } } + +#define nfsm_srvstrsiz(s,m) \ + { nfsm_dissect(tl,u_long *,NFSX_UNSIGNED); \ + if (((s) = fxdr_unsigned(long,*tl)) > (m) || (s) <= 0) { \ + error = EBADRPC; \ + nfsm_reply(0); \ + } } + +#define nfsm_mtouio(p,s) \ + if ((s) > 0 && \ + (error = nfsm_mbuftouio(&md,(p),(s),&dpos))) { \ + m_freem(mrep); \ + goto nfsmout; \ + } + +#define nfsm_uiotom(p,s) \ + if (error = nfsm_uiotombuf((p),&mb,(s),&bpos)) { \ + m_freem(mreq); \ + goto nfsmout; \ + } + +#define nfsm_reqhead(v,a,s) \ + mb = mreq = nfsm_reqh((v),(a),(s),&bpos) + +#define nfsm_reqdone m_freem(mrep); \ + nfsmout: + +#define nfsm_rndup(a) (((a)+3)&(~0x3)) + +#define nfsm_request(v, t, p, c) \ + if (error = nfs_request((v), mreq, (t), (p), \ + (c), &mrep, &md, &dpos)) \ + goto nfsmout + +#define nfsm_strtom(a,s,m) \ + if ((s) > (m)) { \ + m_freem(mreq); \ + error = ENAMETOOLONG; \ + goto nfsmout; \ + } \ + t2 = nfsm_rndup(s)+NFSX_UNSIGNED; \ + if (t2 <= M_TRAILINGSPACE(mb)) { \ + nfsm_build(tl,u_long *,t2); \ + *tl++ = txdr_unsigned(s); \ + *(tl+((t2>>2)-2)) = 0; \ + bcopy((caddr_t)(a), (caddr_t)tl, (s)); \ + } else if (error = nfsm_strtmbuf(&mb, &bpos, (a), (s))) { \ + m_freem(mreq); \ + goto nfsmout; \ + } + +#define nfsm_srvdone \ + nfsmout: \ + return(error) + +#define nfsm_reply(s) \ + { \ + nfsd->nd_repstat = error; \ + if (error) \ + (void) nfs_rephead(0, nfsd, error, cache, &frev, \ + mrq, &mb, &bpos); \ + else \ + (void) nfs_rephead((s), nfsd, error, cache, &frev, \ + mrq, &mb, &bpos); \ + m_freem(mrep); \ + mreq = *mrq; \ + if (error) \ + return(0); \ + } + +#define nfsm_adv(s) \ + t1 = mtod(md, caddr_t)+md->m_len-dpos; \ + if (t1 >= (s)) { \ + dpos += (s); \ + } else if (error = nfs_adv(&md, &dpos, (s), t1)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } + +#define nfsm_srvmtofh(f) \ + nfsm_dissect(tl, u_long *, NFSX_FH); \ + bcopy((caddr_t)tl, (caddr_t)f, NFSX_FH) + +#define nfsm_clget \ + if (bp >= be) { \ + if (mp == mb) \ + mp->m_len += bp-bpos; \ + MGET(mp, M_WAIT, MT_DATA); \ + MCLGET(mp, M_WAIT); \ + mp->m_len = NFSMSIZ(mp); \ + mp2->m_next = mp; \ + mp2 = mp; \ + bp = mtod(mp, caddr_t); \ + be = bp+mp->m_len; \ + } \ + tl = (u_long *)bp + +#define nfsm_srvfillattr \ + fp->fa_type = vtonfs_type(vap->va_type); \ + fp->fa_mode = vtonfs_mode(vap->va_type, vap->va_mode); \ + fp->fa_nlink = txdr_unsigned(vap->va_nlink); \ + fp->fa_uid = txdr_unsigned(vap->va_uid); \ + fp->fa_gid = txdr_unsigned(vap->va_gid); \ + if (nfsd->nd_nqlflag == NQL_NOVAL) { \ + fp->fa_nfsblocksize = txdr_unsigned(vap->va_blocksize); \ + if (vap->va_type == VFIFO) \ + fp->fa_nfsrdev = 0xffffffff; \ + else \ + fp->fa_nfsrdev = txdr_unsigned(vap->va_rdev); \ + fp->fa_nfsfsid = txdr_unsigned(vap->va_fsid); \ + fp->fa_nfsfileid = txdr_unsigned(vap->va_fileid); \ + fp->fa_nfssize = txdr_unsigned(vap->va_size); \ + fp->fa_nfsblocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); \ + txdr_nfstime(&vap->va_atime, &fp->fa_nfsatime); \ + txdr_nfstime(&vap->va_mtime, &fp->fa_nfsmtime); \ + fp->fa_nfsctime.nfs_sec = txdr_unsigned(vap->va_ctime.ts_sec); \ + fp->fa_nfsctime.nfs_usec = txdr_unsigned(vap->va_gen); \ + } else { \ + fp->fa_nqblocksize = txdr_unsigned(vap->va_blocksize); \ + if (vap->va_type == VFIFO) \ + fp->fa_nqrdev = 0xffffffff; \ + else \ + fp->fa_nqrdev = txdr_unsigned(vap->va_rdev); \ + fp->fa_nqfsid = txdr_unsigned(vap->va_fsid); \ + fp->fa_nqfileid = txdr_unsigned(vap->va_fileid); \ + txdr_hyper(&vap->va_size, &fp->fa_nqsize); \ + txdr_hyper(&vap->va_bytes, &fp->fa_nqbytes); \ + txdr_nqtime(&vap->va_atime, &fp->fa_nqatime); \ + txdr_nqtime(&vap->va_mtime, &fp->fa_nqmtime); \ + txdr_nqtime(&vap->va_ctime, &fp->fa_nqctime); \ + fp->fa_nqflags = txdr_unsigned(vap->va_flags); \ + fp->fa_nqgen = txdr_unsigned(vap->va_gen); \ + txdr_hyper(&vap->va_filerev, &fp->fa_nqfilerev); \ + } + diff --git a/sys/nfs/nfs_node.c b/sys/nfs/nfs_node.c new file mode 100644 index 00000000000..032bdef0d5a --- /dev/null +++ b/sys/nfs/nfs_node.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_node.c 8.2 (Berkeley) 12/30/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +struct nfsnode **nheadhashtbl; +u_long nheadhash; +#define NFSNOHASH(fhsum) ((fhsum)&nheadhash) + +#define TRUE 1 +#define FALSE 0 + +/* + * Initialize hash links for nfsnodes + * and build nfsnode free list. + */ +nfs_nhinit() +{ + +#ifndef lint + if ((sizeof(struct nfsnode) - 1) & sizeof(struct nfsnode)) + printf("nfs_nhinit: bad size %d\n", sizeof(struct nfsnode)); +#endif /* not lint */ + nheadhashtbl = hashinit(desiredvnodes, M_NFSNODE, &nheadhash); +} + +/* + * Compute an entry in the NFS hash table structure + */ +struct nfsnode ** +nfs_hash(fhp) + register nfsv2fh_t *fhp; +{ + register u_char *fhpp; + register u_long fhsum; + int i; + + fhpp = &fhp->fh_bytes[0]; + fhsum = 0; + for (i = 0; i < NFSX_FH; i++) + fhsum += *fhpp++; + return (&nheadhashtbl[NFSNOHASH(fhsum)]); +} + +/* + * Look up a vnode/nfsnode by file handle. + * Callers must check for mount points!! + * In all cases, a pointer to a + * nfsnode structure is returned. + */ +nfs_nget(mntp, fhp, npp) + struct mount *mntp; + register nfsv2fh_t *fhp; + struct nfsnode **npp; +{ + register struct nfsnode *np, *nq, **nhpp; + register struct vnode *vp; + extern int (**nfsv2_vnodeop_p)(); + struct vnode *nvp; + int error; + + nhpp = nfs_hash(fhp); +loop: + for (np = *nhpp; np; np = np->n_forw) { + if (mntp != NFSTOV(np)->v_mount || + bcmp((caddr_t)fhp, (caddr_t)&np->n_fh, NFSX_FH)) + continue; + vp = NFSTOV(np); + if (vget(vp, 1)) + goto loop; + *npp = np; + return(0); + } + if (error = getnewvnode(VT_NFS, mntp, nfsv2_vnodeop_p, &nvp)) { + *npp = 0; + return (error); + } + vp = nvp; + MALLOC(np, struct nfsnode *, sizeof *np, M_NFSNODE, M_WAITOK); + vp->v_data = np; + np->n_vnode = vp; + /* + * Insert the nfsnode in the hash queue for its new file handle + */ + np->n_flag = 0; + if (nq = *nhpp) + nq->n_back = &np->n_forw; + np->n_forw = nq; + np->n_back = nhpp; + *nhpp = np; + bcopy((caddr_t)fhp, (caddr_t)&np->n_fh, NFSX_FH); + np->n_attrstamp = 0; + np->n_direofoffset = 0; + np->n_sillyrename = (struct sillyrename *)0; + np->n_size = 0; + np->n_mtime = 0; + if (VFSTONFS(mntp)->nm_flag & NFSMNT_NQNFS) { + np->n_brev = 0; + np->n_lrev = 0; + np->n_expiry = (time_t)0; + np->n_tnext = (struct nfsnode *)0; + } + *npp = np; + return (0); +} + +nfs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct nfsnode *np; + register struct sillyrename *sp; + struct proc *p = curproc; /* XXX */ + extern int prtactive; + + np = VTONFS(ap->a_vp); + if (prtactive && ap->a_vp->v_usecount != 0) + vprint("nfs_inactive: pushing active", ap->a_vp); + sp = np->n_sillyrename; + np->n_sillyrename = (struct sillyrename *)0; + if (sp) { + /* + * Remove the silly file that was rename'd earlier + */ + (void) nfs_vinvalbuf(ap->a_vp, 0, sp->s_cred, p, 1); + nfs_removeit(sp); + crfree(sp->s_cred); + vrele(sp->s_dvp); +#ifdef SILLYSEPARATE + free((caddr_t)sp, M_NFSREQ); +#endif + } + np->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT | NQNFSEVICTED | + NQNFSNONCACHE | NQNFSWRITE); + return (0); +} + +/* + * Reclaim an nfsnode so that it can be used for other purposes. + */ +nfs_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + register struct nfsmount *nmp = VFSTONFS(vp->v_mount); + register struct nfsnode *nq; + extern int prtactive; + + if (prtactive && vp->v_usecount != 0) + vprint("nfs_reclaim: pushing active", vp); + /* + * Remove the nfsnode from its hash chain. + */ + if (nq = np->n_forw) + nq->n_back = np->n_back; + *np->n_back = nq; + + /* + * For nqnfs, take it off the timer queue as required. + */ + if ((nmp->nm_flag & NFSMNT_NQNFS) && np->n_tnext) { + if (np->n_tnext == (struct nfsnode *)nmp) + nmp->nm_tprev = np->n_tprev; + else + np->n_tnext->n_tprev = np->n_tprev; + if (np->n_tprev == (struct nfsnode *)nmp) + nmp->nm_tnext = np->n_tnext; + else + np->n_tprev->n_tnext = np->n_tnext; + } + cache_purge(vp); + FREE(vp->v_data, M_NFSNODE); + vp->v_data = (void *)0; + return (0); +} + +/* + * Lock an nfsnode + */ +nfs_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + + /* + * Ugh, another place where interruptible mounts will get hung. + * If you make this sleep interruptible, then you have to fix all + * the VOP_LOCK() calls to expect interruptibility. + */ + while (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + } + if (vp->v_tag == VT_NON) + return (ENOENT); + return (0); +} + +/* + * Unlock an nfsnode + */ +nfs_unlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* + * Check for a locked nfsnode + */ +nfs_islocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* + * Nfs abort op, called after namei() when a CREATE/DELETE isn't actually + * done. Currently nothing to do. + */ +/* ARGSUSED */ +int +nfs_abortop(ap) + struct vop_abortop_args /* { + struct vnode *a_dvp; + struct componentname *a_cnp; + } */ *ap; +{ + + if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) + FREE(ap->a_cnp->cn_pnbuf, M_NAMEI); + return (0); +} diff --git a/sys/nfs/nfs_nqlease.c b/sys/nfs/nfs_nqlease.c new file mode 100644 index 00000000000..965f46132a6 --- /dev/null +++ b/sys/nfs/nfs_nqlease.c @@ -0,0 +1,1228 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_nqlease.c 8.3 (Berkeley) 1/4/94 + */ + +/* + * References: + * Cary G. Gray and David R. Cheriton, "Leases: An Efficient Fault-Tolerant + * Mechanism for Distributed File Cache Consistency", + * In Proc. of the Twelfth ACM Symposium on Operating Systems + * Principals, pg. 202-210, Litchfield Park, AZ, Dec. 1989. + * Michael N. Nelson, Brent B. Welch and John K. Ousterhout, "Caching + * in the Sprite Network File System", ACM TOCS 6(1), + * pages 134-154, February 1988. + * V. Srinivasan and Jeffrey C. Mogul, "Spritely NFS: Implementation and + * Performance of Cache-Consistency Protocols", Digital + * Equipment Corporation WRL Research Report 89/5, May 1989. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * List head for the lease queue and other global data. + * At any time a lease is linked into a list ordered by increasing expiry time. + */ +#define NQFHHASH(f) ((*((u_long *)(f)))&nqfheadhash) + +union nqsrvthead nqthead; +struct nqlease **nqfhead; +u_long nqfheadhash; +time_t nqnfsstarttime = (time_t)0; +u_long nqnfs_prog, nqnfs_vers; +int nqsrv_clockskew = NQ_CLOCKSKEW; +int nqsrv_writeslack = NQ_WRITESLACK; +int nqsrv_maxlease = NQ_MAXLEASE; +int nqsrv_maxnumlease = NQ_MAXNUMLEASE; +void nqsrv_instimeq(), nqsrv_send_eviction(), nfs_sndunlock(); +void nqsrv_unlocklease(), nqsrv_waitfor_expiry(), nfsrv_slpderef(); +void nqsrv_addhost(), nqsrv_locklease(), nqnfs_serverd(); +void nqnfs_clientlease(); +struct mbuf *nfsm_rpchead(); + +/* + * Signifies which rpcs can have piggybacked lease requests + */ +int nqnfs_piggy[NFS_NPROCS] = { + 0, + NQL_READ, + NQL_WRITE, + 0, + NQL_READ, + NQL_READ, + NQL_READ, + 0, + NQL_WRITE, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + NQL_READ, + 0, + NQL_READ, + 0, + 0, + 0, + 0, +}; + +int nnnnnn = sizeof (struct nqlease); +int oooooo = sizeof (struct nfsnode); +extern nfstype nfs_type[9]; +extern struct nfssvc_sock *nfs_udpsock, *nfs_cltpsock; +extern struct nfsd nfsd_head; +extern int nfsd_waiting; +extern struct nfsreq nfsreqh; + +#define TRUE 1 +#define FALSE 0 + +/* + * Get or check for a lease for "vp", based on NQL_CHECK flag. + * The rules are as follows: + * - if a current non-caching lease, reply non-caching + * - if a current lease for same host only, extend lease + * - if a read cachable lease and a read lease request + * add host to list any reply cachable + * - else { set non-cachable for read-write sharing } + * send eviction notice messages to all other hosts that have lease + * wait for lease termination { either by receiving vacated messages + * from all the other hosts or expiry + * via. timeout } + * modify lease to non-cachable + * - else if no current lease, issue new one + * - reply + * - return boolean TRUE iff nam should be m_freem()'d + * NB: Since nqnfs_serverd() is called from a timer, any potential tsleep() + * in here must be framed by nqsrv_locklease() and nqsrv_unlocklease(). + * nqsrv_locklease() is coded such that at least one of LC_LOCKED and + * LC_WANTED is set whenever a process is tsleeping in it. The exception + * is when a new lease is being allocated, since it is not in the timer + * queue yet. (Ditto for the splsoftclock() and splx(s) calls) + */ +nqsrv_getlease(vp, duration, flags, nd, nam, cachablep, frev, cred) + struct vnode *vp; + u_long *duration; + int flags; + struct nfsd *nd; + struct mbuf *nam; + int *cachablep; + u_quad_t *frev; + struct ucred *cred; +{ + register struct nqlease *lp, *lq, **lpp; + register struct nqhost *lph; + struct nqlease *tlp; + struct nqm **lphp; + struct vattr vattr; + fhandle_t fh; + int i, ok, error, s; + + if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) + return (0); + if (*duration > nqsrv_maxlease) + *duration = nqsrv_maxlease; + if (error = VOP_GETATTR(vp, &vattr, cred, nd->nd_procp)) + return (error); + *frev = vattr.va_filerev; + s = splsoftclock(); + tlp = vp->v_lease; + if ((flags & NQL_CHECK) == 0) + nfsstats.srvnqnfs_getleases++; + if (tlp == (struct nqlease *)0) { + + /* + * Find the lease by searching the hash list. + */ + fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; + if (error = VFS_VPTOFH(vp, &fh.fh_fid)) { + splx(s); + return (error); + } + lpp = &nqfhead[NQFHHASH(fh.fh_fid.fid_data)]; + for (lp = *lpp; lp; lp = lp->lc_fhnext) + if (fh.fh_fsid.val[0] == lp->lc_fsid.val[0] && + fh.fh_fsid.val[1] == lp->lc_fsid.val[1] && + !bcmp(fh.fh_fid.fid_data, lp->lc_fiddata, + fh.fh_fid.fid_len - sizeof (long))) { + /* Found it */ + lp->lc_vp = vp; + vp->v_lease = lp; + tlp = lp; + break; + } + } + lp = tlp; + if (lp) { + if ((lp->lc_flag & LC_NONCACHABLE) || + (lp->lc_morehosts == (struct nqm *)0 && + nqsrv_cmpnam(nd->nd_slp, nam, &lp->lc_host))) + goto doreply; + if ((flags & NQL_READ) && (lp->lc_flag & LC_WRITE)==0) { + if (flags & NQL_CHECK) + goto doreply; + if (nqsrv_cmpnam(nd->nd_slp, nam, &lp->lc_host)) + goto doreply; + i = 0; + if (lp->lc_morehosts) { + lph = lp->lc_morehosts->lpm_hosts; + lphp = &lp->lc_morehosts->lpm_next; + ok = 1; + } else { + lphp = &lp->lc_morehosts; + ok = 0; + } + while (ok && (lph->lph_flag & LC_VALID)) { + if (nqsrv_cmpnam(nd->nd_slp, nam, lph)) + goto doreply; + if (++i == LC_MOREHOSTSIZ) { + i = 0; + if (*lphp) { + lph = (*lphp)->lpm_hosts; + lphp = &((*lphp)->lpm_next); + } else + ok = 0; + } else + lph++; + } + nqsrv_locklease(lp); + if (!ok) { + *lphp = (struct nqm *) + malloc(sizeof (struct nqm), + M_NQMHOST, M_WAITOK); + bzero((caddr_t)*lphp, sizeof (struct nqm)); + lph = (*lphp)->lpm_hosts; + } + nqsrv_addhost(lph, nd->nd_slp, nam); + nqsrv_unlocklease(lp); + } else { + lp->lc_flag |= LC_NONCACHABLE; + nqsrv_locklease(lp); + nqsrv_send_eviction(vp, lp, nd->nd_slp, nam, cred); + nqsrv_waitfor_expiry(lp); + nqsrv_unlocklease(lp); + } +doreply: + /* + * Update the lease and return + */ + if ((flags & NQL_CHECK) == 0) + nqsrv_instimeq(lp, *duration); + if (lp->lc_flag & LC_NONCACHABLE) + *cachablep = 0; + else { + *cachablep = 1; + if (flags & NQL_WRITE) + lp->lc_flag |= LC_WRITTEN; + } + splx(s); + return (0); + } + splx(s); + if (flags & NQL_CHECK) + return (0); + + /* + * Allocate new lease + * The value of nqsrv_maxnumlease should be set generously, so that + * the following "printf" happens infrequently. + */ + if (nfsstats.srvnqnfs_leases > nqsrv_maxnumlease) { + printf("Nqnfs server, too many leases\n"); + do { + (void) tsleep((caddr_t)&lbolt, PSOCK, + "nqsrvnuml", 0); + } while (nfsstats.srvnqnfs_leases > nqsrv_maxnumlease); + } + MALLOC(lp, struct nqlease *, sizeof (struct nqlease), M_NQLEASE, M_WAITOK); + bzero((caddr_t)lp, sizeof (struct nqlease)); + if (flags & NQL_WRITE) + lp->lc_flag |= (LC_WRITE | LC_WRITTEN); + nqsrv_addhost(&lp->lc_host, nd->nd_slp, nam); + lp->lc_vp = vp; + lp->lc_fsid = fh.fh_fsid; + bcopy(fh.fh_fid.fid_data, lp->lc_fiddata, fh.fh_fid.fid_len - sizeof (long)); + if (lq = *lpp) + lq->lc_fhprev = &lp->lc_fhnext; + lp->lc_fhnext = lq; + lp->lc_fhprev = lpp; + *lpp = lp; + vp->v_lease = lp; + s = splsoftclock(); + nqsrv_instimeq(lp, *duration); + splx(s); + *cachablep = 1; + if (++nfsstats.srvnqnfs_leases > nfsstats.srvnqnfs_maxleases) + nfsstats.srvnqnfs_maxleases = nfsstats.srvnqnfs_leases; + return (0); +} + +/* + * Local lease check for server syscalls. + * Just set up args and let nqsrv_getlease() do the rest. + */ +void +lease_check(vp, p, cred, flag) + struct vnode *vp; + struct proc *p; + struct ucred *cred; + int flag; +{ + int duration = 0, cache; + struct nfsd nfsd; + u_quad_t frev; + + nfsd.nd_slp = NQLOCALSLP; + nfsd.nd_procp = p; + (void) nqsrv_getlease(vp, &duration, NQL_CHECK | flag, &nfsd, + (struct mbuf *)0, &cache, &frev, cred); +} + +/* + * Add a host to an nqhost structure for a lease. + */ +void +nqsrv_addhost(lph, slp, nam) + register struct nqhost *lph; + struct nfssvc_sock *slp; + struct mbuf *nam; +{ + register struct sockaddr_in *saddr; + + if (slp == NQLOCALSLP) + lph->lph_flag |= (LC_VALID | LC_LOCAL); + else if (slp == nfs_udpsock) { + saddr = mtod(nam, struct sockaddr_in *); + lph->lph_flag |= (LC_VALID | LC_UDP); + lph->lph_inetaddr = saddr->sin_addr.s_addr; + lph->lph_port = saddr->sin_port; + } else if (slp == nfs_cltpsock) { + lph->lph_nam = m_copym(nam, 0, M_COPYALL, M_WAIT); + lph->lph_flag |= (LC_VALID | LC_CLTP); + } else { + lph->lph_flag |= (LC_VALID | LC_SREF); + lph->lph_slp = slp; + slp->ns_sref++; + } +} + +/* + * Update the lease expiry time and position it in the timer queue correctly. + */ +void +nqsrv_instimeq(lp, duration) + register struct nqlease *lp; + u_long duration; +{ + register struct nqlease *tlp; + time_t newexpiry; + + newexpiry = time.tv_sec + duration + nqsrv_clockskew; + if (lp->lc_expiry == newexpiry) + return; + if (lp->lc_chain1[0]) + remque(lp); + lp->lc_expiry = newexpiry; + + /* + * Find where in the queue it should be. + */ + tlp = nqthead.th_chain[1]; + while (tlp->lc_expiry > newexpiry && tlp != (struct nqlease *)&nqthead) + tlp = tlp->lc_chain1[1]; + if (tlp == nqthead.th_chain[1]) + NQSTORENOVRAM(newexpiry); + insque(lp, tlp); +} + +/* + * Compare the requesting host address with the lph entry in the lease. + * Return true iff it is the same. + * This is somewhat messy due to the union in the nqhost structure. + * The local host is indicated by the special value of NQLOCALSLP for slp. + */ +nqsrv_cmpnam(slp, nam, lph) + register struct nfssvc_sock *slp; + struct mbuf *nam; + register struct nqhost *lph; +{ + register struct sockaddr_in *saddr; + struct mbuf *addr; + union nethostaddr lhaddr; + int ret; + + if (slp == NQLOCALSLP) { + if (lph->lph_flag & LC_LOCAL) + return (1); + else + return (0); + } + if (slp == nfs_udpsock || slp == nfs_cltpsock) + addr = nam; + else + addr = slp->ns_nam; + if (lph->lph_flag & LC_UDP) + ret = netaddr_match(AF_INET, &lph->lph_haddr, addr); + else if (lph->lph_flag & LC_CLTP) + ret = netaddr_match(AF_ISO, &lph->lph_claddr, addr); + else { + if ((lph->lph_slp->ns_flag & SLP_VALID) == 0) + return (0); + saddr = mtod(lph->lph_slp->ns_nam, struct sockaddr_in *); + if (saddr->sin_family == AF_INET) + lhaddr.had_inetaddr = saddr->sin_addr.s_addr; + else + lhaddr.had_nam = lph->lph_slp->ns_nam; + ret = netaddr_match(saddr->sin_family, &lhaddr, addr); + } + return (ret); +} + +/* + * Send out eviction notice messages to all other hosts for the lease. + */ +void +nqsrv_send_eviction(vp, lp, slp, nam, cred) + struct vnode *vp; + register struct nqlease *lp; + struct nfssvc_sock *slp; + struct mbuf *nam; + struct ucred *cred; +{ + register struct nqhost *lph = &lp->lc_host; + register struct mbuf *m; + register int siz; + struct nqm *lphnext = lp->lc_morehosts; + struct mbuf *mreq, *mb, *mb2, *nam2, *mheadend; + struct socket *so; + struct sockaddr_in *saddr; + fhandle_t *fhp; + caddr_t bpos, cp; + u_long xid; + int len = 1, ok = 1, i = 0; + int sotype, *solockp; + + while (ok && (lph->lph_flag & LC_VALID)) { + if (nqsrv_cmpnam(slp, nam, lph)) + lph->lph_flag |= LC_VACATED; + else if ((lph->lph_flag & (LC_LOCAL | LC_VACATED)) == 0) { + if (lph->lph_flag & LC_UDP) { + MGET(nam2, M_WAIT, MT_SONAME); + saddr = mtod(nam2, struct sockaddr_in *); + nam2->m_len = saddr->sin_len = + sizeof (struct sockaddr_in); + saddr->sin_family = AF_INET; + saddr->sin_addr.s_addr = lph->lph_inetaddr; + saddr->sin_port = lph->lph_port; + so = nfs_udpsock->ns_so; + } else if (lph->lph_flag & LC_CLTP) { + nam2 = lph->lph_nam; + so = nfs_cltpsock->ns_so; + } else if (lph->lph_slp->ns_flag & SLP_VALID) { + nam2 = (struct mbuf *)0; + so = lph->lph_slp->ns_so; + } else + goto nextone; + sotype = so->so_type; + if (so->so_proto->pr_flags & PR_CONNREQUIRED) + solockp = &lph->lph_slp->ns_solock; + else + solockp = (int *)0; + nfsm_reqhead((struct vnode *)0, NQNFSPROC_EVICTED, + NFSX_FH); + nfsm_build(cp, caddr_t, NFSX_FH); + bzero(cp, NFSX_FH); + fhp = (fhandle_t *)cp; + fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; + VFS_VPTOFH(vp, &fhp->fh_fid); + m = mreq; + siz = 0; + while (m) { + siz += m->m_len; + m = m->m_next; + } + if (siz <= 0 || siz > NFS_MAXPACKET) { + printf("mbuf siz=%d\n",siz); + panic("Bad nfs svc reply"); + } + m = nfsm_rpchead(cred, TRUE, NQNFSPROC_EVICTED, + RPCAUTH_UNIX, 5*NFSX_UNSIGNED, (char *)0, + mreq, siz, &mheadend, &xid); + /* + * For stream protocols, prepend a Sun RPC + * Record Mark. + */ + if (sotype == SOCK_STREAM) { + M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); + *mtod(m, u_long *) = htonl(0x80000000 | + (m->m_pkthdr.len - NFSX_UNSIGNED)); + } + if (((lph->lph_flag & (LC_UDP | LC_CLTP)) == 0 && + (lph->lph_slp->ns_flag & SLP_VALID) == 0) || + (solockp && (*solockp & NFSMNT_SNDLOCK))) + m_freem(m); + else { + if (solockp) + *solockp |= NFSMNT_SNDLOCK; + (void) nfs_send(so, nam2, m, + (struct nfsreq *)0); + if (solockp) + nfs_sndunlock(solockp); + } + if (lph->lph_flag & LC_UDP) + MFREE(nam2, m); + } +nextone: + if (++i == len) { + if (lphnext) { + i = 0; + len = LC_MOREHOSTSIZ; + lph = lphnext->lpm_hosts; + lphnext = lphnext->lpm_next; + } else + ok = 0; + } else + lph++; + } +} + +/* + * Wait for the lease to expire. + * This will occur when all clients have sent "vacated" messages to + * this server OR when it expires do to timeout. + */ +void +nqsrv_waitfor_expiry(lp) + register struct nqlease *lp; +{ + register struct nqhost *lph; + register int i; + struct nqm *lphnext; + int len, ok; + +tryagain: + if (time.tv_sec > lp->lc_expiry) + return; + lph = &lp->lc_host; + lphnext = lp->lc_morehosts; + len = 1; + i = 0; + ok = 1; + while (ok && (lph->lph_flag & LC_VALID)) { + if ((lph->lph_flag & (LC_LOCAL | LC_VACATED)) == 0) { + lp->lc_flag |= LC_EXPIREDWANTED; + (void) tsleep((caddr_t)&lp->lc_flag, PSOCK, + "nqexp", 0); + goto tryagain; + } + if (++i == len) { + if (lphnext) { + i = 0; + len = LC_MOREHOSTSIZ; + lph = lphnext->lpm_hosts; + lphnext = lphnext->lpm_next; + } else + ok = 0; + } else + lph++; + } +} + +/* + * Nqnfs server timer that maintains the server lease queue. + * Scan the lease queue for expired entries: + * - when one is found, wakeup anyone waiting for it + * else dequeue and free + */ +void +nqnfs_serverd() +{ + register struct nqlease *lp, *lq; + register struct nqhost *lph; + struct nqlease *nextlp; + struct nqm *lphnext, *olphnext; + struct mbuf *n; + int i, len, ok; + + lp = nqthead.th_chain[0]; + while (lp != (struct nqlease *)&nqthead) { + if (lp->lc_expiry >= time.tv_sec) + break; + nextlp = lp->lc_chain1[0]; + if (lp->lc_flag & LC_EXPIREDWANTED) { + lp->lc_flag &= ~LC_EXPIREDWANTED; + wakeup((caddr_t)&lp->lc_flag); + } else if ((lp->lc_flag & (LC_LOCKED | LC_WANTED)) == 0) { + /* + * Make a best effort at keeping a write caching lease long + * enough by not deleting it until it has been explicitly + * vacated or there have been no writes in the previous + * write_slack seconds since expiry and the nfsds are not + * all busy. The assumption is that if the nfsds are not + * all busy now (no queue of nfs requests), then the client + * would have been able to do at least one write to the + * file during the last write_slack seconds if it was still + * trying to push writes to the server. + */ + if ((lp->lc_flag & (LC_WRITE | LC_VACATED)) == LC_WRITE && + ((lp->lc_flag & LC_WRITTEN) || nfsd_waiting == 0)) { + lp->lc_flag &= ~LC_WRITTEN; + nqsrv_instimeq(lp, nqsrv_writeslack); + } else { + remque(lp); + if (lq = lp->lc_fhnext) + lq->lc_fhprev = lp->lc_fhprev; + *lp->lc_fhprev = lq; + /* + * This soft reference may no longer be valid, but + * no harm done. The worst case is if the vnode was + * recycled and has another valid lease reference, + * which is dereferenced prematurely. + */ + lp->lc_vp->v_lease = (struct nqlease *)0; + lph = &lp->lc_host; + lphnext = lp->lc_morehosts; + olphnext = (struct nqm *)0; + len = 1; + i = 0; + ok = 1; + while (ok && (lph->lph_flag & LC_VALID)) { + if (lph->lph_flag & LC_CLTP) + MFREE(lph->lph_nam, n); + if (lph->lph_flag & LC_SREF) + nfsrv_slpderef(lph->lph_slp); + if (++i == len) { + if (olphnext) { + free((caddr_t)olphnext, M_NQMHOST); + olphnext = (struct nqm *)0; + } + if (lphnext) { + olphnext = lphnext; + i = 0; + len = LC_MOREHOSTSIZ; + lph = lphnext->lpm_hosts; + lphnext = lphnext->lpm_next; + } else + ok = 0; + } else + lph++; + } + FREE((caddr_t)lp, M_NQLEASE); + if (olphnext) + free((caddr_t)olphnext, M_NQMHOST); + nfsstats.srvnqnfs_leases--; + } + } + lp = nextlp; + } +} + +/* + * Called from nfssvc_nfsd() for a getlease rpc request. + * Do the from/to xdr translation and call nqsrv_getlease() to + * do the real work. + */ +nqnfsrv_getlease(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct nfsv2_fattr *fp; + struct vattr va; + register struct vattr *vap = &va; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + register u_long *tl; + register long t1; + u_quad_t frev; + caddr_t bpos; + int error = 0; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + int flags, rdonly, cache; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); + flags = fxdr_unsigned(int, *tl++); + nfsd->nd_duration = fxdr_unsigned(int, *tl); + if (error = nfsrv_fhtovp(fhp, + TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + if (rdonly && flags == NQL_WRITE) { + error = EROFS; + nfsm_reply(0); + } + (void) nqsrv_getlease(vp, &nfsd->nd_duration, flags, nfsd, + nam, &cache, &frev, cred); + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_NQFATTR + 4*NFSX_UNSIGNED); + nfsm_build(tl, u_long *, 4*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(cache); + *tl++ = txdr_unsigned(nfsd->nd_duration); + txdr_hyper(&frev, tl); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_NQFATTR); + nfsm_srvfillattr; + nfsm_srvdone; +} + +/* + * Called from nfssvc_nfsd() when a "vacated" message is received from a + * client. Find the entry and expire it. + */ +nqnfsrv_vacated(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct nqlease *lp; + register struct nqhost *lph; + struct nqlease *tlp = (struct nqlease *)0; + nfsv2fh_t nfh; + fhandle_t *fhp; + register u_long *tl; + register long t1; + struct nqm *lphnext; + int error = 0, i, len, ok, gotit = 0; + char *cp2; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + m_freem(mrep); + /* + * Find the lease by searching the hash list. + */ + for (lp = nqfhead[NQFHHASH(fhp->fh_fid.fid_data)]; lp; + lp = lp->lc_fhnext) + if (fhp->fh_fsid.val[0] == lp->lc_fsid.val[0] && + fhp->fh_fsid.val[1] == lp->lc_fsid.val[1] && + !bcmp(fhp->fh_fid.fid_data, lp->lc_fiddata, + MAXFIDSZ)) { + /* Found it */ + tlp = lp; + break; + } + if (tlp) { + lp = tlp; + len = 1; + i = 0; + lph = &lp->lc_host; + lphnext = lp->lc_morehosts; + ok = 1; + while (ok && (lph->lph_flag & LC_VALID)) { + if (nqsrv_cmpnam(nfsd->nd_slp, nam, lph)) { + lph->lph_flag |= LC_VACATED; + gotit++; + break; + } + if (++i == len) { + if (lphnext) { + len = LC_MOREHOSTSIZ; + i = 0; + lph = lphnext->lpm_hosts; + lphnext = lphnext->lpm_next; + } else + ok = 0; + } else + lph++; + } + if ((lp->lc_flag & LC_EXPIREDWANTED) && gotit) { + lp->lc_flag &= ~LC_EXPIREDWANTED; + wakeup((caddr_t)&lp->lc_flag); + } +nfsmout: + return (EPERM); + } + return (EPERM); +} + +/* + * Client get lease rpc function. + */ +nqnfs_getlease(vp, rwflag, cred, p) + register struct vnode *vp; + int rwflag; + struct ucred *cred; + struct proc *p; +{ + register u_long *tl; + register caddr_t cp; + register long t1; + register struct nfsnode *np; + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + caddr_t bpos, dpos, cp2; + time_t reqtime; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + int cachable; + u_quad_t frev; + + nfsstats.rpccnt[NQNFSPROC_GETLEASE]++; + mb = mreq = nfsm_reqh(vp, NQNFSPROC_GETLEASE, NFSX_FH+2*NFSX_UNSIGNED, + &bpos); + nfsm_fhtom(vp); + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(rwflag); + *tl = txdr_unsigned(nmp->nm_leaseterm); + reqtime = time.tv_sec; + nfsm_request(vp, NQNFSPROC_GETLEASE, p, cred); + np = VTONFS(vp); + nfsm_dissect(tl, u_long *, 4*NFSX_UNSIGNED); + cachable = fxdr_unsigned(int, *tl++); + reqtime += fxdr_unsigned(int, *tl++); + if (reqtime > time.tv_sec) { + fxdr_hyper(tl, &frev); + nqnfs_clientlease(nmp, np, rwflag, cachable, reqtime, frev); + nfsm_loadattr(vp, (struct vattr *)0); + } else + error = NQNFS_EXPIRED; + nfsm_reqdone; + return (error); +} + +/* + * Client vacated message function. + */ +nqnfs_vacated(vp, cred) + register struct vnode *vp; + struct ucred *cred; +{ + register caddr_t cp; + register struct mbuf *m; + register int i; + caddr_t bpos; + u_long xid; + int error = 0; + struct mbuf *mreq, *mb, *mb2, *mheadend; + struct nfsmount *nmp; + struct nfsreq myrep; + + nmp = VFSTONFS(vp->v_mount); + nfsstats.rpccnt[NQNFSPROC_VACATED]++; + nfsm_reqhead(vp, NQNFSPROC_VACATED, NFSX_FH); + nfsm_fhtom(vp); + m = mreq; + i = 0; + while (m) { + i += m->m_len; + m = m->m_next; + } + m = nfsm_rpchead(cred, TRUE, NQNFSPROC_VACATED, + RPCAUTH_UNIX, 5*NFSX_UNSIGNED, (char *)0, + mreq, i, &mheadend, &xid); + if (nmp->nm_sotype == SOCK_STREAM) { + M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); + *mtod(m, u_long *) = htonl(0x80000000 | (m->m_pkthdr.len - + NFSX_UNSIGNED)); + } + myrep.r_flags = 0; + myrep.r_nmp = nmp; + if (nmp->nm_soflags & PR_CONNREQUIRED) + (void) nfs_sndlock(&nmp->nm_flag, (struct nfsreq *)0); + (void) nfs_send(nmp->nm_so, nmp->nm_nam, m, &myrep); + if (nmp->nm_soflags & PR_CONNREQUIRED) + nfs_sndunlock(&nmp->nm_flag); + return (error); +} + +/* + * Called for client side callbacks + */ +nqnfs_callback(nmp, mrep, md, dpos) + struct nfsmount *nmp; + struct mbuf *mrep, *md; + caddr_t dpos; +{ + register struct vnode *vp; + register u_long *tl; + register long t1; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct nfsnode *np; + struct nfsd nd; + int error; + char *cp2; + + nd.nd_mrep = mrep; + nd.nd_md = md; + nd.nd_dpos = dpos; + if (error = nfs_getreq(&nd, FALSE)) + return (error); + md = nd.nd_md; + dpos = nd.nd_dpos; + if (nd.nd_procnum != NQNFSPROC_EVICTED) { + m_freem(mrep); + return (EPERM); + } + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + m_freem(mrep); + if (error = nfs_nget(nmp->nm_mountp, fhp, &np)) + return (error); + vp = NFSTOV(np); + if (np->n_tnext) { + np->n_expiry = 0; + np->n_flag |= NQNFSEVICTED; + if (np->n_tprev != (struct nfsnode *)nmp) { + if (np->n_tnext == (struct nfsnode *)nmp) + nmp->nm_tprev = np->n_tprev; + else + np->n_tnext->n_tprev = np->n_tprev; + np->n_tprev->n_tnext = np->n_tnext; + np->n_tnext = nmp->nm_tnext; + nmp->nm_tnext = np; + np->n_tprev = (struct nfsnode *)nmp; + if (np->n_tnext == (struct nfsnode *)nmp) + nmp->nm_tprev = np; + else + np->n_tnext->n_tprev = np; + } + } + vrele(vp); + nfsm_srvdone; +} + +/* + * Nqnfs client helper daemon. Runs once a second to expire leases. + * It also get authorization strings for "kerb" mounts. + * It must start at the beginning of the list again after any potential + * "sleep" since nfs_reclaim() called from vclean() can pull a node off + * the list asynchronously. + */ +nqnfs_clientd(nmp, cred, ncd, flag, argp, p) + register struct nfsmount *nmp; + struct ucred *cred; + struct nfsd_cargs *ncd; + int flag; + caddr_t argp; + struct proc *p; +{ + register struct nfsnode *np; + struct vnode *vp; + struct nfsreq myrep; + int error, vpid; + + /* + * First initialize some variables + */ + nqnfs_prog = txdr_unsigned(NQNFS_PROG); + nqnfs_vers = txdr_unsigned(NQNFS_VER1); + + /* + * If an authorization string is being passed in, get it. + */ + if ((flag & NFSSVC_GOTAUTH) && + (nmp->nm_flag & (NFSMNT_WAITAUTH | NFSMNT_DISMNT)) == 0) { + if (nmp->nm_flag & NFSMNT_HASAUTH) + panic("cld kerb"); + if ((flag & NFSSVC_AUTHINFAIL) == 0) { + if (ncd->ncd_authlen <= RPCAUTH_MAXSIZ && + copyin(ncd->ncd_authstr, nmp->nm_authstr, + ncd->ncd_authlen) == 0) { + nmp->nm_authtype = ncd->ncd_authtype; + nmp->nm_authlen = ncd->ncd_authlen; + } else + nmp->nm_flag |= NFSMNT_AUTHERR; + } else + nmp->nm_flag |= NFSMNT_AUTHERR; + nmp->nm_flag |= NFSMNT_HASAUTH; + wakeup((caddr_t)&nmp->nm_authlen); + } else + nmp->nm_flag |= NFSMNT_WAITAUTH; + + /* + * Loop every second updating queue until there is a termination sig. + */ + while ((nmp->nm_flag & NFSMNT_DISMNT) == 0) { + if (nmp->nm_flag & NFSMNT_NQNFS) { + /* + * If there are no outstanding requests (and therefore no + * processes in nfs_reply) and there is data in the receive + * queue, poke for callbacks. + */ + if (nfsreqh.r_next == &nfsreqh && nmp->nm_so && + nmp->nm_so->so_rcv.sb_cc > 0) { + myrep.r_flags = R_GETONEREP; + myrep.r_nmp = nmp; + myrep.r_mrep = (struct mbuf *)0; + myrep.r_procp = (struct proc *)0; + (void) nfs_reply(&myrep); + } + + /* + * Loop through the leases, updating as required. + */ + np = nmp->nm_tnext; + while (np != (struct nfsnode *)nmp && + (nmp->nm_flag & NFSMNT_DISMINPROG) == 0) { + vp = NFSTOV(np); +if (vp->v_mount->mnt_stat.f_fsid.val[1] != MOUNT_NFS) panic("trash2"); + vpid = vp->v_id; + if (np->n_expiry < time.tv_sec) { + if (vget(vp, 1) == 0) { + nmp->nm_inprog = vp; + if (vpid == vp->v_id) { +if (vp->v_mount->mnt_stat.f_fsid.val[1] != MOUNT_NFS) panic("trash3"); + if (np->n_tnext == (struct nfsnode *)nmp) + nmp->nm_tprev = np->n_tprev; + else + np->n_tnext->n_tprev = np->n_tprev; + if (np->n_tprev == (struct nfsnode *)nmp) + nmp->nm_tnext = np->n_tnext; + else + np->n_tprev->n_tnext = np->n_tnext; + np->n_tnext = (struct nfsnode *)0; + if ((np->n_flag & (NMODIFIED | NQNFSEVICTED)) + && vp->v_type == VREG) { + if (np->n_flag & NQNFSEVICTED) { + (void) nfs_vinvalbuf(vp, + V_SAVE, cred, p, 0); + np->n_flag &= ~NQNFSEVICTED; + (void) nqnfs_vacated(vp, cred); + } else { + (void) VOP_FSYNC(vp, cred, + MNT_WAIT, p); + np->n_flag &= ~NMODIFIED; + } + } + } + vrele(vp); + nmp->nm_inprog = NULLVP; + } + if (np != nmp->nm_tnext) + np = nmp->nm_tnext; + else + break; + } else if ((np->n_expiry - NQ_RENEWAL) < time.tv_sec) { + if ((np->n_flag & (NQNFSWRITE | NQNFSNONCACHE)) + == NQNFSWRITE && vp->v_dirtyblkhd.lh_first && + vget(vp, 1) == 0) { + nmp->nm_inprog = vp; +if (vp->v_mount->mnt_stat.f_fsid.val[1] != MOUNT_NFS) panic("trash4"); + if (vpid == vp->v_id && + nqnfs_getlease(vp, NQL_WRITE, cred, p)==0) + np->n_brev = np->n_lrev; + vrele(vp); + nmp->nm_inprog = NULLVP; + } + if (np != nmp->nm_tnext) + np = nmp->nm_tnext; + else + break; + } else + break; + } + } + + /* + * Get an authorization string, if required. + */ + if ((nmp->nm_flag & (NFSMNT_WAITAUTH | NFSMNT_DISMNT | NFSMNT_HASAUTH)) == 0) { + ncd->ncd_authuid = nmp->nm_authuid; + if (copyout((caddr_t)ncd, argp, sizeof (struct nfsd_cargs))) + nmp->nm_flag |= NFSMNT_WAITAUTH; + else + return (ENEEDAUTH); + } + + /* + * Wait a bit (no pun) and do it again. + */ + if ((nmp->nm_flag & NFSMNT_DISMNT) == 0 && + (nmp->nm_flag & (NFSMNT_WAITAUTH | NFSMNT_HASAUTH))) { + error = tsleep((caddr_t)&nmp->nm_authstr, PSOCK | PCATCH, + "nqnfstimr", hz / 3); + if (error == EINTR || error == ERESTART) + (void) dounmount(nmp->nm_mountp, 0, p); + } + } + free((caddr_t)nmp, M_NFSMNT); + if (error == EWOULDBLOCK) + error = 0; + return (error); +} + +/* + * Adjust all timer queue expiry times when the time of day clock is changed. + * Called from the settimeofday() syscall. + */ +void +lease_updatetime(deltat) + register int deltat; +{ + register struct nqlease *lp; + register struct nfsnode *np; + struct mount *mp; + struct nfsmount *nmp; + int s; + + if (nqnfsstarttime != 0) + nqnfsstarttime += deltat; + s = splsoftclock(); + lp = nqthead.th_chain[0]; + while (lp != (struct nqlease *)&nqthead) { + lp->lc_expiry += deltat; + lp = lp->lc_chain1[0]; + } + splx(s); + + /* + * Search the mount list for all nqnfs mounts and do their timer + * queues. + */ + for (mp = mountlist.tqh_first; mp != NULL; mp = mp->mnt_list.tqe_next) { + if (mp->mnt_stat.f_fsid.val[1] == MOUNT_NFS) { + nmp = VFSTONFS(mp); + if (nmp->nm_flag & NFSMNT_NQNFS) { + np = nmp->nm_tnext; + while (np != (struct nfsnode *)nmp) { + np->n_expiry += deltat; + np = np->n_tnext; + } + } + } + } +} + +/* + * Lock a server lease. + */ +void +nqsrv_locklease(lp) + struct nqlease *lp; +{ + + while (lp->lc_flag & LC_LOCKED) { + lp->lc_flag |= LC_WANTED; + (void) tsleep((caddr_t)lp, PSOCK, "nqlc", 0); + } + lp->lc_flag |= LC_LOCKED; + lp->lc_flag &= ~LC_WANTED; +} + +/* + * Unlock a server lease. + */ +void +nqsrv_unlocklease(lp) + struct nqlease *lp; +{ + + lp->lc_flag &= ~LC_LOCKED; + if (lp->lc_flag & LC_WANTED) + wakeup((caddr_t)lp); +} + +/* + * Update a client lease. + */ +void +nqnfs_clientlease(nmp, np, rwflag, cachable, expiry, frev) + register struct nfsmount *nmp; + register struct nfsnode *np; + int rwflag, cachable; + time_t expiry; + u_quad_t frev; +{ + register struct nfsnode *tp; + + if (np->n_tnext) { + if (np->n_tnext == (struct nfsnode *)nmp) + nmp->nm_tprev = np->n_tprev; + else + np->n_tnext->n_tprev = np->n_tprev; + if (np->n_tprev == (struct nfsnode *)nmp) + nmp->nm_tnext = np->n_tnext; + else + np->n_tprev->n_tnext = np->n_tnext; + if (rwflag == NQL_WRITE) + np->n_flag |= NQNFSWRITE; + } else if (rwflag == NQL_READ) + np->n_flag &= ~NQNFSWRITE; + else + np->n_flag |= NQNFSWRITE; + if (cachable) + np->n_flag &= ~NQNFSNONCACHE; + else + np->n_flag |= NQNFSNONCACHE; + np->n_expiry = expiry; + np->n_lrev = frev; + tp = nmp->nm_tprev; + while (tp != (struct nfsnode *)nmp && tp->n_expiry > np->n_expiry) + tp = tp->n_tprev; + if (tp == (struct nfsnode *)nmp) { + np->n_tnext = nmp->nm_tnext; + nmp->nm_tnext = np; + } else { + np->n_tnext = tp->n_tnext; + tp->n_tnext = np; + } + np->n_tprev = tp; + if (np->n_tnext == (struct nfsnode *)nmp) + nmp->nm_tprev = np; + else + np->n_tnext->n_tprev = np; +} diff --git a/sys/nfs/nfs_serv.c b/sys/nfs/nfs_serv.c new file mode 100644 index 00000000000..f31b96e02ed --- /dev/null +++ b/sys/nfs/nfs_serv.c @@ -0,0 +1,1908 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_serv.c 8.3 (Berkeley) 1/12/94 + */ + +/* + * nfs version 2 server calls to vnode ops + * - these routines generally have 3 phases + * 1 - break down and validate rpc request in mbuf list + * 2 - do the vnode ops for the request + * (surprisingly ?? many are very similar to syscalls in vfs_syscalls.c) + * 3 - build the rpc reply in an mbuf list + * nb: + * - do not mix the phases, since the nfsm_?? macros can return failures + * on a bad rpc or similar and do not do any vrele() or vput()'s + * + * - the nfsm_reply() macro generates an nfs rpc reply with the nfs + * error number iff error != 0 whereas + * returning an error from the server function implies a fatal error + * such as a badly constructed rpc request that should be dropped without + * a reply. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +/* Defs */ +#define TRUE 1 +#define FALSE 0 + +/* Global vars */ +extern u_long nfs_procids[NFS_NPROCS]; +extern u_long nfs_xdrneg1; +extern u_long nfs_false, nfs_true; +nfstype nfs_type[9] = { NFNON, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, NFNON, + NFCHR, NFNON }; + +/* + * nqnfs access service + */ +nqnfsrv_access(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, mode = 0; + char *cp2; + struct mbuf *mb, *mreq; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED); + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + if (*tl++ == nfs_true) + mode |= VREAD; + if (*tl++ == nfs_true) + mode |= VWRITE; + if (*tl == nfs_true) + mode |= VEXEC; + error = nfsrv_access(vp, mode, cred, rdonly, nfsd->nd_procp); + vput(vp); + nfsm_reply(0); + nfsm_srvdone; +} + +/* + * nfs getattr service + */ +nfsrv_getattr(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct nfsv2_fattr *fp; + struct vattr va; + register struct vattr *vap = &va; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + nqsrv_getl(vp, NQL_READ); + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + nfsm_srvdone; +} + +/* + * nfs setattr service + */ +nfsrv_setattr(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct vattr va; + register struct vattr *vap = &va; + register struct nfsv2_sattr *sp; + register struct nfsv2_fattr *fp; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + u_quad_t frev, frev2; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_dissect(sp, struct nfsv2_sattr *, NFSX_SATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + nqsrv_getl(vp, NQL_WRITE); + VATTR_NULL(vap); + /* + * Nah nah nah nah na nah + * There is a bug in the Sun client that puts 0xffff in the mode + * field of sattr when it should put in 0xffffffff. The u_short + * doesn't sign extend. + * --> check the low order 2 bytes for 0xffff + */ + if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff) + vap->va_mode = nfstov_mode(sp->sa_mode); + if (sp->sa_uid != nfs_xdrneg1) + vap->va_uid = fxdr_unsigned(uid_t, sp->sa_uid); + if (sp->sa_gid != nfs_xdrneg1) + vap->va_gid = fxdr_unsigned(gid_t, sp->sa_gid); + if (nfsd->nd_nqlflag == NQL_NOVAL) { + if (sp->sa_nfssize != nfs_xdrneg1) + vap->va_size = fxdr_unsigned(u_quad_t, sp->sa_nfssize); + if (sp->sa_nfsatime.nfs_sec != nfs_xdrneg1) { +#ifdef notyet + fxdr_nfstime(&sp->sa_nfsatime, &vap->va_atime); +#else + vap->va_atime.ts_sec = + fxdr_unsigned(long, sp->sa_nfsatime.nfs_sec); + vap->va_atime.ts_nsec = 0; +#endif + } + if (sp->sa_nfsmtime.nfs_sec != nfs_xdrneg1) + fxdr_nfstime(&sp->sa_nfsmtime, &vap->va_mtime); + } else { + fxdr_hyper(&sp->sa_nqsize, &vap->va_size); + fxdr_nqtime(&sp->sa_nqatime, &vap->va_atime); + fxdr_nqtime(&sp->sa_nqmtime, &vap->va_mtime); + vap->va_flags = fxdr_unsigned(u_long, sp->sa_nqflags); + } + + /* + * If the size is being changed write acces is required, otherwise + * just check for a read only file system. + */ + if (vap->va_size == ((u_quad_t)((quad_t) -1))) { + if (rdonly || (vp->v_mount->mnt_flag & MNT_RDONLY)) { + error = EROFS; + goto out; + } + } else { + if (vp->v_type == VDIR) { + error = EISDIR; + goto out; + } else if (error = nfsrv_access(vp, VWRITE, cred, rdonly, + nfsd->nd_procp)) + goto out; + } + if (error = VOP_SETATTR(vp, vap, cred, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); +out: + vput(vp); + nfsm_reply(NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL) + 2*NFSX_UNSIGNED); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + if (nfsd->nd_nqlflag != NQL_NOVAL) { + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + txdr_hyper(&frev2, tl); + } + nfsm_srvdone; +} + +/* + * nfs lookup rpc + */ +nfsrv_lookup(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct nfsv2_fattr *fp; + struct nameidata nd; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + register caddr_t cp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, cache, duration2, cache2, len; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct vattr va, *vap = &va; + u_quad_t frev, frev2; + + fhp = &nfh.fh_generic; + duration2 = 0; + if (nfsd->nd_nqlflag != NQL_NOVAL) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + duration2 = fxdr_unsigned(int, *tl); + } + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = LOOKUP; + nd.ni_cnd.cn_flags = LOCKLEAF | SAVESTART; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + nfsm_reply(0); + nqsrv_getl(nd.ni_startdir, NQL_READ); + vrele(nd.ni_startdir); + FREE(nd.ni_cnd.cn_pnbuf, M_NAMEI); + vp = nd.ni_vp; + bzero((caddr_t)fhp, sizeof(nfh)); + fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; + if (error = VFS_VPTOFH(vp, &fhp->fh_fid)) { + vput(vp); + nfsm_reply(0); + } + if (duration2) + (void) nqsrv_getlease(vp, &duration2, NQL_READ, nfsd, + nam, &cache2, &frev2, cred); + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_FH + NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL) + 5*NFSX_UNSIGNED); + if (nfsd->nd_nqlflag != NQL_NOVAL) { + if (duration2) { + nfsm_build(tl, u_long *, 5*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(NQL_READ); + *tl++ = txdr_unsigned(cache2); + *tl++ = txdr_unsigned(duration2); + txdr_hyper(&frev2, tl); + } else { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = 0; + } + } + nfsm_srvfhtom(fhp); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + nfsm_srvdone; +} + +/* + * nfs readlink service + */ +nfsrv_readlink(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct iovec iv[(NFS_MAXPATHLEN+MLEN-1)/MLEN]; + register struct iovec *ivp = iv; + register struct mbuf *mp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, i, tlen, len; + char *cp2; + struct mbuf *mb, *mb2, *mp2, *mp3, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct uio io, *uiop = &io; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + len = 0; + i = 0; + while (len < NFS_MAXPATHLEN) { + MGET(mp, M_WAIT, MT_DATA); + MCLGET(mp, M_WAIT); + mp->m_len = NFSMSIZ(mp); + if (len == 0) + mp3 = mp2 = mp; + else { + mp2->m_next = mp; + mp2 = mp; + } + if ((len+mp->m_len) > NFS_MAXPATHLEN) { + mp->m_len = NFS_MAXPATHLEN-len; + len = NFS_MAXPATHLEN; + } else + len += mp->m_len; + ivp->iov_base = mtod(mp, caddr_t); + ivp->iov_len = mp->m_len; + i++; + ivp++; + } + uiop->uio_iov = iv; + uiop->uio_iovcnt = i; + uiop->uio_offset = 0; + uiop->uio_resid = len; + uiop->uio_rw = UIO_READ; + uiop->uio_segflg = UIO_SYSSPACE; + uiop->uio_procp = (struct proc *)0; + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) { + m_freem(mp3); + nfsm_reply(0); + } + if (vp->v_type != VLNK) { + error = EINVAL; + goto out; + } + nqsrv_getl(vp, NQL_READ); + error = VOP_READLINK(vp, uiop, cred); +out: + vput(vp); + if (error) + m_freem(mp3); + nfsm_reply(NFSX_UNSIGNED); + if (uiop->uio_resid > 0) { + len -= uiop->uio_resid; + tlen = nfsm_rndup(len); + nfsm_adj(mp3, NFS_MAXPATHLEN-tlen, tlen-len); + } + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = txdr_unsigned(len); + mb->m_next = mp3; + nfsm_srvdone; +} + +/* + * nfs read service + */ +nfsrv_read(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct iovec *iv; + struct iovec *iv2; + register struct mbuf *m; + register struct nfsv2_fattr *fp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, i, cnt, len, left, siz, tlen; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct mbuf *m2; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct uio io, *uiop = &io; + struct vattr va, *vap = &va; + off_t off; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + if (nfsd->nd_nqlflag == NQL_NOVAL) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + off = (off_t)fxdr_unsigned(u_long, *tl); + } else { + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + fxdr_hyper(tl, &off); + } + nfsm_srvstrsiz(cnt, NFS_MAXDATA); + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + if (vp->v_type != VREG) { + error = (vp->v_type == VDIR) ? EISDIR : EACCES; + vput(vp); + nfsm_reply(0); + } + nqsrv_getl(vp, NQL_READ); + if ((error = nfsrv_access(vp, VREAD, cred, rdonly, nfsd->nd_procp)) && + (error = nfsrv_access(vp, VEXEC, cred, rdonly, nfsd->nd_procp))) { + vput(vp); + nfsm_reply(0); + } + if (error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + if (off >= vap->va_size) + cnt = 0; + else if ((off + cnt) > vap->va_size) + cnt = nfsm_rndup(vap->va_size - off); + nfsm_reply(NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)+NFSX_UNSIGNED+nfsm_rndup(cnt)); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + len = left = cnt; + if (cnt > 0) { + /* + * Generate the mbuf list with the uio_iov ref. to it. + */ + i = 0; + m = m2 = mb; + MALLOC(iv, struct iovec *, + ((NFS_MAXDATA+MLEN-1)/MLEN) * sizeof (struct iovec), + M_TEMP, M_WAITOK); + iv2 = iv; + while (left > 0) { + siz = min(M_TRAILINGSPACE(m), left); + if (siz > 0) { + m->m_len += siz; + iv->iov_base = bpos; + iv->iov_len = siz; + iv++; + i++; + left -= siz; + } + if (left > 0) { + MGET(m, M_WAIT, MT_DATA); + MCLGET(m, M_WAIT); + m->m_len = 0; + m2->m_next = m; + m2 = m; + bpos = mtod(m, caddr_t); + } + } + uiop->uio_iov = iv2; + uiop->uio_iovcnt = i; + uiop->uio_offset = off; + uiop->uio_resid = cnt; + uiop->uio_rw = UIO_READ; + uiop->uio_segflg = UIO_SYSSPACE; + error = VOP_READ(vp, uiop, IO_NODELOCKED, cred); + off = uiop->uio_offset; + FREE((caddr_t)iv2, M_TEMP); + if (error || (error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp))) { + m_freem(mreq); + vput(vp); + nfsm_reply(0); + } + } else + uiop->uio_resid = 0; + vput(vp); + nfsm_srvfillattr; + len -= uiop->uio_resid; + tlen = nfsm_rndup(len); + if (cnt != tlen || tlen != len) + nfsm_adj(mb, cnt-tlen, tlen-len); + *tl = txdr_unsigned(len); + nfsm_srvdone; +} + +/* + * nfs write service + */ +nfsrv_write(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct iovec *ivp; + register struct mbuf *mp; + register struct nfsv2_fattr *fp; + struct iovec iv[NFS_MAXIOVEC]; + struct vattr va; + register struct vattr *vap = &va; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, siz, len, xfer; + int ioflags = IO_SYNC | IO_NODELOCKED; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct uio io, *uiop = &io; + off_t off; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_dissect(tl, u_long *, 4 * NFSX_UNSIGNED); + if (nfsd->nd_nqlflag == NQL_NOVAL) { + off = (off_t)fxdr_unsigned(u_long, *++tl); + tl += 2; + } else { + fxdr_hyper(tl, &off); + tl += 2; + if (fxdr_unsigned(u_long, *tl++)) + ioflags |= IO_APPEND; + } + len = fxdr_unsigned(long, *tl); + if (len > NFS_MAXDATA || len <= 0) { + error = EBADRPC; + nfsm_reply(0); + } + if (dpos == (mtod(md, caddr_t)+md->m_len)) { + mp = md->m_next; + if (mp == NULL) { + error = EBADRPC; + nfsm_reply(0); + } + } else { + mp = md; + siz = dpos-mtod(mp, caddr_t); + mp->m_len -= siz; + NFSMADV(mp, siz); + } + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + if (vp->v_type != VREG) { + error = (vp->v_type == VDIR) ? EISDIR : EACCES; + vput(vp); + nfsm_reply(0); + } + nqsrv_getl(vp, NQL_WRITE); + if (error = nfsrv_access(vp, VWRITE, cred, rdonly, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + uiop->uio_resid = 0; + uiop->uio_rw = UIO_WRITE; + uiop->uio_segflg = UIO_SYSSPACE; + uiop->uio_procp = (struct proc *)0; + /* + * Do up to NFS_MAXIOVEC mbufs of write each iteration of the + * loop until done. + */ + while (len > 0 && uiop->uio_resid == 0) { + ivp = iv; + siz = 0; + uiop->uio_iov = ivp; + uiop->uio_iovcnt = 0; + uiop->uio_offset = off; + while (len > 0 && uiop->uio_iovcnt < NFS_MAXIOVEC && mp != NULL) { + ivp->iov_base = mtod(mp, caddr_t); + if (len < mp->m_len) + ivp->iov_len = xfer = len; + else + ivp->iov_len = xfer = mp->m_len; +#ifdef notdef + /* Not Yet .. */ + if (M_HASCL(mp) && (((u_long)ivp->iov_base) & CLOFSET) == 0) + ivp->iov_op = NULL; /* what should it be ?? */ + else + ivp->iov_op = NULL; +#endif + uiop->uio_iovcnt++; + ivp++; + len -= xfer; + siz += xfer; + mp = mp->m_next; + } + if (len > 0 && mp == NULL) { + error = EBADRPC; + vput(vp); + nfsm_reply(0); + } + uiop->uio_resid = siz; + if (error = VOP_WRITE(vp, uiop, ioflags, cred)) { + vput(vp); + nfsm_reply(0); + } + off = uiop->uio_offset; + } + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + if (nfsd->nd_nqlflag != NQL_NOVAL) { + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + txdr_hyper(&vap->va_filerev, tl); + } + nfsm_srvdone; +} + +/* + * nfs create service + * now does a truncate to 0 length via. setattr if it already exists + */ +nfsrv_create(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct nfsv2_fattr *fp; + struct vattr va; + register struct vattr *vap = &va; + register struct nfsv2_sattr *sp; + register u_long *tl; + struct nameidata nd; + register caddr_t cp; + register long t1; + caddr_t bpos; + int error = 0, rdev, cache, len, tsize; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + u_quad_t frev; + + nd.ni_cnd.cn_nameiop = 0; + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = CREATE; + nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | SAVESTART; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + nfsm_reply(0); + VATTR_NULL(vap); + nfsm_dissect(sp, struct nfsv2_sattr *, NFSX_SATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + /* + * Iff doesn't exist, create it + * otherwise just truncate to 0 length + * should I set the mode too ?? + */ + if (nd.ni_vp == NULL) { + vap->va_type = IFTOVT(fxdr_unsigned(u_long, sp->sa_mode)); + if (vap->va_type == VNON) + vap->va_type = VREG; + vap->va_mode = nfstov_mode(sp->sa_mode); + if (nfsd->nd_nqlflag == NQL_NOVAL) + rdev = fxdr_unsigned(long, sp->sa_nfssize); + else + rdev = fxdr_unsigned(long, sp->sa_nqrdev); + if (vap->va_type == VREG || vap->va_type == VSOCK) { + vrele(nd.ni_startdir); + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + if (error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap)) + nfsm_reply(0); + FREE(nd.ni_cnd.cn_pnbuf, M_NAMEI); + } else if (vap->va_type == VCHR || vap->va_type == VBLK || + vap->va_type == VFIFO) { + if (vap->va_type == VCHR && rdev == 0xffffffff) + vap->va_type = VFIFO; + if (vap->va_type == VFIFO) { +#ifndef FIFO + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + error = ENXIO; + goto out; +#endif /* FIFO */ + } else if (error = suser(cred, (u_short *)0)) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + goto out; + } else + vap->va_rdev = (dev_t)rdev; + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + if (error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap)) { + vrele(nd.ni_startdir); + nfsm_reply(0); + } + nd.ni_cnd.cn_nameiop = LOOKUP; + nd.ni_cnd.cn_flags &= ~(LOCKPARENT | SAVESTART); + nd.ni_cnd.cn_proc = nfsd->nd_procp; + nd.ni_cnd.cn_cred = nfsd->nd_procp->p_ucred; + if (error = lookup(&nd)) { + free(nd.ni_cnd.cn_pnbuf, M_NAMEI); + nfsm_reply(0); + } + FREE(nd.ni_cnd.cn_pnbuf, M_NAMEI); + if (nd.ni_cnd.cn_flags & ISSYMLINK) { + vrele(nd.ni_dvp); + vput(nd.ni_vp); + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + error = EINVAL; + nfsm_reply(0); + } + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + error = ENXIO; + goto out; + } + vp = nd.ni_vp; + } else { + vrele(nd.ni_startdir); + free(nd.ni_cnd.cn_pnbuf, M_NAMEI); + vp = nd.ni_vp; + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nfsd->nd_nqlflag == NQL_NOVAL) { + tsize = fxdr_unsigned(long, sp->sa_nfssize); + if (tsize != -1) + vap->va_size = (u_quad_t)tsize; + else + vap->va_size = -1; + } else + fxdr_hyper(&sp->sa_nqsize, &vap->va_size); + if (vap->va_size != -1) { + if (error = nfsrv_access(vp, VWRITE, cred, + (nd.ni_cnd.cn_flags & RDONLY), nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + nqsrv_getl(vp, NQL_WRITE); + if (error = VOP_SETATTR(vp, vap, cred, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + } + } + bzero((caddr_t)fhp, sizeof(nfh)); + fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; + if (error = VFS_VPTOFH(vp, &fhp->fh_fid)) { + vput(vp); + nfsm_reply(0); + } + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_FH+NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfhtom(fhp); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + return (error); +nfsmout: + if (nd.ni_cnd.cn_nameiop || nd.ni_cnd.cn_flags) + vrele(nd.ni_startdir); + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vput(nd.ni_vp); + return (error); + +out: + vrele(nd.ni_startdir); + free(nd.ni_cnd.cn_pnbuf, M_NAMEI); + nfsm_reply(0); +} + +/* + * nfs remove service + */ +nfsrv_remove(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct nameidata nd; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, cache, len; + char *cp2; + struct mbuf *mb, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = DELETE; + nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + nfsm_reply(0); + vp = nd.ni_vp; + if (vp->v_type == VDIR && + (error = suser(cred, (u_short *)0))) + goto out; + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) { + error = EBUSY; + goto out; + } + if (vp->v_flag & VTEXT) + (void) vnode_pager_uncache(vp); +out: + if (!error) { + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + nqsrv_getl(vp, NQL_WRITE); + error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + } + nfsm_reply(0); + nfsm_srvdone; +} + +/* + * nfs rename service + */ +nfsrv_rename(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, cache, len, len2; + char *cp2; + struct mbuf *mb, *mreq; + struct nameidata fromnd, tond; + struct vnode *fvp, *tvp, *tdvp; + nfsv2fh_t fnfh, tnfh; + fhandle_t *ffhp, *tfhp; + u_quad_t frev; + uid_t saved_uid; + + ffhp = &fnfh.fh_generic; + tfhp = &tnfh.fh_generic; + fromnd.ni_cnd.cn_nameiop = 0; + tond.ni_cnd.cn_nameiop = 0; + nfsm_srvmtofh(ffhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + /* + * Remember our original uid so that we can reset cr_uid before + * the second nfs_namei() call, in case it is remapped. + */ + saved_uid = cred->cr_uid; + fromnd.ni_cnd.cn_cred = cred; + fromnd.ni_cnd.cn_nameiop = DELETE; + fromnd.ni_cnd.cn_flags = WANTPARENT | SAVESTART; + if (error = nfs_namei(&fromnd, ffhp, len, nfsd->nd_slp, nam, &md, + &dpos, nfsd->nd_procp)) + nfsm_reply(0); + fvp = fromnd.ni_vp; + nfsm_srvmtofh(tfhp); + nfsm_strsiz(len2, NFS_MAXNAMLEN); + cred->cr_uid = saved_uid; + tond.ni_cnd.cn_cred = cred; + tond.ni_cnd.cn_nameiop = RENAME; + tond.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART; + if (error = nfs_namei(&tond, tfhp, len2, nfsd->nd_slp, nam, &md, + &dpos, nfsd->nd_procp)) { + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = EISDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = ENOTDIR; + goto out; + } + if (tvp->v_type == VDIR && tvp->v_mountedhere) { + error = EXDEV; + goto out; + } + } + if (fvp->v_type == VDIR && fvp->v_mountedhere) { + error = EBUSY; + goto out; + } + if (fvp->v_mount != tdvp->v_mount) { + error = EXDEV; + goto out; + } + if (fvp == tdvp) + error = EINVAL; + /* + * If source is the same as the destination (that is the + * same vnode with the same name in the same directory), + * then there is nothing to do. + */ + if (fvp == tvp && fromnd.ni_dvp == tdvp && + fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && + !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, + fromnd.ni_cnd.cn_namelen)) + error = -1; +out: + if (!error) { + nqsrv_getl(fromnd.ni_dvp, NQL_WRITE); + nqsrv_getl(tdvp, NQL_WRITE); + if (tvp) + nqsrv_getl(tvp, NQL_WRITE); + error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + } else { + VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + vrele(tond.ni_startdir); + FREE(tond.ni_cnd.cn_pnbuf, M_NAMEI); +out1: + vrele(fromnd.ni_startdir); + FREE(fromnd.ni_cnd.cn_pnbuf, M_NAMEI); + nfsm_reply(0); + return (error); + +nfsmout: + if (tond.ni_cnd.cn_nameiop || tond.ni_cnd.cn_flags) { + vrele(tond.ni_startdir); + FREE(tond.ni_cnd.cn_pnbuf, M_NAMEI); + } + if (fromnd.ni_cnd.cn_nameiop || fromnd.ni_cnd.cn_flags) { + vrele(fromnd.ni_startdir); + FREE(fromnd.ni_cnd.cn_pnbuf, M_NAMEI); + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + return (error); +} + +/* + * nfs link service + */ +nfsrv_link(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct nameidata nd; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, len; + char *cp2; + struct mbuf *mb, *mreq; + struct vnode *vp, *xp; + nfsv2fh_t nfh, dnfh; + fhandle_t *fhp, *dfhp; + u_quad_t frev; + + fhp = &nfh.fh_generic; + dfhp = &dnfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvmtofh(dfhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + if (error = nfsrv_fhtovp(fhp, FALSE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + if (vp->v_type == VDIR && (error = suser(cred, (u_short *)0))) + goto out1; + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = CREATE; + nd.ni_cnd.cn_flags = LOCKPARENT; + if (error = nfs_namei(&nd, dfhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + goto out1; + xp = nd.ni_vp; + if (xp != NULL) { + error = EEXIST; + goto out; + } + xp = nd.ni_dvp; + if (vp->v_mount != xp->v_mount) + error = EXDEV; +out: + if (!error) { + nqsrv_getl(vp, NQL_WRITE); + nqsrv_getl(xp, NQL_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + } +out1: + vrele(vp); + nfsm_reply(0); + nfsm_srvdone; +} + +/* + * nfs symbolic link service + */ +nfsrv_symlink(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct vattr va; + struct nameidata nd; + register struct vattr *vap = &va; + register u_long *tl; + register long t1; + struct nfsv2_sattr *sp; + caddr_t bpos; + struct uio io; + struct iovec iv; + int error = 0, cache, len, len2; + char *pathcp, *cp2; + struct mbuf *mb, *mreq; + nfsv2fh_t nfh; + fhandle_t *fhp; + u_quad_t frev; + + pathcp = (char *)0; + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = CREATE; + nd.ni_cnd.cn_flags = LOCKPARENT; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + goto out; + nfsm_strsiz(len2, NFS_MAXPATHLEN); + MALLOC(pathcp, caddr_t, len2 + 1, M_TEMP, M_WAITOK); + iv.iov_base = pathcp; + iv.iov_len = len2; + io.uio_resid = len2; + io.uio_offset = 0; + io.uio_iov = &iv; + io.uio_iovcnt = 1; + io.uio_segflg = UIO_SYSSPACE; + io.uio_rw = UIO_READ; + io.uio_procp = (struct proc *)0; + nfsm_mtouio(&io, len2); + nfsm_dissect(sp, struct nfsv2_sattr *, NFSX_SATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + *(pathcp + len2) = '\0'; + if (nd.ni_vp) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + error = EEXIST; + goto out; + } + VATTR_NULL(vap); + vap->va_mode = fxdr_unsigned(u_short, sp->sa_mode); + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap, pathcp); +out: + if (pathcp) + FREE(pathcp, M_TEMP); + nfsm_reply(0); + return (error); +nfsmout: + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + if (pathcp) + FREE(pathcp, M_TEMP); + return (error); +} + +/* + * nfs mkdir service + */ +nfsrv_mkdir(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct vattr va; + register struct vattr *vap = &va; + register struct nfsv2_fattr *fp; + struct nameidata nd; + register caddr_t cp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, cache, len; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = CREATE; + nd.ni_cnd.cn_flags = LOCKPARENT; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + nfsm_reply(0); + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + VATTR_NULL(vap); + vap->va_type = VDIR; + vap->va_mode = nfstov_mode(*tl++); + vp = nd.ni_vp; + if (vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + error = EEXIST; + nfsm_reply(0); + } + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + if (error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap)) + nfsm_reply(0); + vp = nd.ni_vp; + bzero((caddr_t)fhp, sizeof(nfh)); + fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; + if (error = VFS_VPTOFH(vp, &fhp->fh_fid)) { + vput(vp); + nfsm_reply(0); + } + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_FH+NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfhtom(fhp); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + return (error); +nfsmout: + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + return (error); +} + +/* + * nfs rmdir service + */ +nfsrv_rmdir(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, cache, len; + char *cp2; + struct mbuf *mb, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct nameidata nd; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = DELETE; + nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + nfsm_reply(0); + vp = nd.ni_vp; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + /* + * No rmdir "." please. + */ + if (nd.ni_dvp == vp) { + error = EINVAL; + goto out; + } + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) + error = EBUSY; +out: + if (!error) { + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + nqsrv_getl(vp, NQL_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + } + nfsm_reply(0); + nfsm_srvdone; +} + +/* + * nfs readdir service + * - mallocs what it thinks is enough to read + * count rounded up to a multiple of NFS_DIRBLKSIZ <= NFS_MAXREADDIR + * - calls VOP_READDIR() + * - loops around building the reply + * if the output generated exceeds count break out of loop + * The nfsm_clget macro is used here so that the reply will be packed + * tightly in mbuf clusters. + * - it only knows that it has encountered eof when the VOP_READDIR() + * reads nothing + * - as such one readdir rpc will return eof false although you are there + * and then the next will return eof + * - it trims out records with d_fileno == 0 + * this doesn't matter for Unix clients, but they might confuse clients + * for other os'. + * NB: It is tempting to set eof to true if the VOP_READDIR() reads less + * than requested, but this may not apply to all filesystems. For + * example, client NFS does not { although it is never remote mounted + * anyhow } + * The alternate call nqnfsrv_readdirlook() does lookups as well. + * PS: The NFS protocol spec. does not clarify what the "count" byte + * argument is a count of.. just name strings and file id's or the + * entire reply rpc or ... + * I tried just file name and id sizes and it confused the Sun client, + * so I am using the full rpc size now. The "paranoia.." comment refers + * to including the status longwords that are not a part of the dir. + * "entry" structures, but are in the rpc. + */ +struct flrep { + u_long fl_cachable; + u_long fl_duration; + u_long fl_frev[2]; + nfsv2fh_t fl_nfh; + u_long fl_fattr[NFSX_NQFATTR / sizeof (u_long)]; +}; + +nfsrv_readdir(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register char *bp, *be; + register struct mbuf *mp; + register struct dirent *dp; + register caddr_t cp; + register u_long *tl; + register long t1; + caddr_t bpos; + struct mbuf *mb, *mb2, *mreq, *mp2; + char *cpos, *cend, *cp2, *rbuf; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct uio io; + struct iovec iv; + int len, nlen, rem, xfer, tsiz, i, error = 0; + int siz, cnt, fullsiz, eofflag, rdonly, cache; + u_quad_t frev; + u_long on, off, toff; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); + toff = fxdr_unsigned(u_long, *tl++); + off = (toff & ~(NFS_DIRBLKSIZ-1)); + on = (toff & (NFS_DIRBLKSIZ-1)); + cnt = fxdr_unsigned(int, *tl); + siz = ((cnt+NFS_DIRBLKSIZ-1) & ~(NFS_DIRBLKSIZ-1)); + if (cnt > NFS_MAXREADDIR) + siz = NFS_MAXREADDIR; + fullsiz = siz; + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + nqsrv_getl(vp, NQL_READ); + if (error = nfsrv_access(vp, VEXEC, cred, rdonly, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + VOP_UNLOCK(vp); + MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); +again: + iv.iov_base = rbuf; + iv.iov_len = fullsiz; + io.uio_iov = &iv; + io.uio_iovcnt = 1; + io.uio_offset = (off_t)off; + io.uio_resid = fullsiz; + io.uio_segflg = UIO_SYSSPACE; + io.uio_rw = UIO_READ; + io.uio_procp = (struct proc *)0; + error = VOP_READDIR(vp, &io, cred); + off = (off_t)io.uio_offset; + if (error) { + vrele(vp); + free((caddr_t)rbuf, M_TEMP); + nfsm_reply(0); + } + if (io.uio_resid < fullsiz) + eofflag = 0; + else + eofflag = 1; + if (io.uio_resid) { + siz -= io.uio_resid; + + /* + * If nothing read, return eof + * rpc reply + */ + if (siz == 0) { + vrele(vp); + nfsm_reply(2*NFSX_UNSIGNED); + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = nfs_false; + *tl = nfs_true; + FREE((caddr_t)rbuf, M_TEMP); + return (0); + } + } + + /* + * Check for degenerate cases of nothing useful read. + * If so go try again + */ + cpos = rbuf + on; + cend = rbuf + siz; + dp = (struct dirent *)cpos; + while (cpos < cend && dp->d_fileno == 0) { + cpos += dp->d_reclen; + dp = (struct dirent *)cpos; + } + if (cpos >= cend) { + toff = off; + siz = fullsiz; + on = 0; + goto again; + } + + cpos = rbuf + on; + cend = rbuf + siz; + dp = (struct dirent *)cpos; + len = 3*NFSX_UNSIGNED; /* paranoia, probably can be 0 */ + nfsm_reply(siz); + mp = mp2 = mb; + bp = bpos; + be = bp + M_TRAILINGSPACE(mp); + + /* Loop through the records and build reply */ + while (cpos < cend) { + if (dp->d_fileno != 0) { + nlen = dp->d_namlen; + rem = nfsm_rndup(nlen)-nlen; + len += (4*NFSX_UNSIGNED + nlen + rem); + if (len > cnt) { + eofflag = 0; + break; + } + /* + * Build the directory record xdr from + * the dirent entry. + */ + nfsm_clget; + *tl = nfs_true; + bp += NFSX_UNSIGNED; + nfsm_clget; + *tl = txdr_unsigned(dp->d_fileno); + bp += NFSX_UNSIGNED; + nfsm_clget; + *tl = txdr_unsigned(nlen); + bp += NFSX_UNSIGNED; + + /* And loop around copying the name */ + xfer = nlen; + cp = dp->d_name; + while (xfer > 0) { + nfsm_clget; + if ((bp+xfer) > be) + tsiz = be-bp; + else + tsiz = xfer; + bcopy(cp, bp, tsiz); + bp += tsiz; + xfer -= tsiz; + if (xfer > 0) + cp += tsiz; + } + /* And null pad to a long boundary */ + for (i = 0; i < rem; i++) + *bp++ = '\0'; + nfsm_clget; + + /* Finish off the record */ + toff += dp->d_reclen; + *tl = txdr_unsigned(toff); + bp += NFSX_UNSIGNED; + } else + toff += dp->d_reclen; + cpos += dp->d_reclen; + dp = (struct dirent *)cpos; + } + vrele(vp); + nfsm_clget; + *tl = nfs_false; + bp += NFSX_UNSIGNED; + nfsm_clget; + if (eofflag) + *tl = nfs_true; + else + *tl = nfs_false; + bp += NFSX_UNSIGNED; + if (mp != mb) { + if (bp < be) + mp->m_len = bp - mtod(mp, caddr_t); + } else + mp->m_len += bp - bpos; + FREE(rbuf, M_TEMP); + nfsm_srvdone; +} + +nqnfsrv_readdirlook(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register char *bp, *be; + register struct mbuf *mp; + register struct dirent *dp; + register caddr_t cp; + register u_long *tl; + register long t1; + caddr_t bpos; + struct mbuf *mb, *mb2, *mreq, *mp2; + char *cpos, *cend, *cp2, *rbuf; + struct vnode *vp, *nvp; + struct flrep fl; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct uio io; + struct iovec iv; + struct vattr va, *vap = &va; + struct nfsv2_fattr *fp; + int len, nlen, rem, xfer, tsiz, i, error = 0, duration2, cache2; + int siz, cnt, fullsiz, eofflag, rdonly, cache; + u_quad_t frev, frev2; + u_long on, off, toff; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_dissect(tl, u_long *, 3*NFSX_UNSIGNED); + toff = fxdr_unsigned(u_long, *tl++); + off = (toff & ~(NFS_DIRBLKSIZ-1)); + on = (toff & (NFS_DIRBLKSIZ-1)); + cnt = fxdr_unsigned(int, *tl++); + duration2 = fxdr_unsigned(int, *tl); + siz = ((cnt+NFS_DIRBLKSIZ-1) & ~(NFS_DIRBLKSIZ-1)); + if (cnt > NFS_MAXREADDIR) + siz = NFS_MAXREADDIR; + fullsiz = siz; + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + nqsrv_getl(vp, NQL_READ); + if (error = nfsrv_access(vp, VEXEC, cred, rdonly, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + VOP_UNLOCK(vp); + MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); +again: + iv.iov_base = rbuf; + iv.iov_len = fullsiz; + io.uio_iov = &iv; + io.uio_iovcnt = 1; + io.uio_offset = (off_t)off; + io.uio_resid = fullsiz; + io.uio_segflg = UIO_SYSSPACE; + io.uio_rw = UIO_READ; + io.uio_procp = (struct proc *)0; + error = VOP_READDIR(vp, &io, cred); + off = (u_long)io.uio_offset; + if (error) { + vrele(vp); + free((caddr_t)rbuf, M_TEMP); + nfsm_reply(0); + } + if (io.uio_resid < fullsiz) + eofflag = 0; + else + eofflag = 1; + if (io.uio_resid) { + siz -= io.uio_resid; + + /* + * If nothing read, return eof + * rpc reply + */ + if (siz == 0) { + vrele(vp); + nfsm_reply(2 * NFSX_UNSIGNED); + nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); + *tl++ = nfs_false; + *tl = nfs_true; + FREE((caddr_t)rbuf, M_TEMP); + return (0); + } + } + + /* + * Check for degenerate cases of nothing useful read. + * If so go try again + */ + cpos = rbuf + on; + cend = rbuf + siz; + dp = (struct dirent *)cpos; + while (cpos < cend && dp->d_fileno == 0) { + cpos += dp->d_reclen; + dp = (struct dirent *)cpos; + } + if (cpos >= cend) { + toff = off; + siz = fullsiz; + on = 0; + goto again; + } + + cpos = rbuf + on; + cend = rbuf + siz; + dp = (struct dirent *)cpos; + len = 3 * NFSX_UNSIGNED; /* paranoia, probably can be 0 */ + nfsm_reply(siz); + mp = mp2 = mb; + bp = bpos; + be = bp + M_TRAILINGSPACE(mp); + + /* Loop through the records and build reply */ + while (cpos < cend) { + if (dp->d_fileno != 0) { + nlen = dp->d_namlen; + rem = nfsm_rndup(nlen)-nlen; + + /* + * For readdir_and_lookup get the vnode using + * the file number. + */ + if (VFS_VGET(vp->v_mount, dp->d_fileno, &nvp)) + goto invalid; + bzero((caddr_t)&fl.fl_nfh, sizeof (nfsv2fh_t)); + fl.fl_nfh.fh_generic.fh_fsid = + nvp->v_mount->mnt_stat.f_fsid; + if (VFS_VPTOFH(nvp, &fl.fl_nfh.fh_generic.fh_fid)) { + vput(nvp); + goto invalid; + } + if (duration2) { + (void) nqsrv_getlease(nvp, &duration2, NQL_READ, + nfsd, nam, &cache2, &frev2, cred); + fl.fl_duration = txdr_unsigned(duration2); + fl.fl_cachable = txdr_unsigned(cache2); + txdr_hyper(&frev2, fl.fl_frev); + } else + fl.fl_duration = 0; + if (VOP_GETATTR(nvp, vap, cred, nfsd->nd_procp)) { + vput(nvp); + goto invalid; + } + vput(nvp); + fp = (struct nfsv2_fattr *)&fl.fl_fattr; + nfsm_srvfillattr; + len += (4*NFSX_UNSIGNED + nlen + rem + NFSX_FH + + NFSX_NQFATTR); + if (len > cnt) { + eofflag = 0; + break; + } + /* + * Build the directory record xdr from + * the dirent entry. + */ + nfsm_clget; + *tl = nfs_true; + bp += NFSX_UNSIGNED; + + /* + * For readdir_and_lookup copy the stuff out. + */ + xfer = sizeof (struct flrep); + cp = (caddr_t)&fl; + while (xfer > 0) { + nfsm_clget; + if ((bp+xfer) > be) + tsiz = be-bp; + else + tsiz = xfer; + bcopy(cp, bp, tsiz); + bp += tsiz; + xfer -= tsiz; + if (xfer > 0) + cp += tsiz; + } + nfsm_clget; + *tl = txdr_unsigned(dp->d_fileno); + bp += NFSX_UNSIGNED; + nfsm_clget; + *tl = txdr_unsigned(nlen); + bp += NFSX_UNSIGNED; + + /* And loop around copying the name */ + xfer = nlen; + cp = dp->d_name; + while (xfer > 0) { + nfsm_clget; + if ((bp+xfer) > be) + tsiz = be-bp; + else + tsiz = xfer; + bcopy(cp, bp, tsiz); + bp += tsiz; + xfer -= tsiz; + if (xfer > 0) + cp += tsiz; + } + /* And null pad to a long boundary */ + for (i = 0; i < rem; i++) + *bp++ = '\0'; + nfsm_clget; + + /* Finish off the record */ + toff += dp->d_reclen; + *tl = txdr_unsigned(toff); + bp += NFSX_UNSIGNED; + } else +invalid: + toff += dp->d_reclen; + cpos += dp->d_reclen; + dp = (struct dirent *)cpos; + } + vrele(vp); + nfsm_clget; + *tl = nfs_false; + bp += NFSX_UNSIGNED; + nfsm_clget; + if (eofflag) + *tl = nfs_true; + else + *tl = nfs_false; + bp += NFSX_UNSIGNED; + if (mp != mb) { + if (bp < be) + mp->m_len = bp - mtod(mp, caddr_t); + } else + mp->m_len += bp - bpos; + FREE(rbuf, M_TEMP); + nfsm_srvdone; +} + +/* + * nfs statfs service + */ +nfsrv_statfs(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct statfs *sf; + register struct nfsv2_statfs *sfp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, isnq; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct statfs statfs; + u_quad_t frev; + + fhp = &nfh.fh_generic; + isnq = (nfsd->nd_nqlflag != NQL_NOVAL); + nfsm_srvmtofh(fhp); + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + sf = &statfs; + error = VFS_STATFS(vp->v_mount, sf, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_STATFS(isnq)); + nfsm_build(sfp, struct nfsv2_statfs *, NFSX_STATFS(isnq)); + sfp->sf_tsize = txdr_unsigned(NFS_MAXDGRAMDATA); + sfp->sf_bsize = txdr_unsigned(sf->f_bsize); + sfp->sf_blocks = txdr_unsigned(sf->f_blocks); + sfp->sf_bfree = txdr_unsigned(sf->f_bfree); + sfp->sf_bavail = txdr_unsigned(sf->f_bavail); + if (isnq) { + sfp->sf_files = txdr_unsigned(sf->f_files); + sfp->sf_ffree = txdr_unsigned(sf->f_ffree); + } + nfsm_srvdone; +} + +/* + * Null operation, used by clients to ping server + */ +/* ARGSUSED */ +nfsrv_null(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + caddr_t bpos; + int error = VNOVAL, cache; + struct mbuf *mb, *mreq; + u_quad_t frev; + + nfsm_reply(0); + return (error); +} + +/* + * No operation, used for obsolete procedures + */ +/* ARGSUSED */ +nfsrv_noop(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + caddr_t bpos; + int error, cache; + struct mbuf *mb, *mreq; + u_quad_t frev; + + if (nfsd->nd_repstat) + error = nfsd->nd_repstat; + else + error = EPROCUNAVAIL; + nfsm_reply(0); + return (error); +} + +/* + * Perform access checking for vnodes obtained from file handles that would + * refer to files already opened by a Unix client. You cannot just use + * vn_writechk() and VOP_ACCESS() for two reasons. + * 1 - You must check for exported rdonly as well as MNT_RDONLY for the write case + * 2 - The owner is to be given access irrespective of mode bits so that + * processes that chmod after opening a file don't break. I don't like + * this because it opens a security hole, but since the nfs server opens + * a security hole the size of a barn door anyhow, what the heck. + */ +nfsrv_access(vp, flags, cred, rdonly, p) + register struct vnode *vp; + int flags; + register struct ucred *cred; + int rdonly; + struct proc *p; +{ + struct vattr vattr; + int error; + if (flags & VWRITE) { + /* Just vn_writechk() changed to check rdonly */ + /* + * Disallow write attempts on read-only file systems; + * unless the file is a socket or a block or character + * device resident on the file system. + */ + if (rdonly || (vp->v_mount->mnt_flag & MNT_RDONLY)) { + switch (vp->v_type) { + case VREG: case VDIR: case VLNK: + return (EROFS); + } + } + /* + * If there's shared text associated with + * the inode, try to free it up once. If + * we fail, we can't allow writing. + */ + if ((vp->v_flag & VTEXT) && !vnode_pager_uncache(vp)) + return (ETXTBSY); + } + if (error = VOP_GETATTR(vp, &vattr, cred, p)) + return (error); + if ((error = VOP_ACCESS(vp, flags, cred, p)) && + cred->cr_uid != vattr.va_uid) + return (error); + return (0); +} diff --git a/sys/nfs/nfs_socket.c b/sys/nfs/nfs_socket.c new file mode 100644 index 00000000000..cf88ed33d92 --- /dev/null +++ b/sys/nfs/nfs_socket.c @@ -0,0 +1,1990 @@ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_socket.c 8.3 (Berkeley) 1/12/94 + */ + +/* + * Socket operations for use by nfs + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TRUE 1 +#define FALSE 0 + +/* + * Estimate rto for an nfs rpc sent via. an unreliable datagram. + * Use the mean and mean deviation of rtt for the appropriate type of rpc + * for the frequent rpcs and a default for the others. + * The justification for doing "other" this way is that these rpcs + * happen so infrequently that timer est. would probably be stale. + * Also, since many of these rpcs are + * non-idempotent, a conservative timeout is desired. + * getattr, lookup - A+2D + * read, write - A+4D + * other - nm_timeo + */ +#define NFS_RTO(n, t) \ + ((t) == 0 ? (n)->nm_timeo : \ + ((t) < 3 ? \ + (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \ + ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1))) +#define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1] +#define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1] +/* + * External data, mostly RPC constants in XDR form + */ +extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix, + rpc_msgaccepted, rpc_call, rpc_autherr, rpc_rejectedcred, + rpc_auth_kerb; +extern u_long nfs_prog, nfs_vers, nqnfs_prog, nqnfs_vers; +extern time_t nqnfsstarttime; +extern int nonidempotent[NFS_NPROCS]; + +/* + * Maps errno values to nfs error numbers. + * Use NFSERR_IO as the catch all for ones not specifically defined in + * RFC 1094. + */ +static int nfsrv_errmap[ELAST] = { + NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, + NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, + NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, +}; + +/* + * Defines which timer to use for the procnum. + * 0 - default + * 1 - getattr + * 2 - lookup + * 3 - read + * 4 - write + */ +static int proct[NFS_NPROCS] = { + 0, 1, 0, 0, 2, 3, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0, +}; + +/* + * There is a congestion window for outstanding rpcs maintained per mount + * point. The cwnd size is adjusted in roughly the way that: + * Van Jacobson, Congestion avoidance and Control, In "Proceedings of + * SIGCOMM '88". ACM, August 1988. + * describes for TCP. The cwnd size is chopped in half on a retransmit timeout + * and incremented by 1/cwnd when each rpc reply is received and a full cwnd + * of rpcs is in progress. + * (The sent count and cwnd are scaled for integer arith.) + * Variants of "slow start" were tried and were found to be too much of a + * performance hit (ave. rtt 3 times larger), + * I suspect due to the large rtt that nfs rpcs have. + */ +#define NFS_CWNDSCALE 256 +#define NFS_MAXCWND (NFS_CWNDSCALE * 32) +static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, }; +int nfs_sbwait(); +void nfs_disconnect(), nfs_realign(), nfsrv_wakenfsd(), nfs_sndunlock(); +void nfs_rcvunlock(), nqnfs_serverd(), nqnfs_clientlease(); +struct mbuf *nfsm_rpchead(); +int nfsrtton = 0; +struct nfsrtt nfsrtt; +struct nfsd nfsd_head; + +int nfsrv_null(), + nfsrv_getattr(), + nfsrv_setattr(), + nfsrv_lookup(), + nfsrv_readlink(), + nfsrv_read(), + nfsrv_write(), + nfsrv_create(), + nfsrv_remove(), + nfsrv_rename(), + nfsrv_link(), + nfsrv_symlink(), + nfsrv_mkdir(), + nfsrv_rmdir(), + nfsrv_readdir(), + nfsrv_statfs(), + nfsrv_noop(), + nqnfsrv_readdirlook(), + nqnfsrv_getlease(), + nqnfsrv_vacated(), + nqnfsrv_access(); + +int (*nfsrv_procs[NFS_NPROCS])() = { + nfsrv_null, + nfsrv_getattr, + nfsrv_setattr, + nfsrv_noop, + nfsrv_lookup, + nfsrv_readlink, + nfsrv_read, + nfsrv_noop, + nfsrv_write, + nfsrv_create, + nfsrv_remove, + nfsrv_rename, + nfsrv_link, + nfsrv_symlink, + nfsrv_mkdir, + nfsrv_rmdir, + nfsrv_readdir, + nfsrv_statfs, + nqnfsrv_readdirlook, + nqnfsrv_getlease, + nqnfsrv_vacated, + nfsrv_noop, + nqnfsrv_access, +}; + +struct nfsreq nfsreqh; + +/* + * Initialize sockets and congestion for a new NFS connection. + * We do not free the sockaddr if error. + */ +nfs_connect(nmp, rep) + register struct nfsmount *nmp; + struct nfsreq *rep; +{ + register struct socket *so; + int s, error, rcvreserve, sndreserve; + struct sockaddr *saddr; + struct sockaddr_in *sin; + struct mbuf *m; + u_short tport; + + nmp->nm_so = (struct socket *)0; + saddr = mtod(nmp->nm_nam, struct sockaddr *); + if (error = socreate(saddr->sa_family, + &nmp->nm_so, nmp->nm_sotype, nmp->nm_soproto)) + goto bad; + so = nmp->nm_so; + nmp->nm_soflags = so->so_proto->pr_flags; + + /* + * Some servers require that the client port be a reserved port number. + */ + if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) { + MGET(m, M_WAIT, MT_SONAME); + sin = mtod(m, struct sockaddr_in *); + sin->sin_len = m->m_len = sizeof (struct sockaddr_in); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + tport = IPPORT_RESERVED - 1; + sin->sin_port = htons(tport); + while ((error = sobind(so, m)) == EADDRINUSE && + --tport > IPPORT_RESERVED / 2) + sin->sin_port = htons(tport); + m_freem(m); + if (error) + goto bad; + } + + /* + * Protocols that do not require connections may be optionally left + * unconnected for servers that reply from a port other than NFS_PORT. + */ + if (nmp->nm_flag & NFSMNT_NOCONN) { + if (nmp->nm_soflags & PR_CONNREQUIRED) { + error = ENOTCONN; + goto bad; + } + } else { + if (error = soconnect(so, nmp->nm_nam)) + goto bad; + + /* + * Wait for the connection to complete. Cribbed from the + * connect system call but with the wait timing out so + * that interruptible mounts don't hang here for a long time. + */ + s = splnet(); + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { + (void) tsleep((caddr_t)&so->so_timeo, PSOCK, + "nfscon", 2 * hz); + if ((so->so_state & SS_ISCONNECTING) && + so->so_error == 0 && rep && + (error = nfs_sigintr(nmp, rep, rep->r_procp))) { + so->so_state &= ~SS_ISCONNECTING; + splx(s); + goto bad; + } + } + if (so->so_error) { + error = so->so_error; + so->so_error = 0; + splx(s); + goto bad; + } + splx(s); + } + if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) { + so->so_rcv.sb_timeo = (5 * hz); + so->so_snd.sb_timeo = (5 * hz); + } else { + so->so_rcv.sb_timeo = 0; + so->so_snd.sb_timeo = 0; + } + if (nmp->nm_sotype == SOCK_DGRAM) { + sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; + rcvreserve = nmp->nm_rsize + NFS_MAXPKTHDR; + } else if (nmp->nm_sotype == SOCK_SEQPACKET) { + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * 2; + } else { + if (nmp->nm_sotype != SOCK_STREAM) + panic("nfscon sotype"); + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); + } + if (so->so_proto->pr_protocol == IPPROTO_TCP) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); + } + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) + * 2; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) + * 2; + } + if (error = soreserve(so, sndreserve, rcvreserve)) + goto bad; + so->so_rcv.sb_flags |= SB_NOINTR; + so->so_snd.sb_flags |= SB_NOINTR; + + /* Initialize other non-zero congestion variables */ + nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = nmp->nm_srtt[3] = + nmp->nm_srtt[4] = (NFS_TIMEO << 3); + nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] = + nmp->nm_sdrtt[3] = nmp->nm_sdrtt[4] = 0; + nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ + nmp->nm_sent = 0; + nmp->nm_timeouts = 0; + return (0); + +bad: + nfs_disconnect(nmp); + return (error); +} + +/* + * Reconnect routine: + * Called when a connection is broken on a reliable protocol. + * - clean up the old socket + * - nfs_connect() again + * - set R_MUSTRESEND for all outstanding requests on mount point + * If this fails the mount point is DEAD! + * nb: Must be called with the nfs_sndlock() set on the mount point. + */ +nfs_reconnect(rep) + register struct nfsreq *rep; +{ + register struct nfsreq *rp; + register struct nfsmount *nmp = rep->r_nmp; + int error; + + nfs_disconnect(nmp); + while (error = nfs_connect(nmp, rep)) { + if (error == EINTR || error == ERESTART) + return (EINTR); + (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); + } + + /* + * Loop through outstanding request list and fix up all requests + * on old socket. + */ + rp = nfsreqh.r_next; + while (rp != &nfsreqh) { + if (rp->r_nmp == nmp) + rp->r_flags |= R_MUSTRESEND; + rp = rp->r_next; + } + return (0); +} + +/* + * NFS disconnect. Clean up and unlink. + */ +void +nfs_disconnect(nmp) + register struct nfsmount *nmp; +{ + register struct socket *so; + + if (nmp->nm_so) { + so = nmp->nm_so; + nmp->nm_so = (struct socket *)0; + soshutdown(so, 2); + soclose(so); + } +} + +/* + * This is the nfs send routine. For connection based socket types, it + * must be called with an nfs_sndlock() on the socket. + * "rep == NULL" indicates that it has been called from a server. + * For the client side: + * - return EINTR if the RPC is terminated, 0 otherwise + * - set R_MUSTRESEND if the send fails for any reason + * - do any cleanup required by recoverable socket errors (???) + * For the server side: + * - return EINTR or ERESTART if interrupted by a signal + * - return EPIPE if a connection is lost for connection based sockets (TCP...) + * - do any cleanup required by recoverable socket errors (???) + */ +nfs_send(so, nam, top, rep) + register struct socket *so; + struct mbuf *nam; + register struct mbuf *top; + struct nfsreq *rep; +{ + struct mbuf *sendnam; + int error, soflags, flags; + + if (rep) { + if (rep->r_flags & R_SOFTTERM) { + m_freem(top); + return (EINTR); + } + if ((so = rep->r_nmp->nm_so) == NULL) { + rep->r_flags |= R_MUSTRESEND; + m_freem(top); + return (0); + } + rep->r_flags &= ~R_MUSTRESEND; + soflags = rep->r_nmp->nm_soflags; + } else + soflags = so->so_proto->pr_flags; + if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) + sendnam = (struct mbuf *)0; + else + sendnam = nam; + if (so->so_type == SOCK_SEQPACKET) + flags = MSG_EOR; + else + flags = 0; + + error = sosend(so, sendnam, (struct uio *)0, top, + (struct mbuf *)0, flags); + if (error) { + if (rep) { + log(LOG_INFO, "nfs send error %d for server %s\n",error, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + /* + * Deal with errors for the client side. + */ + if (rep->r_flags & R_SOFTTERM) + error = EINTR; + else + rep->r_flags |= R_MUSTRESEND; + } else + log(LOG_INFO, "nfsd send error %d\n", error); + + /* + * Handle any recoverable (soft) socket errors here. (???) + */ + if (error != EINTR && error != ERESTART && + error != EWOULDBLOCK && error != EPIPE) + error = 0; + } + return (error); +} + +/* + * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all + * done by soreceive(), but for SOCK_STREAM we must deal with the Record + * Mark and consolidate the data into a new mbuf list. + * nb: Sometimes TCP passes the data up to soreceive() in long lists of + * small mbufs. + * For SOCK_STREAM we must be very careful to read an entire record once + * we have read any of it, even if the system call has been interrupted. + */ +nfs_receive(rep, aname, mp) + register struct nfsreq *rep; + struct mbuf **aname; + struct mbuf **mp; +{ + register struct socket *so; + struct uio auio; + struct iovec aio; + register struct mbuf *m; + struct mbuf *control; + u_long len; + struct mbuf **getnam; + int error, sotype, rcvflg; + struct proc *p = curproc; /* XXX */ + + /* + * Set up arguments for soreceive() + */ + *mp = (struct mbuf *)0; + *aname = (struct mbuf *)0; + sotype = rep->r_nmp->nm_sotype; + + /* + * For reliable protocols, lock against other senders/receivers + * in case a reconnect is necessary. + * For SOCK_STREAM, first get the Record Mark to find out how much + * more there is to get. + * We must lock the socket against other receivers + * until we have an entire rpc request/reply. + */ + if (sotype != SOCK_DGRAM) { + if (error = nfs_sndlock(&rep->r_nmp->nm_flag, rep)) + return (error); +tryagain: + /* + * Check for fatal errors and resending request. + */ + /* + * Ugh: If a reconnect attempt just happened, nm_so + * would have changed. NULL indicates a failed + * attempt that has essentially shut down this + * mount point. + */ + if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { + nfs_sndunlock(&rep->r_nmp->nm_flag); + return (EINTR); + } + if ((so = rep->r_nmp->nm_so) == NULL) { + if (error = nfs_reconnect(rep)) { + nfs_sndunlock(&rep->r_nmp->nm_flag); + return (error); + } + goto tryagain; + } + while (rep->r_flags & R_MUSTRESEND) { + m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); + nfsstats.rpcretries++; + if (error = nfs_send(so, rep->r_nmp->nm_nam, m, rep)) { + if (error == EINTR || error == ERESTART || + (error = nfs_reconnect(rep))) { + nfs_sndunlock(&rep->r_nmp->nm_flag); + return (error); + } + goto tryagain; + } + } + nfs_sndunlock(&rep->r_nmp->nm_flag); + if (sotype == SOCK_STREAM) { + aio.iov_base = (caddr_t) &len; + aio.iov_len = sizeof(u_long); + auio.uio_iov = &aio; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_offset = 0; + auio.uio_resid = sizeof(u_long); + auio.uio_procp = p; + do { + rcvflg = MSG_WAITALL; + error = soreceive(so, (struct mbuf **)0, &auio, + (struct mbuf **)0, (struct mbuf **)0, &rcvflg); + if (error == EWOULDBLOCK && rep) { + if (rep->r_flags & R_SOFTTERM) + return (EINTR); + } + } while (error == EWOULDBLOCK); + if (!error && auio.uio_resid > 0) { + log(LOG_INFO, + "short receive (%d/%d) from nfs server %s\n", + sizeof(u_long) - auio.uio_resid, + sizeof(u_long), + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = EPIPE; + } + if (error) + goto errout; + len = ntohl(len) & ~0x80000000; + /* + * This is SERIOUS! We are out of sync with the sender + * and forcing a disconnect/reconnect is all I can do. + */ + if (len > NFS_MAXPACKET) { + log(LOG_ERR, "%s (%d) from nfs server %s\n", + "impossible packet length", + len, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = EFBIG; + goto errout; + } + auio.uio_resid = len; + do { + rcvflg = MSG_WAITALL; + error = soreceive(so, (struct mbuf **)0, + &auio, mp, (struct mbuf **)0, &rcvflg); + } while (error == EWOULDBLOCK || error == EINTR || + error == ERESTART); + if (!error && auio.uio_resid > 0) { + log(LOG_INFO, + "short receive (%d/%d) from nfs server %s\n", + len - auio.uio_resid, len, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = EPIPE; + } + } else { + /* + * NB: Since uio_resid is big, MSG_WAITALL is ignored + * and soreceive() will return when it has either a + * control msg or a data msg. + * We have no use for control msg., but must grab them + * and then throw them away so we know what is going + * on. + */ + auio.uio_resid = len = 100000000; /* Anything Big */ + auio.uio_procp = p; + do { + rcvflg = 0; + error = soreceive(so, (struct mbuf **)0, + &auio, mp, &control, &rcvflg); + if (control) + m_freem(control); + if (error == EWOULDBLOCK && rep) { + if (rep->r_flags & R_SOFTTERM) + return (EINTR); + } + } while (error == EWOULDBLOCK || + (!error && *mp == NULL && control)); + if ((rcvflg & MSG_EOR) == 0) + printf("Egad!!\n"); + if (!error && *mp == NULL) + error = EPIPE; + len -= auio.uio_resid; + } +errout: + if (error && error != EINTR && error != ERESTART) { + m_freem(*mp); + *mp = (struct mbuf *)0; + if (error != EPIPE) + log(LOG_INFO, + "receive error %d from nfs server %s\n", + error, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); + if (!error) + error = nfs_reconnect(rep); + if (!error) + goto tryagain; + } + } else { + if ((so = rep->r_nmp->nm_so) == NULL) + return (EACCES); + if (so->so_state & SS_ISCONNECTED) + getnam = (struct mbuf **)0; + else + getnam = aname; + auio.uio_resid = len = 1000000; + auio.uio_procp = p; + do { + rcvflg = 0; + error = soreceive(so, getnam, &auio, mp, + (struct mbuf **)0, &rcvflg); + if (error == EWOULDBLOCK && + (rep->r_flags & R_SOFTTERM)) + return (EINTR); + } while (error == EWOULDBLOCK); + len -= auio.uio_resid; + } + if (error) { + m_freem(*mp); + *mp = (struct mbuf *)0; + } + /* + * Search for any mbufs that are not a multiple of 4 bytes long + * or with m_data not longword aligned. + * These could cause pointer alignment problems, so copy them to + * well aligned mbufs. + */ + nfs_realign(*mp, 5 * NFSX_UNSIGNED); + return (error); +} + +/* + * Implement receipt of reply on a socket. + * We must search through the list of received datagrams matching them + * with outstanding requests using the xid, until ours is found. + */ +/* ARGSUSED */ +nfs_reply(myrep) + struct nfsreq *myrep; +{ + register struct nfsreq *rep; + register struct nfsmount *nmp = myrep->r_nmp; + register long t1; + struct mbuf *mrep, *nam, *md; + u_long rxid, *tl; + caddr_t dpos, cp2; + int error; + + /* + * Loop around until we get our own reply + */ + for (;;) { + /* + * Lock against other receivers so that I don't get stuck in + * sbwait() after someone else has received my reply for me. + * Also necessary for connection based protocols to avoid + * race conditions during a reconnect. + */ + if (error = nfs_rcvlock(myrep)) + return (error); + /* Already received, bye bye */ + if (myrep->r_mrep != NULL) { + nfs_rcvunlock(&nmp->nm_flag); + return (0); + } + /* + * Get the next Rpc reply off the socket + */ + error = nfs_receive(myrep, &nam, &mrep); + nfs_rcvunlock(&nmp->nm_flag); + if (error) { + + /* + * Ignore routing errors on connectionless protocols?? + */ + if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { + nmp->nm_so->so_error = 0; + if (myrep->r_flags & R_GETONEREP) + return (0); + continue; + } + return (error); + } + if (nam) + m_freem(nam); + + /* + * Get the xid and check that it is an rpc reply + */ + md = mrep; + dpos = mtod(md, caddr_t); + nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); + rxid = *tl++; + if (*tl != rpc_reply) { + if (nmp->nm_flag & NFSMNT_NQNFS) { + if (nqnfs_callback(nmp, mrep, md, dpos)) + nfsstats.rpcinvalid++; + } else { + nfsstats.rpcinvalid++; + m_freem(mrep); + } +nfsmout: + if (myrep->r_flags & R_GETONEREP) + return (0); + continue; + } + + /* + * Loop through the request list to match up the reply + * Iff no match, just drop the datagram + */ + rep = nfsreqh.r_next; + while (rep != &nfsreqh) { + if (rep->r_mrep == NULL && rxid == rep->r_xid) { + /* Found it.. */ + rep->r_mrep = mrep; + rep->r_md = md; + rep->r_dpos = dpos; + if (nfsrtton) { + struct rttl *rt; + + rt = &nfsrtt.rttl[nfsrtt.pos]; + rt->proc = rep->r_procnum; + rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]); + rt->sent = nmp->nm_sent; + rt->cwnd = nmp->nm_cwnd; + rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1]; + rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1]; + rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid; + rt->tstamp = time; + if (rep->r_flags & R_TIMING) + rt->rtt = rep->r_rtt; + else + rt->rtt = 1000000; + nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ; + } + /* + * Update congestion window. + * Do the additive increase of + * one rpc/rtt. + */ + if (nmp->nm_cwnd <= nmp->nm_sent) { + nmp->nm_cwnd += + (NFS_CWNDSCALE * NFS_CWNDSCALE + + (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; + if (nmp->nm_cwnd > NFS_MAXCWND) + nmp->nm_cwnd = NFS_MAXCWND; + } + rep->r_flags &= ~R_SENT; + nmp->nm_sent -= NFS_CWNDSCALE; + /* + * Update rtt using a gain of 0.125 on the mean + * and a gain of 0.25 on the deviation. + */ + if (rep->r_flags & R_TIMING) { + /* + * Since the timer resolution of + * NFS_HZ is so course, it can often + * result in r_rtt == 0. Since + * r_rtt == N means that the actual + * rtt is between N+dt and N+2-dt ticks, + * add 1. + */ + t1 = rep->r_rtt + 1; + t1 -= (NFS_SRTT(rep) >> 3); + NFS_SRTT(rep) += t1; + if (t1 < 0) + t1 = -t1; + t1 -= (NFS_SDRTT(rep) >> 2); + NFS_SDRTT(rep) += t1; + } + nmp->nm_timeouts = 0; + break; + } + rep = rep->r_next; + } + /* + * If not matched to a request, drop it. + * If it's mine, get out. + */ + if (rep == &nfsreqh) { + nfsstats.rpcunexpected++; + m_freem(mrep); + } else if (rep == myrep) { + if (rep->r_mrep == NULL) + panic("nfsreply nil"); + return (0); + } + if (myrep->r_flags & R_GETONEREP) + return (0); + } +} + +/* + * nfs_request - goes something like this + * - fill in request struct + * - links it into list + * - calls nfs_send() for first transmit + * - calls nfs_receive() to get reply + * - break down rpc header and return with nfs reply pointed to + * by mrep or error + * nb: always frees up mreq mbuf list + */ +nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp) + struct vnode *vp; + struct mbuf *mrest; + int procnum; + struct proc *procp; + struct ucred *cred; + struct mbuf **mrp; + struct mbuf **mdp; + caddr_t *dposp; +{ + register struct mbuf *m, *mrep; + register struct nfsreq *rep; + register u_long *tl; + register int i; + struct nfsmount *nmp; + struct mbuf *md, *mheadend; + struct nfsreq *reph; + struct nfsnode *np; + time_t reqtime, waituntil; + caddr_t dpos, cp2; + int t1, nqlflag, cachable, s, error = 0, mrest_len, auth_len, auth_type; + int trylater_delay = NQ_TRYLATERDEL, trylater_cnt = 0, failed_auth = 0; + u_long xid; + u_quad_t frev; + char *auth_str; + + nmp = VFSTONFS(vp->v_mount); + MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); + rep->r_nmp = nmp; + rep->r_vp = vp; + rep->r_procp = procp; + rep->r_procnum = procnum; + i = 0; + m = mrest; + while (m) { + i += m->m_len; + m = m->m_next; + } + mrest_len = i; + + /* + * Get the RPC header with authorization. + */ +kerbauth: + auth_str = (char *)0; + if (nmp->nm_flag & NFSMNT_KERB) { + if (failed_auth) { + error = nfs_getauth(nmp, rep, cred, &auth_type, + &auth_str, &auth_len); + if (error) { + free((caddr_t)rep, M_NFSREQ); + m_freem(mrest); + return (error); + } + } else { + auth_type = RPCAUTH_UNIX; + auth_len = 5 * NFSX_UNSIGNED; + } + } else { + auth_type = RPCAUTH_UNIX; + if (cred->cr_ngroups < 1) + panic("nfsreq nogrps"); + auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ? + nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + + 5 * NFSX_UNSIGNED; + } + m = nfsm_rpchead(cred, (nmp->nm_flag & NFSMNT_NQNFS), procnum, + auth_type, auth_len, auth_str, mrest, mrest_len, &mheadend, &xid); + if (auth_str) + free(auth_str, M_TEMP); + + /* + * For stream protocols, insert a Sun RPC Record Mark. + */ + if (nmp->nm_sotype == SOCK_STREAM) { + M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); + *mtod(m, u_long *) = htonl(0x80000000 | + (m->m_pkthdr.len - NFSX_UNSIGNED)); + } + rep->r_mreq = m; + rep->r_xid = xid; +tryagain: + if (nmp->nm_flag & NFSMNT_SOFT) + rep->r_retry = nmp->nm_retry; + else + rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ + rep->r_rtt = rep->r_rexmit = 0; + if (proct[procnum] > 0) + rep->r_flags = R_TIMING; + else + rep->r_flags = 0; + rep->r_mrep = NULL; + + /* + * Do the client side RPC. + */ + nfsstats.rpcrequests++; + /* + * Chain request into list of outstanding requests. Be sure + * to put it LAST so timer finds oldest requests first. + */ + s = splsoftclock(); + reph = &nfsreqh; + reph->r_prev->r_next = rep; + rep->r_prev = reph->r_prev; + reph->r_prev = rep; + rep->r_next = reph; + + /* Get send time for nqnfs */ + reqtime = time.tv_sec; + + /* + * If backing off another request or avoiding congestion, don't + * send this one now but let timer do it. If not timing a request, + * do it now. + */ + if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || + (nmp->nm_flag & NFSMNT_DUMBTIMR) || + nmp->nm_sent < nmp->nm_cwnd)) { + splx(s); + if (nmp->nm_soflags & PR_CONNREQUIRED) + error = nfs_sndlock(&nmp->nm_flag, rep); + if (!error) { + m = m_copym(m, 0, M_COPYALL, M_WAIT); + error = nfs_send(nmp->nm_so, nmp->nm_nam, m, rep); + if (nmp->nm_soflags & PR_CONNREQUIRED) + nfs_sndunlock(&nmp->nm_flag); + } + if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { + nmp->nm_sent += NFS_CWNDSCALE; + rep->r_flags |= R_SENT; + } + } else { + splx(s); + rep->r_rtt = -1; + } + + /* + * Wait for the reply from our send or the timer's. + */ + if (!error || error == EPIPE) + error = nfs_reply(rep); + + /* + * RPC done, unlink the request. + */ + s = splsoftclock(); + rep->r_prev->r_next = rep->r_next; + rep->r_next->r_prev = rep->r_prev; + splx(s); + + /* + * Decrement the outstanding request count. + */ + if (rep->r_flags & R_SENT) { + rep->r_flags &= ~R_SENT; /* paranoia */ + nmp->nm_sent -= NFS_CWNDSCALE; + } + + /* + * If there was a successful reply and a tprintf msg. + * tprintf a response. + */ + if (!error && (rep->r_flags & R_TPRINTFMSG)) + nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname, + "is alive again"); + mrep = rep->r_mrep; + md = rep->r_md; + dpos = rep->r_dpos; + if (error) { + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + return (error); + } + + /* + * break down the rpc header and check if ok + */ + nfsm_dissect(tl, u_long *, 3*NFSX_UNSIGNED); + if (*tl++ == rpc_msgdenied) { + if (*tl == rpc_mismatch) + error = EOPNOTSUPP; + else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) { + if (*tl == rpc_rejectedcred && failed_auth == 0) { + failed_auth++; + mheadend->m_next = (struct mbuf *)0; + m_freem(mrep); + m_freem(rep->r_mreq); + goto kerbauth; + } else + error = EAUTH; + } else + error = EACCES; + m_freem(mrep); + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + return (error); + } + + /* + * skip over the auth_verf, someday we may want to cache auth_short's + * for nfs_reqhead(), but for now just dump it + */ + if (*++tl != 0) { + i = nfsm_rndup(fxdr_unsigned(long, *tl)); + nfsm_adv(i); + } + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + /* 0 == ok */ + if (*tl == 0) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + if (*tl != 0) { + error = fxdr_unsigned(int, *tl); + m_freem(mrep); + if ((nmp->nm_flag & NFSMNT_NQNFS) && + error == NQNFS_TRYLATER) { + error = 0; + waituntil = time.tv_sec + trylater_delay; + while (time.tv_sec < waituntil) + (void) tsleep((caddr_t)&lbolt, + PSOCK, "nqnfstry", 0); + trylater_delay *= nfs_backoff[trylater_cnt]; + if (trylater_cnt < 7) + trylater_cnt++; + goto tryagain; + } + + /* + * If the File Handle was stale, invalidate the + * lookup cache, just in case. + */ + if (error == ESTALE) + cache_purge(vp); + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + return (error); + } + + /* + * For nqnfs, get any lease in reply + */ + if (nmp->nm_flag & NFSMNT_NQNFS) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + if (*tl) { + np = VTONFS(vp); + nqlflag = fxdr_unsigned(int, *tl); + nfsm_dissect(tl, u_long *, 4*NFSX_UNSIGNED); + cachable = fxdr_unsigned(int, *tl++); + reqtime += fxdr_unsigned(int, *tl++); + if (reqtime > time.tv_sec) { + fxdr_hyper(tl, &frev); + nqnfs_clientlease(nmp, np, nqlflag, + cachable, reqtime, frev); + } + } + } + *mrp = mrep; + *mdp = md; + *dposp = dpos; + m_freem(rep->r_mreq); + FREE((caddr_t)rep, M_NFSREQ); + return (0); + } + m_freem(mrep); + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + error = EPROTONOSUPPORT; +nfsmout: + return (error); +} + +/* + * Generate the rpc reply header + * siz arg. is used to decide if adding a cluster is worthwhile + */ +nfs_rephead(siz, nd, err, cache, frev, mrq, mbp, bposp) + int siz; + struct nfsd *nd; + int err; + int cache; + u_quad_t *frev; + struct mbuf **mrq; + struct mbuf **mbp; + caddr_t *bposp; +{ + register u_long *tl; + register struct mbuf *mreq; + caddr_t bpos; + struct mbuf *mb, *mb2; + + MGETHDR(mreq, M_WAIT, MT_DATA); + mb = mreq; + /* + * If this is a big reply, use a cluster else + * try and leave leading space for the lower level headers. + */ + siz += RPC_REPLYSIZ; + if (siz >= MINCLSIZE) { + MCLGET(mreq, M_WAIT); + } else + mreq->m_data += max_hdr; + tl = mtod(mreq, u_long *); + mreq->m_len = 6*NFSX_UNSIGNED; + bpos = ((caddr_t)tl)+mreq->m_len; + *tl++ = nd->nd_retxid; + *tl++ = rpc_reply; + if (err == ERPCMISMATCH || err == NQNFS_AUTHERR) { + *tl++ = rpc_msgdenied; + if (err == NQNFS_AUTHERR) { + *tl++ = rpc_autherr; + *tl = rpc_rejectedcred; + mreq->m_len -= NFSX_UNSIGNED; + bpos -= NFSX_UNSIGNED; + } else { + *tl++ = rpc_mismatch; + *tl++ = txdr_unsigned(2); + *tl = txdr_unsigned(2); + } + } else { + *tl++ = rpc_msgaccepted; + *tl++ = 0; + *tl++ = 0; + switch (err) { + case EPROGUNAVAIL: + *tl = txdr_unsigned(RPC_PROGUNAVAIL); + break; + case EPROGMISMATCH: + *tl = txdr_unsigned(RPC_PROGMISMATCH); + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(2); + *tl = txdr_unsigned(2); /* someday 3 */ + break; + case EPROCUNAVAIL: + *tl = txdr_unsigned(RPC_PROCUNAVAIL); + break; + default: + *tl = 0; + if (err != VNOVAL) { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + if (err) + *tl = txdr_unsigned(nfsrv_errmap[err - 1]); + else + *tl = 0; + } + break; + }; + } + + /* + * For nqnfs, piggyback lease as requested. + */ + if (nd->nd_nqlflag != NQL_NOVAL && err == 0) { + if (nd->nd_nqlflag) { + nfsm_build(tl, u_long *, 5*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(nd->nd_nqlflag); + *tl++ = txdr_unsigned(cache); + *tl++ = txdr_unsigned(nd->nd_duration); + txdr_hyper(frev, tl); + } else { + if (nd->nd_nqlflag != 0) + panic("nqreph"); + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = 0; + } + } + *mrq = mreq; + *mbp = mb; + *bposp = bpos; + if (err != 0 && err != VNOVAL) + nfsstats.srvrpc_errs++; + return (0); +} + +/* + * Nfs timer routine + * Scan the nfsreq list and retranmit any requests that have timed out + * To avoid retransmission attempts on STREAM sockets (in the future) make + * sure to set the r_retry field to 0 (implies nm_retry == 0). + */ +void +nfs_timer(arg) + void *arg; +{ + register struct nfsreq *rep; + register struct mbuf *m; + register struct socket *so; + register struct nfsmount *nmp; + register int timeo; + static long lasttime = 0; + int s, error; + + s = splnet(); + for (rep = nfsreqh.r_next; rep != &nfsreqh; rep = rep->r_next) { + nmp = rep->r_nmp; + if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) + continue; + if (nfs_sigintr(nmp, rep, rep->r_procp)) { + rep->r_flags |= R_SOFTTERM; + continue; + } + if (rep->r_rtt >= 0) { + rep->r_rtt++; + if (nmp->nm_flag & NFSMNT_DUMBTIMR) + timeo = nmp->nm_timeo; + else + timeo = NFS_RTO(nmp, proct[rep->r_procnum]); + if (nmp->nm_timeouts > 0) + timeo *= nfs_backoff[nmp->nm_timeouts - 1]; + if (rep->r_rtt <= timeo) + continue; + if (nmp->nm_timeouts < 8) + nmp->nm_timeouts++; + } + /* + * Check for server not responding + */ + if ((rep->r_flags & R_TPRINTFMSG) == 0 && + rep->r_rexmit > nmp->nm_deadthresh) { + nfs_msg(rep->r_procp, + nmp->nm_mountp->mnt_stat.f_mntfromname, + "not responding"); + rep->r_flags |= R_TPRINTFMSG; + } + if (rep->r_rexmit >= rep->r_retry) { /* too many */ + nfsstats.rpctimeouts++; + rep->r_flags |= R_SOFTTERM; + continue; + } + if (nmp->nm_sotype != SOCK_DGRAM) { + if (++rep->r_rexmit > NFS_MAXREXMIT) + rep->r_rexmit = NFS_MAXREXMIT; + continue; + } + if ((so = nmp->nm_so) == NULL) + continue; + + /* + * If there is enough space and the window allows.. + * Resend it + * Set r_rtt to -1 in case we fail to send it now. + */ + rep->r_rtt = -1; + if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && + ((nmp->nm_flag & NFSMNT_DUMBTIMR) || + (rep->r_flags & R_SENT) || + nmp->nm_sent < nmp->nm_cwnd) && + (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ + if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) + error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, + (struct mbuf *)0, (struct mbuf *)0); + else + error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, + nmp->nm_nam, (struct mbuf *)0); + if (error) { + if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) + so->so_error = 0; + } else { + /* + * Iff first send, start timing + * else turn timing off, backoff timer + * and divide congestion window by 2. + */ + if (rep->r_flags & R_SENT) { + rep->r_flags &= ~R_TIMING; + if (++rep->r_rexmit > NFS_MAXREXMIT) + rep->r_rexmit = NFS_MAXREXMIT; + nmp->nm_cwnd >>= 1; + if (nmp->nm_cwnd < NFS_CWNDSCALE) + nmp->nm_cwnd = NFS_CWNDSCALE; + nfsstats.rpcretries++; + } else { + rep->r_flags |= R_SENT; + nmp->nm_sent += NFS_CWNDSCALE; + } + rep->r_rtt = 0; + } + } + } + + /* + * Call the nqnfs server timer once a second to handle leases. + */ + if (lasttime != time.tv_sec) { + lasttime = time.tv_sec; + nqnfs_serverd(); + } + splx(s); + timeout(nfs_timer, (void *)0, hz / NFS_HZ); +} + +/* + * Test for a termination condition pending on the process. + * This is used for NFSMNT_INT mounts. + */ +nfs_sigintr(nmp, rep, p) + struct nfsmount *nmp; + struct nfsreq *rep; + register struct proc *p; +{ + + if (rep && (rep->r_flags & R_SOFTTERM)) + return (EINTR); + if (!(nmp->nm_flag & NFSMNT_INT)) + return (0); + if (p && p->p_siglist && + (((p->p_siglist & ~p->p_sigmask) & ~p->p_sigignore) & + NFSINT_SIGMASK)) + return (EINTR); + return (0); +} + +/* + * Lock a socket against others. + * Necessary for STREAM sockets to ensure you get an entire rpc request/reply + * and also to avoid race conditions between the processes with nfs requests + * in progress when a reconnect is necessary. + */ +nfs_sndlock(flagp, rep) + register int *flagp; + struct nfsreq *rep; +{ + struct proc *p; + int slpflag = 0, slptimeo = 0; + + if (rep) { + p = rep->r_procp; + if (rep->r_nmp->nm_flag & NFSMNT_INT) + slpflag = PCATCH; + } else + p = (struct proc *)0; + while (*flagp & NFSMNT_SNDLOCK) { + if (nfs_sigintr(rep->r_nmp, rep, p)) + return (EINTR); + *flagp |= NFSMNT_WANTSND; + (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", + slptimeo); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + *flagp |= NFSMNT_SNDLOCK; + return (0); +} + +/* + * Unlock the stream socket for others. + */ +void +nfs_sndunlock(flagp) + register int *flagp; +{ + + if ((*flagp & NFSMNT_SNDLOCK) == 0) + panic("nfs sndunlock"); + *flagp &= ~NFSMNT_SNDLOCK; + if (*flagp & NFSMNT_WANTSND) { + *flagp &= ~NFSMNT_WANTSND; + wakeup((caddr_t)flagp); + } +} + +nfs_rcvlock(rep) + register struct nfsreq *rep; +{ + register int *flagp = &rep->r_nmp->nm_flag; + int slpflag, slptimeo = 0; + + if (*flagp & NFSMNT_INT) + slpflag = PCATCH; + else + slpflag = 0; + while (*flagp & NFSMNT_RCVLOCK) { + if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) + return (EINTR); + *flagp |= NFSMNT_WANTRCV; + (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", + slptimeo); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + *flagp |= NFSMNT_RCVLOCK; + return (0); +} + +/* + * Unlock the stream socket for others. + */ +void +nfs_rcvunlock(flagp) + register int *flagp; +{ + + if ((*flagp & NFSMNT_RCVLOCK) == 0) + panic("nfs rcvunlock"); + *flagp &= ~NFSMNT_RCVLOCK; + if (*flagp & NFSMNT_WANTRCV) { + *flagp &= ~NFSMNT_WANTRCV; + wakeup((caddr_t)flagp); + } +} + +/* + * Check for badly aligned mbuf data areas and + * realign data in an mbuf list by copying the data areas up, as required. + */ +void +nfs_realign(m, hsiz) + register struct mbuf *m; + int hsiz; +{ + register struct mbuf *m2; + register int siz, mlen, olen; + register caddr_t tcp, fcp; + struct mbuf *mnew; + + while (m) { + /* + * This never happens for UDP, rarely happens for TCP + * but frequently happens for iso transport. + */ + if ((m->m_len & 0x3) || (mtod(m, int) & 0x3)) { + olen = m->m_len; + fcp = mtod(m, caddr_t); + if ((int)fcp & 0x3) { + m->m_flags &= ~M_PKTHDR; + if (m->m_flags & M_EXT) + m->m_data = m->m_ext.ext_buf + + ((m->m_ext.ext_size - olen) & ~0x3); + else + m->m_data = m->m_dat; + } + m->m_len = 0; + tcp = mtod(m, caddr_t); + mnew = m; + m2 = m->m_next; + + /* + * If possible, only put the first invariant part + * of the RPC header in the first mbuf. + */ + mlen = M_TRAILINGSPACE(m); + if (olen <= hsiz && mlen > hsiz) + mlen = hsiz; + + /* + * Loop through the mbuf list consolidating data. + */ + while (m) { + while (olen > 0) { + if (mlen == 0) { + m2->m_flags &= ~M_PKTHDR; + if (m2->m_flags & M_EXT) + m2->m_data = m2->m_ext.ext_buf; + else + m2->m_data = m2->m_dat; + m2->m_len = 0; + mlen = M_TRAILINGSPACE(m2); + tcp = mtod(m2, caddr_t); + mnew = m2; + m2 = m2->m_next; + } + siz = min(mlen, olen); + if (tcp != fcp) + bcopy(fcp, tcp, siz); + mnew->m_len += siz; + mlen -= siz; + olen -= siz; + tcp += siz; + fcp += siz; + } + m = m->m_next; + if (m) { + olen = m->m_len; + fcp = mtod(m, caddr_t); + } + } + + /* + * Finally, set m_len == 0 for any trailing mbufs that have + * been copied out of. + */ + while (m2) { + m2->m_len = 0; + m2 = m2->m_next; + } + return; + } + m = m->m_next; + } +} + +/* + * Socket upcall routine for the nfsd sockets. + * The caddr_t arg is a pointer to the "struct nfssvc_sock". + * Essentially do as much as possible non-blocking, else punt and it will + * be called with M_WAIT from an nfsd. + */ +void +nfsrv_rcv(so, arg, waitflag) + struct socket *so; + caddr_t arg; + int waitflag; +{ + register struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; + register struct mbuf *m; + struct mbuf *mp, *nam; + struct uio auio; + int flags, error; + + if ((slp->ns_flag & SLP_VALID) == 0) + return; +#ifdef notdef + /* + * Define this to test for nfsds handling this under heavy load. + */ + if (waitflag == M_DONTWAIT) { + slp->ns_flag |= SLP_NEEDQ; goto dorecs; + } +#endif + auio.uio_procp = NULL; + if (so->so_type == SOCK_STREAM) { + /* + * If there are already records on the queue, defer soreceive() + * to an nfsd so that there is feedback to the TCP layer that + * the nfs servers are heavily loaded. + */ + if (slp->ns_rec && waitflag == M_DONTWAIT) { + slp->ns_flag |= SLP_NEEDQ; + goto dorecs; + } + + /* + * Do soreceive(). + */ + auio.uio_resid = 1000000000; + flags = MSG_DONTWAIT; + error = soreceive(so, &nam, &auio, &mp, (struct mbuf **)0, &flags); + if (error || mp == (struct mbuf *)0) { + if (error == EWOULDBLOCK) + slp->ns_flag |= SLP_NEEDQ; + else + slp->ns_flag |= SLP_DISCONN; + goto dorecs; + } + m = mp; + if (slp->ns_rawend) { + slp->ns_rawend->m_next = m; + slp->ns_cc += 1000000000 - auio.uio_resid; + } else { + slp->ns_raw = m; + slp->ns_cc = 1000000000 - auio.uio_resid; + } + while (m->m_next) + m = m->m_next; + slp->ns_rawend = m; + + /* + * Now try and parse record(s) out of the raw stream data. + */ + if (error = nfsrv_getstream(slp, waitflag)) { + if (error == EPERM) + slp->ns_flag |= SLP_DISCONN; + else + slp->ns_flag |= SLP_NEEDQ; + } + } else { + do { + auio.uio_resid = 1000000000; + flags = MSG_DONTWAIT; + error = soreceive(so, &nam, &auio, &mp, + (struct mbuf **)0, &flags); + if (mp) { + nfs_realign(mp, 10 * NFSX_UNSIGNED); + if (nam) { + m = nam; + m->m_next = mp; + } else + m = mp; + if (slp->ns_recend) + slp->ns_recend->m_nextpkt = m; + else + slp->ns_rec = m; + slp->ns_recend = m; + m->m_nextpkt = (struct mbuf *)0; + } + if (error) { + if ((so->so_proto->pr_flags & PR_CONNREQUIRED) + && error != EWOULDBLOCK) { + slp->ns_flag |= SLP_DISCONN; + goto dorecs; + } + } + } while (mp); + } + + /* + * Now try and process the request records, non-blocking. + */ +dorecs: + if (waitflag == M_DONTWAIT && + (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) + nfsrv_wakenfsd(slp); +} + +/* + * Try and extract an RPC request from the mbuf data list received on a + * stream socket. The "waitflag" argument indicates whether or not it + * can sleep. + */ +nfsrv_getstream(slp, waitflag) + register struct nfssvc_sock *slp; + int waitflag; +{ + register struct mbuf *m; + register char *cp1, *cp2; + register int len; + struct mbuf *om, *m2, *recm; + u_long recmark; + + if (slp->ns_flag & SLP_GETSTREAM) + panic("nfs getstream"); + slp->ns_flag |= SLP_GETSTREAM; + for (;;) { + if (slp->ns_reclen == 0) { + if (slp->ns_cc < NFSX_UNSIGNED) { + slp->ns_flag &= ~SLP_GETSTREAM; + return (0); + } + m = slp->ns_raw; + if (m->m_len >= NFSX_UNSIGNED) { + bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); + m->m_data += NFSX_UNSIGNED; + m->m_len -= NFSX_UNSIGNED; + } else { + cp1 = (caddr_t)&recmark; + cp2 = mtod(m, caddr_t); + while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { + while (m->m_len == 0) { + m = m->m_next; + cp2 = mtod(m, caddr_t); + } + *cp1++ = *cp2++; + m->m_data++; + m->m_len--; + } + } + slp->ns_cc -= NFSX_UNSIGNED; + slp->ns_reclen = ntohl(recmark) & ~0x80000000; + if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) { + slp->ns_flag &= ~SLP_GETSTREAM; + return (EPERM); + } + } + + /* + * Now get the record part. + */ + if (slp->ns_cc == slp->ns_reclen) { + recm = slp->ns_raw; + slp->ns_raw = slp->ns_rawend = (struct mbuf *)0; + slp->ns_cc = slp->ns_reclen = 0; + } else if (slp->ns_cc > slp->ns_reclen) { + len = 0; + m = slp->ns_raw; + om = (struct mbuf *)0; + while (len < slp->ns_reclen) { + if ((len + m->m_len) > slp->ns_reclen) { + m2 = m_copym(m, 0, slp->ns_reclen - len, + waitflag); + if (m2) { + if (om) { + om->m_next = m2; + recm = slp->ns_raw; + } else + recm = m2; + m->m_data += slp->ns_reclen - len; + m->m_len -= slp->ns_reclen - len; + len = slp->ns_reclen; + } else { + slp->ns_flag &= ~SLP_GETSTREAM; + return (EWOULDBLOCK); + } + } else if ((len + m->m_len) == slp->ns_reclen) { + om = m; + len += m->m_len; + m = m->m_next; + recm = slp->ns_raw; + om->m_next = (struct mbuf *)0; + } else { + om = m; + len += m->m_len; + m = m->m_next; + } + } + slp->ns_raw = m; + slp->ns_cc -= len; + slp->ns_reclen = 0; + } else { + slp->ns_flag &= ~SLP_GETSTREAM; + return (0); + } + nfs_realign(recm, 10 * NFSX_UNSIGNED); + if (slp->ns_recend) + slp->ns_recend->m_nextpkt = recm; + else + slp->ns_rec = recm; + slp->ns_recend = recm; + } +} + +/* + * Parse an RPC header. + */ +nfsrv_dorec(slp, nd) + register struct nfssvc_sock *slp; + register struct nfsd *nd; +{ + register struct mbuf *m; + int error; + + if ((slp->ns_flag & SLP_VALID) == 0 || + (m = slp->ns_rec) == (struct mbuf *)0) + return (ENOBUFS); + if (slp->ns_rec = m->m_nextpkt) + m->m_nextpkt = (struct mbuf *)0; + else + slp->ns_recend = (struct mbuf *)0; + if (m->m_type == MT_SONAME) { + nd->nd_nam = m; + nd->nd_md = nd->nd_mrep = m->m_next; + m->m_next = (struct mbuf *)0; + } else { + nd->nd_nam = (struct mbuf *)0; + nd->nd_md = nd->nd_mrep = m; + } + nd->nd_dpos = mtod(nd->nd_md, caddr_t); + if (error = nfs_getreq(nd, TRUE)) { + m_freem(nd->nd_nam); + return (error); + } + return (0); +} + +/* + * Parse an RPC request + * - verify it + * - fill in the cred struct. + */ +nfs_getreq(nd, has_header) + register struct nfsd *nd; + int has_header; +{ + register int len, i; + register u_long *tl; + register long t1; + struct uio uio; + struct iovec iov; + caddr_t dpos, cp2; + u_long nfsvers, auth_type; + int error = 0, nqnfs = 0; + struct mbuf *mrep, *md; + + mrep = nd->nd_mrep; + md = nd->nd_md; + dpos = nd->nd_dpos; + if (has_header) { + nfsm_dissect(tl, u_long *, 10*NFSX_UNSIGNED); + nd->nd_retxid = *tl++; + if (*tl++ != rpc_call) { + m_freem(mrep); + return (EBADRPC); + } + } else { + nfsm_dissect(tl, u_long *, 8*NFSX_UNSIGNED); + } + nd->nd_repstat = 0; + if (*tl++ != rpc_vers) { + nd->nd_repstat = ERPCMISMATCH; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + nfsvers = nfs_vers; + if (*tl != nfs_prog) { + if (*tl == nqnfs_prog) { + nqnfs++; + nfsvers = nqnfs_vers; + } else { + nd->nd_repstat = EPROGUNAVAIL; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + } + tl++; + if (*tl++ != nfsvers) { + nd->nd_repstat = EPROGMISMATCH; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + nd->nd_procnum = fxdr_unsigned(u_long, *tl++); + if (nd->nd_procnum == NFSPROC_NULL) + return (0); + if (nd->nd_procnum >= NFS_NPROCS || + (!nqnfs && nd->nd_procnum > NFSPROC_STATFS) || + (*tl != rpc_auth_unix && *tl != rpc_auth_kerb)) { + nd->nd_repstat = EPROCUNAVAIL; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + auth_type = *tl++; + len = fxdr_unsigned(int, *tl++); + if (len < 0 || len > RPCAUTH_MAXSIZ) { + m_freem(mrep); + return (EBADRPC); + } + + /* + * Handle auth_unix or auth_kerb. + */ + if (auth_type == rpc_auth_unix) { + len = fxdr_unsigned(int, *++tl); + if (len < 0 || len > NFS_MAXNAMLEN) { + m_freem(mrep); + return (EBADRPC); + } + nfsm_adv(nfsm_rndup(len)); + nfsm_dissect(tl, u_long *, 3*NFSX_UNSIGNED); + nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); + nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); + len = fxdr_unsigned(int, *tl); + if (len < 0 || len > RPCAUTH_UNIXGIDS) { + m_freem(mrep); + return (EBADRPC); + } + nfsm_dissect(tl, u_long *, (len + 2)*NFSX_UNSIGNED); + for (i = 1; i <= len; i++) + if (i < NGROUPS) + nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); + else + tl++; + nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1); + } else if (auth_type == rpc_auth_kerb) { + nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); + nd->nd_authlen = fxdr_unsigned(int, *tl); + uio.uio_resid = nfsm_rndup(nd->nd_authlen); + if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) { + m_freem(mrep); + return (EBADRPC); + } + uio.uio_offset = 0; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_segflg = UIO_SYSSPACE; + iov.iov_base = (caddr_t)nd->nd_authstr; + iov.iov_len = RPCAUTH_MAXSIZ; + nfsm_mtouio(&uio, uio.uio_resid); + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + nd->nd_flag |= NFSD_NEEDAUTH; + } + + /* + * Do we have any use for the verifier. + * According to the "Remote Procedure Call Protocol Spec." it + * should be AUTH_NULL, but some clients make it AUTH_UNIX? + * For now, just skip over it + */ + len = fxdr_unsigned(int, *++tl); + if (len < 0 || len > RPCAUTH_MAXSIZ) { + m_freem(mrep); + return (EBADRPC); + } + if (len > 0) { + nfsm_adv(nfsm_rndup(len)); + } + + /* + * For nqnfs, get piggybacked lease request. + */ + if (nqnfs && nd->nd_procnum != NQNFSPROC_EVICTED) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + nd->nd_nqlflag = fxdr_unsigned(int, *tl); + if (nd->nd_nqlflag) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + nd->nd_duration = fxdr_unsigned(int, *tl); + } else + nd->nd_duration = NQ_MINLEASE; + } else { + nd->nd_nqlflag = NQL_NOVAL; + nd->nd_duration = NQ_MINLEASE; + } + nd->nd_md = md; + nd->nd_dpos = dpos; + return (0); +nfsmout: + return (error); +} + +/* + * Search for a sleeping nfsd and wake it up. + * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the + * running nfsds will go look for the work in the nfssvc_sock list. + */ +void +nfsrv_wakenfsd(slp) + struct nfssvc_sock *slp; +{ + register struct nfsd *nd = nfsd_head.nd_next; + + if ((slp->ns_flag & SLP_VALID) == 0) + return; + while (nd != (struct nfsd *)&nfsd_head) { + if (nd->nd_flag & NFSD_WAITING) { + nd->nd_flag &= ~NFSD_WAITING; + if (nd->nd_slp) + panic("nfsd wakeup"); + slp->ns_sref++; + nd->nd_slp = slp; + wakeup((caddr_t)nd); + return; + } + nd = nd->nd_next; + } + slp->ns_flag |= SLP_DOREC; + nfsd_head.nd_flag |= NFSD_CHECKSLP; +} + +nfs_msg(p, server, msg) + struct proc *p; + char *server, *msg; +{ + tpr_t tpr; + + if (p) + tpr = tprintf_open(p); + else + tpr = NULL; + tprintf(tpr, "nfs server %s: %s\n", server, msg); + tprintf_close(tpr); +} diff --git a/sys/nfs/nfs_srvcache.c b/sys/nfs/nfs_srvcache.c new file mode 100644 index 00000000000..63d8bb72d82 --- /dev/null +++ b/sys/nfs/nfs_srvcache.c @@ -0,0 +1,348 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_srvcache.c 8.1 (Berkeley) 6/10/93 + */ + +/* + * Reference: Chet Juszczak, "Improving the Performance and Correctness + * of an NFS Server", in Proc. Winter 1989 USENIX Conference, + * pages 53-63. San Diego, February 1989. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifdef ISO +#include +#endif +#include +#include +#include +#include +#include +#include + +long numnfsrvcache, desirednfsrvcache = NFSRVCACHESIZ; + +#define NFSRCHASH(xid) (((xid) + ((xid) >> 24)) & rheadhash) +static struct nfsrvcache *nfsrvlruhead, **nfsrvlrutail = &nfsrvlruhead; +static struct nfsrvcache **rheadhtbl; +static u_long rheadhash; + +#define TRUE 1 +#define FALSE 0 + +#define NETFAMILY(rp) \ + (((rp)->rc_flag & RC_INETADDR) ? AF_INET : AF_ISO) + +/* + * Static array that defines which nfs rpc's are nonidempotent + */ +int nonidempotent[NFS_NPROCS] = { + FALSE, + FALSE, + TRUE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + TRUE, + TRUE, + TRUE, + TRUE, + TRUE, + TRUE, + TRUE, + TRUE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, +}; + +/* True iff the rpc reply is an nfs status ONLY! */ +static int repliesstatus[NFS_NPROCS] = { + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + TRUE, + TRUE, + TRUE, + TRUE, + FALSE, + TRUE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + TRUE, +}; + +/* + * Initialize the server request cache list + */ +nfsrv_initcache() +{ + + rheadhtbl = hashinit(desirednfsrvcache, M_NFSD, &rheadhash); +} + +/* + * Look for the request in the cache + * If found then + * return action and optionally reply + * else + * insert it in the cache + * + * The rules are as follows: + * - if in progress, return DROP request + * - if completed within DELAY of the current time, return DROP it + * - if completed a longer time ago return REPLY if the reply was cached or + * return DOIT + * Update/add new request at end of lru list + */ +nfsrv_getcache(nam, nd, repp) + struct mbuf *nam; + register struct nfsd *nd; + struct mbuf **repp; +{ + register struct nfsrvcache *rp, *rq, **rpp; + struct mbuf *mb; + struct sockaddr_in *saddr; + caddr_t bpos; + int ret; + + if (nd->nd_nqlflag != NQL_NOVAL) + return (RC_DOIT); + rpp = &rheadhtbl[NFSRCHASH(nd->nd_retxid)]; +loop: + for (rp = *rpp; rp; rp = rp->rc_forw) { + if (nd->nd_retxid == rp->rc_xid && nd->nd_procnum == rp->rc_proc && + netaddr_match(NETFAMILY(rp), &rp->rc_haddr, nam)) { + if ((rp->rc_flag & RC_LOCKED) != 0) { + rp->rc_flag |= RC_WANTED; + (void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 0); + goto loop; + } + rp->rc_flag |= RC_LOCKED; + /* If not at end of LRU chain, move it there */ + if (rp->rc_next) { + /* remove from LRU chain */ + *rp->rc_prev = rp->rc_next; + rp->rc_next->rc_prev = rp->rc_prev; + /* and replace at end of it */ + rp->rc_next = NULL; + rp->rc_prev = nfsrvlrutail; + *nfsrvlrutail = rp; + nfsrvlrutail = &rp->rc_next; + } + if (rp->rc_state == RC_UNUSED) + panic("nfsrv cache"); + if (rp->rc_state == RC_INPROG) { + nfsstats.srvcache_inproghits++; + ret = RC_DROPIT; + } else if (rp->rc_flag & RC_REPSTATUS) { + nfsstats.srvcache_nonidemdonehits++; + nfs_rephead(0, nd, rp->rc_status, + 0, (u_quad_t *)0, repp, &mb, &bpos); + ret = RC_REPLY; + } else if (rp->rc_flag & RC_REPMBUF) { + nfsstats.srvcache_nonidemdonehits++; + *repp = m_copym(rp->rc_reply, 0, M_COPYALL, + M_WAIT); + ret = RC_REPLY; + } else { + nfsstats.srvcache_idemdonehits++; + rp->rc_state = RC_INPROG; + ret = RC_DOIT; + } + rp->rc_flag &= ~RC_LOCKED; + if (rp->rc_flag & RC_WANTED) { + rp->rc_flag &= ~RC_WANTED; + wakeup((caddr_t)rp); + } + return (ret); + } + } + nfsstats.srvcache_misses++; + if (numnfsrvcache < desirednfsrvcache) { + rp = (struct nfsrvcache *)malloc((u_long)sizeof *rp, + M_NFSD, M_WAITOK); + bzero((char *)rp, sizeof *rp); + numnfsrvcache++; + rp->rc_flag = RC_LOCKED; + } else { + rp = nfsrvlruhead; + while ((rp->rc_flag & RC_LOCKED) != 0) { + rp->rc_flag |= RC_WANTED; + (void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 0); + rp = nfsrvlruhead; + } + rp->rc_flag |= RC_LOCKED; + /* remove from hash chain */ + if (rq = rp->rc_forw) + rq->rc_back = rp->rc_back; + *rp->rc_back = rq; + /* remove from LRU chain */ + *rp->rc_prev = rp->rc_next; + rp->rc_next->rc_prev = rp->rc_prev; + if (rp->rc_flag & RC_REPMBUF) + m_freem(rp->rc_reply); + if (rp->rc_flag & RC_NAM) + MFREE(rp->rc_nam, mb); + rp->rc_flag &= (RC_LOCKED | RC_WANTED); + } + /* place at end of LRU list */ + rp->rc_next = NULL; + rp->rc_prev = nfsrvlrutail; + *nfsrvlrutail = rp; + nfsrvlrutail = &rp->rc_next; + rp->rc_state = RC_INPROG; + rp->rc_xid = nd->nd_retxid; + saddr = mtod(nam, struct sockaddr_in *); + switch (saddr->sin_family) { + case AF_INET: + rp->rc_flag |= RC_INETADDR; + rp->rc_inetaddr = saddr->sin_addr.s_addr; + break; + case AF_ISO: + default: + rp->rc_flag |= RC_NAM; + rp->rc_nam = m_copym(nam, 0, M_COPYALL, M_WAIT); + break; + }; + rp->rc_proc = nd->nd_procnum; + /* insert into hash chain */ + if (rq = *rpp) + rq->rc_back = &rp->rc_forw; + rp->rc_forw = rq; + rp->rc_back = rpp; + *rpp = rp; + rp->rc_flag &= ~RC_LOCKED; + if (rp->rc_flag & RC_WANTED) { + rp->rc_flag &= ~RC_WANTED; + wakeup((caddr_t)rp); + } + return (RC_DOIT); +} + +/* + * Update a request cache entry after the rpc has been done + */ +void +nfsrv_updatecache(nam, nd, repvalid, repmbuf) + struct mbuf *nam; + register struct nfsd *nd; + int repvalid; + struct mbuf *repmbuf; +{ + register struct nfsrvcache *rp; + + if (nd->nd_nqlflag != NQL_NOVAL) + return; +loop: + for (rp = rheadhtbl[NFSRCHASH(nd->nd_retxid)]; rp; rp = rp->rc_forw) { + if (nd->nd_retxid == rp->rc_xid && nd->nd_procnum == rp->rc_proc && + netaddr_match(NETFAMILY(rp), &rp->rc_haddr, nam)) { + if ((rp->rc_flag & RC_LOCKED) != 0) { + rp->rc_flag |= RC_WANTED; + (void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 0); + goto loop; + } + rp->rc_flag |= RC_LOCKED; + rp->rc_state = RC_DONE; + /* + * If we have a valid reply update status and save + * the reply for non-idempotent rpc's. + */ + if (repvalid && nonidempotent[nd->nd_procnum]) { + if (repliesstatus[nd->nd_procnum]) { + rp->rc_status = nd->nd_repstat; + rp->rc_flag |= RC_REPSTATUS; + } else { + rp->rc_reply = m_copym(repmbuf, + 0, M_COPYALL, M_WAIT); + rp->rc_flag |= RC_REPMBUF; + } + } + rp->rc_flag &= ~RC_LOCKED; + if (rp->rc_flag & RC_WANTED) { + rp->rc_flag &= ~RC_WANTED; + wakeup((caddr_t)rp); + } + return; + } + } +} + +/* + * Clean out the cache. Called when the last nfsd terminates. + */ +void +nfsrv_cleancache() +{ + register struct nfsrvcache *rp, *nextrp; + + for (rp = nfsrvlruhead; rp; rp = nextrp) { + nextrp = rp->rc_next; + free(rp, M_NFSD); + } + bzero((char *)rheadhtbl, (rheadhash + 1) * sizeof(void *)); + nfsrvlruhead = NULL; + nfsrvlrutail = &nfsrvlruhead; + numnfsrvcache = 0; +} diff --git a/sys/nfs/nfs_subs.c b/sys/nfs/nfs_subs.c new file mode 100644 index 00000000000..5778f7d7f01 --- /dev/null +++ b/sys/nfs/nfs_subs.c @@ -0,0 +1,1130 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_subs.c 8.3 (Berkeley) 1/4/94 + */ + +/* + * These functions support the macros and help fiddle mbuf chains for + * the nfs op functions. They do things like create the rpc header and + * copy data between mbuf chains and uio lists. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#ifdef ISO +#include +#endif + +#define TRUE 1 +#define FALSE 0 + +/* + * Data items converted to xdr at startup, since they are constant + * This is kinda hokey, but may save a little time doing byte swaps + */ +u_long nfs_procids[NFS_NPROCS]; +u_long nfs_xdrneg1; +u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, + rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_rejectedcred, + rpc_auth_kerb; +u_long nfs_vers, nfs_prog, nfs_true, nfs_false; + +/* And other global data */ +static u_long nfs_xid = 0; +enum vtype ntov_type[7] = { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON }; +extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +extern struct nfsreq nfsreqh; +extern int nqnfs_piggy[NFS_NPROCS]; +extern struct nfsrtt nfsrtt; +extern time_t nqnfsstarttime; +extern u_long nqnfs_prog, nqnfs_vers; +extern int nqsrv_clockskew; +extern int nqsrv_writeslack; +extern int nqsrv_maxlease; + +/* + * Create the header for an rpc request packet + * The hsiz is the size of the rest of the nfs request header. + * (just used to decide if a cluster is a good idea) + */ +struct mbuf * +nfsm_reqh(vp, procid, hsiz, bposp) + struct vnode *vp; + u_long procid; + int hsiz; + caddr_t *bposp; +{ + register struct mbuf *mb; + register u_long *tl; + register caddr_t bpos; + struct mbuf *mb2; + struct nfsmount *nmp; + int nqflag; + + MGET(mb, M_WAIT, MT_DATA); + if (hsiz >= MINCLSIZE) + MCLGET(mb, M_WAIT); + mb->m_len = 0; + bpos = mtod(mb, caddr_t); + + /* + * For NQNFS, add lease request. + */ + if (vp) { + nmp = VFSTONFS(vp->v_mount); + if (nmp->nm_flag & NFSMNT_NQNFS) { + nqflag = NQNFS_NEEDLEASE(vp, procid); + if (nqflag) { + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(nqflag); + *tl = txdr_unsigned(nmp->nm_leaseterm); + } else { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = 0; + } + } + } + /* Finally, return values */ + *bposp = bpos; + return (mb); +} + +/* + * Build the RPC header and fill in the authorization info. + * The authorization string argument is only used when the credentials + * come from outside of the kernel. + * Returns the head of the mbuf list. + */ +struct mbuf * +nfsm_rpchead(cr, nqnfs, procid, auth_type, auth_len, auth_str, mrest, + mrest_len, mbp, xidp) + register struct ucred *cr; + int nqnfs; + int procid; + int auth_type; + int auth_len; + char *auth_str; + struct mbuf *mrest; + int mrest_len; + struct mbuf **mbp; + u_long *xidp; +{ + register struct mbuf *mb; + register u_long *tl; + register caddr_t bpos; + register int i; + struct mbuf *mreq, *mb2; + int siz, grpsiz, authsiz; + + authsiz = nfsm_rndup(auth_len); + if (auth_type == RPCAUTH_NQNFS) + authsiz += 2 * NFSX_UNSIGNED; + MGETHDR(mb, M_WAIT, MT_DATA); + if ((authsiz + 10*NFSX_UNSIGNED) >= MINCLSIZE) { + MCLGET(mb, M_WAIT); + } else if ((authsiz + 10*NFSX_UNSIGNED) < MHLEN) { + MH_ALIGN(mb, authsiz + 10*NFSX_UNSIGNED); + } else { + MH_ALIGN(mb, 8*NFSX_UNSIGNED); + } + mb->m_len = 0; + mreq = mb; + bpos = mtod(mb, caddr_t); + + /* + * First the RPC header. + */ + nfsm_build(tl, u_long *, 8*NFSX_UNSIGNED); + if (++nfs_xid == 0) + nfs_xid++; + *tl++ = *xidp = txdr_unsigned(nfs_xid); + *tl++ = rpc_call; + *tl++ = rpc_vers; + if (nqnfs) { + *tl++ = txdr_unsigned(NQNFS_PROG); + *tl++ = txdr_unsigned(NQNFS_VER1); + } else { + *tl++ = txdr_unsigned(NFS_PROG); + *tl++ = txdr_unsigned(NFS_VER2); + } + *tl++ = txdr_unsigned(procid); + + /* + * And then the authorization cred. + */ + *tl++ = txdr_unsigned(auth_type); + *tl = txdr_unsigned(authsiz); + switch (auth_type) { + case RPCAUTH_UNIX: + nfsm_build(tl, u_long *, auth_len); + *tl++ = 0; /* stamp ?? */ + *tl++ = 0; /* NULL hostname */ + *tl++ = txdr_unsigned(cr->cr_uid); + *tl++ = txdr_unsigned(cr->cr_groups[0]); + grpsiz = (auth_len >> 2) - 5; + *tl++ = txdr_unsigned(grpsiz); + for (i = 1; i <= grpsiz; i++) + *tl++ = txdr_unsigned(cr->cr_groups[i]); + break; + case RPCAUTH_NQNFS: + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(cr->cr_uid); + *tl = txdr_unsigned(auth_len); + siz = auth_len; + while (siz > 0) { + if (M_TRAILINGSPACE(mb) == 0) { + MGET(mb2, M_WAIT, MT_DATA); + if (siz >= MINCLSIZE) + MCLGET(mb2, M_WAIT); + mb->m_next = mb2; + mb = mb2; + mb->m_len = 0; + bpos = mtod(mb, caddr_t); + } + i = min(siz, M_TRAILINGSPACE(mb)); + bcopy(auth_str, bpos, i); + mb->m_len += i; + auth_str += i; + bpos += i; + siz -= i; + } + if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { + for (i = 0; i < siz; i++) + *bpos++ = '\0'; + mb->m_len += siz; + } + break; + }; + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(RPCAUTH_NULL); + *tl = 0; + mb->m_next = mrest; + mreq->m_pkthdr.len = authsiz + 10*NFSX_UNSIGNED + mrest_len; + mreq->m_pkthdr.rcvif = (struct ifnet *)0; + *mbp = mb; + return (mreq); +} + +/* + * copies mbuf chain to the uio scatter/gather list + */ +nfsm_mbuftouio(mrep, uiop, siz, dpos) + struct mbuf **mrep; + register struct uio *uiop; + int siz; + caddr_t *dpos; +{ + register char *mbufcp, *uiocp; + register int xfer, left, len; + register struct mbuf *mp; + long uiosiz, rem; + int error = 0; + + mp = *mrep; + mbufcp = *dpos; + len = mtod(mp, caddr_t)+mp->m_len-mbufcp; + rem = nfsm_rndup(siz)-siz; + while (siz > 0) { + if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + return (EFBIG); + left = uiop->uio_iov->iov_len; + uiocp = uiop->uio_iov->iov_base; + if (left > siz) + left = siz; + uiosiz = left; + while (left > 0) { + while (len == 0) { + mp = mp->m_next; + if (mp == NULL) + return (EBADRPC); + mbufcp = mtod(mp, caddr_t); + len = mp->m_len; + } + xfer = (left > len) ? len : left; +#ifdef notdef + /* Not Yet.. */ + if (uiop->uio_iov->iov_op != NULL) + (*(uiop->uio_iov->iov_op)) + (mbufcp, uiocp, xfer); + else +#endif + if (uiop->uio_segflg == UIO_SYSSPACE) + bcopy(mbufcp, uiocp, xfer); + else + copyout(mbufcp, uiocp, xfer); + left -= xfer; + len -= xfer; + mbufcp += xfer; + uiocp += xfer; + uiop->uio_offset += xfer; + uiop->uio_resid -= xfer; + } + if (uiop->uio_iov->iov_len <= siz) { + uiop->uio_iovcnt--; + uiop->uio_iov++; + } else { + uiop->uio_iov->iov_base += uiosiz; + uiop->uio_iov->iov_len -= uiosiz; + } + siz -= uiosiz; + } + *dpos = mbufcp; + *mrep = mp; + if (rem > 0) { + if (len < rem) + error = nfs_adv(mrep, dpos, rem, len); + else + *dpos += rem; + } + return (error); +} + +/* + * copies a uio scatter/gather list to an mbuf chain... + */ +nfsm_uiotombuf(uiop, mq, siz, bpos) + register struct uio *uiop; + struct mbuf **mq; + int siz; + caddr_t *bpos; +{ + register char *uiocp; + register struct mbuf *mp, *mp2; + register int xfer, left, mlen; + int uiosiz, clflg, rem; + char *cp; + + if (siz > MLEN) /* or should it >= MCLBYTES ?? */ + clflg = 1; + else + clflg = 0; + rem = nfsm_rndup(siz)-siz; + mp = mp2 = *mq; + while (siz > 0) { + if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + return (EINVAL); + left = uiop->uio_iov->iov_len; + uiocp = uiop->uio_iov->iov_base; + if (left > siz) + left = siz; + uiosiz = left; + while (left > 0) { + mlen = M_TRAILINGSPACE(mp); + if (mlen == 0) { + MGET(mp, M_WAIT, MT_DATA); + if (clflg) + MCLGET(mp, M_WAIT); + mp->m_len = 0; + mp2->m_next = mp; + mp2 = mp; + mlen = M_TRAILINGSPACE(mp); + } + xfer = (left > mlen) ? mlen : left; +#ifdef notdef + /* Not Yet.. */ + if (uiop->uio_iov->iov_op != NULL) + (*(uiop->uio_iov->iov_op)) + (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + else +#endif + if (uiop->uio_segflg == UIO_SYSSPACE) + bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + else + copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + mp->m_len += xfer; + left -= xfer; + uiocp += xfer; + uiop->uio_offset += xfer; + uiop->uio_resid -= xfer; + } + if (uiop->uio_iov->iov_len <= siz) { + uiop->uio_iovcnt--; + uiop->uio_iov++; + } else { + uiop->uio_iov->iov_base += uiosiz; + uiop->uio_iov->iov_len -= uiosiz; + } + siz -= uiosiz; + } + if (rem > 0) { + if (rem > M_TRAILINGSPACE(mp)) { + MGET(mp, M_WAIT, MT_DATA); + mp->m_len = 0; + mp2->m_next = mp; + } + cp = mtod(mp, caddr_t)+mp->m_len; + for (left = 0; left < rem; left++) + *cp++ = '\0'; + mp->m_len += rem; + *bpos = cp; + } else + *bpos = mtod(mp, caddr_t)+mp->m_len; + *mq = mp; + return (0); +} + +/* + * Help break down an mbuf chain by setting the first siz bytes contiguous + * pointed to by returned val. + * This is used by the macros nfsm_dissect and nfsm_dissecton for tough + * cases. (The macros use the vars. dpos and dpos2) + */ +nfsm_disct(mdp, dposp, siz, left, cp2) + struct mbuf **mdp; + caddr_t *dposp; + int siz; + int left; + caddr_t *cp2; +{ + register struct mbuf *mp, *mp2; + register int siz2, xfer; + register caddr_t p; + + mp = *mdp; + while (left == 0) { + *mdp = mp = mp->m_next; + if (mp == NULL) + return (EBADRPC); + left = mp->m_len; + *dposp = mtod(mp, caddr_t); + } + if (left >= siz) { + *cp2 = *dposp; + *dposp += siz; + } else if (mp->m_next == NULL) { + return (EBADRPC); + } else if (siz > MHLEN) { + panic("nfs S too big"); + } else { + MGET(mp2, M_WAIT, MT_DATA); + mp2->m_next = mp->m_next; + mp->m_next = mp2; + mp->m_len -= left; + mp = mp2; + *cp2 = p = mtod(mp, caddr_t); + bcopy(*dposp, p, left); /* Copy what was left */ + siz2 = siz-left; + p += left; + mp2 = mp->m_next; + /* Loop around copying up the siz2 bytes */ + while (siz2 > 0) { + if (mp2 == NULL) + return (EBADRPC); + xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; + if (xfer > 0) { + bcopy(mtod(mp2, caddr_t), p, xfer); + NFSMADV(mp2, xfer); + mp2->m_len -= xfer; + p += xfer; + siz2 -= xfer; + } + if (siz2 > 0) + mp2 = mp2->m_next; + } + mp->m_len = siz; + *mdp = mp2; + *dposp = mtod(mp2, caddr_t); + } + return (0); +} + +/* + * Advance the position in the mbuf chain. + */ +nfs_adv(mdp, dposp, offs, left) + struct mbuf **mdp; + caddr_t *dposp; + int offs; + int left; +{ + register struct mbuf *m; + register int s; + + m = *mdp; + s = left; + while (s < offs) { + offs -= s; + m = m->m_next; + if (m == NULL) + return (EBADRPC); + s = m->m_len; + } + *mdp = m; + *dposp = mtod(m, caddr_t)+offs; + return (0); +} + +/* + * Copy a string into mbufs for the hard cases... + */ +nfsm_strtmbuf(mb, bpos, cp, siz) + struct mbuf **mb; + char **bpos; + char *cp; + long siz; +{ + register struct mbuf *m1, *m2; + long left, xfer, len, tlen; + u_long *tl; + int putsize; + + putsize = 1; + m2 = *mb; + left = M_TRAILINGSPACE(m2); + if (left > 0) { + tl = ((u_long *)(*bpos)); + *tl++ = txdr_unsigned(siz); + putsize = 0; + left -= NFSX_UNSIGNED; + m2->m_len += NFSX_UNSIGNED; + if (left > 0) { + bcopy(cp, (caddr_t) tl, left); + siz -= left; + cp += left; + m2->m_len += left; + left = 0; + } + } + /* Loop around adding mbufs */ + while (siz > 0) { + MGET(m1, M_WAIT, MT_DATA); + if (siz > MLEN) + MCLGET(m1, M_WAIT); + m1->m_len = NFSMSIZ(m1); + m2->m_next = m1; + m2 = m1; + tl = mtod(m1, u_long *); + tlen = 0; + if (putsize) { + *tl++ = txdr_unsigned(siz); + m1->m_len -= NFSX_UNSIGNED; + tlen = NFSX_UNSIGNED; + putsize = 0; + } + if (siz < m1->m_len) { + len = nfsm_rndup(siz); + xfer = siz; + if (xfer < len) + *(tl+(xfer>>2)) = 0; + } else { + xfer = len = m1->m_len; + } + bcopy(cp, (caddr_t) tl, xfer); + m1->m_len = len+tlen; + siz -= xfer; + cp += xfer; + } + *mb = m1; + *bpos = mtod(m1, caddr_t)+m1->m_len; + return (0); +} + +/* + * Called once to initialize data structures... + */ +nfs_init() +{ + register int i; + + nfsrtt.pos = 0; + rpc_vers = txdr_unsigned(RPC_VER2); + rpc_call = txdr_unsigned(RPC_CALL); + rpc_reply = txdr_unsigned(RPC_REPLY); + rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); + rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); + rpc_mismatch = txdr_unsigned(RPC_MISMATCH); + rpc_autherr = txdr_unsigned(RPC_AUTHERR); + rpc_rejectedcred = txdr_unsigned(AUTH_REJECTCRED); + rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); + rpc_auth_kerb = txdr_unsigned(RPCAUTH_NQNFS); + nfs_vers = txdr_unsigned(NFS_VER2); + nfs_prog = txdr_unsigned(NFS_PROG); + nfs_true = txdr_unsigned(TRUE); + nfs_false = txdr_unsigned(FALSE); + /* Loop thru nfs procids */ + for (i = 0; i < NFS_NPROCS; i++) + nfs_procids[i] = txdr_unsigned(i); + /* Ensure async daemons disabled */ + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + nfs_iodwant[i] = (struct proc *)0; + TAILQ_INIT(&nfs_bufq); + nfs_xdrneg1 = txdr_unsigned(-1); + nfs_nhinit(); /* Init the nfsnode table */ + nfsrv_init(0); /* Init server data structures */ + nfsrv_initcache(); /* Init the server request cache */ + + /* + * Initialize the nqnfs server stuff. + */ + if (nqnfsstarttime == 0) { + nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + + nqsrv_clockskew + nqsrv_writeslack; + NQLOADNOVRAM(nqnfsstarttime); + nqnfs_prog = txdr_unsigned(NQNFS_PROG); + nqnfs_vers = txdr_unsigned(NQNFS_VER1); + nqthead.th_head[0] = &nqthead; + nqthead.th_head[1] = &nqthead; + nqfhead = hashinit(NQLCHSZ, M_NQLEASE, &nqfheadhash); + } + + /* + * Initialize reply list and start timer + */ + nfsreqh.r_prev = nfsreqh.r_next = &nfsreqh; + nfs_timer(); +} + +/* + * Attribute cache routines. + * nfs_loadattrcache() - loads or updates the cache contents from attributes + * that are on the mbuf list + * nfs_getattrcache() - returns valid attributes if found in cache, returns + * error otherwise + */ + +/* + * Load the attribute cache (that lives in the nfsnode entry) with + * the values on the mbuf list and + * Iff vap not NULL + * copy the attributes to *vaper + */ +nfs_loadattrcache(vpp, mdp, dposp, vaper) + struct vnode **vpp; + struct mbuf **mdp; + caddr_t *dposp; + struct vattr *vaper; +{ + register struct vnode *vp = *vpp; + register struct vattr *vap; + register struct nfsv2_fattr *fp; + extern int (**spec_nfsv2nodeop_p)(); + register struct nfsnode *np, *nq, **nhpp; + register long t1; + caddr_t dpos, cp2; + int error = 0, isnq; + struct mbuf *md; + enum vtype vtyp; + u_short vmode; + long rdev; + struct timespec mtime; + struct vnode *nvp; + + md = *mdp; + dpos = *dposp; + t1 = (mtod(md, caddr_t) + md->m_len) - dpos; + isnq = (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS); + if (error = nfsm_disct(&md, &dpos, NFSX_FATTR(isnq), t1, &cp2)) + return (error); + fp = (struct nfsv2_fattr *)cp2; + vtyp = nfstov_type(fp->fa_type); + vmode = fxdr_unsigned(u_short, fp->fa_mode); + if (vtyp == VNON || vtyp == VREG) + vtyp = IFTOVT(vmode); + if (isnq) { + rdev = fxdr_unsigned(long, fp->fa_nqrdev); + fxdr_nqtime(&fp->fa_nqmtime, &mtime); + } else { + rdev = fxdr_unsigned(long, fp->fa_nfsrdev); + fxdr_nfstime(&fp->fa_nfsmtime, &mtime); + } + /* + * If v_type == VNON it is a new node, so fill in the v_type, + * n_mtime fields. Check to see if it represents a special + * device, and if so, check for a possible alias. Once the + * correct vnode has been obtained, fill in the rest of the + * information. + */ + np = VTONFS(vp); + if (vp->v_type == VNON) { + if (vtyp == VCHR && rdev == 0xffffffff) + vp->v_type = vtyp = VFIFO; + else + vp->v_type = vtyp; + if (vp->v_type == VFIFO) { +#ifdef FIFO + extern int (**fifo_nfsv2nodeop_p)(); + vp->v_op = fifo_nfsv2nodeop_p; +#else + return (EOPNOTSUPP); +#endif /* FIFO */ + } + if (vp->v_type == VCHR || vp->v_type == VBLK) { + vp->v_op = spec_nfsv2nodeop_p; + if (nvp = checkalias(vp, (dev_t)rdev, vp->v_mount)) { + /* + * Discard unneeded vnode, but save its nfsnode. + */ + if (nq = np->n_forw) + nq->n_back = np->n_back; + *np->n_back = nq; + nvp->v_data = vp->v_data; + vp->v_data = NULL; + vp->v_op = spec_vnodeop_p; + vrele(vp); + vgone(vp); + /* + * Reinitialize aliased node. + */ + np->n_vnode = nvp; + nhpp = (struct nfsnode **)nfs_hash(&np->n_fh); + if (nq = *nhpp) + nq->n_back = &np->n_forw; + np->n_forw = nq; + np->n_back = nhpp; + *nhpp = np; + *vpp = vp = nvp; + } + } + np->n_mtime = mtime.ts_sec; + } + vap = &np->n_vattr; + vap->va_type = vtyp; + vap->va_mode = (vmode & 07777); + vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); + vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); + vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); + vap->va_rdev = (dev_t)rdev; + vap->va_mtime = mtime; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + if (isnq) { + fxdr_hyper(&fp->fa_nqsize, &vap->va_size); + vap->va_blocksize = fxdr_unsigned(long, fp->fa_nqblocksize); + fxdr_hyper(&fp->fa_nqbytes, &vap->va_bytes); + vap->va_fileid = fxdr_unsigned(long, fp->fa_nqfileid); + fxdr_nqtime(&fp->fa_nqatime, &vap->va_atime); + vap->va_flags = fxdr_unsigned(u_long, fp->fa_nqflags); + fxdr_nqtime(&fp->fa_nqctime, &vap->va_ctime); + vap->va_gen = fxdr_unsigned(u_long, fp->fa_nqgen); + fxdr_hyper(&fp->fa_nqfilerev, &vap->va_filerev); + } else { + vap->va_size = fxdr_unsigned(u_long, fp->fa_nfssize); + vap->va_blocksize = fxdr_unsigned(long, fp->fa_nfsblocksize); + vap->va_bytes = fxdr_unsigned(long, fp->fa_nfsblocks) * NFS_FABLKSIZE; + vap->va_fileid = fxdr_unsigned(long, fp->fa_nfsfileid); + fxdr_nfstime(&fp->fa_nfsatime, &vap->va_atime); + vap->va_flags = 0; + vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa_nfsctime.nfs_sec); + vap->va_ctime.ts_nsec = 0; + vap->va_gen = fxdr_unsigned(u_long, fp->fa_nfsctime.nfs_usec); + vap->va_filerev = 0; + } + if (vap->va_size != np->n_size) { + if (vap->va_type == VREG) { + if (np->n_flag & NMODIFIED) { + if (vap->va_size < np->n_size) + vap->va_size = np->n_size; + else + np->n_size = vap->va_size; + } else + np->n_size = vap->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else + np->n_size = vap->va_size; + } + np->n_attrstamp = time.tv_sec; + *dposp = dpos; + *mdp = md; + if (vaper != NULL) { + bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); +#ifdef notdef + if ((np->n_flag & NMODIFIED) && np->n_size > vap->va_size) + if (np->n_size > vap->va_size) + vaper->va_size = np->n_size; +#endif + if (np->n_flag & NCHG) { + if (np->n_flag & NACC) { + vaper->va_atime.ts_sec = np->n_atim.tv_sec; + vaper->va_atime.ts_nsec = + np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vaper->va_mtime.ts_sec = np->n_mtim.tv_sec; + vaper->va_mtime.ts_nsec = + np->n_mtim.tv_usec * 1000; + } + } + } + return (0); +} + +/* + * Check the time stamp + * If the cache is valid, copy contents to *vap and return 0 + * otherwise return an error + */ +nfs_getattrcache(vp, vaper) + register struct vnode *vp; + struct vattr *vaper; +{ + register struct nfsnode *np = VTONFS(vp); + register struct vattr *vap; + + if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQLOOKLEASE) { + if (!NQNFS_CKCACHABLE(vp, NQL_READ) || np->n_attrstamp == 0) { + nfsstats.attrcache_misses++; + return (ENOENT); + } + } else if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) { + nfsstats.attrcache_misses++; + return (ENOENT); + } + nfsstats.attrcache_hits++; + vap = &np->n_vattr; + if (vap->va_size != np->n_size) { + if (vap->va_type == VREG) { + if (np->n_flag & NMODIFIED) { + if (vap->va_size < np->n_size) + vap->va_size = np->n_size; + else + np->n_size = vap->va_size; + } else + np->n_size = vap->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else + np->n_size = vap->va_size; + } + bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); +#ifdef notdef + if ((np->n_flag & NMODIFIED) == 0) { + np->n_size = vaper->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else if (np->n_size > vaper->va_size) + if (np->n_size > vaper->va_size) + vaper->va_size = np->n_size; +#endif + if (np->n_flag & NCHG) { + if (np->n_flag & NACC) { + vaper->va_atime.ts_sec = np->n_atim.tv_sec; + vaper->va_atime.ts_nsec = np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vaper->va_mtime.ts_sec = np->n_mtim.tv_sec; + vaper->va_mtime.ts_nsec = np->n_mtim.tv_usec * 1000; + } + } + return (0); +} + +/* + * Set up nameidata for a lookup() call and do it + */ +nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, p) + register struct nameidata *ndp; + fhandle_t *fhp; + int len; + struct nfssvc_sock *slp; + struct mbuf *nam; + struct mbuf **mdp; + caddr_t *dposp; + struct proc *p; +{ + register int i, rem; + register struct mbuf *md; + register char *fromcp, *tocp; + struct vnode *dp; + int error, rdonly; + struct componentname *cnp = &ndp->ni_cnd; + + MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK); + /* + * Copy the name from the mbuf list to ndp->ni_pnbuf + * and set the various ndp fields appropriately. + */ + fromcp = *dposp; + tocp = cnp->cn_pnbuf; + md = *mdp; + rem = mtod(md, caddr_t) + md->m_len - fromcp; + cnp->cn_hash = 0; + for (i = 0; i < len; i++) { + while (rem == 0) { + md = md->m_next; + if (md == NULL) { + error = EBADRPC; + goto out; + } + fromcp = mtod(md, caddr_t); + rem = md->m_len; + } + if (*fromcp == '\0' || *fromcp == '/') { + error = EINVAL; + goto out; + } + cnp->cn_hash += (unsigned char)*fromcp; + *tocp++ = *fromcp++; + rem--; + } + *tocp = '\0'; + *mdp = md; + *dposp = fromcp; + len = nfsm_rndup(len)-len; + if (len > 0) { + if (rem >= len) + *dposp += len; + else if (error = nfs_adv(mdp, dposp, len, rem)) + goto out; + } + ndp->ni_pathlen = tocp - cnp->cn_pnbuf; + cnp->cn_nameptr = cnp->cn_pnbuf; + /* + * Extract and set starting directory. + */ + if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, + nam, &rdonly)) + goto out; + if (dp->v_type != VDIR) { + vrele(dp); + error = ENOTDIR; + goto out; + } + ndp->ni_startdir = dp; + if (rdonly) + cnp->cn_flags |= (NOCROSSMOUNT | RDONLY); + else + cnp->cn_flags |= NOCROSSMOUNT; + /* + * And call lookup() to do the real work + */ + cnp->cn_proc = p; + if (error = lookup(ndp)) + goto out; + /* + * Check for encountering a symbolic link + */ + if (cnp->cn_flags & ISSYMLINK) { + if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) + vput(ndp->ni_dvp); + else + vrele(ndp->ni_dvp); + vput(ndp->ni_vp); + ndp->ni_vp = NULL; + error = EINVAL; + goto out; + } + /* + * Check for saved name request + */ + if (cnp->cn_flags & (SAVENAME | SAVESTART)) { + cnp->cn_flags |= HASBUF; + return (0); + } +out: + FREE(cnp->cn_pnbuf, M_NAMEI); + return (error); +} + +/* + * A fiddled version of m_adj() that ensures null fill to a long + * boundary and only trims off the back end + */ +void +nfsm_adj(mp, len, nul) + struct mbuf *mp; + register int len; + int nul; +{ + register struct mbuf *m; + register int count, i; + register char *cp; + + /* + * Trim from tail. Scan the mbuf chain, + * calculating its length and finding the last mbuf. + * If the adjustment only affects this mbuf, then just + * adjust and return. Otherwise, rescan and truncate + * after the remaining size. + */ + count = 0; + m = mp; + for (;;) { + count += m->m_len; + if (m->m_next == (struct mbuf *)0) + break; + m = m->m_next; + } + if (m->m_len > len) { + m->m_len -= len; + if (nul > 0) { + cp = mtod(m, caddr_t)+m->m_len-nul; + for (i = 0; i < nul; i++) + *cp++ = '\0'; + } + return; + } + count -= len; + if (count < 0) + count = 0; + /* + * Correct length for chain is "count". + * Find the mbuf with last data, adjust its length, + * and toss data from remaining mbufs on chain. + */ + for (m = mp; m; m = m->m_next) { + if (m->m_len >= count) { + m->m_len = count; + if (nul > 0) { + cp = mtod(m, caddr_t)+m->m_len-nul; + for (i = 0; i < nul; i++) + *cp++ = '\0'; + } + break; + } + count -= m->m_len; + } + while (m = m->m_next) + m->m_len = 0; +} + +/* + * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) + * - look up fsid in mount list (if not found ret error) + * - get vp and export rights by calling VFS_FHTOVP() + * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon + * - if not lockflag unlock it with VOP_UNLOCK() + */ +nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp) + fhandle_t *fhp; + int lockflag; + struct vnode **vpp; + struct ucred *cred; + struct nfssvc_sock *slp; + struct mbuf *nam; + int *rdonlyp; +{ + register struct mount *mp; + register struct nfsuid *uidp; + register int i; + struct ucred *credanon; + int error, exflags; + + *vpp = (struct vnode *)0; + if ((mp = getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if (error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon)) + return (error); + /* + * Check/setup credentials. + */ + if (exflags & MNT_EXKERB) { + uidp = slp->ns_uidh[NUIDHASH(cred->cr_uid)]; + while (uidp) { + if (uidp->nu_uid == cred->cr_uid) + break; + uidp = uidp->nu_hnext; + } + if (uidp) { + cred->cr_uid = uidp->nu_cr.cr_uid; + for (i = 0; i < uidp->nu_cr.cr_ngroups; i++) + cred->cr_groups[i] = uidp->nu_cr.cr_groups[i]; + } else { + vput(*vpp); + return (NQNFS_AUTHERR); + } + } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { + cred->cr_uid = credanon->cr_uid; + for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) + cred->cr_groups[i] = credanon->cr_groups[i]; + } + if (exflags & MNT_EXRDONLY) + *rdonlyp = 1; + else + *rdonlyp = 0; + if (!lockflag) + VOP_UNLOCK(*vpp); + return (0); +} + +/* + * This function compares two net addresses by family and returns TRUE + * if they are the same host. + * If there is any doubt, return FALSE. + * The AF_INET family is handled as a special case so that address mbufs + * don't need to be saved to store "struct in_addr", which is only 4 bytes. + */ +netaddr_match(family, haddr, nam) + int family; + union nethostaddr *haddr; + struct mbuf *nam; +{ + register struct sockaddr_in *inetaddr; + + switch (family) { + case AF_INET: + inetaddr = mtod(nam, struct sockaddr_in *); + if (inetaddr->sin_family == AF_INET && + inetaddr->sin_addr.s_addr == haddr->had_inetaddr) + return (1); + break; +#ifdef ISO + case AF_ISO: + { + register struct sockaddr_iso *isoaddr1, *isoaddr2; + + isoaddr1 = mtod(nam, struct sockaddr_iso *); + isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *); + if (isoaddr1->siso_family == AF_ISO && + isoaddr1->siso_nlen > 0 && + isoaddr1->siso_nlen == isoaddr2->siso_nlen && + SAME_ISOADDR(isoaddr1, isoaddr2)) + return (1); + break; + } +#endif /* ISO */ + default: + break; + }; + return (0); +} diff --git a/sys/nfs/nfs_syscalls.c b/sys/nfs/nfs_syscalls.c new file mode 100644 index 00000000000..5d86b42ee20 --- /dev/null +++ b/sys/nfs/nfs_syscalls.c @@ -0,0 +1,874 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_syscalls.c 8.3 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef ISO +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +/* Global defs. */ +extern u_long nfs_prog, nfs_vers; +extern int (*nfsrv_procs[NFS_NPROCS])(); +extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +extern int nfs_numasync; +extern time_t nqnfsstarttime; +extern struct nfsrv_req nsrvq_head; +extern struct nfsd nfsd_head; +extern int nqsrv_writeslack; +extern int nfsrtton; +struct nfssvc_sock *nfs_udpsock, *nfs_cltpsock; +int nuidhash_max = NFS_MAXUIDHASH; +static int nfs_numnfsd = 0; +int nfsd_waiting = 0; +static int notstarted = 1; +static int modify_flag = 0; +static struct nfsdrt nfsdrt; +void nfsrv_cleancache(), nfsrv_rcv(), nfsrv_wakenfsd(), nfs_sndunlock(); +static void nfsd_rt(); +void nfsrv_slpderef(), nfsrv_init(); + +#define TRUE 1 +#define FALSE 0 + +static int nfs_asyncdaemon[NFS_MAXASYNCDAEMON]; +/* + * NFS server system calls + * getfh() lives here too, but maybe should move to kern/vfs_syscalls.c + */ + +/* + * Get file handle system call + */ +struct getfh_args { + char *fname; + fhandle_t *fhp; +}; +getfh(p, uap, retval) + struct proc *p; + register struct getfh_args *uap; + int *retval; +{ + register struct vnode *vp; + fhandle_t fh; + int error; + struct nameidata nd; + + /* + * Must be super user + */ + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + bzero((caddr_t)&fh, sizeof(fh)); + fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; + error = VFS_VPTOFH(vp, &fh.fh_fid); + vput(vp); + if (error) + return (error); + error = copyout((caddr_t)&fh, (caddr_t)uap->fhp, sizeof (fh)); + return (error); +} + +static struct nfssvc_sock nfssvc_sockhead; + +/* + * Nfs server psuedo system call for the nfsd's + * Based on the flag value it either: + * - adds a socket to the selection list + * - remains in the kernel as an nfsd + * - remains in the kernel as an nfsiod + */ +struct nfssvc_args { + int flag; + caddr_t argp; +}; +nfssvc(p, uap, retval) + struct proc *p; + register struct nfssvc_args *uap; + int *retval; +{ + struct nameidata nd; + struct file *fp; + struct mbuf *nam; + struct nfsd_args nfsdarg; + struct nfsd_srvargs nfsd_srvargs, *nsd = &nfsd_srvargs; + struct nfsd_cargs ncd; + struct nfsd *nfsd; + struct nfssvc_sock *slp; + struct nfsuid *nuidp, **nuh; + struct nfsmount *nmp; + int error; + + /* + * Must be super user + */ + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + while (nfssvc_sockhead.ns_flag & SLP_INIT) { + nfssvc_sockhead.ns_flag |= SLP_WANTINIT; + (void) tsleep((caddr_t)&nfssvc_sockhead, PSOCK, "nfsd init", 0); + } + if (uap->flag & NFSSVC_BIOD) + error = nfssvc_iod(p); + else if (uap->flag & NFSSVC_MNTD) { + if (error = copyin(uap->argp, (caddr_t)&ncd, sizeof (ncd))) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + ncd.ncd_dirp, p); + if (error = namei(&nd)) + return (error); + if ((nd.ni_vp->v_flag & VROOT) == 0) + error = EINVAL; + nmp = VFSTONFS(nd.ni_vp->v_mount); + vput(nd.ni_vp); + if (error) + return (error); + if ((nmp->nm_flag & NFSMNT_MNTD) && + (uap->flag & NFSSVC_GOTAUTH) == 0) + return (0); + nmp->nm_flag |= NFSMNT_MNTD; + error = nqnfs_clientd(nmp, p->p_ucred, &ncd, uap->flag, + uap->argp, p); + } else if (uap->flag & NFSSVC_ADDSOCK) { + if (error = copyin(uap->argp, (caddr_t)&nfsdarg, + sizeof(nfsdarg))) + return (error); + if (error = getsock(p->p_fd, nfsdarg.sock, &fp)) + return (error); + /* + * Get the client address for connected sockets. + */ + if (nfsdarg.name == NULL || nfsdarg.namelen == 0) + nam = (struct mbuf *)0; + else if (error = sockargs(&nam, nfsdarg.name, nfsdarg.namelen, + MT_SONAME)) + return (error); + error = nfssvc_addsock(fp, nam); + } else { + if (error = copyin(uap->argp, (caddr_t)nsd, sizeof (*nsd))) + return (error); + if ((uap->flag & NFSSVC_AUTHIN) && (nfsd = nsd->nsd_nfsd) && + (nfsd->nd_slp->ns_flag & SLP_VALID)) { + slp = nfsd->nd_slp; + + /* + * First check to see if another nfsd has already + * added this credential. + */ + nuidp = slp->ns_uidh[NUIDHASH(nsd->nsd_uid)]; + while (nuidp) { + if (nuidp->nu_uid == nsd->nsd_uid) + break; + nuidp = nuidp->nu_hnext; + } + if (!nuidp) { + /* + * Nope, so we will. + */ + if (slp->ns_numuids < nuidhash_max) { + slp->ns_numuids++; + nuidp = (struct nfsuid *) + malloc(sizeof (struct nfsuid), M_NFSUID, + M_WAITOK); + } else + nuidp = (struct nfsuid *)0; + if ((slp->ns_flag & SLP_VALID) == 0) { + if (nuidp) + free((caddr_t)nuidp, M_NFSUID); + } else { + if (nuidp == (struct nfsuid *)0) { + nuidp = slp->ns_lruprev; + remque(nuidp); + if (nuidp->nu_hprev) + nuidp->nu_hprev->nu_hnext = + nuidp->nu_hnext; + if (nuidp->nu_hnext) + nuidp->nu_hnext->nu_hprev = + nuidp->nu_hprev; + } + nuidp->nu_cr = nsd->nsd_cr; + if (nuidp->nu_cr.cr_ngroups > NGROUPS) + nuidp->nu_cr.cr_ngroups = NGROUPS; + nuidp->nu_cr.cr_ref = 1; + nuidp->nu_uid = nsd->nsd_uid; + insque(nuidp, (struct nfsuid *)slp); + nuh = &slp->ns_uidh[NUIDHASH(nsd->nsd_uid)]; + if (nuidp->nu_hnext = *nuh) + nuidp->nu_hnext->nu_hprev = nuidp; + nuidp->nu_hprev = (struct nfsuid *)0; + *nuh = nuidp; + } + } + } + if ((uap->flag & NFSSVC_AUTHINFAIL) && (nfsd = nsd->nsd_nfsd)) + nfsd->nd_flag |= NFSD_AUTHFAIL; + error = nfssvc_nfsd(nsd, uap->argp, p); + } + if (error == EINTR || error == ERESTART) + error = 0; + return (error); +} + +/* + * Adds a socket to the list for servicing by nfsds. + */ +nfssvc_addsock(fp, mynam) + struct file *fp; + struct mbuf *mynam; +{ + register struct mbuf *m; + register int siz; + register struct nfssvc_sock *slp; + register struct socket *so; + struct nfssvc_sock *tslp; + int error, s; + + so = (struct socket *)fp->f_data; + tslp = (struct nfssvc_sock *)0; + /* + * Add it to the list, as required. + */ + if (so->so_proto->pr_protocol == IPPROTO_UDP) { + tslp = nfs_udpsock; + if (tslp->ns_flag & SLP_VALID) { + m_freem(mynam); + return (EPERM); + } +#ifdef ISO + } else if (so->so_proto->pr_protocol == ISOPROTO_CLTP) { + tslp = nfs_cltpsock; + if (tslp->ns_flag & SLP_VALID) { + m_freem(mynam); + return (EPERM); + } +#endif /* ISO */ + } + if (so->so_type == SOCK_STREAM) + siz = NFS_MAXPACKET + sizeof (u_long); + else + siz = NFS_MAXPACKET; + if (error = soreserve(so, siz, siz)) { + m_freem(mynam); + return (error); + } + + /* + * Set protocol specific options { for now TCP only } and + * reserve some space. For datagram sockets, this can get called + * repeatedly for the same socket, but that isn't harmful. + */ + if (so->so_type == SOCK_STREAM) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); + } + if (so->so_proto->pr_domain->dom_family == AF_INET && + so->so_proto->pr_protocol == IPPROTO_TCP) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); + } + so->so_rcv.sb_flags &= ~SB_NOINTR; + so->so_rcv.sb_timeo = 0; + so->so_snd.sb_flags &= ~SB_NOINTR; + so->so_snd.sb_timeo = 0; + if (tslp) + slp = tslp; + else { + slp = (struct nfssvc_sock *) + malloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK); + bzero((caddr_t)slp, sizeof (struct nfssvc_sock)); + slp->ns_prev = nfssvc_sockhead.ns_prev; + slp->ns_prev->ns_next = slp; + slp->ns_next = &nfssvc_sockhead; + nfssvc_sockhead.ns_prev = slp; + slp->ns_lrunext = slp->ns_lruprev = (struct nfsuid *)slp; + } + slp->ns_so = so; + slp->ns_nam = mynam; + fp->f_count++; + slp->ns_fp = fp; + s = splnet(); + so->so_upcallarg = (caddr_t)slp; + so->so_upcall = nfsrv_rcv; + slp->ns_flag = (SLP_VALID | SLP_NEEDQ); + nfsrv_wakenfsd(slp); + splx(s); + return (0); +} + +/* + * Called by nfssvc() for nfsds. Just loops around servicing rpc requests + * until it is killed by a signal. + */ +nfssvc_nfsd(nsd, argp, p) + struct nfsd_srvargs *nsd; + caddr_t argp; + struct proc *p; +{ + register struct mbuf *m, *nam2; + register int siz; + register struct nfssvc_sock *slp; + register struct socket *so; + register int *solockp; + struct nfsd *nd = nsd->nsd_nfsd; + struct mbuf *mreq, *nam; + struct timeval starttime; + struct nfsuid *uidp; + int error, cacherep, s; + int sotype; + + s = splnet(); + if (nd == (struct nfsd *)0) { + nsd->nsd_nfsd = nd = (struct nfsd *) + malloc(sizeof (struct nfsd), M_NFSD, M_WAITOK); + bzero((caddr_t)nd, sizeof (struct nfsd)); + nd->nd_procp = p; + nd->nd_cr.cr_ref = 1; + insque(nd, &nfsd_head); + nd->nd_nqlflag = NQL_NOVAL; + nfs_numnfsd++; + } + /* + * Loop getting rpc requests until SIGKILL. + */ + for (;;) { + if ((nd->nd_flag & NFSD_REQINPROG) == 0) { + while (nd->nd_slp == (struct nfssvc_sock *)0 && + (nfsd_head.nd_flag & NFSD_CHECKSLP) == 0) { + nd->nd_flag |= NFSD_WAITING; + nfsd_waiting++; + error = tsleep((caddr_t)nd, PSOCK | PCATCH, "nfsd", 0); + nfsd_waiting--; + if (error) + goto done; + } + if (nd->nd_slp == (struct nfssvc_sock *)0 && + (nfsd_head.nd_flag & NFSD_CHECKSLP)) { + slp = nfssvc_sockhead.ns_next; + while (slp != &nfssvc_sockhead) { + if ((slp->ns_flag & (SLP_VALID | SLP_DOREC)) + == (SLP_VALID | SLP_DOREC)) { + slp->ns_flag &= ~SLP_DOREC; + slp->ns_sref++; + nd->nd_slp = slp; + break; + } + slp = slp->ns_next; + } + if (slp == &nfssvc_sockhead) + nfsd_head.nd_flag &= ~NFSD_CHECKSLP; + } + if ((slp = nd->nd_slp) == (struct nfssvc_sock *)0) + continue; + if (slp->ns_flag & SLP_VALID) { + if (slp->ns_flag & SLP_DISCONN) + nfsrv_zapsock(slp); + else if (slp->ns_flag & SLP_NEEDQ) { + slp->ns_flag &= ~SLP_NEEDQ; + (void) nfs_sndlock(&slp->ns_solock, + (struct nfsreq *)0); + nfsrv_rcv(slp->ns_so, (caddr_t)slp, + M_WAIT); + nfs_sndunlock(&slp->ns_solock); + } + error = nfsrv_dorec(slp, nd); + nd->nd_flag |= NFSD_REQINPROG; + } + } else { + error = 0; + slp = nd->nd_slp; + } + if (error || (slp->ns_flag & SLP_VALID) == 0) { + nd->nd_slp = (struct nfssvc_sock *)0; + nd->nd_flag &= ~NFSD_REQINPROG; + nfsrv_slpderef(slp); + continue; + } + splx(s); + so = slp->ns_so; + sotype = so->so_type; + starttime = time; + if (so->so_proto->pr_flags & PR_CONNREQUIRED) + solockp = &slp->ns_solock; + else + solockp = (int *)0; + /* + * nam == nam2 for connectionless protocols such as UDP + * nam2 == NULL for connection based protocols to disable + * recent request caching. + */ + if (nam2 = nd->nd_nam) { + nam = nam2; + cacherep = RC_CHECKIT; + } else { + nam = slp->ns_nam; + cacherep = RC_DOIT; + } + + /* + * Check to see if authorization is needed. + */ + if (nd->nd_flag & NFSD_NEEDAUTH) { + static int logauth = 0; + + nd->nd_flag &= ~NFSD_NEEDAUTH; + /* + * Check for a mapping already installed. + */ + uidp = slp->ns_uidh[NUIDHASH(nd->nd_cr.cr_uid)]; + while (uidp) { + if (uidp->nu_uid == nd->nd_cr.cr_uid) + break; + uidp = uidp->nu_hnext; + } + if (!uidp) { + nsd->nsd_uid = nd->nd_cr.cr_uid; + if (nam2 && logauth++ == 0) + log(LOG_WARNING, "Kerberized NFS using UDP\n"); + nsd->nsd_haddr = + mtod(nam, struct sockaddr_in *)->sin_addr.s_addr; + nsd->nsd_authlen = nd->nd_authlen; + if (copyout(nd->nd_authstr, nsd->nsd_authstr, + nd->nd_authlen) == 0 && + copyout((caddr_t)nsd, argp, sizeof (*nsd)) == 0) + return (ENEEDAUTH); + cacherep = RC_DROPIT; + } + } + if (cacherep == RC_CHECKIT) + cacherep = nfsrv_getcache(nam2, nd, &mreq); + + /* + * Check for just starting up for NQNFS and send + * fake "try again later" replies to the NQNFS clients. + */ + if (notstarted && nqnfsstarttime <= time.tv_sec) { + if (modify_flag) { + nqnfsstarttime = time.tv_sec + nqsrv_writeslack; + modify_flag = 0; + } else + notstarted = 0; + } + if (notstarted) { + if (nd->nd_nqlflag == NQL_NOVAL) + cacherep = RC_DROPIT; + else if (nd->nd_procnum != NFSPROC_WRITE) { + nd->nd_procnum = NFSPROC_NOOP; + nd->nd_repstat = NQNFS_TRYLATER; + cacherep = RC_DOIT; + } else + modify_flag = 1; + } else if (nd->nd_flag & NFSD_AUTHFAIL) { + nd->nd_flag &= ~NFSD_AUTHFAIL; + nd->nd_procnum = NFSPROC_NOOP; + nd->nd_repstat = NQNFS_AUTHERR; + cacherep = RC_DOIT; + } + + switch (cacherep) { + case RC_DOIT: + error = (*(nfsrv_procs[nd->nd_procnum]))(nd, + nd->nd_mrep, nd->nd_md, nd->nd_dpos, &nd->nd_cr, + nam, &mreq); + if (nd->nd_cr.cr_ref != 1) { + printf("nfssvc cref=%d\n", nd->nd_cr.cr_ref); + panic("nfssvc cref"); + } + if (error) { + if (nd->nd_procnum != NQNFSPROC_VACATED) + nfsstats.srv_errs++; + if (nam2) { + nfsrv_updatecache(nam2, nd, FALSE, mreq); + m_freem(nam2); + } + break; + } + nfsstats.srvrpccnt[nd->nd_procnum]++; + if (nam2) + nfsrv_updatecache(nam2, nd, TRUE, mreq); + nd->nd_mrep = (struct mbuf *)0; + case RC_REPLY: + m = mreq; + siz = 0; + while (m) { + siz += m->m_len; + m = m->m_next; + } + if (siz <= 0 || siz > NFS_MAXPACKET) { + printf("mbuf siz=%d\n",siz); + panic("Bad nfs svc reply"); + } + m = mreq; + m->m_pkthdr.len = siz; + m->m_pkthdr.rcvif = (struct ifnet *)0; + /* + * For stream protocols, prepend a Sun RPC + * Record Mark. + */ + if (sotype == SOCK_STREAM) { + M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); + *mtod(m, u_long *) = htonl(0x80000000 | siz); + } + if (solockp) + (void) nfs_sndlock(solockp, (struct nfsreq *)0); + if (slp->ns_flag & SLP_VALID) + error = nfs_send(so, nam2, m, (struct nfsreq *)0); + else { + error = EPIPE; + m_freem(m); + } + if (nfsrtton) + nfsd_rt(&starttime, sotype, nd, nam, cacherep); + if (nam2) + MFREE(nam2, m); + if (nd->nd_mrep) + m_freem(nd->nd_mrep); + if (error == EPIPE) + nfsrv_zapsock(slp); + if (solockp) + nfs_sndunlock(solockp); + if (error == EINTR || error == ERESTART) { + nfsrv_slpderef(slp); + s = splnet(); + goto done; + } + break; + case RC_DROPIT: + if (nfsrtton) + nfsd_rt(&starttime, sotype, nd, nam, cacherep); + m_freem(nd->nd_mrep); + m_freem(nam2); + break; + }; + s = splnet(); + if (nfsrv_dorec(slp, nd)) { + nd->nd_flag &= ~NFSD_REQINPROG; + nd->nd_slp = (struct nfssvc_sock *)0; + nfsrv_slpderef(slp); + } + } +done: + remque(nd); + splx(s); + free((caddr_t)nd, M_NFSD); + nsd->nsd_nfsd = (struct nfsd *)0; + if (--nfs_numnfsd == 0) + nfsrv_init(TRUE); /* Reinitialize everything */ + return (error); +} + +/* + * Asynchronous I/O daemons for client nfs. + * They do read-ahead and write-behind operations on the block I/O cache. + * Never returns unless it fails or gets killed. + */ +nfssvc_iod(p) + struct proc *p; +{ + register struct buf *bp; + register int i, myiod; + int error = 0; + + /* + * Assign my position or return error if too many already running + */ + myiod = -1; + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + if (nfs_asyncdaemon[i] == 0) { + nfs_asyncdaemon[i]++; + myiod = i; + break; + } + if (myiod == -1) + return (EBUSY); + nfs_numasync++; + /* + * Just loop around doin our stuff until SIGKILL + */ + for (;;) { + while (nfs_bufq.tqh_first == NULL && error == 0) { + nfs_iodwant[myiod] = p; + error = tsleep((caddr_t)&nfs_iodwant[myiod], + PWAIT | PCATCH, "nfsidl", 0); + } + while ((bp = nfs_bufq.tqh_first) != NULL) { + /* Take one off the front of the list */ + TAILQ_REMOVE(&nfs_bufq, bp, b_freelist); + if (bp->b_flags & B_READ) + (void) nfs_doio(bp, bp->b_rcred, (struct proc *)0); + else + (void) nfs_doio(bp, bp->b_wcred, (struct proc *)0); + } + if (error) { + nfs_asyncdaemon[myiod] = 0; + nfs_numasync--; + return (error); + } + } +} + +/* + * Shut down a socket associated with an nfssvc_sock structure. + * Should be called with the send lock set, if required. + * The trick here is to increment the sref at the start, so that the nfsds + * will stop using it and clear ns_flag at the end so that it will not be + * reassigned during cleanup. + */ +nfsrv_zapsock(slp) + register struct nfssvc_sock *slp; +{ + register struct nfsuid *nuidp, *onuidp; + register int i; + struct socket *so; + struct file *fp; + struct mbuf *m; + + slp->ns_flag &= ~SLP_ALLFLAGS; + if (fp = slp->ns_fp) { + slp->ns_fp = (struct file *)0; + so = slp->ns_so; + so->so_upcall = NULL; + soshutdown(so, 2); + closef(fp, (struct proc *)0); + if (slp->ns_nam) + MFREE(slp->ns_nam, m); + m_freem(slp->ns_raw); + m_freem(slp->ns_rec); + nuidp = slp->ns_lrunext; + while (nuidp != (struct nfsuid *)slp) { + onuidp = nuidp; + nuidp = nuidp->nu_lrunext; + free((caddr_t)onuidp, M_NFSUID); + } + slp->ns_lrunext = slp->ns_lruprev = (struct nfsuid *)slp; + for (i = 0; i < NUIDHASHSIZ; i++) + slp->ns_uidh[i] = (struct nfsuid *)0; + } +} + +/* + * Get an authorization string for the uid by having the mount_nfs sitting + * on this mount point porpous out of the kernel and do it. + */ +nfs_getauth(nmp, rep, cred, auth_type, auth_str, auth_len) + register struct nfsmount *nmp; + struct nfsreq *rep; + struct ucred *cred; + int *auth_type; + char **auth_str; + int *auth_len; +{ + int error = 0; + + while ((nmp->nm_flag & NFSMNT_WAITAUTH) == 0) { + nmp->nm_flag |= NFSMNT_WANTAUTH; + (void) tsleep((caddr_t)&nmp->nm_authtype, PSOCK, + "nfsauth1", 2 * hz); + if (error = nfs_sigintr(nmp, rep, rep->r_procp)) { + nmp->nm_flag &= ~NFSMNT_WANTAUTH; + return (error); + } + } + nmp->nm_flag &= ~(NFSMNT_WAITAUTH | NFSMNT_WANTAUTH); + nmp->nm_authstr = *auth_str = (char *)malloc(RPCAUTH_MAXSIZ, M_TEMP, M_WAITOK); + nmp->nm_authuid = cred->cr_uid; + wakeup((caddr_t)&nmp->nm_authstr); + + /* + * And wait for mount_nfs to do its stuff. + */ + while ((nmp->nm_flag & NFSMNT_HASAUTH) == 0 && error == 0) { + (void) tsleep((caddr_t)&nmp->nm_authlen, PSOCK, + "nfsauth2", 2 * hz); + error = nfs_sigintr(nmp, rep, rep->r_procp); + } + if (nmp->nm_flag & NFSMNT_AUTHERR) { + nmp->nm_flag &= ~NFSMNT_AUTHERR; + error = EAUTH; + } + if (error) + free((caddr_t)*auth_str, M_TEMP); + else { + *auth_type = nmp->nm_authtype; + *auth_len = nmp->nm_authlen; + } + nmp->nm_flag &= ~NFSMNT_HASAUTH; + nmp->nm_flag |= NFSMNT_WAITAUTH; + if (nmp->nm_flag & NFSMNT_WANTAUTH) { + nmp->nm_flag &= ~NFSMNT_WANTAUTH; + wakeup((caddr_t)&nmp->nm_authtype); + } + return (error); +} + +/* + * Derefence a server socket structure. If it has no more references and + * is no longer valid, you can throw it away. + */ +void +nfsrv_slpderef(slp) + register struct nfssvc_sock *slp; +{ + if (--(slp->ns_sref) == 0 && (slp->ns_flag & SLP_VALID) == 0) { + slp->ns_prev->ns_next = slp->ns_next; + slp->ns_next->ns_prev = slp->ns_prev; + free((caddr_t)slp, M_NFSSVC); + } +} + +/* + * Initialize the data structures for the server. + * Handshake with any new nfsds starting up to avoid any chance of + * corruption. + */ +void +nfsrv_init(terminating) + int terminating; +{ + register struct nfssvc_sock *slp; + struct nfssvc_sock *oslp; + + if (nfssvc_sockhead.ns_flag & SLP_INIT) + panic("nfsd init"); + nfssvc_sockhead.ns_flag |= SLP_INIT; + if (terminating) { + slp = nfssvc_sockhead.ns_next; + while (slp != &nfssvc_sockhead) { + if (slp->ns_flag & SLP_VALID) + nfsrv_zapsock(slp); + slp->ns_next->ns_prev = slp->ns_prev; + slp->ns_prev->ns_next = slp->ns_next; + oslp = slp; + slp = slp->ns_next; + free((caddr_t)oslp, M_NFSSVC); + } + nfsrv_cleancache(); /* And clear out server cache */ + } + nfs_udpsock = (struct nfssvc_sock *) + malloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK); + bzero((caddr_t)nfs_udpsock, sizeof (struct nfssvc_sock)); + nfs_cltpsock = (struct nfssvc_sock *) + malloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK); + bzero((caddr_t)nfs_cltpsock, sizeof (struct nfssvc_sock)); + nfssvc_sockhead.ns_next = nfs_udpsock; + nfs_udpsock->ns_next = nfs_cltpsock; + nfs_cltpsock->ns_next = &nfssvc_sockhead; + nfssvc_sockhead.ns_prev = nfs_cltpsock; + nfs_cltpsock->ns_prev = nfs_udpsock; + nfs_udpsock->ns_prev = &nfssvc_sockhead; + nfs_udpsock->ns_lrunext = nfs_udpsock->ns_lruprev = + (struct nfsuid *)nfs_udpsock; + nfs_cltpsock->ns_lrunext = nfs_cltpsock->ns_lruprev = + (struct nfsuid *)nfs_cltpsock; + nfsd_head.nd_next = nfsd_head.nd_prev = &nfsd_head; + nfsd_head.nd_flag = 0; + nfssvc_sockhead.ns_flag &= ~SLP_INIT; + if (nfssvc_sockhead.ns_flag & SLP_WANTINIT) { + nfssvc_sockhead.ns_flag &= ~SLP_WANTINIT; + wakeup((caddr_t)&nfssvc_sockhead); + } +} + +/* + * Add entries to the server monitor log. + */ +static void +nfsd_rt(startp, sotype, nd, nam, cacherep) + struct timeval *startp; + int sotype; + register struct nfsd *nd; + struct mbuf *nam; + int cacherep; +{ + register struct drt *rt; + + rt = &nfsdrt.drt[nfsdrt.pos]; + if (cacherep == RC_DOIT) + rt->flag = 0; + else if (cacherep == RC_REPLY) + rt->flag = DRT_CACHEREPLY; + else + rt->flag = DRT_CACHEDROP; + if (sotype == SOCK_STREAM) + rt->flag |= DRT_TCP; + if (nd->nd_nqlflag != NQL_NOVAL) + rt->flag |= DRT_NQNFS; + rt->proc = nd->nd_procnum; + if (mtod(nam, struct sockaddr *)->sa_family == AF_INET) + rt->ipadr = mtod(nam, struct sockaddr_in *)->sin_addr.s_addr; + else + rt->ipadr = INADDR_ANY; + rt->resptime = ((time.tv_sec - startp->tv_sec) * 1000000) + + (time.tv_usec - startp->tv_usec); + rt->tstamp = time; + nfsdrt.pos = (nfsdrt.pos + 1) % NFSRTTLOGSIZ; +} diff --git a/sys/nfs/nfs_vfsops.c b/sys/nfs/nfs_vfsops.c new file mode 100644 index 00000000000..1f186760689 --- /dev/null +++ b/sys/nfs/nfs_vfsops.c @@ -0,0 +1,740 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_vfsops.c 8.3 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * nfs vfs operations. + */ +struct vfsops nfs_vfsops = { + nfs_mount, + nfs_start, + nfs_unmount, + nfs_root, + nfs_quotactl, + nfs_statfs, + nfs_sync, + nfs_vget, + nfs_fhtovp, + nfs_vptofh, + nfs_init, +}; + +/* + * This structure must be filled in by a primary bootstrap or bootstrap + * server for a diskless/dataless machine. It is initialized below just + * to ensure that it is allocated to initialized data (.data not .bss). + */ +struct nfs_diskless nfs_diskless = { 0 }; + +extern u_long nfs_procids[NFS_NPROCS]; +extern u_long nfs_prog, nfs_vers; +void nfs_disconnect __P((struct nfsmount *)); +void nfsargs_ntoh __P((struct nfs_args *)); +static struct mount *nfs_mountdiskless __P((char *, char *, int, + struct sockaddr_in *, struct nfs_args *, register struct vnode **)); + +#define TRUE 1 +#define FALSE 0 + +/* + * nfs statfs call + */ +int +nfs_statfs(mp, sbp, p) + struct mount *mp; + register struct statfs *sbp; + struct proc *p; +{ + register struct vnode *vp; + register struct nfsv2_statfs *sfp; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + int error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct nfsmount *nmp; + struct ucred *cred; + struct nfsnode *np; + + nmp = VFSTONFS(mp); + isnq = (nmp->nm_flag & NFSMNT_NQNFS); + if (error = nfs_nget(mp, &nmp->nm_fh, &np)) + return (error); + vp = NFSTOV(np); + nfsstats.rpccnt[NFSPROC_STATFS]++; + cred = crget(); + cred->cr_ngroups = 1; + nfsm_reqhead(vp, NFSPROC_STATFS, NFSX_FH); + nfsm_fhtom(vp); + nfsm_request(vp, NFSPROC_STATFS, p, cred); + nfsm_dissect(sfp, struct nfsv2_statfs *, NFSX_STATFS(isnq)); + sbp->f_type = MOUNT_NFS; + sbp->f_flags = nmp->nm_flag; + sbp->f_iosize = NFS_MAXDGRAMDATA; + sbp->f_bsize = fxdr_unsigned(long, sfp->sf_bsize); + sbp->f_blocks = fxdr_unsigned(long, sfp->sf_blocks); + sbp->f_bfree = fxdr_unsigned(long, sfp->sf_bfree); + sbp->f_bavail = fxdr_unsigned(long, sfp->sf_bavail); + if (isnq) { + sbp->f_files = fxdr_unsigned(long, sfp->sf_files); + sbp->f_ffree = fxdr_unsigned(long, sfp->sf_ffree); + } else { + sbp->f_files = 0; + sbp->f_ffree = 0; + } + if (sbp != &mp->mnt_stat) { + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + nfsm_reqdone; + vrele(vp); + crfree(cred); + return (error); +} + +/* + * Mount a remote root fs via. nfs. This depends on the info in the + * nfs_diskless structure that has been filled in properly by some primary + * bootstrap. + * It goes something like this: + * - do enough of "ifconfig" by calling ifioctl() so that the system + * can talk to the server + * - If nfs_diskless.mygateway is filled in, use that address as + * a default gateway. + * - hand craft the swap nfs vnode hanging off a fake mount point + * if swdevt[0].sw_dev == NODEV + * - build the rootfs mount point and call mountnfs() to do the rest. + */ +int +nfs_mountroot() +{ + register struct mount *mp; + register struct nfs_diskless *nd = &nfs_diskless; + struct socket *so; + struct vnode *vp; + struct proc *p = curproc; /* XXX */ + int error, i; + + /* + * XXX time must be non-zero when we init the interface or else + * the arp code will wedge... + */ + if (time.tv_sec == 0) + time.tv_sec = 1; + +#ifdef notyet + /* Set up swap credentials. */ + proc0.p_ucred->cr_uid = ntohl(nd->swap_ucred.cr_uid); + proc0.p_ucred->cr_gid = ntohl(nd->swap_ucred.cr_gid); + if ((proc0.p_ucred->cr_ngroups = ntohs(nd->swap_ucred.cr_ngroups)) > + NGROUPS) + proc0.p_ucred->cr_ngroups = NGROUPS; + for (i = 0; i < proc0.p_ucred->cr_ngroups; i++) + proc0.p_ucred->cr_groups[i] = ntohl(nd->swap_ucred.cr_groups[i]); +#endif + + /* + * Do enough of ifconfig(8) so that the critical net interface can + * talk to the server. + */ + if (error = socreate(nd->myif.ifra_addr.sa_family, &so, SOCK_DGRAM, 0)) + panic("nfs_mountroot: socreate: %d", error); + if (error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, p)) + panic("nfs_mountroot: SIOCAIFADDR: %d", error); + soclose(so); + + /* + * If the gateway field is filled in, set it as the default route. + */ + if (nd->mygateway.sin_len != 0) { + struct sockaddr_in mask, sin; + + bzero((caddr_t)&mask, sizeof(mask)); + sin = mask; + sin.sin_family = AF_INET; + sin.sin_len = sizeof(sin); + if (error = rtrequest(RTM_ADD, (struct sockaddr *)&sin, + (struct sockaddr *)&nd->mygateway, + (struct sockaddr *)&mask, + RTF_UP | RTF_GATEWAY, (struct rtentry **)0)) + panic("nfs_mountroot: RTM_ADD: %d", error); + } + + /* + * If swapping to an nfs node (indicated by swdevt[0].sw_dev == NODEV): + * Create a fake mount point just for the swap vnode so that the + * swap file can be on a different server from the rootfs. + */ + if (swdevt[0].sw_dev == NODEV) { + nd->swap_args.fh = (nfsv2fh_t *)nd->swap_fh; + (void) nfs_mountdiskless(nd->swap_hostnam, "/swap", 0, + &nd->swap_saddr, &nd->swap_args, &vp); + + /* + * Since the swap file is not the root dir of a file system, + * hack it to a regular file. + */ + vp->v_type = VREG; + vp->v_flag = 0; + swapdev_vp = vp; + VREF(vp); + swdevt[0].sw_vp = vp; + swdevt[0].sw_nblks = ntohl(nd->swap_nblks); + } else if (bdevvp(swapdev, &swapdev_vp)) + panic("nfs_mountroot: can't setup swapdev_vp"); + + /* + * Create the rootfs mount point. + */ + nd->root_args.fh = (nfsv2fh_t *)nd->root_fh; + mp = nfs_mountdiskless(nd->root_hostnam, "/", MNT_RDONLY, + &nd->root_saddr, &nd->root_args, &vp); + + if (vfs_lock(mp)) + panic("nfs_mountroot: vfs_lock"); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mp->mnt_flag |= MNT_ROOTFS; + mp->mnt_vnodecovered = NULLVP; + vfs_unlock(mp); + rootvp = vp; + + /* + * This is not really an nfs issue, but it is much easier to + * set hostname here and then let the "/etc/rc.xxx" files + * mount the right /var based upon its preset value. + */ + bcopy(nd->my_hostnam, hostname, MAXHOSTNAMELEN); + hostname[MAXHOSTNAMELEN - 1] = '\0'; + for (i = 0; i < MAXHOSTNAMELEN; i++) + if (hostname[i] == '\0') + break; + hostnamelen = i; + inittodr(ntohl(nd->root_time)); + return (0); +} + +/* + * Internal version of mount system call for diskless setup. + */ +static struct mount * +nfs_mountdiskless(path, which, mountflag, sin, args, vpp) + char *path; + char *which; + int mountflag; + struct sockaddr_in *sin; + struct nfs_args *args; + register struct vnode **vpp; +{ + register struct mount *mp; + register struct mbuf *m; + register int error; + + mp = (struct mount *)malloc((u_long)sizeof(struct mount), + M_MOUNT, M_NOWAIT); + if (mp == NULL) + panic("nfs_mountroot: %s mount malloc", which); + bzero((char *)mp, (u_long)sizeof(struct mount)); + mp->mnt_op = &nfs_vfsops; + mp->mnt_flag = mountflag; + + MGET(m, MT_SONAME, M_DONTWAIT); + if (m == NULL) + panic("nfs_mountroot: %s mount mbuf", which); + bcopy((caddr_t)sin, mtod(m, caddr_t), sin->sin_len); + m->m_len = sin->sin_len; + nfsargs_ntoh(args); + if (error = mountnfs(args, mp, m, which, path, vpp)) + panic("nfs_mountroot: mount %s on %s: %d", path, which, error); + + return (mp); +} + +/* + * Convert the integer fields of the nfs_args structure from net byte order + * to host byte order. Called by nfs_mountroot() above. + */ +void +nfsargs_ntoh(nfsp) + register struct nfs_args *nfsp; +{ + + NTOHL(nfsp->sotype); + NTOHL(nfsp->proto); + NTOHL(nfsp->flags); + NTOHL(nfsp->wsize); + NTOHL(nfsp->rsize); + NTOHL(nfsp->timeo); + NTOHL(nfsp->retrans); + NTOHL(nfsp->maxgrouplist); + NTOHL(nfsp->readahead); + NTOHL(nfsp->leaseterm); + NTOHL(nfsp->deadthresh); +} + +/* + * VFS Operations. + * + * mount system call + * It seems a bit dumb to copyinstr() the host and path here and then + * bcopy() them in mountnfs(), but I wanted to detect errors before + * doing the sockargs() call because sockargs() allocates an mbuf and + * an error after that means that I have to release the mbuf. + */ +/* ARGSUSED */ +int +nfs_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + int error; + struct nfs_args args; + struct mbuf *nam; + struct vnode *vp; + char pth[MNAMELEN], hst[MNAMELEN]; + u_int len; + nfsv2fh_t nfh; + + if (error = copyin(data, (caddr_t)&args, sizeof (struct nfs_args))) + return (error); + if (error = copyin((caddr_t)args.fh, (caddr_t)&nfh, sizeof (nfsv2fh_t))) + return (error); + if (error = copyinstr(path, pth, MNAMELEN-1, &len)) + return (error); + bzero(&pth[len], MNAMELEN - len); + if (error = copyinstr(args.hostname, hst, MNAMELEN-1, &len)) + return (error); + bzero(&hst[len], MNAMELEN - len); + /* sockargs() call must be after above copyin() calls */ + if (error = sockargs(&nam, (caddr_t)args.addr, + args.addrlen, MT_SONAME)) + return (error); + args.fh = &nfh; + error = mountnfs(&args, mp, nam, pth, hst, &vp); + return (error); +} + +/* + * Common code for mount and mountroot + */ +int +mountnfs(argp, mp, nam, pth, hst, vpp) + register struct nfs_args *argp; + register struct mount *mp; + struct mbuf *nam; + char *pth, *hst; + struct vnode **vpp; +{ + register struct nfsmount *nmp; + struct nfsnode *np; + int error; + + if (mp->mnt_flag & MNT_UPDATE) { + nmp = VFSTONFS(mp); + /* update paths, file handles, etc, here XXX */ + m_freem(nam); + return (0); + } else { + MALLOC(nmp, struct nfsmount *, sizeof (struct nfsmount), + M_NFSMNT, M_WAITOK); + bzero((caddr_t)nmp, sizeof (struct nfsmount)); + mp->mnt_data = (qaddr_t)nmp; + } + getnewfsid(mp, MOUNT_NFS); + nmp->nm_mountp = mp; + nmp->nm_flag = argp->flags; + if ((nmp->nm_flag & (NFSMNT_NQNFS | NFSMNT_MYWRITE)) == + (NFSMNT_NQNFS | NFSMNT_MYWRITE)) { + error = EPERM; + goto bad; + } + if (nmp->nm_flag & NFSMNT_NQNFS) + /* + * We have to set mnt_maxsymlink to a non-zero value so + * that COMPAT_43 routines will know that we are setting + * the d_type field in directories (and can zero it for + * unsuspecting binaries). + */ + mp->mnt_maxsymlinklen = 1; + nmp->nm_timeo = NFS_TIMEO; + nmp->nm_retry = NFS_RETRANS; + nmp->nm_wsize = NFS_WSIZE; + nmp->nm_rsize = NFS_RSIZE; + nmp->nm_numgrps = NFS_MAXGRPS; + nmp->nm_readahead = NFS_DEFRAHEAD; + nmp->nm_leaseterm = NQ_DEFLEASE; + nmp->nm_deadthresh = NQ_DEADTHRESH; + nmp->nm_tnext = (struct nfsnode *)nmp; + nmp->nm_tprev = (struct nfsnode *)nmp; + nmp->nm_inprog = NULLVP; + bcopy((caddr_t)argp->fh, (caddr_t)&nmp->nm_fh, sizeof(nfsv2fh_t)); + mp->mnt_stat.f_type = MOUNT_NFS; + bcopy(hst, mp->mnt_stat.f_mntfromname, MNAMELEN); + bcopy(pth, mp->mnt_stat.f_mntonname, MNAMELEN); + nmp->nm_nam = nam; + + if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { + nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10; + if (nmp->nm_timeo < NFS_MINTIMEO) + nmp->nm_timeo = NFS_MINTIMEO; + else if (nmp->nm_timeo > NFS_MAXTIMEO) + nmp->nm_timeo = NFS_MAXTIMEO; + } + + if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) { + nmp->nm_retry = argp->retrans; + if (nmp->nm_retry > NFS_MAXREXMIT) + nmp->nm_retry = NFS_MAXREXMIT; + } + + if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) { + nmp->nm_wsize = argp->wsize; + /* Round down to multiple of blocksize */ + nmp->nm_wsize &= ~0x1ff; + if (nmp->nm_wsize <= 0) + nmp->nm_wsize = 512; + else if (nmp->nm_wsize > NFS_MAXDATA) + nmp->nm_wsize = NFS_MAXDATA; + } + if (nmp->nm_wsize > MAXBSIZE) + nmp->nm_wsize = MAXBSIZE; + + if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) { + nmp->nm_rsize = argp->rsize; + /* Round down to multiple of blocksize */ + nmp->nm_rsize &= ~0x1ff; + if (nmp->nm_rsize <= 0) + nmp->nm_rsize = 512; + else if (nmp->nm_rsize > NFS_MAXDATA) + nmp->nm_rsize = NFS_MAXDATA; + } + if (nmp->nm_rsize > MAXBSIZE) + nmp->nm_rsize = MAXBSIZE; + if ((argp->flags & NFSMNT_MAXGRPS) && argp->maxgrouplist >= 0 && + argp->maxgrouplist <= NFS_MAXGRPS) + nmp->nm_numgrps = argp->maxgrouplist; + if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0 && + argp->readahead <= NFS_MAXRAHEAD) + nmp->nm_readahead = argp->readahead; + if ((argp->flags & NFSMNT_LEASETERM) && argp->leaseterm >= 2 && + argp->leaseterm <= NQ_MAXLEASE) + nmp->nm_leaseterm = argp->leaseterm; + if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 1 && + argp->deadthresh <= NQ_NEVERDEAD) + nmp->nm_deadthresh = argp->deadthresh; + /* Set up the sockets and per-host congestion */ + nmp->nm_sotype = argp->sotype; + nmp->nm_soproto = argp->proto; + + /* + * For Connection based sockets (TCP,...) defer the connect until + * the first request, in case the server is not responding. + */ + if (nmp->nm_sotype == SOCK_DGRAM && + (error = nfs_connect(nmp, (struct nfsreq *)0))) + goto bad; + + /* + * This is silly, but it has to be set so that vinifod() works. + * We do not want to do an nfs_statfs() here since we can get + * stuck on a dead server and we are holding a lock on the mount + * point. + */ + mp->mnt_stat.f_iosize = NFS_MAXDGRAMDATA; + /* + * A reference count is needed on the nfsnode representing the + * remote root. If this object is not persistent, then backward + * traversals of the mount point (i.e. "..") will not work if + * the nfsnode gets flushed out of the cache. Ufs does not have + * this problem, because one can identify root inodes by their + * number == ROOTINO (2). + */ + if (error = nfs_nget(mp, &nmp->nm_fh, &np)) + goto bad; + *vpp = NFSTOV(np); + + return (0); +bad: + nfs_disconnect(nmp); + free((caddr_t)nmp, M_NFSMNT); + m_freem(nam); + return (error); +} + +/* + * unmount system call + */ +int +nfs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + register struct nfsmount *nmp; + struct nfsnode *np; + struct vnode *vp; + int error, flags = 0; + extern int doforce; + + if (mntflags & MNT_FORCE) { + if (!doforce || (mp->mnt_flag & MNT_ROOTFS)) + return (EINVAL); + flags |= FORCECLOSE; + } + nmp = VFSTONFS(mp); + /* + * Goes something like this.. + * - Check for activity on the root vnode (other than ourselves). + * - Call vflush() to clear out vnodes for this file system, + * except for the root vnode. + * - Decrement reference on the vnode representing remote root. + * - Close the socket + * - Free up the data structures + */ + /* + * We need to decrement the ref. count on the nfsnode representing + * the remote root. See comment in mountnfs(). The VFS unmount() + * has done vput on this vnode, otherwise we would get deadlock! + */ + if (error = nfs_nget(mp, &nmp->nm_fh, &np)) + return(error); + vp = NFSTOV(np); + if (vp->v_usecount > 2) { + vput(vp); + return (EBUSY); + } + + /* + * Must handshake with nqnfs_clientd() if it is active. + */ + nmp->nm_flag |= NFSMNT_DISMINPROG; + while (nmp->nm_inprog != NULLVP) + (void) tsleep((caddr_t)&lbolt, PSOCK, "nfsdism", 0); + if (error = vflush(mp, vp, flags)) { + vput(vp); + nmp->nm_flag &= ~NFSMNT_DISMINPROG; + return (error); + } + + /* + * We are now committed to the unmount. + * For NQNFS, let the server daemon free the nfsmount structure. + */ + if (nmp->nm_flag & (NFSMNT_NQNFS | NFSMNT_KERB)) + nmp->nm_flag |= NFSMNT_DISMNT; + + /* + * There are two reference counts to get rid of here. + */ + vrele(vp); + vrele(vp); + vgone(vp); + nfs_disconnect(nmp); + m_freem(nmp->nm_nam); + + if ((nmp->nm_flag & (NFSMNT_NQNFS | NFSMNT_KERB)) == 0) + free((caddr_t)nmp, M_NFSMNT); + return (0); +} + +/* + * Return root of a filesystem + */ +int +nfs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + register struct vnode *vp; + struct nfsmount *nmp; + struct nfsnode *np; + int error; + + nmp = VFSTONFS(mp); + if (error = nfs_nget(mp, &nmp->nm_fh, &np)) + return (error); + vp = NFSTOV(np); + vp->v_type = VDIR; + vp->v_flag = VROOT; + *vpp = vp; + return (0); +} + +extern int syncprt; + +/* + * Flush out the buffer cache + */ +/* ARGSUSED */ +int +nfs_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + register struct vnode *vp; + int error, allerror = 0; + + /* + * Force stale buffer cache information to be flushed. + */ +loop: + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + /* + * If the vnode that we are about to sync is no longer + * associated with this mount point, start over. + */ + if (vp->v_mount != mp) + goto loop; + if (VOP_ISLOCKED(vp) || vp->v_dirtyblkhd.lh_first == NULL) + continue; + if (vget(vp, 1)) + goto loop; + if (error = VOP_FSYNC(vp, cred, waitfor, p)) + allerror = error; + vput(vp); + } + return (allerror); +} + +/* + * NFS flat namespace lookup. + * Currently unsupported. + */ +/* ARGSUSED */ +int +nfs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +/* + * At this point, this should never happen + */ +/* ARGSUSED */ +int +nfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) + register struct mount *mp; + struct fid *fhp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + + return (EINVAL); +} + +/* + * Vnode pointer to File handle, should never happen either + */ +/* ARGSUSED */ +int +nfs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + + return (EINVAL); +} + +/* + * Vfs start routine, a no-op. + */ +/* ARGSUSED */ +int +nfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + + return (0); +} + +/* + * Do operations associated with quotas, not supported + */ +/* ARGSUSED */ +int +nfs_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c new file mode 100644 index 00000000000..a909b48dc67 --- /dev/null +++ b/sys/nfs/nfs_vnops.c @@ -0,0 +1,2539 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_vnops.c 8.5 (Berkeley) 2/13/94 + */ + +/* + * vnode op calls for sun nfs version 2 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Defs */ +#define TRUE 1 +#define FALSE 0 + +/* + * Global vfs data structures for nfs + */ +int (**nfsv2_vnodeop_p)(); +struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, nfs_lookup }, /* lookup */ + { &vop_create_desc, nfs_create }, /* create */ + { &vop_mknod_desc, nfs_mknod }, /* mknod */ + { &vop_open_desc, nfs_open }, /* open */ + { &vop_close_desc, nfs_close }, /* close */ + { &vop_access_desc, nfs_access }, /* access */ + { &vop_getattr_desc, nfs_getattr }, /* getattr */ + { &vop_setattr_desc, nfs_setattr }, /* setattr */ + { &vop_read_desc, nfs_read }, /* read */ + { &vop_write_desc, nfs_write }, /* write */ + { &vop_ioctl_desc, nfs_ioctl }, /* ioctl */ + { &vop_select_desc, nfs_select }, /* select */ + { &vop_mmap_desc, nfs_mmap }, /* mmap */ + { &vop_fsync_desc, nfs_fsync }, /* fsync */ + { &vop_seek_desc, nfs_seek }, /* seek */ + { &vop_remove_desc, nfs_remove }, /* remove */ + { &vop_link_desc, nfs_link }, /* link */ + { &vop_rename_desc, nfs_rename }, /* rename */ + { &vop_mkdir_desc, nfs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, nfs_rmdir }, /* rmdir */ + { &vop_symlink_desc, nfs_symlink }, /* symlink */ + { &vop_readdir_desc, nfs_readdir }, /* readdir */ + { &vop_readlink_desc, nfs_readlink }, /* readlink */ + { &vop_abortop_desc, nfs_abortop }, /* abortop */ + { &vop_inactive_desc, nfs_inactive }, /* inactive */ + { &vop_reclaim_desc, nfs_reclaim }, /* reclaim */ + { &vop_lock_desc, nfs_lock }, /* lock */ + { &vop_unlock_desc, nfs_unlock }, /* unlock */ + { &vop_bmap_desc, nfs_bmap }, /* bmap */ + { &vop_strategy_desc, nfs_strategy }, /* strategy */ + { &vop_print_desc, nfs_print }, /* print */ + { &vop_islocked_desc, nfs_islocked }, /* islocked */ + { &vop_pathconf_desc, nfs_pathconf }, /* pathconf */ + { &vop_advlock_desc, nfs_advlock }, /* advlock */ + { &vop_blkatoff_desc, nfs_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, nfs_valloc }, /* valloc */ + { &vop_reallocblks_desc, nfs_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, nfs_vfree }, /* vfree */ + { &vop_truncate_desc, nfs_truncate }, /* truncate */ + { &vop_update_desc, nfs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc nfsv2_vnodeop_opv_desc = + { &nfsv2_vnodeop_p, nfsv2_vnodeop_entries }; + +/* + * Special device vnode ops + */ +int (**spec_nfsv2nodeop_p)(); +struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, spec_create }, /* create */ + { &vop_mknod_desc, spec_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, nfsspec_close }, /* close */ + { &vop_access_desc, nfsspec_access }, /* access */ + { &vop_getattr_desc, nfs_getattr }, /* getattr */ + { &vop_setattr_desc, nfs_setattr }, /* setattr */ + { &vop_read_desc, nfsspec_read }, /* read */ + { &vop_write_desc, nfsspec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_select_desc, spec_select }, /* select */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, nfs_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, spec_remove }, /* remove */ + { &vop_link_desc, spec_link }, /* link */ + { &vop_rename_desc, spec_rename }, /* rename */ + { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ + { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ + { &vop_symlink_desc, spec_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, nfs_inactive }, /* inactive */ + { &vop_reclaim_desc, nfs_reclaim }, /* reclaim */ + { &vop_lock_desc, nfs_lock }, /* lock */ + { &vop_unlock_desc, nfs_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + { &vop_strategy_desc, spec_strategy }, /* strategy */ + { &vop_print_desc, nfs_print }, /* print */ + { &vop_islocked_desc, nfs_islocked }, /* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_blkatoff_desc, spec_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, spec_valloc }, /* valloc */ + { &vop_reallocblks_desc, spec_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, spec_vfree }, /* vfree */ + { &vop_truncate_desc, spec_truncate }, /* truncate */ + { &vop_update_desc, nfs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = + { &spec_nfsv2nodeop_p, spec_nfsv2nodeop_entries }; + +#ifdef FIFO +int (**fifo_nfsv2nodeop_p)(); +struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fifo_lookup }, /* lookup */ + { &vop_create_desc, fifo_create }, /* create */ + { &vop_mknod_desc, fifo_mknod }, /* mknod */ + { &vop_open_desc, fifo_open }, /* open */ + { &vop_close_desc, nfsfifo_close }, /* close */ + { &vop_access_desc, nfsspec_access }, /* access */ + { &vop_getattr_desc, nfs_getattr }, /* getattr */ + { &vop_setattr_desc, nfs_setattr }, /* setattr */ + { &vop_read_desc, nfsfifo_read }, /* read */ + { &vop_write_desc, nfsfifo_write }, /* write */ + { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */ + { &vop_select_desc, fifo_select }, /* select */ + { &vop_mmap_desc, fifo_mmap }, /* mmap */ + { &vop_fsync_desc, nfs_fsync }, /* fsync */ + { &vop_seek_desc, fifo_seek }, /* seek */ + { &vop_remove_desc, fifo_remove }, /* remove */ + { &vop_link_desc, fifo_link }, /* link */ + { &vop_rename_desc, fifo_rename }, /* rename */ + { &vop_mkdir_desc, fifo_mkdir }, /* mkdir */ + { &vop_rmdir_desc, fifo_rmdir }, /* rmdir */ + { &vop_symlink_desc, fifo_symlink }, /* symlink */ + { &vop_readdir_desc, fifo_readdir }, /* readdir */ + { &vop_readlink_desc, fifo_readlink }, /* readlink */ + { &vop_abortop_desc, fifo_abortop }, /* abortop */ + { &vop_inactive_desc, nfs_inactive }, /* inactive */ + { &vop_reclaim_desc, nfs_reclaim }, /* reclaim */ + { &vop_lock_desc, nfs_lock }, /* lock */ + { &vop_unlock_desc, nfs_unlock }, /* unlock */ + { &vop_bmap_desc, fifo_bmap }, /* bmap */ + { &vop_strategy_desc, fifo_badop }, /* strategy */ + { &vop_print_desc, nfs_print }, /* print */ + { &vop_islocked_desc, nfs_islocked }, /* islocked */ + { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */ + { &vop_advlock_desc, fifo_advlock }, /* advlock */ + { &vop_blkatoff_desc, fifo_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fifo_valloc }, /* valloc */ + { &vop_reallocblks_desc, fifo_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, fifo_vfree }, /* vfree */ + { &vop_truncate_desc, fifo_truncate }, /* truncate */ + { &vop_update_desc, nfs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = + { &fifo_nfsv2nodeop_p, fifo_nfsv2nodeop_entries }; +#endif /* FIFO */ + +void nqnfs_clientlease(); + +/* + * Global variables + */ +extern u_long nfs_procids[NFS_NPROCS]; +extern u_long nfs_prog, nfs_vers, nfs_true, nfs_false; +extern char nfsiobuf[MAXPHYS+NBPG]; +struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +int nfs_numasync = 0; +#define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) + +/* + * nfs null call from vfs. + */ +int +nfs_null(vp, cred, procp) + struct vnode *vp; + struct ucred *cred; + struct proc *procp; +{ + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb; + + nfsm_reqhead(vp, NFSPROC_NULL, 0); + nfsm_request(vp, NFSPROC_NULL, procp, cred); + nfsm_reqdone; + return (error); +} + +/* + * nfs access vnode op. + * For nfs, just return ok. File accesses may fail later. + * For nqnfs, use the access rpc to check accessibility. If file modes are + * changed on the server, accesses might still fail later. + */ +int +nfs_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register u_long *tl; + register caddr_t cp; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + /* + * For nqnfs, do an access rpc, otherwise you are stuck emulating + * ufs_access() locally using the vattr. This may not be correct, + * since the server may apply other access criteria such as + * client uid-->server uid mapping that we do not know about, but + * this is better than just returning anything that is lying about + * in the cache. + */ + if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) { + nfsstats.rpccnt[NQNFSPROC_ACCESS]++; + nfsm_reqhead(vp, NQNFSPROC_ACCESS, NFSX_FH + 3 * NFSX_UNSIGNED); + nfsm_fhtom(vp); + nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); + if (ap->a_mode & VREAD) + *tl++ = nfs_true; + else + *tl++ = nfs_false; + if (ap->a_mode & VWRITE) + *tl++ = nfs_true; + else + *tl++ = nfs_false; + if (ap->a_mode & VEXEC) + *tl = nfs_true; + else + *tl = nfs_false; + nfsm_request(vp, NQNFSPROC_ACCESS, ap->a_p, ap->a_cred); + nfsm_reqdone; + return (error); + } else + return (nfsspec_access(ap)); +} + +/* + * nfs open vnode op + * Check to see if the type is ok + * and that deletion is not in progress. + * For paged in text files, you will need to flush the page cache + * if consistency is lost. + */ +/* ARGSUSED */ +int +nfs_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct vattr vattr; + int error; + + if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) + return (EACCES); + if (vp->v_flag & VTEXT) { + /* + * Get a valid lease. If cached data is stale, flush it. + */ + if (nmp->nm_flag & NFSMNT_NQNFS) { + if (NQNFS_CKINVALID(vp, np, NQL_READ)) { + do { + error = nqnfs_getlease(vp, NQL_READ, ap->a_cred, ap->a_p); + } while (error == NQNFS_EXPIRED); + if (error) + return (error); + if (np->n_lrev != np->n_brev || + (np->n_flag & NQNFSNONCACHE)) { + if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, + ap->a_p, 1)) == EINTR) + return (error); + (void) vnode_pager_uncache(vp); + np->n_brev = np->n_lrev; + } + } + } else { + if (np->n_flag & NMODIFIED) { + if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, + ap->a_p, 1)) == EINTR) + return (error); + (void) vnode_pager_uncache(vp); + np->n_attrstamp = 0; + np->n_direofoffset = 0; + if (error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p)) + return (error); + np->n_mtime = vattr.va_mtime.ts_sec; + } else { + if (error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p)) + return (error); + if (np->n_mtime != vattr.va_mtime.ts_sec) { + np->n_direofoffset = 0; + if ((error = nfs_vinvalbuf(vp, V_SAVE, + ap->a_cred, ap->a_p, 1)) == EINTR) + return (error); + (void) vnode_pager_uncache(vp); + np->n_mtime = vattr.va_mtime.ts_sec; + } + } + } + } else if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) + np->n_attrstamp = 0; /* For Open/Close consistency */ + return (0); +} + +/* + * nfs close vnode op + * For reg files, invalidate any buffer cache entries. + */ +/* ARGSUSED */ +int +nfs_close(ap) + struct vop_close_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + int error = 0; + + if (vp->v_type == VREG) { + if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) == 0 && + (np->n_flag & NMODIFIED)) { + error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); + np->n_attrstamp = 0; + } + if (np->n_flag & NWRITEERR) { + np->n_flag &= ~NWRITEERR; + error = np->n_error; + } + } + return (error); +} + +/* + * nfs getattr call from vfs. + */ +int +nfs_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + register caddr_t cp; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + /* + * Update local times for special files. + */ + if (np->n_flag & (NACC | NUPD)) + np->n_flag |= NCHG; + /* + * First look in the cache. + */ + if (nfs_getattrcache(vp, ap->a_vap) == 0) + return (0); + nfsstats.rpccnt[NFSPROC_GETATTR]++; + nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH); + nfsm_fhtom(vp); + nfsm_request(vp, NFSPROC_GETATTR, ap->a_p, ap->a_cred); + nfsm_loadattr(vp, ap->a_vap); + nfsm_reqdone; + return (error); +} + +/* + * nfs setattr call. + */ +int +nfs_setattr(ap) + struct vop_setattr_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct nfsv2_sattr *sp; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + u_long *tl; + int error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + register struct vattr *vap = ap->a_vap; + u_quad_t frev, tsize; + + if (vap->va_size != VNOVAL || vap->va_mtime.ts_sec != VNOVAL || + vap->va_atime.ts_sec != VNOVAL) { + if (vap->va_size != VNOVAL) { + if (np->n_flag & NMODIFIED) { + if (vap->va_size == 0) + error = nfs_vinvalbuf(vp, 0, ap->a_cred, + ap->a_p, 1); + else + error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, + ap->a_p, 1); + if (error) + return (error); + } + tsize = np->n_size; + np->n_size = np->n_vattr.va_size = vap->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else if ((np->n_flag & NMODIFIED) && + (error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, + ap->a_p, 1)) == EINTR) + return (error); + } + nfsstats.rpccnt[NFSPROC_SETATTR]++; + isnq = (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS); + nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH+NFSX_SATTR(isnq)); + nfsm_fhtom(vp); + nfsm_build(sp, struct nfsv2_sattr *, NFSX_SATTR(isnq)); + if (vap->va_mode == (u_short)-1) + sp->sa_mode = VNOVAL; + else + sp->sa_mode = vtonfs_mode(vp->v_type, vap->va_mode); + if (vap->va_uid == (uid_t)-1) + sp->sa_uid = VNOVAL; + else + sp->sa_uid = txdr_unsigned(vap->va_uid); + if (vap->va_gid == (gid_t)-1) + sp->sa_gid = VNOVAL; + else + sp->sa_gid = txdr_unsigned(vap->va_gid); + if (isnq) { + txdr_hyper(&vap->va_size, &sp->sa_nqsize); + txdr_nqtime(&vap->va_atime, &sp->sa_nqatime); + txdr_nqtime(&vap->va_mtime, &sp->sa_nqmtime); + sp->sa_nqflags = txdr_unsigned(vap->va_flags); + sp->sa_nqrdev = VNOVAL; + } else { + sp->sa_nfssize = txdr_unsigned(vap->va_size); + txdr_nfstime(&vap->va_atime, &sp->sa_nfsatime); + txdr_nfstime(&vap->va_mtime, &sp->sa_nfsmtime); + } + nfsm_request(vp, NFSPROC_SETATTR, ap->a_p, ap->a_cred); + nfsm_loadattr(vp, (struct vattr *)0); + if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) && + NQNFS_CKCACHABLE(vp, NQL_WRITE)) { + nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); + fxdr_hyper(tl, &frev); + if (frev > np->n_brev) + np->n_brev = frev; + } + nfsm_reqdone; + if (error) { + np->n_size = np->n_vattr.va_size = tsize; + vnode_pager_setsize(vp, (u_long)np->n_size); + } + return (error); +} + +/* + * nfs lookup call, one step at a time... + * First look in cache + * If not found, unlock the directory nfsnode and do the rpc + */ +int +nfs_lookup(ap) + struct vop_lookup_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct componentname *cnp = ap->a_cnp; + register struct vnode *dvp = ap->a_dvp; + register struct vnode **vpp = ap->a_vpp; + register int flags = cnp->cn_flags; + register struct vnode *vdp; + register u_long *tl; + register caddr_t cp; + register long t1, t2; + struct nfsmount *nmp; + caddr_t bpos, dpos, cp2; + time_t reqtime; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct vnode *newvp; + long len; + nfsv2fh_t *fhp; + struct nfsnode *np; + int lockparent, wantparent, error = 0; + int nqlflag, cachable; + u_quad_t frev; + + *vpp = NULL; + if (dvp->v_type != VDIR) + return (ENOTDIR); + lockparent = flags & LOCKPARENT; + wantparent = flags & (LOCKPARENT|WANTPARENT); + nmp = VFSTONFS(dvp->v_mount); + np = VTONFS(dvp); + if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { + struct vattr vattr; + int vpid; + + vdp = *vpp; + vpid = vdp->v_id; + /* + * See the comment starting `Step through' in ufs/ufs_lookup.c + * for an explanation of the locking protocol + */ + if (dvp == vdp) { + VREF(vdp); + error = 0; + } else + error = vget(vdp, 1); + if (!error) { + if (vpid == vdp->v_id) { + if (nmp->nm_flag & NFSMNT_NQNFS) { + if ((nmp->nm_flag & NFSMNT_NQLOOKLEASE) == 0) { + nfsstats.lookupcache_hits++; + if (cnp->cn_nameiop != LOOKUP && + (flags & ISLASTCN)) + cnp->cn_flags |= SAVENAME; + return (0); + } else if (NQNFS_CKCACHABLE(dvp, NQL_READ)) { + if (np->n_lrev != np->n_brev || + (np->n_flag & NMODIFIED)) { + np->n_direofoffset = 0; + cache_purge(dvp); + error = nfs_vinvalbuf(dvp, 0, + cnp->cn_cred, cnp->cn_proc, + 1); + if (error == EINTR) + return (error); + np->n_brev = np->n_lrev; + } else { + nfsstats.lookupcache_hits++; + if (cnp->cn_nameiop != LOOKUP && + (flags & ISLASTCN)) + cnp->cn_flags |= SAVENAME; + return (0); + } + } + } else if (!VOP_GETATTR(vdp, &vattr, cnp->cn_cred, cnp->cn_proc) && + vattr.va_ctime.ts_sec == VTONFS(vdp)->n_ctime) { + nfsstats.lookupcache_hits++; + if (cnp->cn_nameiop != LOOKUP && + (flags & ISLASTCN)) + cnp->cn_flags |= SAVENAME; + return (0); + } + cache_purge(vdp); + } + vrele(vdp); + } + *vpp = NULLVP; + } + error = 0; + nfsstats.lookupcache_misses++; + nfsstats.rpccnt[NFSPROC_LOOKUP]++; + len = cnp->cn_namelen; + nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(len)); + + /* + * For nqnfs optionally piggyback a getlease request for the name + * being looked up. + */ + if (nmp->nm_flag & NFSMNT_NQNFS) { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + if ((nmp->nm_flag & NFSMNT_NQLOOKLEASE) && + ((cnp->cn_flags & MAKEENTRY) && + (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN)))) + *tl = txdr_unsigned(nmp->nm_leaseterm); + else + *tl = 0; + } + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); + reqtime = time.tv_sec; + nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_proc, cnp->cn_cred); +nfsmout: + if (error) { + if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && + (flags & ISLASTCN) && error == ENOENT) + error = EJUSTRETURN; + if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) + cnp->cn_flags |= SAVENAME; + return (error); + } + if (nmp->nm_flag & NFSMNT_NQNFS) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + if (*tl) { + nqlflag = fxdr_unsigned(int, *tl); + nfsm_dissect(tl, u_long *, 4*NFSX_UNSIGNED); + cachable = fxdr_unsigned(int, *tl++); + reqtime += fxdr_unsigned(int, *tl++); + fxdr_hyper(tl, &frev); + } else + nqlflag = 0; + } + nfsm_dissect(fhp, nfsv2fh_t *, NFSX_FH); + + /* + * Handle RENAME case... + */ + if (cnp->cn_nameiop == RENAME && wantparent && (flags & ISLASTCN)) { + if (!bcmp(np->n_fh.fh_bytes, (caddr_t)fhp, NFSX_FH)) { + m_freem(mrep); + return (EISDIR); + } + if (error = nfs_nget(dvp->v_mount, fhp, &np)) { + m_freem(mrep); + return (error); + } + newvp = NFSTOV(np); + if (error = + nfs_loadattrcache(&newvp, &md, &dpos, (struct vattr *)0)) { + vrele(newvp); + m_freem(mrep); + return (error); + } + *vpp = newvp; + m_freem(mrep); + cnp->cn_flags |= SAVENAME; + return (0); + } + + if (!bcmp(np->n_fh.fh_bytes, (caddr_t)fhp, NFSX_FH)) { + VREF(dvp); + newvp = dvp; + } else { + if (error = nfs_nget(dvp->v_mount, fhp, &np)) { + m_freem(mrep); + return (error); + } + newvp = NFSTOV(np); + } + if (error = nfs_loadattrcache(&newvp, &md, &dpos, (struct vattr *)0)) { + vrele(newvp); + m_freem(mrep); + return (error); + } + m_freem(mrep); + *vpp = newvp; + if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) + cnp->cn_flags |= SAVENAME; + if ((cnp->cn_flags & MAKEENTRY) && + (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) { + if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) + np->n_ctime = np->n_vattr.va_ctime.ts_sec; + else if (nqlflag && reqtime > time.tv_sec) + nqnfs_clientlease(nmp, np, nqlflag, cachable, reqtime, + frev); + cache_enter(dvp, *vpp, cnp); + } + return (0); +} + +/* + * nfs read call. + * Just call nfs_bioread() to do the work. + */ +int +nfs_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + + if (vp->v_type != VREG) + return (EPERM); + return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred)); +} + +/* + * nfs readlink call + */ +int +nfs_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + + if (vp->v_type != VLNK) + return (EPERM); + return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred)); +} + +/* + * Do a readlink rpc. + * Called by nfs_doio() from below the buffer cache. + */ +int +nfs_readlinkrpc(vp, uiop, cred) + register struct vnode *vp; + struct uio *uiop; + struct ucred *cred; +{ + register u_long *tl; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + long len; + + nfsstats.rpccnt[NFSPROC_READLINK]++; + nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH); + nfsm_fhtom(vp); + nfsm_request(vp, NFSPROC_READLINK, uiop->uio_procp, cred); + nfsm_strsiz(len, NFS_MAXPATHLEN); + nfsm_mtouio(uiop, len); + nfsm_reqdone; + return (error); +} + +/* + * nfs read rpc call + * Ditto above + */ +int +nfs_readrpc(vp, uiop, cred) + register struct vnode *vp; + struct uio *uiop; + struct ucred *cred; +{ + register u_long *tl; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct nfsmount *nmp; + long len, retlen, tsiz; + + nmp = VFSTONFS(vp->v_mount); + tsiz = uiop->uio_resid; + if (uiop->uio_offset + tsiz > 0xffffffff && + (nmp->nm_flag & NFSMNT_NQNFS) == 0) + return (EFBIG); + while (tsiz > 0) { + nfsstats.rpccnt[NFSPROC_READ]++; + len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz; + nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH+NFSX_UNSIGNED*3); + nfsm_fhtom(vp); + nfsm_build(tl, u_long *, NFSX_UNSIGNED*3); + if (nmp->nm_flag & NFSMNT_NQNFS) { + txdr_hyper(&uiop->uio_offset, tl); + *(tl + 2) = txdr_unsigned(len); + } else { + *tl++ = txdr_unsigned(uiop->uio_offset); + *tl++ = txdr_unsigned(len); + *tl = 0; + } + nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred); + nfsm_loadattr(vp, (struct vattr *)0); + nfsm_strsiz(retlen, nmp->nm_rsize); + nfsm_mtouio(uiop, retlen); + m_freem(mrep); + if (retlen < len) + tsiz = 0; + else + tsiz -= len; + } +nfsmout: + return (error); +} + +/* + * nfs write call + */ +int +nfs_writerpc(vp, uiop, cred, ioflags) + register struct vnode *vp; + struct uio *uiop; + struct ucred *cred; + int ioflags; +{ + register u_long *tl; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct nfsmount *nmp; + struct nfsnode *np = VTONFS(vp); + u_quad_t frev; + long len, tsiz; + + nmp = VFSTONFS(vp->v_mount); + tsiz = uiop->uio_resid; + if (uiop->uio_offset + tsiz > 0xffffffff && + (nmp->nm_flag & NFSMNT_NQNFS) == 0) + return (EFBIG); + while (tsiz > 0) { + nfsstats.rpccnt[NFSPROC_WRITE]++; + len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; + nfsm_reqhead(vp, NFSPROC_WRITE, + NFSX_FH+NFSX_UNSIGNED*4+nfsm_rndup(len)); + nfsm_fhtom(vp); + nfsm_build(tl, u_long *, NFSX_UNSIGNED * 4); + if (nmp->nm_flag & NFSMNT_NQNFS) { + txdr_hyper(&uiop->uio_offset, tl); + tl += 2; + if (ioflags & IO_APPEND) + *tl++ = txdr_unsigned(1); + else + *tl++ = 0; + } else { + *++tl = txdr_unsigned(uiop->uio_offset); + tl += 2; + } + *tl = txdr_unsigned(len); + nfsm_uiotom(uiop, len); + nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred); + nfsm_loadattr(vp, (struct vattr *)0); + if (nmp->nm_flag & NFSMNT_MYWRITE) + VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime.ts_sec; + else if ((nmp->nm_flag & NFSMNT_NQNFS) && + NQNFS_CKCACHABLE(vp, NQL_WRITE)) { + nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); + fxdr_hyper(tl, &frev); + if (frev > np->n_brev) + np->n_brev = frev; + } + m_freem(mrep); + tsiz -= len; + } +nfsmout: + if (error) + uiop->uio_resid = tsiz; + return (error); +} + +/* + * nfs mknod call + * This is a kludge. Use a create rpc but with the IFMT bits of the mode + * set to specify the file type and the size field for rdev. + */ +/* ARGSUSED */ +int +nfs_mknod(ap) + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + register struct vnode *dvp = ap->a_dvp; + register struct vattr *vap = ap->a_vap; + register struct componentname *cnp = ap->a_cnp; + register struct nfsv2_sattr *sp; + register u_long *tl; + register caddr_t cp; + register long t1, t2; + struct vnode *newvp; + struct vattr vattr; + char *cp2; + caddr_t bpos, dpos; + int error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + u_long rdev; + + isnq = (VFSTONFS(dvp->v_mount)->nm_flag & NFSMNT_NQNFS); + if (vap->va_type == VCHR || vap->va_type == VBLK) + rdev = txdr_unsigned(vap->va_rdev); +#ifdef FIFO + else if (vap->va_type == VFIFO) + rdev = 0xffffffff; +#endif /* FIFO */ + else { + VOP_ABORTOP(dvp, cnp); + vput(dvp); + return (EOPNOTSUPP); + } + if (error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) { + VOP_ABORTOP(dvp, cnp); + vput(dvp); + return (error); + } + nfsstats.rpccnt[NFSPROC_CREATE]++; + nfsm_reqhead(dvp, NFSPROC_CREATE, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(cnp->cn_namelen)+NFSX_SATTR(isnq)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_build(sp, struct nfsv2_sattr *, NFSX_SATTR(isnq)); + sp->sa_mode = vtonfs_mode(vap->va_type, vap->va_mode); + sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); + sp->sa_gid = txdr_unsigned(vattr.va_gid); + if (isnq) { + sp->sa_nqrdev = rdev; + sp->sa_nqflags = 0; + txdr_nqtime(&vap->va_atime, &sp->sa_nqatime); + txdr_nqtime(&vap->va_mtime, &sp->sa_nqmtime); + } else { + sp->sa_nfssize = rdev; + txdr_nfstime(&vap->va_atime, &sp->sa_nfsatime); + txdr_nfstime(&vap->va_mtime, &sp->sa_nfsmtime); + } + nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_proc, cnp->cn_cred); + nfsm_mtofh(dvp, newvp); + nfsm_reqdone; + if (!error && (cnp->cn_flags & MAKEENTRY)) + cache_enter(dvp, newvp, cnp); + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + vrele(dvp); + return (error); +} + +/* + * nfs file create call + */ +int +nfs_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + register struct vnode *dvp = ap->a_dvp; + register struct vattr *vap = ap->a_vap; + register struct componentname *cnp = ap->a_cnp; + register struct nfsv2_sattr *sp; + register u_long *tl; + register caddr_t cp; + register long t1, t2; + caddr_t bpos, dpos, cp2; + int error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct vattr vattr; + + if (error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) { + VOP_ABORTOP(dvp, cnp); + vput(dvp); + return (error); + } + nfsstats.rpccnt[NFSPROC_CREATE]++; + isnq = (VFSTONFS(dvp->v_mount)->nm_flag & NFSMNT_NQNFS); + nfsm_reqhead(dvp, NFSPROC_CREATE, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(cnp->cn_namelen)+NFSX_SATTR(isnq)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_build(sp, struct nfsv2_sattr *, NFSX_SATTR(isnq)); + sp->sa_mode = vtonfs_mode(vap->va_type, vap->va_mode); + sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); + sp->sa_gid = txdr_unsigned(vattr.va_gid); + if (isnq) { + u_quad_t qval = 0; + + txdr_hyper(&qval, &sp->sa_nqsize); + sp->sa_nqflags = 0; + sp->sa_nqrdev = -1; + txdr_nqtime(&vap->va_atime, &sp->sa_nqatime); + txdr_nqtime(&vap->va_mtime, &sp->sa_nqmtime); + } else { + sp->sa_nfssize = 0; + txdr_nfstime(&vap->va_atime, &sp->sa_nfsatime); + txdr_nfstime(&vap->va_mtime, &sp->sa_nfsmtime); + } + nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_proc, cnp->cn_cred); + nfsm_mtofh(dvp, *ap->a_vpp); + nfsm_reqdone; + if (!error && (cnp->cn_flags & MAKEENTRY)) + cache_enter(dvp, *ap->a_vpp, cnp); + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + vrele(dvp); + return (error); +} + +/* + * nfs file remove call + * To try and make nfs semantics closer to ufs semantics, a file that has + * other processes using the vnode is renamed instead of removed and then + * removed later on the last close. + * - If v_usecount > 1 + * If a rename is not already in the works + * call nfs_sillyrename() to set it up + * else + * do the remove rpc + */ +int +nfs_remove(ap) + struct vop_remove_args /* { + struct vnodeop_desc *a_desc; + struct vnode * a_dvp; + struct vnode * a_vp; + struct componentname * a_cnp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct vnode *dvp = ap->a_dvp; + register struct componentname *cnp = ap->a_cnp; + register struct nfsnode *np = VTONFS(vp); + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + if (vp->v_usecount > 1) { + if (!np->n_sillyrename) + error = nfs_sillyrename(dvp, vp, cnp); + } else { + /* + * Purge the name cache so that the chance of a lookup for + * the name succeeding while the remove is in progress is + * minimized. Without node locking it can still happen, such + * that an I/O op returns ESTALE, but since you get this if + * another host removes the file.. + */ + cache_purge(vp); + /* + * Throw away biocache buffers. Mainly to avoid + * unnecessary delayed writes. + */ + error = nfs_vinvalbuf(vp, 0, cnp->cn_cred, cnp->cn_proc, 1); + if (error == EINTR) + return (error); + /* Do the rpc */ + nfsstats.rpccnt[NFSPROC_REMOVE]++; + nfsm_reqhead(dvp, NFSPROC_REMOVE, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(cnp->cn_namelen)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_request(dvp, NFSPROC_REMOVE, cnp->cn_proc, cnp->cn_cred); + nfsm_reqdone; + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + /* + * Kludge City: If the first reply to the remove rpc is lost.. + * the reply to the retransmitted request will be ENOENT + * since the file was in fact removed + * Therefore, we cheat and return success. + */ + if (error == ENOENT) + error = 0; + } + np->n_attrstamp = 0; + vrele(dvp); + vrele(vp); + return (error); +} + +/* + * nfs file remove rpc called from nfs_inactive + */ +int +nfs_removeit(sp) + register struct sillyrename *sp; +{ + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + nfsstats.rpccnt[NFSPROC_REMOVE]++; + nfsm_reqhead(sp->s_dvp, NFSPROC_REMOVE, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(sp->s_namlen)); + nfsm_fhtom(sp->s_dvp); + nfsm_strtom(sp->s_name, sp->s_namlen, NFS_MAXNAMLEN); + nfsm_request(sp->s_dvp, NFSPROC_REMOVE, NULL, sp->s_cred); + nfsm_reqdone; + VTONFS(sp->s_dvp)->n_flag |= NMODIFIED; + VTONFS(sp->s_dvp)->n_attrstamp = 0; + return (error); +} + +/* + * nfs file rename call + */ +int +nfs_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + register struct vnode *fvp = ap->a_fvp; + register struct vnode *tvp = ap->a_tvp; + register struct vnode *fdvp = ap->a_fdvp; + register struct vnode *tdvp = ap->a_tdvp; + register struct componentname *tcnp = ap->a_tcnp; + register struct componentname *fcnp = ap->a_fcnp; + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + /* Check for cross-device rename */ + if ((fvp->v_mount != tdvp->v_mount) || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; + goto out; + } + + + nfsstats.rpccnt[NFSPROC_RENAME]++; + nfsm_reqhead(fdvp, NFSPROC_RENAME, + (NFSX_FH+NFSX_UNSIGNED)*2+nfsm_rndup(fcnp->cn_namelen)+ + nfsm_rndup(fcnp->cn_namelen)); /* or fcnp->cn_cred?*/ + nfsm_fhtom(fdvp); + nfsm_strtom(fcnp->cn_nameptr, fcnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_fhtom(tdvp); + nfsm_strtom(tcnp->cn_nameptr, tcnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_request(fdvp, NFSPROC_RENAME, tcnp->cn_proc, tcnp->cn_cred); + nfsm_reqdone; + VTONFS(fdvp)->n_flag |= NMODIFIED; + VTONFS(fdvp)->n_attrstamp = 0; + VTONFS(tdvp)->n_flag |= NMODIFIED; + VTONFS(tdvp)->n_attrstamp = 0; + if (fvp->v_type == VDIR) { + if (tvp != NULL && tvp->v_type == VDIR) + cache_purge(tdvp); + cache_purge(fdvp); + } +out: + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + vrele(fdvp); + vrele(fvp); + /* + * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. + */ + if (error == ENOENT) + error = 0; + return (error); +} + +/* + * nfs file rename rpc called from nfs_remove() above + */ +int +nfs_renameit(sdvp, scnp, sp) + struct vnode *sdvp; + struct componentname *scnp; + register struct sillyrename *sp; +{ + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + nfsstats.rpccnt[NFSPROC_RENAME]++; + nfsm_reqhead(sdvp, NFSPROC_RENAME, + (NFSX_FH+NFSX_UNSIGNED)*2+nfsm_rndup(scnp->cn_namelen)+ + nfsm_rndup(sp->s_namlen)); + nfsm_fhtom(sdvp); + nfsm_strtom(scnp->cn_nameptr, scnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_fhtom(sdvp); + nfsm_strtom(sp->s_name, sp->s_namlen, NFS_MAXNAMLEN); + nfsm_request(sdvp, NFSPROC_RENAME, scnp->cn_proc, scnp->cn_cred); + nfsm_reqdone; + FREE(scnp->cn_pnbuf, M_NAMEI); + VTONFS(sdvp)->n_flag |= NMODIFIED; + VTONFS(sdvp)->n_attrstamp = 0; + return (error); +} + +/* + * nfs hard link create call + */ +int +nfs_link(ap) + struct vop_link_args /* { + struct vnode *a_vp; + struct vnode *a_tdvp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct vnode *tdvp = ap->a_tdvp; + register struct componentname *cnp = ap->a_cnp; + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + if (vp->v_mount != tdvp->v_mount) { + /*VOP_ABORTOP(vp, cnp);*/ + if (tdvp == vp) + vrele(vp); + else + vput(vp); + return (EXDEV); + } + + nfsstats.rpccnt[NFSPROC_LINK]++; + nfsm_reqhead(tdvp, NFSPROC_LINK, + NFSX_FH*2+NFSX_UNSIGNED+nfsm_rndup(cnp->cn_namelen)); + nfsm_fhtom(tdvp); + nfsm_fhtom(vp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_request(tdvp, NFSPROC_LINK, cnp->cn_proc, cnp->cn_cred); + nfsm_reqdone; + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(tdvp)->n_attrstamp = 0; + VTONFS(tdvp)->n_flag |= NMODIFIED; + VTONFS(vp)->n_attrstamp = 0; + vrele(vp); + /* + * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. + */ + if (error == EEXIST) + error = 0; + return (error); +} + +/* + * nfs symbolic link create call + */ +/* start here */ +int +nfs_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + register struct vnode *dvp = ap->a_dvp; + register struct vattr *vap = ap->a_vap; + register struct componentname *cnp = ap->a_cnp; + register struct nfsv2_sattr *sp; + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int slen, error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + nfsstats.rpccnt[NFSPROC_SYMLINK]++; + slen = strlen(ap->a_target); + isnq = (VFSTONFS(dvp->v_mount)->nm_flag & NFSMNT_NQNFS); + nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH+2*NFSX_UNSIGNED+ + nfsm_rndup(cnp->cn_namelen)+nfsm_rndup(slen)+NFSX_SATTR(isnq)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN); + nfsm_build(sp, struct nfsv2_sattr *, NFSX_SATTR(isnq)); + sp->sa_mode = vtonfs_mode(VLNK, vap->va_mode); + sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); + sp->sa_gid = txdr_unsigned(cnp->cn_cred->cr_gid); + if (isnq) { + quad_t qval = -1; + + txdr_hyper(&qval, &sp->sa_nqsize); + sp->sa_nqflags = 0; + txdr_nqtime(&vap->va_atime, &sp->sa_nqatime); + txdr_nqtime(&vap->va_mtime, &sp->sa_nqmtime); + } else { + sp->sa_nfssize = -1; + txdr_nfstime(&vap->va_atime, &sp->sa_nfsatime); + txdr_nfstime(&vap->va_mtime, &sp->sa_nfsmtime); + } + nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_proc, cnp->cn_cred); + nfsm_reqdone; + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + vrele(dvp); + /* + * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. + */ + if (error == EEXIST) + error = 0; + return (error); +} + +/* + * nfs make dir call + */ +int +nfs_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + register struct vnode *dvp = ap->a_dvp; + register struct vattr *vap = ap->a_vap; + register struct componentname *cnp = ap->a_cnp; + register struct vnode **vpp = ap->a_vpp; + register struct nfsv2_sattr *sp; + register u_long *tl; + register caddr_t cp; + register long t1, t2; + register int len; + caddr_t bpos, dpos, cp2; + int error = 0, firsttry = 1, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct vattr vattr; + + if (error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) { + VOP_ABORTOP(dvp, cnp); + vput(dvp); + return (error); + } + len = cnp->cn_namelen; + isnq = (VFSTONFS(dvp->v_mount)->nm_flag & NFSMNT_NQNFS); + nfsstats.rpccnt[NFSPROC_MKDIR]++; + nfsm_reqhead(dvp, NFSPROC_MKDIR, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(len)+NFSX_SATTR(isnq)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); + nfsm_build(sp, struct nfsv2_sattr *, NFSX_SATTR(isnq)); + sp->sa_mode = vtonfs_mode(VDIR, vap->va_mode); + sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); + sp->sa_gid = txdr_unsigned(vattr.va_gid); + if (isnq) { + quad_t qval = -1; + + txdr_hyper(&qval, &sp->sa_nqsize); + sp->sa_nqflags = 0; + txdr_nqtime(&vap->va_atime, &sp->sa_nqatime); + txdr_nqtime(&vap->va_mtime, &sp->sa_nqmtime); + } else { + sp->sa_nfssize = -1; + txdr_nfstime(&vap->va_atime, &sp->sa_nfsatime); + txdr_nfstime(&vap->va_mtime, &sp->sa_nfsmtime); + } + nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_proc, cnp->cn_cred); + nfsm_mtofh(dvp, *vpp); + nfsm_reqdone; + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + /* + * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry + * if we can succeed in looking up the directory. + * "firsttry" is necessary since the macros may "goto nfsmout" which + * is above the if on errors. (Ugh) + */ + if (error == EEXIST && firsttry) { + firsttry = 0; + error = 0; + nfsstats.rpccnt[NFSPROC_LOOKUP]++; + *vpp = NULL; + nfsm_reqhead(dvp, NFSPROC_LOOKUP, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(len)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); + nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_proc, cnp->cn_cred); + nfsm_mtofh(dvp, *vpp); + if ((*vpp)->v_type != VDIR) { + vput(*vpp); + error = EEXIST; + } + m_freem(mrep); + } + FREE(cnp->cn_pnbuf, M_NAMEI); + vrele(dvp); + return (error); +} + +/* + * nfs remove directory call + */ +int +nfs_rmdir(ap) + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct vnode *dvp = ap->a_dvp; + register struct componentname *cnp = ap->a_cnp; + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + if (dvp == vp) { + vrele(dvp); + vrele(dvp); + FREE(cnp->cn_pnbuf, M_NAMEI); + return (EINVAL); + } + nfsstats.rpccnt[NFSPROC_RMDIR]++; + nfsm_reqhead(dvp, NFSPROC_RMDIR, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(cnp->cn_namelen)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_proc, cnp->cn_cred); + nfsm_reqdone; + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + cache_purge(dvp); + cache_purge(vp); + vrele(vp); + vrele(dvp); + /* + * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. + */ + if (error == ENOENT) + error = 0; + return (error); +} + +/* + * nfs readdir call + * Although cookie is defined as opaque, I translate it to/from net byte + * order so that it looks more sensible. This appears consistent with the + * Ultrix implementation of NFS. + */ +int +nfs_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + register struct uio *uio = ap->a_uio; + int tresid, error; + struct vattr vattr; + + if (vp->v_type != VDIR) + return (EPERM); + /* + * First, check for hit on the EOF offset cache + */ + if (uio->uio_offset != 0 && uio->uio_offset == np->n_direofoffset && + (np->n_flag & NMODIFIED) == 0) { + if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) { + if (NQNFS_CKCACHABLE(vp, NQL_READ)) { + nfsstats.direofcache_hits++; + return (0); + } + } else if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_procp) == 0 && + np->n_mtime == vattr.va_mtime.ts_sec) { + nfsstats.direofcache_hits++; + return (0); + } + } + + /* + * Call nfs_bioread() to do the real work. + */ + tresid = uio->uio_resid; + error = nfs_bioread(vp, uio, 0, ap->a_cred); + + if (!error && uio->uio_resid == tresid) + nfsstats.direofcache_misses++; + return (error); +} + +/* + * Readdir rpc call. + * Called from below the buffer cache by nfs_doio(). + */ +int +nfs_readdirrpc(vp, uiop, cred) + register struct vnode *vp; + struct uio *uiop; + struct ucred *cred; +{ + register long len; + register struct dirent *dp; + register u_long *tl; + register caddr_t cp; + register long t1; + long tlen, lastlen; + caddr_t bpos, dpos, cp2; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct mbuf *md2; + caddr_t dpos2; + int siz; + int more_dirs = 1; + u_long off, savoff; + struct dirent *savdp; + struct nfsmount *nmp; + struct nfsnode *np = VTONFS(vp); + long tresid; + + nmp = VFSTONFS(vp->v_mount); + tresid = uiop->uio_resid; + /* + * Loop around doing readdir rpc's of size uio_resid or nm_rsize, + * whichever is smaller, truncated to a multiple of NFS_DIRBLKSIZ. + * The stopping criteria is EOF or buffer full. + */ + while (more_dirs && uiop->uio_resid >= NFS_DIRBLKSIZ) { + nfsstats.rpccnt[NFSPROC_READDIR]++; + nfsm_reqhead(vp, NFSPROC_READDIR, + NFSX_FH + 2 * NFSX_UNSIGNED); + nfsm_fhtom(vp); + nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); + off = (u_long)uiop->uio_offset; + *tl++ = txdr_unsigned(off); + *tl = txdr_unsigned(((uiop->uio_resid > nmp->nm_rsize) ? + nmp->nm_rsize : uiop->uio_resid) & ~(NFS_DIRBLKSIZ-1)); + nfsm_request(vp, NFSPROC_READDIR, uiop->uio_procp, cred); + siz = 0; + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + more_dirs = fxdr_unsigned(int, *tl); + + /* Save the position so that we can do nfsm_mtouio() later */ + dpos2 = dpos; + md2 = md; + + /* loop thru the dir entries, doctoring them to 4bsd form */ +#ifdef lint + dp = (struct dirent *)0; +#endif /* lint */ + while (more_dirs && siz < uiop->uio_resid) { + savoff = off; /* Hold onto offset and dp */ + savdp = dp; + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + dp = (struct dirent *)tl; + dp->d_fileno = fxdr_unsigned(u_long, *tl++); + len = fxdr_unsigned(int, *tl); + if (len <= 0 || len > NFS_MAXNAMLEN) { + error = EBADRPC; + m_freem(mrep); + goto nfsmout; + } + dp->d_namlen = (u_char)len; + dp->d_type = DT_UNKNOWN; + nfsm_adv(len); /* Point past name */ + tlen = nfsm_rndup(len); + /* + * This should not be necessary, but some servers have + * broken XDR such that these bytes are not null filled. + */ + if (tlen != len) { + *dpos = '\0'; /* Null-terminate */ + nfsm_adv(tlen - len); + len = tlen; + } + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + off = fxdr_unsigned(u_long, *tl); + *tl++ = 0; /* Ensures null termination of name */ + more_dirs = fxdr_unsigned(int, *tl); + dp->d_reclen = len + 4 * NFSX_UNSIGNED; + siz += dp->d_reclen; + } + /* + * If at end of rpc data, get the eof boolean + */ + if (!more_dirs) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + more_dirs = (fxdr_unsigned(int, *tl) == 0); + + /* + * If at EOF, cache directory offset + */ + if (!more_dirs) + np->n_direofoffset = off; + } + /* + * If there is too much to fit in the data buffer, use savoff and + * savdp to trim off the last record. + * --> we are not at eof + */ + if (siz > uiop->uio_resid) { + off = savoff; + siz -= dp->d_reclen; + dp = savdp; + more_dirs = 0; /* Paranoia */ + } + if (siz > 0) { + lastlen = dp->d_reclen; + md = md2; + dpos = dpos2; + nfsm_mtouio(uiop, siz); + uiop->uio_offset = (off_t)off; + } else + more_dirs = 0; /* Ugh, never happens, but in case.. */ + m_freem(mrep); + } + /* + * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ + * by increasing d_reclen for the last record. + */ + if (uiop->uio_resid < tresid) { + len = uiop->uio_resid & (NFS_DIRBLKSIZ - 1); + if (len > 0) { + dp = (struct dirent *) + (uiop->uio_iov->iov_base - lastlen); + dp->d_reclen += len; + uiop->uio_iov->iov_base += len; + uiop->uio_iov->iov_len -= len; + uiop->uio_resid -= len; + } + } +nfsmout: + return (error); +} + +/* + * Nqnfs readdir_and_lookup RPC. Used in place of nfs_readdirrpc(). + */ +int +nfs_readdirlookrpc(vp, uiop, cred) + struct vnode *vp; + register struct uio *uiop; + struct ucred *cred; +{ + register int len; + register struct dirent *dp; + register u_long *tl; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct nameidata nami, *ndp = &nami; + struct componentname *cnp = &ndp->ni_cnd; + u_long off, endoff, fileno; + time_t reqtime, ltime; + struct nfsmount *nmp; + struct nfsnode *np; + struct vnode *newvp; + nfsv2fh_t *fhp; + u_quad_t frev; + int error = 0, tlen, more_dirs = 1, tresid, doit, bigenough, i; + int cachable; + + if (uiop->uio_iovcnt != 1) + panic("nfs rdirlook"); + nmp = VFSTONFS(vp->v_mount); + tresid = uiop->uio_resid; + ndp->ni_dvp = vp; + newvp = NULLVP; + /* + * Loop around doing readdir rpc's of size uio_resid or nm_rsize, + * whichever is smaller, truncated to a multiple of NFS_DIRBLKSIZ. + * The stopping criteria is EOF or buffer full. + */ + while (more_dirs && uiop->uio_resid >= NFS_DIRBLKSIZ) { + nfsstats.rpccnt[NQNFSPROC_READDIRLOOK]++; + nfsm_reqhead(vp, NQNFSPROC_READDIRLOOK, + NFSX_FH + 3 * NFSX_UNSIGNED); + nfsm_fhtom(vp); + nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); + off = (u_long)uiop->uio_offset; + *tl++ = txdr_unsigned(off); + *tl++ = txdr_unsigned(((uiop->uio_resid > nmp->nm_rsize) ? + nmp->nm_rsize : uiop->uio_resid) & ~(NFS_DIRBLKSIZ-1)); + if (nmp->nm_flag & NFSMNT_NQLOOKLEASE) + *tl = txdr_unsigned(nmp->nm_leaseterm); + else + *tl = 0; + reqtime = time.tv_sec; + nfsm_request(vp, NQNFSPROC_READDIRLOOK, uiop->uio_procp, cred); + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + more_dirs = fxdr_unsigned(int, *tl); + + /* loop thru the dir entries, doctoring them to 4bsd form */ + bigenough = 1; + while (more_dirs && bigenough) { + doit = 1; + nfsm_dissect(tl, u_long *, 4 * NFSX_UNSIGNED); + if (nmp->nm_flag & NFSMNT_NQLOOKLEASE) { + cachable = fxdr_unsigned(int, *tl++); + ltime = reqtime + fxdr_unsigned(int, *tl++); + fxdr_hyper(tl, &frev); + } + nfsm_dissect(fhp, nfsv2fh_t *, NFSX_FH); + if (!bcmp(VTONFS(vp)->n_fh.fh_bytes, (caddr_t)fhp, NFSX_FH)) { + VREF(vp); + newvp = vp; + np = VTONFS(vp); + } else { + if (error = nfs_nget(vp->v_mount, fhp, &np)) + doit = 0; + newvp = NFSTOV(np); + } + if (error = nfs_loadattrcache(&newvp, &md, &dpos, + (struct vattr *)0)) + doit = 0; + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + fileno = fxdr_unsigned(u_long, *tl++); + len = fxdr_unsigned(int, *tl); + if (len <= 0 || len > NFS_MAXNAMLEN) { + error = EBADRPC; + m_freem(mrep); + goto nfsmout; + } + tlen = (len + 4) & ~0x3; + if ((tlen + DIRHDSIZ) > uiop->uio_resid) + bigenough = 0; + if (bigenough && doit) { + dp = (struct dirent *)uiop->uio_iov->iov_base; + dp->d_fileno = fileno; + dp->d_namlen = len; + dp->d_reclen = tlen + DIRHDSIZ; + dp->d_type = + IFTODT(VTTOIF(np->n_vattr.va_type)); + uiop->uio_resid -= DIRHDSIZ; + uiop->uio_iov->iov_base += DIRHDSIZ; + uiop->uio_iov->iov_len -= DIRHDSIZ; + cnp->cn_nameptr = uiop->uio_iov->iov_base; + cnp->cn_namelen = len; + ndp->ni_vp = newvp; + nfsm_mtouio(uiop, len); + cp = uiop->uio_iov->iov_base; + tlen -= len; + for (i = 0; i < tlen; i++) + *cp++ = '\0'; + uiop->uio_iov->iov_base += tlen; + uiop->uio_iov->iov_len -= tlen; + uiop->uio_resid -= tlen; + cnp->cn_hash = 0; + for (cp = cnp->cn_nameptr, i = 1; i <= len; i++, cp++) + cnp->cn_hash += (unsigned char)*cp * i; + if ((nmp->nm_flag & NFSMNT_NQLOOKLEASE) && + ltime > time.tv_sec) + nqnfs_clientlease(nmp, np, NQL_READ, + cachable, ltime, frev); + if (cnp->cn_namelen <= NCHNAMLEN) + cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp); + } else { + nfsm_adv(nfsm_rndup(len)); + } + if (newvp != NULLVP) { + vrele(newvp); + newvp = NULLVP; + } + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + if (bigenough) + endoff = off = fxdr_unsigned(u_long, *tl++); + else + endoff = fxdr_unsigned(u_long, *tl++); + more_dirs = fxdr_unsigned(int, *tl); + } + /* + * If at end of rpc data, get the eof boolean + */ + if (!more_dirs) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + more_dirs = (fxdr_unsigned(int, *tl) == 0); + + /* + * If at EOF, cache directory offset + */ + if (!more_dirs) + VTONFS(vp)->n_direofoffset = endoff; + } + if (uiop->uio_resid < tresid) + uiop->uio_offset = (off_t)off; + else + more_dirs = 0; + m_freem(mrep); + } + /* + * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ + * by increasing d_reclen for the last record. + */ + if (uiop->uio_resid < tresid) { + len = uiop->uio_resid & (NFS_DIRBLKSIZ - 1); + if (len > 0) { + dp->d_reclen += len; + uiop->uio_iov->iov_base += len; + uiop->uio_iov->iov_len -= len; + uiop->uio_resid -= len; + } + } +nfsmout: + if (newvp != NULLVP) + vrele(newvp); + return (error); +} +static char hextoasc[] = "0123456789abcdef"; + +/* + * Silly rename. To make the NFS filesystem that is stateless look a little + * more like the "ufs" a remove of an active vnode is translated to a rename + * to a funny looking filename that is removed by nfs_inactive on the + * nfsnode. There is the potential for another process on a different client + * to create the same funny name between the nfs_lookitup() fails and the + * nfs_rename() completes, but... + */ +int +nfs_sillyrename(dvp, vp, cnp) + struct vnode *dvp, *vp; + struct componentname *cnp; +{ + register struct nfsnode *np; + register struct sillyrename *sp; + int error; + short pid; + + cache_purge(dvp); + np = VTONFS(vp); +#ifdef SILLYSEPARATE + MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename), + M_NFSREQ, M_WAITOK); +#else + sp = &np->n_silly; +#endif + sp->s_cred = crdup(cnp->cn_cred); + sp->s_dvp = dvp; + VREF(dvp); + + /* Fudge together a funny name */ + pid = cnp->cn_proc->p_pid; + bcopy(".nfsAxxxx4.4", sp->s_name, 13); + sp->s_namlen = 12; + sp->s_name[8] = hextoasc[pid & 0xf]; + sp->s_name[7] = hextoasc[(pid >> 4) & 0xf]; + sp->s_name[6] = hextoasc[(pid >> 8) & 0xf]; + sp->s_name[5] = hextoasc[(pid >> 12) & 0xf]; + + /* Try lookitups until we get one that isn't there */ + while (nfs_lookitup(sp, (nfsv2fh_t *)0, cnp->cn_proc) == 0) { + sp->s_name[4]++; + if (sp->s_name[4] > 'z') { + error = EINVAL; + goto bad; + } + } + if (error = nfs_renameit(dvp, cnp, sp)) + goto bad; + nfs_lookitup(sp, &np->n_fh, cnp->cn_proc); + np->n_sillyrename = sp; + return (0); +bad: + vrele(sp->s_dvp); + crfree(sp->s_cred); +#ifdef SILLYSEPARATE + free((caddr_t)sp, M_NFSREQ); +#endif + return (error); +} + +/* + * Look up a file name for silly rename stuff. + * Just like nfs_lookup() except that it doesn't load returned values + * into the nfsnode table. + * If fhp != NULL it copies the returned file handle out + */ +int +nfs_lookitup(sp, fhp, procp) + register struct sillyrename *sp; + nfsv2fh_t *fhp; + struct proc *procp; +{ + register struct vnode *vp = sp->s_dvp; + register u_long *tl; + register caddr_t cp; + register long t1, t2; + caddr_t bpos, dpos, cp2; + int error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + long len; + + isnq = (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS); + nfsstats.rpccnt[NFSPROC_LOOKUP]++; + len = sp->s_namlen; + nfsm_reqhead(vp, NFSPROC_LOOKUP, NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(len)); + if (isnq) { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = 0; + } + nfsm_fhtom(vp); + nfsm_strtom(sp->s_name, len, NFS_MAXNAMLEN); + nfsm_request(vp, NFSPROC_LOOKUP, procp, sp->s_cred); + if (fhp != NULL) { + if (isnq) + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + nfsm_dissect(cp, caddr_t, NFSX_FH); + bcopy(cp, (caddr_t)fhp, NFSX_FH); + } + nfsm_reqdone; + return (error); +} + +/* + * Kludge City.. + * - make nfs_bmap() essentially a no-op that does no translation + * - do nfs_strategy() by faking physical I/O with nfs_readrpc/nfs_writerpc + * after mapping the physical addresses into Kernel Virtual space in the + * nfsiobuf area. + * (Maybe I could use the process's page mapping, but I was concerned that + * Kernel Write might not be enabled and also figured copyout() would do + * a lot more work than bcopy() and also it currently happens in the + * context of the swapper process (2). + */ +int +nfs_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + + if (ap->a_vpp != NULL) + *ap->a_vpp = vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn * btodb(vp->v_mount->mnt_stat.f_iosize); + return (0); +} + +/* + * Strategy routine. + * For async requests when nfsiod(s) are running, queue the request by + * calling nfs_asyncio(), otherwise just all nfs_doio() to do the + * request. + */ +int +nfs_strategy(ap) + struct vop_strategy_args *ap; +{ + register struct buf *bp = ap->a_bp; + struct ucred *cr; + struct proc *p; + int error = 0; + + if (bp->b_flags & B_PHYS) + panic("nfs physio"); + if (bp->b_flags & B_ASYNC) + p = (struct proc *)0; + else + p = curproc; /* XXX */ + if (bp->b_flags & B_READ) + cr = bp->b_rcred; + else + cr = bp->b_wcred; + /* + * If the op is asynchronous and an i/o daemon is waiting + * queue the request, wake it up and wait for completion + * otherwise just do it ourselves. + */ + if ((bp->b_flags & B_ASYNC) == 0 || + nfs_asyncio(bp, NOCRED)) + error = nfs_doio(bp, cr, p); + return (error); +} + +/* + * Mmap a file + * + * NB Currently unsupported. + */ +/* ARGSUSED */ +int +nfs_mmap(ap) + struct vop_mmap_args /* { + struct vnode *a_vp; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (EINVAL); +} + +/* + * Flush all the blocks associated with a vnode. + * Walk through the buffer pool and push any dirty pages + * associated with the vnode. + */ +/* ARGSUSED */ +int +nfs_fsync(ap) + struct vop_fsync_args /* { + struct vnodeop_desc *a_desc; + struct vnode * a_vp; + struct ucred * a_cred; + int a_waitfor; + struct proc * a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + register struct buf *bp; + struct buf *nbp; + struct nfsmount *nmp; + int s, error = 0, slptimeo = 0, slpflag = 0; + + nmp = VFSTONFS(vp->v_mount); + if (nmp->nm_flag & NFSMNT_INT) + slpflag = PCATCH; +loop: + s = splbio(); + for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { + nbp = bp->b_vnbufs.le_next; + if (bp->b_flags & B_BUSY) { + if (ap->a_waitfor != MNT_WAIT) + continue; + bp->b_flags |= B_WANTED; + error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), + "nfsfsync", slptimeo); + splx(s); + if (error) { + if (nfs_sigintr(nmp, (struct nfsreq *)0, ap->a_p)) + return (EINTR); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + goto loop; + } + if ((bp->b_flags & B_DELWRI) == 0) + panic("nfs_fsync: not dirty"); + bremfree(bp); + bp->b_flags |= B_BUSY; + splx(s); + bp->b_flags |= B_ASYNC; + VOP_BWRITE(bp); + goto loop; + } + splx(s); + if (ap->a_waitfor == MNT_WAIT) { + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + error = tsleep((caddr_t)&vp->v_numoutput, + slpflag | (PRIBIO + 1), "nfsfsync", slptimeo); + if (error) { + if (nfs_sigintr(nmp, (struct nfsreq *)0, ap->a_p)) + return (EINTR); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + } + if (vp->v_dirtyblkhd.lh_first) { +#ifdef DIAGNOSTIC + vprint("nfs_fsync: dirty", vp); +#endif + goto loop; + } + } + if (np->n_flag & NWRITEERR) { + error = np->n_error; + np->n_flag &= ~NWRITEERR; + } + return (error); +} + +/* + * Return POSIX pathconf information applicable to nfs. + * + * Currently the NFS protocol does not support getting such + * information from the remote server. + */ +/* ARGSUSED */ +nfs_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + return (EINVAL); +} + +/* + * NFS advisory byte-level locks. + * Currently unsupported. + */ +int +nfs_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * Print out the contents of an nfsnode. + */ +int +nfs_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + + printf("tag VT_NFS, fileid %d fsid 0x%x", + np->n_vattr.va_fileid, np->n_vattr.va_fsid); +#ifdef FIFO + if (vp->v_type == VFIFO) + fifo_printinfo(vp); +#endif /* FIFO */ + printf("\n"); +} + +/* + * NFS directory offset lookup. + * Currently unsupported. + */ +int +nfs_blkatoff(ap) + struct vop_blkatoff_args /* { + struct vnode *a_vp; + off_t a_offset; + char **a_res; + struct buf **a_bpp; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * NFS flat namespace allocation. + * Currently unsupported. + */ +int +nfs_valloc(ap) + struct vop_valloc_args /* { + struct vnode *a_pvp; + int a_mode; + struct ucred *a_cred; + struct vnode **a_vpp; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * NFS flat namespace free. + * Currently unsupported. + */ +int +nfs_vfree(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * NFS file truncation. + */ +int +nfs_truncate(ap) + struct vop_truncate_args /* { + struct vnode *a_vp; + off_t a_length; + int a_flags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* Use nfs_setattr */ + printf("nfs_truncate: need to implement!!"); + return (EOPNOTSUPP); +} + +/* + * NFS update. + */ +int +nfs_update(ap) + struct vop_update_args /* { + struct vnode *a_vp; + struct timeval *a_ta; + struct timeval *a_tm; + int a_waitfor; + } */ *ap; +{ + + /* Use nfs_setattr */ + printf("nfs_update: need to implement!!"); + return (EOPNOTSUPP); +} + +/* + * nfs special file access vnode op. + * Essentially just get vattr and then imitate iaccess() since the device is + * local to the client. + */ +int +nfsspec_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vattr *vap; + register gid_t *gp; + register struct ucred *cred = ap->a_cred; + mode_t mode = ap->a_mode; + struct vattr vattr; + register int i; + int error; + + /* + * If you're the super-user, + * you always get access. + */ + if (cred->cr_uid == 0) + return (0); + vap = &vattr; + if (error = VOP_GETATTR(ap->a_vp, vap, cred, ap->a_p)) + return (error); + /* + * Access check is based on only one of owner, group, public. + * If not owner, then check group. If not a member of the + * group, then check public access. + */ + if (cred->cr_uid != vap->va_uid) { + mode >>= 3; + gp = cred->cr_groups; + for (i = 0; i < cred->cr_ngroups; i++, gp++) + if (vap->va_gid == *gp) + goto found; + mode >>= 3; +found: + ; + } + return ((vap->va_mode & mode) == mode ? 0 : EACCES); +} + +/* + * Read wrapper for special devices. + */ +int +nfsspec_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct nfsnode *np = VTONFS(ap->a_vp); + + /* + * Set access flag. + */ + np->n_flag |= NACC; + np->n_atim = time; + return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap)); +} + +/* + * Write wrapper for special devices. + */ +int +nfsspec_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct nfsnode *np = VTONFS(ap->a_vp); + + /* + * Set update flag. + */ + np->n_flag |= NUPD; + np->n_mtim = time; + return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap)); +} + +/* + * Close wrapper for special devices. + * + * Update the times on the nfsnode then do device close. + */ +int +nfsspec_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + struct vattr vattr; + + if (np->n_flag & (NACC | NUPD)) { + np->n_flag |= NCHG; + if (vp->v_usecount == 1 && + (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { + VATTR_NULL(&vattr); + if (np->n_flag & NACC) { + vattr.va_atime.ts_sec = np->n_atim.tv_sec; + vattr.va_atime.ts_nsec = + np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vattr.va_mtime.ts_sec = np->n_mtim.tv_sec; + vattr.va_mtime.ts_nsec = + np->n_mtim.tv_usec * 1000; + } + (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); + } + } + return (VOCALL(spec_vnodeop_p, VOFFSET(vop_close), ap)); +} + +#ifdef FIFO +/* + * Read wrapper for fifos. + */ +int +nfsfifo_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + extern int (**fifo_vnodeop_p)(); + register struct nfsnode *np = VTONFS(ap->a_vp); + + /* + * Set access flag. + */ + np->n_flag |= NACC; + np->n_atim = time; + return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap)); +} + +/* + * Write wrapper for fifos. + */ +int +nfsfifo_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + extern int (**fifo_vnodeop_p)(); + register struct nfsnode *np = VTONFS(ap->a_vp); + + /* + * Set update flag. + */ + np->n_flag |= NUPD; + np->n_mtim = time; + return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap)); +} + +/* + * Close wrapper for fifos. + * + * Update the times on the nfsnode then do fifo close. + */ +int +nfsfifo_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + struct vattr vattr; + extern int (**fifo_vnodeop_p)(); + + if (np->n_flag & (NACC | NUPD)) { + if (np->n_flag & NACC) + np->n_atim = time; + if (np->n_flag & NUPD) + np->n_mtim = time; + np->n_flag |= NCHG; + if (vp->v_usecount == 1 && + (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { + VATTR_NULL(&vattr); + if (np->n_flag & NACC) { + vattr.va_atime.ts_sec = np->n_atim.tv_sec; + vattr.va_atime.ts_nsec = + np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vattr.va_mtime.ts_sec = np->n_mtim.tv_sec; + vattr.va_mtime.ts_nsec = + np->n_mtim.tv_usec * 1000; + } + (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); + } + } + return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap)); +} +#endif /* FIFO */ diff --git a/sys/nfs/nfsdiskless.h b/sys/nfs/nfsdiskless.h new file mode 100644 index 00000000000..74e6b7bca43 --- /dev/null +++ b/sys/nfs/nfsdiskless.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsdiskless.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Structure that must be initialized for a diskless nfs client. + * This structure is used by nfs_mountroot() to set up the root and swap + * vnodes plus do a partial ifconfig(8) and route(8) so that the critical net + * interface can communicate with the server. + * The primary bootstrap is expected to fill in the appropriate fields before + * starting vmunix. Whether or not the swap area is nfs mounted is determined + * by the value in swdevt[0]. (equal to NODEV --> swap over nfs) + * Currently only works for AF_INET protocols. + * NB: All fields are stored in net byte order to avoid hassles with + * client/server byte ordering differences. + */ +struct nfs_diskless { + struct ifaliasreq myif; /* Default interface */ + struct sockaddr_in mygateway; /* Default gateway */ + struct nfs_args swap_args; /* Mount args for swap file */ + u_char swap_fh[NFS_FHSIZE]; /* Swap file's file handle */ + struct sockaddr_in swap_saddr; /* Address of swap server */ + char swap_hostnam[MNAMELEN]; /* Host name for mount pt */ + int swap_nblks; /* Size of server swap file */ + struct ucred swap_ucred; /* Swap credentials */ + struct nfs_args root_args; /* Mount args for root fs */ + u_char root_fh[NFS_FHSIZE]; /* File handle of root dir */ + struct sockaddr_in root_saddr; /* Address of root server */ + char root_hostnam[MNAMELEN]; /* Host name for mount pt */ + long root_time; /* Timestamp of root fs */ + char my_hostnam[MAXHOSTNAMELEN]; /* Client host name */ +}; diff --git a/sys/nfs/nfsm_subs.h b/sys/nfs/nfsm_subs.h new file mode 100644 index 00000000000..879db360057 --- /dev/null +++ b/sys/nfs/nfsm_subs.h @@ -0,0 +1,269 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsm_subs.h 8.1 (Berkeley) 6/16/93 + */ + +/* + * These macros do strange and peculiar things to mbuf chains for + * the assistance of the nfs code. To attempt to use them for any + * other purpose will be dangerous. (they make weird assumptions) + */ + +/* + * First define what the actual subs. return + */ +extern struct mbuf *nfsm_reqh(); + +#define M_HASCL(m) ((m)->m_flags & M_EXT) +#define NFSMINOFF(m) \ + if (M_HASCL(m)) \ + (m)->m_data = (m)->m_ext.ext_buf; \ + else if ((m)->m_flags & M_PKTHDR) \ + (m)->m_data = (m)->m_pktdat; \ + else \ + (m)->m_data = (m)->m_dat +#define NFSMADV(m, s) (m)->m_data += (s) +#define NFSMSIZ(m) ((M_HASCL(m))?MCLBYTES: \ + (((m)->m_flags & M_PKTHDR)?MHLEN:MLEN)) + +/* + * Now for the macros that do the simple stuff and call the functions + * for the hard stuff. + * These macros use several vars. declared in nfsm_reqhead and these + * vars. must not be used elsewhere unless you are careful not to corrupt + * them. The vars. starting with pN and tN (N=1,2,3,..) are temporaries + * that may be used so long as the value is not expected to retained + * after a macro. + * I know, this is kind of dorkey, but it makes the actual op functions + * fairly clean and deals with the mess caused by the xdr discriminating + * unions. + */ + +#define nfsm_build(a,c,s) \ + { if ((s) > M_TRAILINGSPACE(mb)) { \ + MGET(mb2, M_WAIT, MT_DATA); \ + if ((s) > MLEN) \ + panic("build > MLEN"); \ + mb->m_next = mb2; \ + mb = mb2; \ + mb->m_len = 0; \ + bpos = mtod(mb, caddr_t); \ + } \ + (a) = (c)(bpos); \ + mb->m_len += (s); \ + bpos += (s); } + +#define nfsm_dissect(a,c,s) \ + { t1 = mtod(md, caddr_t)+md->m_len-dpos; \ + if (t1 >= (s)) { \ + (a) = (c)(dpos); \ + dpos += (s); \ + } else if (error = nfsm_disct(&md, &dpos, (s), t1, &cp2)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } else { \ + (a) = (c)cp2; \ + } } + +#define nfsm_fhtom(v) \ + nfsm_build(cp,caddr_t,NFSX_FH); \ + bcopy((caddr_t)&(VTONFS(v)->n_fh), cp, NFSX_FH) + +#define nfsm_srvfhtom(f) \ + nfsm_build(cp,caddr_t,NFSX_FH); \ + bcopy((caddr_t)(f), cp, NFSX_FH) + +#define nfsm_mtofh(d,v) \ + { struct nfsnode *np; nfsv2fh_t *fhp; \ + nfsm_dissect(fhp,nfsv2fh_t *,NFSX_FH); \ + if (error = nfs_nget((d)->v_mount, fhp, &np)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } \ + (v) = NFSTOV(np); \ + nfsm_loadattr(v, (struct vattr *)0); \ + } + +#define nfsm_loadattr(v,a) \ + { struct vnode *tvp = (v); \ + if (error = nfs_loadattrcache(&tvp, &md, &dpos, (a))) { \ + m_freem(mrep); \ + goto nfsmout; \ + } \ + (v) = tvp; } + +#define nfsm_strsiz(s,m) \ + { nfsm_dissect(tl,u_long *,NFSX_UNSIGNED); \ + if (((s) = fxdr_unsigned(long,*tl)) > (m)) { \ + m_freem(mrep); \ + error = EBADRPC; \ + goto nfsmout; \ + } } + +#define nfsm_srvstrsiz(s,m) \ + { nfsm_dissect(tl,u_long *,NFSX_UNSIGNED); \ + if (((s) = fxdr_unsigned(long,*tl)) > (m) || (s) <= 0) { \ + error = EBADRPC; \ + nfsm_reply(0); \ + } } + +#define nfsm_mtouio(p,s) \ + if ((s) > 0 && \ + (error = nfsm_mbuftouio(&md,(p),(s),&dpos))) { \ + m_freem(mrep); \ + goto nfsmout; \ + } + +#define nfsm_uiotom(p,s) \ + if (error = nfsm_uiotombuf((p),&mb,(s),&bpos)) { \ + m_freem(mreq); \ + goto nfsmout; \ + } + +#define nfsm_reqhead(v,a,s) \ + mb = mreq = nfsm_reqh((v),(a),(s),&bpos) + +#define nfsm_reqdone m_freem(mrep); \ + nfsmout: + +#define nfsm_rndup(a) (((a)+3)&(~0x3)) + +#define nfsm_request(v, t, p, c) \ + if (error = nfs_request((v), mreq, (t), (p), \ + (c), &mrep, &md, &dpos)) \ + goto nfsmout + +#define nfsm_strtom(a,s,m) \ + if ((s) > (m)) { \ + m_freem(mreq); \ + error = ENAMETOOLONG; \ + goto nfsmout; \ + } \ + t2 = nfsm_rndup(s)+NFSX_UNSIGNED; \ + if (t2 <= M_TRAILINGSPACE(mb)) { \ + nfsm_build(tl,u_long *,t2); \ + *tl++ = txdr_unsigned(s); \ + *(tl+((t2>>2)-2)) = 0; \ + bcopy((caddr_t)(a), (caddr_t)tl, (s)); \ + } else if (error = nfsm_strtmbuf(&mb, &bpos, (a), (s))) { \ + m_freem(mreq); \ + goto nfsmout; \ + } + +#define nfsm_srvdone \ + nfsmout: \ + return(error) + +#define nfsm_reply(s) \ + { \ + nfsd->nd_repstat = error; \ + if (error) \ + (void) nfs_rephead(0, nfsd, error, cache, &frev, \ + mrq, &mb, &bpos); \ + else \ + (void) nfs_rephead((s), nfsd, error, cache, &frev, \ + mrq, &mb, &bpos); \ + m_freem(mrep); \ + mreq = *mrq; \ + if (error) \ + return(0); \ + } + +#define nfsm_adv(s) \ + t1 = mtod(md, caddr_t)+md->m_len-dpos; \ + if (t1 >= (s)) { \ + dpos += (s); \ + } else if (error = nfs_adv(&md, &dpos, (s), t1)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } + +#define nfsm_srvmtofh(f) \ + nfsm_dissect(tl, u_long *, NFSX_FH); \ + bcopy((caddr_t)tl, (caddr_t)f, NFSX_FH) + +#define nfsm_clget \ + if (bp >= be) { \ + if (mp == mb) \ + mp->m_len += bp-bpos; \ + MGET(mp, M_WAIT, MT_DATA); \ + MCLGET(mp, M_WAIT); \ + mp->m_len = NFSMSIZ(mp); \ + mp2->m_next = mp; \ + mp2 = mp; \ + bp = mtod(mp, caddr_t); \ + be = bp+mp->m_len; \ + } \ + tl = (u_long *)bp + +#define nfsm_srvfillattr \ + fp->fa_type = vtonfs_type(vap->va_type); \ + fp->fa_mode = vtonfs_mode(vap->va_type, vap->va_mode); \ + fp->fa_nlink = txdr_unsigned(vap->va_nlink); \ + fp->fa_uid = txdr_unsigned(vap->va_uid); \ + fp->fa_gid = txdr_unsigned(vap->va_gid); \ + if (nfsd->nd_nqlflag == NQL_NOVAL) { \ + fp->fa_nfsblocksize = txdr_unsigned(vap->va_blocksize); \ + if (vap->va_type == VFIFO) \ + fp->fa_nfsrdev = 0xffffffff; \ + else \ + fp->fa_nfsrdev = txdr_unsigned(vap->va_rdev); \ + fp->fa_nfsfsid = txdr_unsigned(vap->va_fsid); \ + fp->fa_nfsfileid = txdr_unsigned(vap->va_fileid); \ + fp->fa_nfssize = txdr_unsigned(vap->va_size); \ + fp->fa_nfsblocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); \ + txdr_nfstime(&vap->va_atime, &fp->fa_nfsatime); \ + txdr_nfstime(&vap->va_mtime, &fp->fa_nfsmtime); \ + fp->fa_nfsctime.nfs_sec = txdr_unsigned(vap->va_ctime.ts_sec); \ + fp->fa_nfsctime.nfs_usec = txdr_unsigned(vap->va_gen); \ + } else { \ + fp->fa_nqblocksize = txdr_unsigned(vap->va_blocksize); \ + if (vap->va_type == VFIFO) \ + fp->fa_nqrdev = 0xffffffff; \ + else \ + fp->fa_nqrdev = txdr_unsigned(vap->va_rdev); \ + fp->fa_nqfsid = txdr_unsigned(vap->va_fsid); \ + fp->fa_nqfileid = txdr_unsigned(vap->va_fileid); \ + txdr_hyper(&vap->va_size, &fp->fa_nqsize); \ + txdr_hyper(&vap->va_bytes, &fp->fa_nqbytes); \ + txdr_nqtime(&vap->va_atime, &fp->fa_nqatime); \ + txdr_nqtime(&vap->va_mtime, &fp->fa_nqmtime); \ + txdr_nqtime(&vap->va_ctime, &fp->fa_nqctime); \ + fp->fa_nqflags = txdr_unsigned(vap->va_flags); \ + fp->fa_nqgen = txdr_unsigned(vap->va_gen); \ + txdr_hyper(&vap->va_filerev, &fp->fa_nqfilerev); \ + } + diff --git a/sys/nfs/nfsmount.h b/sys/nfs/nfsmount.h new file mode 100644 index 00000000000..4d74acb38a5 --- /dev/null +++ b/sys/nfs/nfsmount.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsmount.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Mount structure. + * One allocated on every NFS mount. + * Holds NFS specific information for mount. + */ +struct nfsmount { + int nm_flag; /* Flags for soft/hard... */ + struct mount *nm_mountp; /* Vfs structure for this filesystem */ + int nm_numgrps; /* Max. size of groupslist */ + nfsv2fh_t nm_fh; /* File handle of root dir */ + struct socket *nm_so; /* Rpc socket */ + int nm_sotype; /* Type of socket */ + int nm_soproto; /* and protocol */ + int nm_soflags; /* pr_flags for socket protocol */ + struct mbuf *nm_nam; /* Addr of server */ + int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */ + int nm_retry; /* Max retries */ + int nm_srtt[4]; /* Timers for rpcs */ + int nm_sdrtt[4]; + int nm_sent; /* Request send count */ + int nm_cwnd; /* Request send window */ + int nm_timeouts; /* Request timeouts */ + int nm_deadthresh; /* Threshold of timeouts-->dead server*/ + int nm_rsize; /* Max size of read rpc */ + int nm_wsize; /* Max size of write rpc */ + int nm_readahead; /* Num. of blocks to readahead */ + int nm_leaseterm; /* Term (sec) for NQNFS lease */ + struct nfsnode *nm_tnext; /* Head of lease timer queue */ + struct nfsnode *nm_tprev; + struct vnode *nm_inprog; /* Vnode in prog by nqnfs_clientd() */ + uid_t nm_authuid; /* Uid for authenticator */ + int nm_authtype; /* Authenticator type */ + int nm_authlen; /* and length */ + char *nm_authstr; /* Authenticator string */ +}; + +#ifdef KERNEL +/* + * Convert mount ptr to nfsmount ptr. + */ +#define VFSTONFS(mp) ((struct nfsmount *)((mp)->mnt_data)) +#endif /* KERNEL */ + +/* + * Prototypes for NFS mount operations + */ +int nfs_mount __P(( + struct mount *mp, + char *path, + caddr_t data, + struct nameidata *ndp, + struct proc *p)); +int nfs_start __P(( + struct mount *mp, + int flags, + struct proc *p)); +int nfs_unmount __P(( + struct mount *mp, + int mntflags, + struct proc *p)); +int nfs_root __P(( + struct mount *mp, + struct vnode **vpp)); +int nfs_quotactl __P(( + struct mount *mp, + int cmds, + uid_t uid, + caddr_t arg, + struct proc *p)); +int nfs_statfs __P(( + struct mount *mp, + struct statfs *sbp, + struct proc *p)); +int nfs_sync __P(( + struct mount *mp, + int waitfor, + struct ucred *cred, + struct proc *p)); +int nfs_fhtovp __P(( + struct mount *mp, + struct fid *fhp, + struct mbuf *nam, + struct vnode **vpp, + int *exflagsp, + struct ucred **credanonp)); +int nfs_vptofh __P(( + struct vnode *vp, + struct fid *fhp)); +int nfs_init __P(()); diff --git a/sys/nfs/nfsnode.h b/sys/nfs/nfsnode.h new file mode 100644 index 00000000000..f5fee5bf2f3 --- /dev/null +++ b/sys/nfs/nfsnode.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsnode.h 8.4 (Berkeley) 2/13/94 + */ + +/* + * Silly rename structure that hangs off the nfsnode until the name + * can be removed by nfs_inactive() + */ +struct sillyrename { + struct ucred *s_cred; + struct vnode *s_dvp; + long s_namlen; + char s_name[20]; +}; + +/* + * The nfsnode is the nfs equivalent to ufs's inode. Any similarity + * is purely coincidental. + * There is a unique nfsnode allocated for each active file, + * each current directory, each mounted-on file, text file, and the root. + * An nfsnode is 'named' by its file handle. (nget/nfs_node.c) + */ + +struct nfsnode { + struct nfsnode *n_forw; /* hash, forward */ + struct nfsnode **n_back; /* hash, backward */ + nfsv2fh_t n_fh; /* NFS File Handle */ + long n_flag; /* Flag for locking.. */ + struct vnode *n_vnode; /* vnode associated with this node */ + struct vattr n_vattr; /* Vnode attribute cache */ + time_t n_attrstamp; /* Time stamp for cached attributes */ + struct sillyrename *n_sillyrename; /* Ptr to silly rename struct */ + u_quad_t n_size; /* Current size of file */ + int n_error; /* Save write error value */ + u_long n_direofoffset; /* Dir. EOF offset cache */ + time_t n_mtime; /* Prev modify time. */ + time_t n_ctime; /* Prev create time. */ + u_quad_t n_brev; /* Modify rev when cached */ + u_quad_t n_lrev; /* Modify rev for lease */ + time_t n_expiry; /* Lease expiry time */ + struct nfsnode *n_tnext; /* Nqnfs timer chain */ + struct nfsnode *n_tprev; + long spare1; /* To 8 byte boundary */ + struct sillyrename n_silly; /* Silly rename struct */ + struct timeval n_atim; /* Special file times */ + struct timeval n_mtim; +}; + +/* + * Flags for n_flag + */ +#define NFLUSHWANT 0x0001 /* Want wakeup from a flush in prog. */ +#define NFLUSHINPROG 0x0002 /* Avoid multiple calls to vinvalbuf() */ +#define NMODIFIED 0x0004 /* Might have a modified buffer in bio */ +#define NWRITEERR 0x0008 /* Flag write errors so close will know */ +#define NQNFSNONCACHE 0x0020 /* Non-cachable lease */ +#define NQNFSWRITE 0x0040 /* Write lease */ +#define NQNFSEVICTED 0x0080 /* Has been evicted */ +#define NACC 0x0100 /* Special file accessed */ +#define NUPD 0x0200 /* Special file updated */ +#define NCHG 0x0400 /* Special file times changed */ + +/* + * Convert between nfsnode pointers and vnode pointers + */ +#define VTONFS(vp) ((struct nfsnode *)(vp)->v_data) +#define NFSTOV(np) ((struct vnode *)(np)->n_vnode) + +/* + * Queue head for nfsiod's + */ +TAILQ_HEAD(nfsbufs, buf) nfs_bufq; + +#ifdef KERNEL +/* + * Prototypes for NFS vnode operations + */ +int nfs_lookup __P((struct vop_lookup_args *)); +int nfs_create __P((struct vop_create_args *)); +int nfs_mknod __P((struct vop_mknod_args *)); +int nfs_open __P((struct vop_open_args *)); +int nfs_close __P((struct vop_close_args *)); +int nfsspec_close __P((struct vop_close_args *)); +#ifdef FIFO +int nfsfifo_close __P((struct vop_close_args *)); +#endif +int nfs_access __P((struct vop_access_args *)); +int nfsspec_access __P((struct vop_access_args *)); +int nfs_getattr __P((struct vop_getattr_args *)); +int nfs_setattr __P((struct vop_setattr_args *)); +int nfs_read __P((struct vop_read_args *)); +int nfs_write __P((struct vop_write_args *)); +int nfsspec_read __P((struct vop_read_args *)); +int nfsspec_write __P((struct vop_write_args *)); +#ifdef FIFO +int nfsfifo_read __P((struct vop_read_args *)); +int nfsfifo_write __P((struct vop_write_args *)); +#endif +#define nfs_ioctl ((int (*) __P((struct vop_ioctl_args *)))enoioctl) +#define nfs_select ((int (*) __P((struct vop_select_args *)))seltrue) +int nfs_mmap __P((struct vop_mmap_args *)); +int nfs_fsync __P((struct vop_fsync_args *)); +#define nfs_seek ((int (*) __P((struct vop_seek_args *)))nullop) +int nfs_remove __P((struct vop_remove_args *)); +int nfs_link __P((struct vop_link_args *)); +int nfs_rename __P((struct vop_rename_args *)); +int nfs_mkdir __P((struct vop_mkdir_args *)); +int nfs_rmdir __P((struct vop_rmdir_args *)); +int nfs_symlink __P((struct vop_symlink_args *)); +int nfs_readdir __P((struct vop_readdir_args *)); +int nfs_readlink __P((struct vop_readlink_args *)); +int nfs_abortop __P((struct vop_abortop_args *)); +int nfs_inactive __P((struct vop_inactive_args *)); +int nfs_reclaim __P((struct vop_reclaim_args *)); +int nfs_lock __P((struct vop_lock_args *)); +int nfs_unlock __P((struct vop_unlock_args *)); +int nfs_bmap __P((struct vop_bmap_args *)); +int nfs_strategy __P((struct vop_strategy_args *)); +int nfs_print __P((struct vop_print_args *)); +int nfs_islocked __P((struct vop_islocked_args *)); +int nfs_pathconf __P((struct vop_pathconf_args *)); +int nfs_advlock __P((struct vop_advlock_args *)); +int nfs_blkatoff __P((struct vop_blkatoff_args *)); +int nfs_vget __P((struct mount *, ino_t, struct vnode **)); +int nfs_valloc __P((struct vop_valloc_args *)); +#define nfs_reallocblks \ + ((int (*) __P((struct vop_reallocblks_args *)))eopnotsupp) +int nfs_vfree __P((struct vop_vfree_args *)); +int nfs_truncate __P((struct vop_truncate_args *)); +int nfs_update __P((struct vop_update_args *)); +int nfs_bwrite __P((struct vop_bwrite_args *)); +#endif /* KERNEL */ diff --git a/sys/nfs/nfsrtt.h b/sys/nfs/nfsrtt.h new file mode 100644 index 00000000000..0d23880019b --- /dev/null +++ b/sys/nfs/nfsrtt.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsrtt.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions for performance monitor. + * The client and server logging are turned on by setting the global + * constant "nfsrtton" to 1. + */ +#define NFSRTTLOGSIZ 128 + +/* + * Circular log of client side rpc activity. Each log entry is for one + * rpc filled in upon completion. (ie. in order of completion) + * The "pos" is the table index for the "next" entry, therefore the + * list goes from nfsrtt.rttl[pos] --> nfsrtt.rttl[pos - 1] in + * chronological order of completion. + */ +struct nfsrtt { + int pos; /* Position in array for next entry */ + struct rttl { + int proc; /* NFS procedure number */ + int rtt; /* Measured round trip time */ + int rto; /* Round Trip Timeout */ + int sent; /* # rpcs in progress */ + int cwnd; /* Send window */ + int srtt; /* Ave Round Trip Time */ + int sdrtt; /* Ave mean deviation of RTT */ + fsid_t fsid; /* Fsid for mount point */ + struct timeval tstamp; /* Timestamp of log entry */ + } rttl[NFSRTTLOGSIZ]; +}; + +/* + * And definitions for server side performance monitor. + * The log organization is the same as above except it is filled in at the + * time the server sends the rpc reply. + */ + +/* + * Bits for the flags field. + */ +#define DRT_NQNFS 0x01 /* Rpc used Nqnfs protocol */ +#define DRT_TCP 0x02 /* Client used TCP transport */ +#define DRT_CACHEREPLY 0x04 /* Reply was from recent request cache */ +#define DRT_CACHEDROP 0x08 /* Rpc request dropped, due to recent reply */ + +/* + * Server log structure + * NB: ipadr == INADDR_ANY indicates a client using a non IP protocol. + * (ISO perhaps?) + */ +struct nfsdrt { + int pos; /* Position of next log entry */ + struct drt { + int flag; /* Bits as defined above */ + int proc; /* NFS procedure number */ + u_long ipadr; /* IP address of client */ + int resptime; /* Response time (usec) */ + struct timeval tstamp; /* Timestamp of log entry */ + } drt[NFSRTTLOGSIZ]; +}; diff --git a/sys/nfs/nfsrvcache.h b/sys/nfs/nfsrvcache.h new file mode 100644 index 00000000000..26da2c275df --- /dev/null +++ b/sys/nfs/nfsrvcache.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsrvcache.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions for the server recent request cache + */ + +#define NFSRVCACHESIZ 256 + +struct nfsrvcache { + struct nfsrvcache *rc_forw; /* Hash chain links */ + struct nfsrvcache **rc_back; /* Hash chain links */ + struct nfsrvcache *rc_next; /* Lru list */ + struct nfsrvcache **rc_prev; /* Lru list */ + u_long rc_xid; /* rpc id number */ + union { + struct mbuf *ru_repmb; /* Reply mbuf list OR */ + int ru_repstat; /* Reply status */ + } rc_un; + union nethostaddr rc_haddr; /* Host address */ + short rc_proc; /* rpc proc number */ + u_char rc_state; /* Current state of request */ + u_char rc_flag; /* Flag bits */ +}; + +#define rc_reply rc_un.ru_repmb +#define rc_status rc_un.ru_repstat +#define rc_inetaddr rc_haddr.had_inetaddr +#define rc_nam rc_haddr.had_nam + +/* Cache entry states */ +#define RC_UNUSED 0 +#define RC_INPROG 1 +#define RC_DONE 2 + +/* Return values */ +#define RC_DROPIT 0 +#define RC_REPLY 1 +#define RC_DOIT 2 +#define RC_CHECKIT 3 + +/* Flag bits */ +#define RC_LOCKED 0x01 +#define RC_WANTED 0x02 +#define RC_REPSTATUS 0x04 +#define RC_REPMBUF 0x08 +#define RC_NQNFS 0x10 +#define RC_INETADDR 0x20 +#define RC_NAM 0x40 diff --git a/sys/nfs/nfsv2.h b/sys/nfs/nfsv2.h new file mode 100644 index 00000000000..e9d2985efac --- /dev/null +++ b/sys/nfs/nfsv2.h @@ -0,0 +1,260 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsv2.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * nfs definitions as per the version 2 specs + */ + +/* + * Constants as defined in the Sun NFS Version 2 spec. + * "NFS: Network File System Protocol Specification" RFC1094 + */ + +#define NFS_PORT 2049 +#define NFS_PROG 100003 +#define NFS_VER2 2 +#define NFS_MAXDGRAMDATA 8192 +#define NFS_MAXDATA 32768 +#define NFS_MAXPATHLEN 1024 +#define NFS_MAXNAMLEN 255 +#define NFS_FHSIZE 32 +#define NFS_MAXPKTHDR 404 +#define NFS_MAXPACKET (NFS_MAXPKTHDR+NFS_MAXDATA) +#define NFS_MINPACKET 20 +#define NFS_FABLKSIZE 512 /* Size in bytes of a block wrt fa_blocks */ + +/* Stat numbers for rpc returns */ +#define NFS_OK 0 +#define NFSERR_PERM 1 +#define NFSERR_NOENT 2 +#define NFSERR_IO 5 +#define NFSERR_NXIO 6 +#define NFSERR_ACCES 13 +#define NFSERR_EXIST 17 +#define NFSERR_NODEV 19 +#define NFSERR_NOTDIR 20 +#define NFSERR_ISDIR 21 +#define NFSERR_FBIG 27 +#define NFSERR_NOSPC 28 +#define NFSERR_ROFS 30 +#define NFSERR_NAMETOL 63 +#define NFSERR_NOTEMPTY 66 +#define NFSERR_DQUOT 69 +#define NFSERR_STALE 70 +#define NFSERR_WFLUSH 99 + +/* Sizes in bytes of various nfs rpc components */ +#define NFSX_FH 32 +#define NFSX_UNSIGNED 4 +#define NFSX_NFSFATTR 68 +#define NFSX_NQFATTR 92 +#define NFSX_NFSSATTR 32 +#define NFSX_NQSATTR 44 +#define NFSX_COOKIE 4 +#define NFSX_NFSSTATFS 20 +#define NFSX_NQSTATFS 28 +#define NFSX_FATTR(isnq) ((isnq) ? NFSX_NQFATTR : NFSX_NFSFATTR) +#define NFSX_SATTR(isnq) ((isnq) ? NFSX_NQSATTR : NFSX_NFSSATTR) +#define NFSX_STATFS(isnq) ((isnq) ? NFSX_NQSTATFS : NFSX_NFSSTATFS) + +/* nfs rpc procedure numbers */ +#define NFSPROC_NULL 0 +#define NFSPROC_GETATTR 1 +#define NFSPROC_SETATTR 2 +#define NFSPROC_NOOP 3 +#define NFSPROC_ROOT NFSPROC_NOOP /* Obsolete */ +#define NFSPROC_LOOKUP 4 +#define NFSPROC_READLINK 5 +#define NFSPROC_READ 6 +#define NFSPROC_WRITECACHE NFSPROC_NOOP /* Obsolete */ +#define NFSPROC_WRITE 8 +#define NFSPROC_CREATE 9 +#define NFSPROC_REMOVE 10 +#define NFSPROC_RENAME 11 +#define NFSPROC_LINK 12 +#define NFSPROC_SYMLINK 13 +#define NFSPROC_MKDIR 14 +#define NFSPROC_RMDIR 15 +#define NFSPROC_READDIR 16 +#define NFSPROC_STATFS 17 + +/* NQ nfs numbers */ +#define NQNFSPROC_READDIRLOOK 18 +#define NQNFSPROC_GETLEASE 19 +#define NQNFSPROC_VACATED 20 +#define NQNFSPROC_EVICTED 21 +#define NQNFSPROC_ACCESS 22 + +#define NFS_NPROCS 23 +/* Conversion macros */ +extern int vttoif_tab[]; +#define vtonfs_mode(t,m) \ + txdr_unsigned(((t) == VFIFO) ? MAKEIMODE(VCHR, (m)) : \ + MAKEIMODE((t), (m))) +#define nfstov_mode(a) (fxdr_unsigned(u_short, (a))&07777) +#define vtonfs_type(a) txdr_unsigned(nfs_type[((long)(a))]) +#define nfstov_type(a) ntov_type[fxdr_unsigned(u_long,(a))&0x7] + +/* File types */ +typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5 } nfstype; + +/* Structs for common parts of the rpc's */ +struct nfsv2_time { + u_long nfs_sec; + u_long nfs_usec; +}; + +struct nqnfs_time { + u_long nq_sec; + u_long nq_nsec; +}; + +/* + * File attributes and setable attributes. These structures cover both + * NFS version 2 and the NQNFS protocol. Note that the union is only + * used to that one pointer can refer to both variants. These structures + * go out on the wire and must be densely packed, so no quad data types + * are used. (all fields are longs or u_longs or structures of same) + * NB: You can't do sizeof(struct nfsv2_fattr), you must use the + * NFSX_FATTR(isnq) macro. + */ +struct nfsv2_fattr { + u_long fa_type; + u_long fa_mode; + u_long fa_nlink; + u_long fa_uid; + u_long fa_gid; + union { + struct { + u_long nfsfa_size; + u_long nfsfa_blocksize; + u_long nfsfa_rdev; + u_long nfsfa_blocks; + u_long nfsfa_fsid; + u_long nfsfa_fileid; + struct nfsv2_time nfsfa_atime; + struct nfsv2_time nfsfa_mtime; + struct nfsv2_time nfsfa_ctime; + } fa_nfsv2; + struct { + struct { + u_long nqfa_qsize[2]; + } nqfa_size; + u_long nqfa_blocksize; + u_long nqfa_rdev; + struct { + u_long nqfa_qbytes[2]; + } nqfa_bytes; + u_long nqfa_fsid; + u_long nqfa_fileid; + struct nqnfs_time nqfa_atime; + struct nqnfs_time nqfa_mtime; + struct nqnfs_time nqfa_ctime; + u_long nqfa_flags; + u_long nqfa_gen; + struct { + u_long nqfa_qfilerev[2]; + } nqfa_filerev; + } fa_nqnfs; + } fa_un; +}; + +/* and some ugly defines for accessing union components */ +#define fa_nfssize fa_un.fa_nfsv2.nfsfa_size +#define fa_nfsblocksize fa_un.fa_nfsv2.nfsfa_blocksize +#define fa_nfsrdev fa_un.fa_nfsv2.nfsfa_rdev +#define fa_nfsblocks fa_un.fa_nfsv2.nfsfa_blocks +#define fa_nfsfsid fa_un.fa_nfsv2.nfsfa_fsid +#define fa_nfsfileid fa_un.fa_nfsv2.nfsfa_fileid +#define fa_nfsatime fa_un.fa_nfsv2.nfsfa_atime +#define fa_nfsmtime fa_un.fa_nfsv2.nfsfa_mtime +#define fa_nfsctime fa_un.fa_nfsv2.nfsfa_ctime +#define fa_nqsize fa_un.fa_nqnfs.nqfa_size +#define fa_nqblocksize fa_un.fa_nqnfs.nqfa_blocksize +#define fa_nqrdev fa_un.fa_nqnfs.nqfa_rdev +#define fa_nqbytes fa_un.fa_nqnfs.nqfa_bytes +#define fa_nqfsid fa_un.fa_nqnfs.nqfa_fsid +#define fa_nqfileid fa_un.fa_nqnfs.nqfa_fileid +#define fa_nqatime fa_un.fa_nqnfs.nqfa_atime +#define fa_nqmtime fa_un.fa_nqnfs.nqfa_mtime +#define fa_nqctime fa_un.fa_nqnfs.nqfa_ctime +#define fa_nqflags fa_un.fa_nqnfs.nqfa_flags +#define fa_nqgen fa_un.fa_nqnfs.nqfa_gen +#define fa_nqfilerev fa_un.fa_nqnfs.nqfa_filerev + +struct nfsv2_sattr { + u_long sa_mode; + u_long sa_uid; + u_long sa_gid; + union { + struct { + u_long nfssa_size; + struct nfsv2_time nfssa_atime; + struct nfsv2_time nfssa_mtime; + } sa_nfsv2; + struct { + struct { + u_long nqsa_qsize[2]; + } nqsa_size; + struct nqnfs_time nqsa_atime; + struct nqnfs_time nqsa_mtime; + u_long nqsa_flags; + u_long nqsa_rdev; + } sa_nqnfs; + } sa_un; +}; + +/* and some ugly defines for accessing the unions */ +#define sa_nfssize sa_un.sa_nfsv2.nfssa_size +#define sa_nfsatime sa_un.sa_nfsv2.nfssa_atime +#define sa_nfsmtime sa_un.sa_nfsv2.nfssa_mtime +#define sa_nqsize sa_un.sa_nqnfs.nqsa_size +#define sa_nqatime sa_un.sa_nqnfs.nqsa_atime +#define sa_nqmtime sa_un.sa_nqnfs.nqsa_mtime +#define sa_nqflags sa_un.sa_nqnfs.nqsa_flags +#define sa_nqrdev sa_un.sa_nqnfs.nqsa_rdev + +struct nfsv2_statfs { + u_long sf_tsize; + u_long sf_bsize; + u_long sf_blocks; + u_long sf_bfree; + u_long sf_bavail; + u_long sf_files; /* Nqnfs only */ + u_long sf_ffree; /* ditto */ +}; diff --git a/sys/nfs/nqnfs.h b/sys/nfs/nqnfs.h new file mode 100644 index 00000000000..730741a4137 --- /dev/null +++ b/sys/nfs/nqnfs.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nqnfs.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions for NQNFS (Not Quite NFS) cache consistency protocol. + */ + +/* Tunable constants */ +#define NQ_CLOCKSKEW 3 /* Clock skew factor (sec) */ +#define NQ_WRITESLACK 5 /* Delay for write cache flushing */ +#define NQ_MAXLEASE 60 /* Max lease duration (sec) */ +#define NQ_MINLEASE 5 /* Min lease duration (sec) */ +#define NQ_DEFLEASE 30 /* Default lease duration (sec) */ +#define NQ_RENEWAL 3 /* Time before expiry (sec) to renew */ +#define NQ_TRYLATERDEL 15 /* Initial try later delay (sec) */ +#define NQ_MAXNUMLEASE 2048 /* Upper bound on number of server leases */ +#define NQ_DEADTHRESH NQ_NEVERDEAD /* Default nm_deadthresh */ +#define NQ_NEVERDEAD 9 /* Greater than max. nm_timeouts */ +#define NQLCHSZ 256 /* Server hash table size */ + +#define NQNFS_PROG 300105 /* As assigned by Sun */ +#define NQNFS_VER1 1 +#define NQNFS_EVICTSIZ 156 /* Size of eviction request in bytes */ + +/* + * Definitions used for saving the "last lease expires" time in Non-volatile + * RAM on the server. The default definitions below assume that NOVRAM is not + * available. + */ +#define NQSTORENOVRAM(t) +#define NQLOADNOVRAM(t) + +/* + * Defn and structs used on the server to maintain state for current leases. + * The list of host(s) that hold the lease are kept as nqhost structures. + * The first one lives in nqlease and any others are held in a linked + * list of nqm structures hanging off of nqlease. + * + * Each nqlease structure is chained into two lists. The first is a list + * ordered by increasing expiry time for nqsrv_timer() and the second is a chain + * hashed on lc_fh. + */ +#define LC_MOREHOSTSIZ 10 + +struct nqhost { + union { + struct { + u_short udp_flag; + u_short udp_port; + union nethostaddr udp_haddr; + } un_udp; + struct { + u_short connless_flag; + u_short connless_spare; + union nethostaddr connless_haddr; + } un_connless; + struct { + u_short conn_flag; + u_short conn_spare; + struct nfssvc_sock *conn_slp; + } un_conn; + } lph_un; +}; +#define lph_flag lph_un.un_udp.udp_flag +#define lph_port lph_un.un_udp.udp_port +#define lph_haddr lph_un.un_udp.udp_haddr +#define lph_inetaddr lph_un.un_udp.udp_haddr.had_inetaddr +#define lph_claddr lph_un.un_connless.connless_haddr +#define lph_nam lph_un.un_connless.connless_haddr.had_nam +#define lph_slp lph_un.un_conn.conn_slp + +struct nqlease { + struct nqlease *lc_chain1[2]; /* Timer queue list (must be first) */ + struct nqlease *lc_fhnext; /* Fhandle hash list */ + struct nqlease **lc_fhprev; + time_t lc_expiry; /* Expiry time (sec) */ + struct nqhost lc_host; /* Host that got lease */ + struct nqm *lc_morehosts; /* Other hosts that share read lease */ + fsid_t lc_fsid; /* Fhandle */ + char lc_fiddata[MAXFIDSZ]; + struct vnode *lc_vp; /* Soft reference to associated vnode */ +}; +#define lc_flag lc_host.lph_un.un_udp.udp_flag + +/* lc_flag bits */ +#define LC_VALID 0x0001 /* Host address valid */ +#define LC_WRITE 0x0002 /* Write cache */ +#define LC_NONCACHABLE 0x0004 /* Non-cachable lease */ +#define LC_LOCKED 0x0008 /* Locked */ +#define LC_WANTED 0x0010 /* Lock wanted */ +#define LC_EXPIREDWANTED 0x0020 /* Want lease when expired */ +#define LC_UDP 0x0040 /* Host address for udp socket */ +#define LC_CLTP 0x0080 /* Host address for other connectionless */ +#define LC_LOCAL 0x0100 /* Host is server */ +#define LC_VACATED 0x0200 /* Host has vacated lease */ +#define LC_WRITTEN 0x0400 /* Recently wrote to the leased file */ +#define LC_SREF 0x0800 /* Holds a nfssvc_sock reference */ + +struct nqm { + struct nqm *lpm_next; + struct nqhost lpm_hosts[LC_MOREHOSTSIZ]; +}; + +/* + * Flag bits for flags argument to nqsrv_getlease. + */ +#define NQL_READ LEASE_READ /* Read Request */ +#define NQL_WRITE LEASE_WRITE /* Write Request */ +#define NQL_CHECK 0x4 /* Check for lease */ +#define NQL_NOVAL 0xffffffff /* Invalid */ + +/* + * Special value for slp for local server calls. + */ +#define NQLOCALSLP ((struct nfssvc_sock *) -1) + +/* + * Server side macros. + */ +#define nqsrv_getl(v, l) \ + (void) nqsrv_getlease((v), &nfsd->nd_duration, \ + ((nfsd->nd_nqlflag != 0 && nfsd->nd_nqlflag != NQL_NOVAL) ? nfsd->nd_nqlflag : \ + ((l) | NQL_CHECK)), \ + nfsd, nam, &cache, &frev, cred) + +/* + * Client side macros that check for a valid lease. + */ +#define NQNFS_CKINVALID(v, n, f) \ + ((time.tv_sec > (n)->n_expiry && \ + VFSTONFS((v)->v_mount)->nm_timeouts < VFSTONFS((v)->v_mount)->nm_deadthresh) \ + || ((f) == NQL_WRITE && ((n)->n_flag & NQNFSWRITE) == 0)) + +#define NQNFS_CKCACHABLE(v, f) \ + ((time.tv_sec <= VTONFS(v)->n_expiry || \ + VFSTONFS((v)->v_mount)->nm_timeouts >= VFSTONFS((v)->v_mount)->nm_deadthresh) \ + && (VTONFS(v)->n_flag & NQNFSNONCACHE) == 0 && \ + ((f) == NQL_READ || (VTONFS(v)->n_flag & NQNFSWRITE))) + +#define NQNFS_NEEDLEASE(v, p) \ + (time.tv_sec > VTONFS(v)->n_expiry ? \ + ((VTONFS(v)->n_flag & NQNFSEVICTED) ? 0 : nqnfs_piggy[p]) : \ + (((time.tv_sec + NQ_RENEWAL) > VTONFS(v)->n_expiry && \ + nqnfs_piggy[p]) ? \ + ((VTONFS(v)->n_flag & NQNFSWRITE) ? \ + NQL_WRITE : nqnfs_piggy[p]) : 0)) + +/* + * List head for timer queue. + */ +extern union nqsrvthead { + union nqsrvthead *th_head[2]; + struct nqlease *th_chain[2]; +} nqthead; +extern struct nqlease **nqfhead; +extern u_long nqfheadhash; + +/* + * Nqnfs return status numbers. + */ +#define NQNFS_EXPIRED 500 +#define NQNFS_TRYLATER 501 +#define NQNFS_AUTHERR 502 diff --git a/sys/nfs/rpcv2.h b/sys/nfs/rpcv2.h new file mode 100644 index 00000000000..9c793a7f875 --- /dev/null +++ b/sys/nfs/rpcv2.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)rpcv2.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions for Sun RPC Version 2, from + * "RPC: Remote Procedure Call Protocol Specification" RFC1057 + */ + +/* Version # */ +#define RPC_VER2 2 + +/* Authentication */ +#define RPCAUTH_NULL 0 +#define RPCAUTH_UNIX 1 +#define RPCAUTH_SHORT 2 +#define RPCAUTH_NQNFS 300000 +#define RPCAUTH_MAXSIZ 400 +#define RPCAUTH_UNIXGIDS 16 + +/* Rpc Constants */ +#define RPC_CALL 0 +#define RPC_REPLY 1 +#define RPC_MSGACCEPTED 0 +#define RPC_MSGDENIED 1 +#define RPC_PROGUNAVAIL 1 +#define RPC_PROGMISMATCH 2 +#define RPC_PROCUNAVAIL 3 +#define RPC_GARBAGE 4 /* I like this one */ +#define RPC_MISMATCH 0 +#define RPC_AUTHERR 1 + +/* Authentication failures */ +#define AUTH_BADCRED 1 +#define AUTH_REJECTCRED 2 +#define AUTH_BADVERF 3 +#define AUTH_REJECTVERF 4 +#define AUTH_TOOWEAK 5 /* Give em wheaties */ + +/* Sizes of rpc header parts */ +#define RPC_SIZ 24 +#define RPC_REPLYSIZ 28 + +/* RPC Prog definitions */ +#define RPCPROG_MNT 100005 +#define RPCMNT_VER1 1 +#define RPCMNT_MOUNT 1 +#define RPCMNT_DUMP 2 +#define RPCMNT_UMOUNT 3 +#define RPCMNT_UMNTALL 4 +#define RPCMNT_EXPORT 5 +#define RPCMNT_NAMELEN 255 +#define RPCMNT_PATHLEN 1024 +#define RPCPROG_NFS 100003 diff --git a/sys/nfs/xdr_subs.h b/sys/nfs/xdr_subs.h new file mode 100644 index 00000000000..c2aa4f3f343 --- /dev/null +++ b/sys/nfs/xdr_subs.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)xdr_subs.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Macros used for conversion to/from xdr representation by nfs... + * These use the MACHINE DEPENDENT routines ntohl, htonl + * As defined by "XDR: External Data Representation Standard" RFC1014 + * + * To simplify the implementation, we use ntohl/htonl even on big-endian + * machines, and count on them being `#define'd away. Some of these + * might be slightly more efficient as quad_t copies on a big-endian, + * but we cannot count on their alignment anyway. + */ + +#define fxdr_unsigned(t, v) ((t)ntohl((long)(v))) +#define txdr_unsigned(v) (htonl((long)(v))) + +#define fxdr_nfstime(f, t) { \ + (t)->ts_sec = ntohl(((struct nfsv2_time *)(f))->nfs_sec); \ + (t)->ts_nsec = 1000 * ntohl(((struct nfsv2_time *)(f))->nfs_usec); \ +} +#define txdr_nfstime(f, t) { \ + ((struct nfsv2_time *)(t))->nfs_sec = htonl((f)->ts_sec); \ + ((struct nfsv2_time *)(t))->nfs_usec = htonl((f)->ts_nsec) / 1000; \ +} + +#define fxdr_nqtime(f, t) { \ + (t)->ts_sec = ntohl(((struct nqnfs_time *)(f))->nq_sec); \ + (t)->ts_nsec = ntohl(((struct nqnfs_time *)(f))->nq_nsec); \ +} +#define txdr_nqtime(f, t) { \ + ((struct nqnfs_time *)(t))->nq_sec = htonl((f)->ts_sec); \ + ((struct nqnfs_time *)(t))->nq_nsec = htonl((f)->ts_nsec); \ +} + +#define fxdr_hyper(f, t) { \ + ((long *)(t))[_QUAD_HIGHWORD] = ntohl(((long *)(f))[0]); \ + ((long *)(t))[_QUAD_LOWWORD] = ntohl(((long *)(f))[1]); \ +} +#define txdr_hyper(f, t) { \ + ((long *)(t))[0] = htonl(((long *)(f))[_QUAD_HIGHWORD]); \ + ((long *)(t))[1] = htonl(((long *)(f))[_QUAD_LOWWORD]); \ +} diff --git a/sys/nfsclient/nfs.h b/sys/nfsclient/nfs.h new file mode 100644 index 00000000000..261fd42657a --- /dev/null +++ b/sys/nfsclient/nfs.h @@ -0,0 +1,297 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Tunable constants for nfs + */ + +#define NFS_MAXIOVEC 34 +#define NFS_HZ 25 /* Ticks per second for NFS timeouts */ +#define NFS_TIMEO (1*NFS_HZ) /* Default timeout = 1 second */ +#define NFS_MINTIMEO (1*NFS_HZ) /* Min timeout to use */ +#define NFS_MAXTIMEO (60*NFS_HZ) /* Max timeout to backoff to */ +#define NFS_MINIDEMTIMEO (5*NFS_HZ) /* Min timeout for non-idempotent ops*/ +#define NFS_MAXREXMIT 100 /* Stop counting after this many */ +#define NFS_MAXWINDOW 1024 /* Max number of outstanding requests */ +#define NFS_RETRANS 10 /* Num of retrans for soft mounts */ +#define NFS_MAXGRPS 16 /* Max. size of groups list */ +#define NFS_MINATTRTIMO 5 /* Attribute cache timeout in sec */ +#define NFS_MAXATTRTIMO 60 +#define NFS_WSIZE 8192 /* Def. write data size <= 8192 */ +#define NFS_RSIZE 8192 /* Def. read data size <= 8192 */ +#define NFS_DEFRAHEAD 1 /* Def. read ahead # blocks */ +#define NFS_MAXRAHEAD 4 /* Max. read ahead # blocks */ +#define NFS_MAXREADDIR NFS_MAXDATA /* Max. size of directory read */ +#define NFS_MAXUIDHASH 64 /* Max. # of hashed uid entries/mp */ +#define NFS_MAXASYNCDAEMON 20 /* Max. number async_daemons runable */ +#define NFS_DIRBLKSIZ 1024 /* Size of an NFS directory block */ +#define NMOD(a) ((a) % nfs_asyncdaemons) + +/* + * Set the attribute timeout based on how recently the file has been modified. + */ +#define NFS_ATTRTIMEO(np) \ + ((((np)->n_flag & NMODIFIED) || \ + (time.tv_sec - (np)->n_mtime) / 10 < NFS_MINATTRTIMO) ? NFS_MINATTRTIMO : \ + ((time.tv_sec - (np)->n_mtime) / 10 > NFS_MAXATTRTIMO ? NFS_MAXATTRTIMO : \ + (time.tv_sec - (np)->n_mtime) / 10)) + +/* + * Structures for the nfssvc(2) syscall. Not that anyone but nfsd and mount_nfs + * should ever try and use it. + */ +struct nfsd_args { + int sock; /* Socket to serve */ + caddr_t name; /* Client address for connection based sockets */ + int namelen; /* Length of name */ +}; + +struct nfsd_srvargs { + struct nfsd *nsd_nfsd; /* Pointer to in kernel nfsd struct */ + uid_t nsd_uid; /* Effective uid mapped to cred */ + u_long nsd_haddr; /* Ip address of client */ + struct ucred nsd_cr; /* Cred. uid maps to */ + int nsd_authlen; /* Length of auth string (ret) */ + char *nsd_authstr; /* Auth string (ret) */ +}; + +struct nfsd_cargs { + char *ncd_dirp; /* Mount dir path */ + uid_t ncd_authuid; /* Effective uid */ + int ncd_authtype; /* Type of authenticator */ + int ncd_authlen; /* Length of authenticator string */ + char *ncd_authstr; /* Authenticator string */ +}; + +/* + * Stats structure + */ +struct nfsstats { + int attrcache_hits; + int attrcache_misses; + int lookupcache_hits; + int lookupcache_misses; + int direofcache_hits; + int direofcache_misses; + int biocache_reads; + int read_bios; + int read_physios; + int biocache_writes; + int write_bios; + int write_physios; + int biocache_readlinks; + int readlink_bios; + int biocache_readdirs; + int readdir_bios; + int rpccnt[NFS_NPROCS]; + int rpcretries; + int srvrpccnt[NFS_NPROCS]; + int srvrpc_errs; + int srv_errs; + int rpcrequests; + int rpctimeouts; + int rpcunexpected; + int rpcinvalid; + int srvcache_inproghits; + int srvcache_idemdonehits; + int srvcache_nonidemdonehits; + int srvcache_misses; + int srvnqnfs_leases; + int srvnqnfs_maxleases; + int srvnqnfs_getleases; +}; + +/* + * Flags for nfssvc() system call. + */ +#define NFSSVC_BIOD 0x002 +#define NFSSVC_NFSD 0x004 +#define NFSSVC_ADDSOCK 0x008 +#define NFSSVC_AUTHIN 0x010 +#define NFSSVC_GOTAUTH 0x040 +#define NFSSVC_AUTHINFAIL 0x080 +#define NFSSVC_MNTD 0x100 + +/* + * The set of signals the interrupt an I/O in progress for NFSMNT_INT mounts. + * What should be in this set is open to debate, but I believe that since + * I/O system calls on ufs are never interrupted by signals the set should + * be minimal. My reasoning is that many current programs that use signals + * such as SIGALRM will not expect file I/O system calls to be interrupted + * by them and break. + */ +#ifdef KERNEL +#define NFSINT_SIGMASK (sigmask(SIGINT)|sigmask(SIGTERM)|sigmask(SIGKILL)| \ + sigmask(SIGHUP)|sigmask(SIGQUIT)) + +/* + * Socket errors ignored for connectionless sockets?? + * For now, ignore them all + */ +#define NFSIGNORE_SOERROR(s, e) \ + ((e) != EINTR && (e) != ERESTART && (e) != EWOULDBLOCK && \ + ((s) & PR_CONNREQUIRED) == 0) + +/* + * Nfs outstanding request list element + */ +struct nfsreq { + struct nfsreq *r_next; + struct nfsreq *r_prev; + struct mbuf *r_mreq; + struct mbuf *r_mrep; + struct mbuf *r_md; + caddr_t r_dpos; + struct nfsmount *r_nmp; + struct vnode *r_vp; + u_long r_xid; + int r_flags; /* flags on request, see below */ + int r_retry; /* max retransmission count */ + int r_rexmit; /* current retrans count */ + int r_timer; /* tick counter on reply */ + int r_procnum; /* NFS procedure number */ + int r_rtt; /* RTT for rpc */ + struct proc *r_procp; /* Proc that did I/O system call */ +}; + +/* Flag values for r_flags */ +#define R_TIMING 0x01 /* timing request (in mntp) */ +#define R_SENT 0x02 /* request has been sent */ +#define R_SOFTTERM 0x04 /* soft mnt, too many retries */ +#define R_INTR 0x08 /* intr mnt, signal pending */ +#define R_SOCKERR 0x10 /* Fatal error on socket */ +#define R_TPRINTFMSG 0x20 /* Did a tprintf msg. */ +#define R_MUSTRESEND 0x40 /* Must resend request */ +#define R_GETONEREP 0x80 /* Probe for one reply only */ + +struct nfsstats nfsstats; + +/* + * A list of nfssvc_sock structures is maintained with all the sockets + * that require service by the nfsd. + * The nfsuid structs hang off of the nfssvc_sock structs in both lru + * and uid hash lists. + */ +#define NUIDHASHSIZ 32 +#define NUIDHASH(uid) ((uid) & (NUIDHASHSIZ - 1)) + +/* + * Network address hash list element + */ +union nethostaddr { + u_long had_inetaddr; + struct mbuf *had_nam; +}; + +struct nfsuid { + struct nfsuid *nu_lrunext; /* MUST be first */ + struct nfsuid *nu_lruprev; + struct nfsuid *nu_hnext; + struct nfsuid *nu_hprev; + int nu_flag; /* Flags */ + uid_t nu_uid; /* Uid mapped by this entry */ + union nethostaddr nu_haddr; /* Host addr. for dgram sockets */ + struct ucred nu_cr; /* Cred uid mapped to */ +}; + +#define nu_inetaddr nu_haddr.had_inetaddr +#define nu_nam nu_haddr.had_nam +/* Bits for nu_flag */ +#define NU_INETADDR 0x1 + +struct nfssvc_sock { + struct nfsuid *ns_lrunext; /* MUST be first */ + struct nfsuid *ns_lruprev; + struct nfssvc_sock *ns_next; + struct nfssvc_sock *ns_prev; + int ns_flag; + u_long ns_sref; + struct file *ns_fp; + struct socket *ns_so; + int ns_solock; + struct mbuf *ns_nam; + int ns_cc; + struct mbuf *ns_raw; + struct mbuf *ns_rawend; + int ns_reclen; + struct mbuf *ns_rec; + struct mbuf *ns_recend; + int ns_numuids; + struct nfsuid *ns_uidh[NUIDHASHSIZ]; +}; + +/* Bits for "ns_flag" */ +#define SLP_VALID 0x01 +#define SLP_DOREC 0x02 +#define SLP_NEEDQ 0x04 +#define SLP_DISCONN 0x08 +#define SLP_GETSTREAM 0x10 +#define SLP_INIT 0x20 +#define SLP_WANTINIT 0x40 + +#define SLP_ALLFLAGS 0xff + +/* + * One of these structures is allocated for each nfsd. + */ +struct nfsd { + struct nfsd *nd_next; /* Must be first */ + struct nfsd *nd_prev; + int nd_flag; /* NFSD_ flags */ + struct nfssvc_sock *nd_slp; /* Current socket */ + struct mbuf *nd_nam; /* Client addr for datagram req. */ + struct mbuf *nd_mrep; /* Req. mbuf list */ + struct mbuf *nd_md; + caddr_t nd_dpos; /* Position in list */ + int nd_procnum; /* RPC procedure number */ + u_long nd_retxid; /* RPC xid */ + int nd_repstat; /* Reply status value */ + struct ucred nd_cr; /* Credentials for req. */ + int nd_nqlflag; /* Leasing flag */ + int nd_duration; /* Lease duration */ + int nd_authlen; /* Authenticator len */ + u_char nd_authstr[RPCAUTH_MAXSIZ]; /* Authenticator data */ + struct proc *nd_procp; /* Proc ptr */ +}; + +#define NFSD_WAITING 0x01 +#define NFSD_CHECKSLP 0x02 +#define NFSD_REQINPROG 0x04 +#define NFSD_NEEDAUTH 0x08 +#define NFSD_AUTHFAIL 0x10 +#endif /* KERNEL */ diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c new file mode 100644 index 00000000000..177a278b631 --- /dev/null +++ b/sys/nfsclient/nfs_bio.c @@ -0,0 +1,799 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_bio.c 8.5 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +struct buf *incore(), *nfs_getcacheblk(); +extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +extern int nfs_numasync; + +/* + * Vnode op for read using bio + * Any similarity to readip() is purely coincidental + */ +nfs_bioread(vp, uio, ioflag, cred) + register struct vnode *vp; + register struct uio *uio; + int ioflag; + struct ucred *cred; +{ + register struct nfsnode *np = VTONFS(vp); + register int biosize, diff; + struct buf *bp, *rabp; + struct vattr vattr; + struct proc *p; + struct nfsmount *nmp; + daddr_t lbn, bn, rabn; + caddr_t baddr; + int got_buf, nra, error = 0, n, on, not_readin; + +#ifdef lint + ioflag = ioflag; +#endif /* lint */ +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("nfs_read mode"); +#endif + if (uio->uio_resid == 0) + return (0); + if (uio->uio_offset < 0 && vp->v_type != VDIR) + return (EINVAL); + nmp = VFSTONFS(vp->v_mount); + biosize = nmp->nm_rsize; + p = uio->uio_procp; + /* + * For nfs, cache consistency can only be maintained approximately. + * Although RFC1094 does not specify the criteria, the following is + * believed to be compatible with the reference port. + * For nqnfs, full cache consistency is maintained within the loop. + * For nfs: + * If the file's modify time on the server has changed since the + * last read rpc or you have written to the file, + * you may have lost data cache consistency with the + * server, so flush all of the file's data out of the cache. + * Then force a getattr rpc to ensure that you have up to date + * attributes. + * The mount flag NFSMNT_MYWRITE says "Assume that my writes are + * the ones changing the modify time. + * NB: This implies that cache data can be read when up to + * NFS_ATTRTIMEO seconds out of date. If you find that you need current + * attributes this could be forced by setting n_attrstamp to 0 before + * the VOP_GETATTR() call. + */ + if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) { + if (np->n_flag & NMODIFIED) { + if ((nmp->nm_flag & NFSMNT_MYWRITE) == 0 || + vp->v_type != VREG) { + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + } + np->n_attrstamp = 0; + np->n_direofoffset = 0; + if (error = VOP_GETATTR(vp, &vattr, cred, p)) + return (error); + np->n_mtime = vattr.va_mtime.ts_sec; + } else { + if (error = VOP_GETATTR(vp, &vattr, cred, p)) + return (error); + if (np->n_mtime != vattr.va_mtime.ts_sec) { + np->n_direofoffset = 0; + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + np->n_mtime = vattr.va_mtime.ts_sec; + } + } + } + do { + + /* + * Get a valid lease. If cached data is stale, flush it. + */ + if (nmp->nm_flag & NFSMNT_NQNFS) { + if (NQNFS_CKINVALID(vp, np, NQL_READ)) { + do { + error = nqnfs_getlease(vp, NQL_READ, cred, p); + } while (error == NQNFS_EXPIRED); + if (error) + return (error); + if (np->n_lrev != np->n_brev || + (np->n_flag & NQNFSNONCACHE) || + ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { + if (vp->v_type == VDIR) { + np->n_direofoffset = 0; + cache_purge(vp); + } + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + np->n_brev = np->n_lrev; + } + } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { + np->n_direofoffset = 0; + cache_purge(vp); + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + } + } + if (np->n_flag & NQNFSNONCACHE) { + switch (vp->v_type) { + case VREG: + error = nfs_readrpc(vp, uio, cred); + break; + case VLNK: + error = nfs_readlinkrpc(vp, uio, cred); + break; + case VDIR: + error = nfs_readdirrpc(vp, uio, cred); + break; + }; + return (error); + } + baddr = (caddr_t)0; + switch (vp->v_type) { + case VREG: + nfsstats.biocache_reads++; + lbn = uio->uio_offset / biosize; + on = uio->uio_offset & (biosize-1); + bn = lbn * (biosize / DEV_BSIZE); + not_readin = 1; + + /* + * Start the read ahead(s), as required. + */ + if (nfs_numasync > 0 && nmp->nm_readahead > 0 && + lbn == vp->v_lastr + 1) { + for (nra = 0; nra < nmp->nm_readahead && + (lbn + 1 + nra) * biosize < np->n_size; nra++) { + rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE); + if (!incore(vp, rabn)) { + rabp = nfs_getcacheblk(vp, rabn, biosize, p); + if (!rabp) + return (EINTR); + if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) { + rabp->b_flags |= (B_READ | B_ASYNC); + if (nfs_asyncio(rabp, cred)) { + rabp->b_flags |= B_INVAL; + brelse(rabp); + } + } + } + } + } + + /* + * If the block is in the cache and has the required data + * in a valid region, just copy it out. + * Otherwise, get the block and write back/read in, + * as required. + */ + if ((bp = incore(vp, bn)) && + (bp->b_flags & (B_BUSY | B_WRITEINPROG)) == + (B_BUSY | B_WRITEINPROG)) + got_buf = 0; + else { +again: + bp = nfs_getcacheblk(vp, bn, biosize, p); + if (!bp) + return (EINTR); + got_buf = 1; + if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { + bp->b_flags |= B_READ; + not_readin = 0; + if (error = nfs_doio(bp, cred, p)) { + brelse(bp); + return (error); + } + } + } + n = min((unsigned)(biosize - on), uio->uio_resid); + diff = np->n_size - uio->uio_offset; + if (diff < n) + n = diff; + if (not_readin && n > 0) { + if (on < bp->b_validoff || (on + n) > bp->b_validend) { + if (!got_buf) { + bp = nfs_getcacheblk(vp, bn, biosize, p); + if (!bp) + return (EINTR); + got_buf = 1; + } + bp->b_flags |= B_INVAL; + if (bp->b_dirtyend > 0) { + if ((bp->b_flags & B_DELWRI) == 0) + panic("nfsbioread"); + if (VOP_BWRITE(bp) == EINTR) + return (EINTR); + } else + brelse(bp); + goto again; + } + } + vp->v_lastr = lbn; + diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); + if (diff < n) + n = diff; + break; + case VLNK: + nfsstats.biocache_readlinks++; + bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); + if (!bp) + return (EINTR); + if ((bp->b_flags & B_DONE) == 0) { + bp->b_flags |= B_READ; + if (error = nfs_doio(bp, cred, p)) { + brelse(bp); + return (error); + } + } + n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); + got_buf = 1; + on = 0; + break; + case VDIR: + nfsstats.biocache_readdirs++; + bn = (daddr_t)uio->uio_offset; + bp = nfs_getcacheblk(vp, bn, NFS_DIRBLKSIZ, p); + if (!bp) + return (EINTR); + if ((bp->b_flags & B_DONE) == 0) { + bp->b_flags |= B_READ; + if (error = nfs_doio(bp, cred, p)) { + brelse(bp); + return (error); + } + } + + /* + * If not eof and read aheads are enabled, start one. + * (You need the current block first, so that you have the + * directory offset cookie of the next block. + */ + rabn = bp->b_blkno; + if (nfs_numasync > 0 && nmp->nm_readahead > 0 && + rabn != 0 && rabn != np->n_direofoffset && + !incore(vp, rabn)) { + rabp = nfs_getcacheblk(vp, rabn, NFS_DIRBLKSIZ, p); + if (rabp) { + if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) { + rabp->b_flags |= (B_READ | B_ASYNC); + if (nfs_asyncio(rabp, cred)) { + rabp->b_flags |= B_INVAL; + brelse(rabp); + } + } + } + } + on = 0; + n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid); + got_buf = 1; + break; + }; + + if (n > 0) { + if (!baddr) + baddr = bp->b_data; + error = uiomove(baddr + on, (int)n, uio); + } + switch (vp->v_type) { + case VREG: + if (n + on == biosize || uio->uio_offset == np->n_size) + bp->b_flags |= B_AGE; + break; + case VLNK: + n = 0; + break; + case VDIR: + uio->uio_offset = bp->b_blkno; + break; + }; + if (got_buf) + brelse(bp); + } while (error == 0 && uio->uio_resid > 0 && n > 0); + return (error); +} + +/* + * Vnode op for write using bio + */ +nfs_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register int biosize; + register struct uio *uio = ap->a_uio; + struct proc *p = uio->uio_procp; + register struct vnode *vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + register struct ucred *cred = ap->a_cred; + int ioflag = ap->a_ioflag; + struct buf *bp; + struct vattr vattr; + struct nfsmount *nmp; + daddr_t lbn, bn; + int n, on, error = 0; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE) + panic("nfs_write mode"); + if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) + panic("nfs_write proc"); +#endif + if (vp->v_type != VREG) + return (EIO); + if (np->n_flag & NWRITEERR) { + np->n_flag &= ~NWRITEERR; + return (np->n_error); + } + if (ioflag & (IO_APPEND | IO_SYNC)) { + if (np->n_flag & NMODIFIED) { + np->n_attrstamp = 0; + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + } + if (ioflag & IO_APPEND) { + np->n_attrstamp = 0; + if (error = VOP_GETATTR(vp, &vattr, cred, p)) + return (error); + uio->uio_offset = np->n_size; + } + } + nmp = VFSTONFS(vp->v_mount); + if (uio->uio_offset < 0) + return (EINVAL); + if (uio->uio_resid == 0) + return (0); + /* + * Maybe this should be above the vnode op call, but so long as + * file servers have no limits, i don't think it matters + */ + if (p && uio->uio_offset + uio->uio_resid > + p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { + psignal(p, SIGXFSZ); + return (EFBIG); + } + /* + * I use nm_rsize, not nm_wsize so that all buffer cache blocks + * will be the same size within a filesystem. nfs_writerpc will + * still use nm_wsize when sizing the rpc's. + */ + biosize = nmp->nm_rsize; + do { + + /* + * Check for a valid write lease. + * If non-cachable, just do the rpc + */ + if ((nmp->nm_flag & NFSMNT_NQNFS) && + NQNFS_CKINVALID(vp, np, NQL_WRITE)) { + do { + error = nqnfs_getlease(vp, NQL_WRITE, cred, p); + } while (error == NQNFS_EXPIRED); + if (error) + return (error); + if (np->n_lrev != np->n_brev || + (np->n_flag & NQNFSNONCACHE)) { + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + np->n_brev = np->n_lrev; + } + } + if (np->n_flag & NQNFSNONCACHE) + return (nfs_writerpc(vp, uio, cred, ioflag)); + nfsstats.biocache_writes++; + lbn = uio->uio_offset / biosize; + on = uio->uio_offset & (biosize-1); + n = min((unsigned)(biosize - on), uio->uio_resid); + bn = lbn * (biosize / DEV_BSIZE); +again: + bp = nfs_getcacheblk(vp, bn, biosize, p); + if (!bp) + return (EINTR); + if (bp->b_wcred == NOCRED) { + crhold(cred); + bp->b_wcred = cred; + } + np->n_flag |= NMODIFIED; + if (uio->uio_offset + n > np->n_size) { + np->n_size = uio->uio_offset + n; + vnode_pager_setsize(vp, (u_long)np->n_size); + } + + /* + * If the new write will leave a contiguous dirty + * area, just update the b_dirtyoff and b_dirtyend, + * otherwise force a write rpc of the old dirty area. + */ + if (bp->b_dirtyend > 0 && + (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { + bp->b_proc = p; + if (VOP_BWRITE(bp) == EINTR) + return (EINTR); + goto again; + } + + /* + * Check for valid write lease and get one as required. + * In case getblk() and/or bwrite() delayed us. + */ + if ((nmp->nm_flag & NFSMNT_NQNFS) && + NQNFS_CKINVALID(vp, np, NQL_WRITE)) { + do { + error = nqnfs_getlease(vp, NQL_WRITE, cred, p); + } while (error == NQNFS_EXPIRED); + if (error) { + brelse(bp); + return (error); + } + if (np->n_lrev != np->n_brev || + (np->n_flag & NQNFSNONCACHE)) { + brelse(bp); + if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1)) + return (error); + np->n_brev = np->n_lrev; + goto again; + } + } + if (error = uiomove((char *)bp->b_data + on, n, uio)) { + bp->b_flags |= B_ERROR; + brelse(bp); + return (error); + } + if (bp->b_dirtyend > 0) { + bp->b_dirtyoff = min(on, bp->b_dirtyoff); + bp->b_dirtyend = max((on + n), bp->b_dirtyend); + } else { + bp->b_dirtyoff = on; + bp->b_dirtyend = on + n; + } +#ifndef notdef + if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || + bp->b_validoff > bp->b_dirtyend) { + bp->b_validoff = bp->b_dirtyoff; + bp->b_validend = bp->b_dirtyend; + } else { + bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); + bp->b_validend = max(bp->b_validend, bp->b_dirtyend); + } +#else + bp->b_validoff = bp->b_dirtyoff; + bp->b_validend = bp->b_dirtyend; +#endif + if (ioflag & IO_APPEND) + bp->b_flags |= B_APPENDWRITE; + + /* + * If the lease is non-cachable or IO_SYNC do bwrite(). + */ + if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { + bp->b_proc = p; + if (error = VOP_BWRITE(bp)) + return (error); + } else if ((n + on) == biosize && + (nmp->nm_flag & NFSMNT_NQNFS) == 0) { + bp->b_proc = (struct proc *)0; + bawrite(bp); + } else + bdwrite(bp); + } while (uio->uio_resid > 0 && n > 0); + return (0); +} + +/* + * Get an nfs cache block. + * Allocate a new one if the block isn't currently in the cache + * and return the block marked busy. If the calling process is + * interrupted by a signal for an interruptible mount point, return + * NULL. + */ +struct buf * +nfs_getcacheblk(vp, bn, size, p) + struct vnode *vp; + daddr_t bn; + int size; + struct proc *p; +{ + register struct buf *bp; + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + + if (nmp->nm_flag & NFSMNT_INT) { + bp = getblk(vp, bn, size, PCATCH, 0); + while (bp == (struct buf *)0) { + if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) + return ((struct buf *)0); + bp = getblk(vp, bn, size, 0, 2 * hz); + } + } else + bp = getblk(vp, bn, size, 0, 0); + return (bp); +} + +/* + * Flush and invalidate all dirty buffers. If another process is already + * doing the flush, just wait for completion. + */ +nfs_vinvalbuf(vp, flags, cred, p, intrflg) + struct vnode *vp; + int flags; + struct ucred *cred; + struct proc *p; + int intrflg; +{ + register struct nfsnode *np = VTONFS(vp); + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + int error = 0, slpflag, slptimeo; + + if ((nmp->nm_flag & NFSMNT_INT) == 0) + intrflg = 0; + if (intrflg) { + slpflag = PCATCH; + slptimeo = 2 * hz; + } else { + slpflag = 0; + slptimeo = 0; + } + /* + * First wait for any other process doing a flush to complete. + */ + while (np->n_flag & NFLUSHINPROG) { + np->n_flag |= NFLUSHWANT; + error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", + slptimeo); + if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) + return (EINTR); + } + + /* + * Now, flush as required. + */ + np->n_flag |= NFLUSHINPROG; + error = vinvalbuf(vp, flags, cred, p, slpflag, 0); + while (error) { + if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { + np->n_flag &= ~NFLUSHINPROG; + if (np->n_flag & NFLUSHWANT) { + np->n_flag &= ~NFLUSHWANT; + wakeup((caddr_t)&np->n_flag); + } + return (EINTR); + } + error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); + } + np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); + if (np->n_flag & NFLUSHWANT) { + np->n_flag &= ~NFLUSHWANT; + wakeup((caddr_t)&np->n_flag); + } + return (0); +} + +/* + * Initiate asynchronous I/O. Return an error if no nfsiods are available. + * This is mainly to avoid queueing async I/O requests when the nfsiods + * are all hung on a dead server. + */ +nfs_asyncio(bp, cred) + register struct buf *bp; + struct ucred *cred; +{ + register int i; + + if (nfs_numasync == 0) + return (EIO); + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + if (nfs_iodwant[i]) { + if (bp->b_flags & B_READ) { + if (bp->b_rcred == NOCRED && cred != NOCRED) { + crhold(cred); + bp->b_rcred = cred; + } + } else { + if (bp->b_wcred == NOCRED && cred != NOCRED) { + crhold(cred); + bp->b_wcred = cred; + } + } + + TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist); + nfs_iodwant[i] = (struct proc *)0; + wakeup((caddr_t)&nfs_iodwant[i]); + return (0); + } + return (EIO); +} + +/* + * Do an I/O operation to/from a cache block. This may be called + * synchronously or from an nfsiod. + */ +int +nfs_doio(bp, cr, p) + register struct buf *bp; + struct cred *cr; + struct proc *p; +{ + register struct uio *uiop; + register struct vnode *vp; + struct nfsnode *np; + struct nfsmount *nmp; + int error, diff, len; + struct uio uio; + struct iovec io; + + vp = bp->b_vp; + np = VTONFS(vp); + nmp = VFSTONFS(vp->v_mount); + uiop = &uio; + uiop->uio_iov = &io; + uiop->uio_iovcnt = 1; + uiop->uio_segflg = UIO_SYSSPACE; + uiop->uio_procp = p; + + /* + * Historically, paging was done with physio, but no more. + */ + if (bp->b_flags & B_PHYS) + panic("doio phys"); + if (bp->b_flags & B_READ) { + io.iov_len = uiop->uio_resid = bp->b_bcount; + io.iov_base = bp->b_data; + uiop->uio_rw = UIO_READ; + switch (vp->v_type) { + case VREG: + uiop->uio_offset = bp->b_blkno * DEV_BSIZE; + nfsstats.read_bios++; + error = nfs_readrpc(vp, uiop, cr); + if (!error) { + bp->b_validoff = 0; + if (uiop->uio_resid) { + /* + * If len > 0, there is a hole in the file and + * no writes after the hole have been pushed to + * the server yet. + * Just zero fill the rest of the valid area. + */ + diff = bp->b_bcount - uiop->uio_resid; + len = np->n_size - (bp->b_blkno * DEV_BSIZE + + diff); + if (len > 0) { + len = min(len, uiop->uio_resid); + bzero((char *)bp->b_data + diff, len); + bp->b_validend = diff + len; + } else + bp->b_validend = diff; + } else + bp->b_validend = bp->b_bcount; + } + if (p && (vp->v_flag & VTEXT) && + (((nmp->nm_flag & NFSMNT_NQNFS) && + np->n_lrev != np->n_brev) || + (!(nmp->nm_flag & NFSMNT_NQNFS) && + np->n_mtime != np->n_vattr.va_mtime.ts_sec))) { + uprintf("Process killed due to text file modification\n"); + psignal(p, SIGKILL); + p->p_flag |= P_NOSWAP; + } + break; + case VLNK: + uiop->uio_offset = 0; + nfsstats.readlink_bios++; + error = nfs_readlinkrpc(vp, uiop, cr); + break; + case VDIR: + uiop->uio_offset = bp->b_lblkno; + nfsstats.readdir_bios++; + if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) + error = nfs_readdirlookrpc(vp, uiop, cr); + else + error = nfs_readdirrpc(vp, uiop, cr); + /* + * Save offset cookie in b_blkno. + */ + bp->b_blkno = uiop->uio_offset; + break; + }; + if (error) { + bp->b_flags |= B_ERROR; + bp->b_error = error; + } + } else { + io.iov_len = uiop->uio_resid = bp->b_dirtyend + - bp->b_dirtyoff; + uiop->uio_offset = (bp->b_blkno * DEV_BSIZE) + + bp->b_dirtyoff; + io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; + uiop->uio_rw = UIO_WRITE; + nfsstats.write_bios++; + if (bp->b_flags & B_APPENDWRITE) + error = nfs_writerpc(vp, uiop, cr, IO_APPEND); + else + error = nfs_writerpc(vp, uiop, cr, 0); + bp->b_flags &= ~(B_WRITEINPROG | B_APPENDWRITE); + + /* + * For an interrupted write, the buffer is still valid and the + * write hasn't been pushed to the server yet, so we can't set + * B_ERROR and report the interruption by setting B_EINTR. For + * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt + * is essentially a noop. + */ + if (error == EINTR) { + bp->b_flags &= ~B_INVAL; + bp->b_flags |= B_DELWRI; + + /* + * Since for the B_ASYNC case, nfs_bwrite() has reassigned the + * buffer to the clean list, we have to reassign it back to the + * dirty one. Ugh. + */ + if (bp->b_flags & B_ASYNC) + reassignbuf(bp, vp); + else + bp->b_flags |= B_EINTR; + } else { + if (error) { + bp->b_flags |= B_ERROR; + bp->b_error = np->n_error = error; + np->n_flag |= NWRITEERR; + } + bp->b_dirtyoff = bp->b_dirtyend = 0; + } + } + bp->b_resid = uiop->uio_resid; + biodone(bp); + return (error); +} diff --git a/sys/nfsclient/nfs_nfsiod.c b/sys/nfsclient/nfs_nfsiod.c new file mode 100644 index 00000000000..5d86b42ee20 --- /dev/null +++ b/sys/nfsclient/nfs_nfsiod.c @@ -0,0 +1,874 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_syscalls.c 8.3 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef ISO +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +/* Global defs. */ +extern u_long nfs_prog, nfs_vers; +extern int (*nfsrv_procs[NFS_NPROCS])(); +extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +extern int nfs_numasync; +extern time_t nqnfsstarttime; +extern struct nfsrv_req nsrvq_head; +extern struct nfsd nfsd_head; +extern int nqsrv_writeslack; +extern int nfsrtton; +struct nfssvc_sock *nfs_udpsock, *nfs_cltpsock; +int nuidhash_max = NFS_MAXUIDHASH; +static int nfs_numnfsd = 0; +int nfsd_waiting = 0; +static int notstarted = 1; +static int modify_flag = 0; +static struct nfsdrt nfsdrt; +void nfsrv_cleancache(), nfsrv_rcv(), nfsrv_wakenfsd(), nfs_sndunlock(); +static void nfsd_rt(); +void nfsrv_slpderef(), nfsrv_init(); + +#define TRUE 1 +#define FALSE 0 + +static int nfs_asyncdaemon[NFS_MAXASYNCDAEMON]; +/* + * NFS server system calls + * getfh() lives here too, but maybe should move to kern/vfs_syscalls.c + */ + +/* + * Get file handle system call + */ +struct getfh_args { + char *fname; + fhandle_t *fhp; +}; +getfh(p, uap, retval) + struct proc *p; + register struct getfh_args *uap; + int *retval; +{ + register struct vnode *vp; + fhandle_t fh; + int error; + struct nameidata nd; + + /* + * Must be super user + */ + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + bzero((caddr_t)&fh, sizeof(fh)); + fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; + error = VFS_VPTOFH(vp, &fh.fh_fid); + vput(vp); + if (error) + return (error); + error = copyout((caddr_t)&fh, (caddr_t)uap->fhp, sizeof (fh)); + return (error); +} + +static struct nfssvc_sock nfssvc_sockhead; + +/* + * Nfs server psuedo system call for the nfsd's + * Based on the flag value it either: + * - adds a socket to the selection list + * - remains in the kernel as an nfsd + * - remains in the kernel as an nfsiod + */ +struct nfssvc_args { + int flag; + caddr_t argp; +}; +nfssvc(p, uap, retval) + struct proc *p; + register struct nfssvc_args *uap; + int *retval; +{ + struct nameidata nd; + struct file *fp; + struct mbuf *nam; + struct nfsd_args nfsdarg; + struct nfsd_srvargs nfsd_srvargs, *nsd = &nfsd_srvargs; + struct nfsd_cargs ncd; + struct nfsd *nfsd; + struct nfssvc_sock *slp; + struct nfsuid *nuidp, **nuh; + struct nfsmount *nmp; + int error; + + /* + * Must be super user + */ + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + while (nfssvc_sockhead.ns_flag & SLP_INIT) { + nfssvc_sockhead.ns_flag |= SLP_WANTINIT; + (void) tsleep((caddr_t)&nfssvc_sockhead, PSOCK, "nfsd init", 0); + } + if (uap->flag & NFSSVC_BIOD) + error = nfssvc_iod(p); + else if (uap->flag & NFSSVC_MNTD) { + if (error = copyin(uap->argp, (caddr_t)&ncd, sizeof (ncd))) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + ncd.ncd_dirp, p); + if (error = namei(&nd)) + return (error); + if ((nd.ni_vp->v_flag & VROOT) == 0) + error = EINVAL; + nmp = VFSTONFS(nd.ni_vp->v_mount); + vput(nd.ni_vp); + if (error) + return (error); + if ((nmp->nm_flag & NFSMNT_MNTD) && + (uap->flag & NFSSVC_GOTAUTH) == 0) + return (0); + nmp->nm_flag |= NFSMNT_MNTD; + error = nqnfs_clientd(nmp, p->p_ucred, &ncd, uap->flag, + uap->argp, p); + } else if (uap->flag & NFSSVC_ADDSOCK) { + if (error = copyin(uap->argp, (caddr_t)&nfsdarg, + sizeof(nfsdarg))) + return (error); + if (error = getsock(p->p_fd, nfsdarg.sock, &fp)) + return (error); + /* + * Get the client address for connected sockets. + */ + if (nfsdarg.name == NULL || nfsdarg.namelen == 0) + nam = (struct mbuf *)0; + else if (error = sockargs(&nam, nfsdarg.name, nfsdarg.namelen, + MT_SONAME)) + return (error); + error = nfssvc_addsock(fp, nam); + } else { + if (error = copyin(uap->argp, (caddr_t)nsd, sizeof (*nsd))) + return (error); + if ((uap->flag & NFSSVC_AUTHIN) && (nfsd = nsd->nsd_nfsd) && + (nfsd->nd_slp->ns_flag & SLP_VALID)) { + slp = nfsd->nd_slp; + + /* + * First check to see if another nfsd has already + * added this credential. + */ + nuidp = slp->ns_uidh[NUIDHASH(nsd->nsd_uid)]; + while (nuidp) { + if (nuidp->nu_uid == nsd->nsd_uid) + break; + nuidp = nuidp->nu_hnext; + } + if (!nuidp) { + /* + * Nope, so we will. + */ + if (slp->ns_numuids < nuidhash_max) { + slp->ns_numuids++; + nuidp = (struct nfsuid *) + malloc(sizeof (struct nfsuid), M_NFSUID, + M_WAITOK); + } else + nuidp = (struct nfsuid *)0; + if ((slp->ns_flag & SLP_VALID) == 0) { + if (nuidp) + free((caddr_t)nuidp, M_NFSUID); + } else { + if (nuidp == (struct nfsuid *)0) { + nuidp = slp->ns_lruprev; + remque(nuidp); + if (nuidp->nu_hprev) + nuidp->nu_hprev->nu_hnext = + nuidp->nu_hnext; + if (nuidp->nu_hnext) + nuidp->nu_hnext->nu_hprev = + nuidp->nu_hprev; + } + nuidp->nu_cr = nsd->nsd_cr; + if (nuidp->nu_cr.cr_ngroups > NGROUPS) + nuidp->nu_cr.cr_ngroups = NGROUPS; + nuidp->nu_cr.cr_ref = 1; + nuidp->nu_uid = nsd->nsd_uid; + insque(nuidp, (struct nfsuid *)slp); + nuh = &slp->ns_uidh[NUIDHASH(nsd->nsd_uid)]; + if (nuidp->nu_hnext = *nuh) + nuidp->nu_hnext->nu_hprev = nuidp; + nuidp->nu_hprev = (struct nfsuid *)0; + *nuh = nuidp; + } + } + } + if ((uap->flag & NFSSVC_AUTHINFAIL) && (nfsd = nsd->nsd_nfsd)) + nfsd->nd_flag |= NFSD_AUTHFAIL; + error = nfssvc_nfsd(nsd, uap->argp, p); + } + if (error == EINTR || error == ERESTART) + error = 0; + return (error); +} + +/* + * Adds a socket to the list for servicing by nfsds. + */ +nfssvc_addsock(fp, mynam) + struct file *fp; + struct mbuf *mynam; +{ + register struct mbuf *m; + register int siz; + register struct nfssvc_sock *slp; + register struct socket *so; + struct nfssvc_sock *tslp; + int error, s; + + so = (struct socket *)fp->f_data; + tslp = (struct nfssvc_sock *)0; + /* + * Add it to the list, as required. + */ + if (so->so_proto->pr_protocol == IPPROTO_UDP) { + tslp = nfs_udpsock; + if (tslp->ns_flag & SLP_VALID) { + m_freem(mynam); + return (EPERM); + } +#ifdef ISO + } else if (so->so_proto->pr_protocol == ISOPROTO_CLTP) { + tslp = nfs_cltpsock; + if (tslp->ns_flag & SLP_VALID) { + m_freem(mynam); + return (EPERM); + } +#endif /* ISO */ + } + if (so->so_type == SOCK_STREAM) + siz = NFS_MAXPACKET + sizeof (u_long); + else + siz = NFS_MAXPACKET; + if (error = soreserve(so, siz, siz)) { + m_freem(mynam); + return (error); + } + + /* + * Set protocol specific options { for now TCP only } and + * reserve some space. For datagram sockets, this can get called + * repeatedly for the same socket, but that isn't harmful. + */ + if (so->so_type == SOCK_STREAM) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); + } + if (so->so_proto->pr_domain->dom_family == AF_INET && + so->so_proto->pr_protocol == IPPROTO_TCP) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); + } + so->so_rcv.sb_flags &= ~SB_NOINTR; + so->so_rcv.sb_timeo = 0; + so->so_snd.sb_flags &= ~SB_NOINTR; + so->so_snd.sb_timeo = 0; + if (tslp) + slp = tslp; + else { + slp = (struct nfssvc_sock *) + malloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK); + bzero((caddr_t)slp, sizeof (struct nfssvc_sock)); + slp->ns_prev = nfssvc_sockhead.ns_prev; + slp->ns_prev->ns_next = slp; + slp->ns_next = &nfssvc_sockhead; + nfssvc_sockhead.ns_prev = slp; + slp->ns_lrunext = slp->ns_lruprev = (struct nfsuid *)slp; + } + slp->ns_so = so; + slp->ns_nam = mynam; + fp->f_count++; + slp->ns_fp = fp; + s = splnet(); + so->so_upcallarg = (caddr_t)slp; + so->so_upcall = nfsrv_rcv; + slp->ns_flag = (SLP_VALID | SLP_NEEDQ); + nfsrv_wakenfsd(slp); + splx(s); + return (0); +} + +/* + * Called by nfssvc() for nfsds. Just loops around servicing rpc requests + * until it is killed by a signal. + */ +nfssvc_nfsd(nsd, argp, p) + struct nfsd_srvargs *nsd; + caddr_t argp; + struct proc *p; +{ + register struct mbuf *m, *nam2; + register int siz; + register struct nfssvc_sock *slp; + register struct socket *so; + register int *solockp; + struct nfsd *nd = nsd->nsd_nfsd; + struct mbuf *mreq, *nam; + struct timeval starttime; + struct nfsuid *uidp; + int error, cacherep, s; + int sotype; + + s = splnet(); + if (nd == (struct nfsd *)0) { + nsd->nsd_nfsd = nd = (struct nfsd *) + malloc(sizeof (struct nfsd), M_NFSD, M_WAITOK); + bzero((caddr_t)nd, sizeof (struct nfsd)); + nd->nd_procp = p; + nd->nd_cr.cr_ref = 1; + insque(nd, &nfsd_head); + nd->nd_nqlflag = NQL_NOVAL; + nfs_numnfsd++; + } + /* + * Loop getting rpc requests until SIGKILL. + */ + for (;;) { + if ((nd->nd_flag & NFSD_REQINPROG) == 0) { + while (nd->nd_slp == (struct nfssvc_sock *)0 && + (nfsd_head.nd_flag & NFSD_CHECKSLP) == 0) { + nd->nd_flag |= NFSD_WAITING; + nfsd_waiting++; + error = tsleep((caddr_t)nd, PSOCK | PCATCH, "nfsd", 0); + nfsd_waiting--; + if (error) + goto done; + } + if (nd->nd_slp == (struct nfssvc_sock *)0 && + (nfsd_head.nd_flag & NFSD_CHECKSLP)) { + slp = nfssvc_sockhead.ns_next; + while (slp != &nfssvc_sockhead) { + if ((slp->ns_flag & (SLP_VALID | SLP_DOREC)) + == (SLP_VALID | SLP_DOREC)) { + slp->ns_flag &= ~SLP_DOREC; + slp->ns_sref++; + nd->nd_slp = slp; + break; + } + slp = slp->ns_next; + } + if (slp == &nfssvc_sockhead) + nfsd_head.nd_flag &= ~NFSD_CHECKSLP; + } + if ((slp = nd->nd_slp) == (struct nfssvc_sock *)0) + continue; + if (slp->ns_flag & SLP_VALID) { + if (slp->ns_flag & SLP_DISCONN) + nfsrv_zapsock(slp); + else if (slp->ns_flag & SLP_NEEDQ) { + slp->ns_flag &= ~SLP_NEEDQ; + (void) nfs_sndlock(&slp->ns_solock, + (struct nfsreq *)0); + nfsrv_rcv(slp->ns_so, (caddr_t)slp, + M_WAIT); + nfs_sndunlock(&slp->ns_solock); + } + error = nfsrv_dorec(slp, nd); + nd->nd_flag |= NFSD_REQINPROG; + } + } else { + error = 0; + slp = nd->nd_slp; + } + if (error || (slp->ns_flag & SLP_VALID) == 0) { + nd->nd_slp = (struct nfssvc_sock *)0; + nd->nd_flag &= ~NFSD_REQINPROG; + nfsrv_slpderef(slp); + continue; + } + splx(s); + so = slp->ns_so; + sotype = so->so_type; + starttime = time; + if (so->so_proto->pr_flags & PR_CONNREQUIRED) + solockp = &slp->ns_solock; + else + solockp = (int *)0; + /* + * nam == nam2 for connectionless protocols such as UDP + * nam2 == NULL for connection based protocols to disable + * recent request caching. + */ + if (nam2 = nd->nd_nam) { + nam = nam2; + cacherep = RC_CHECKIT; + } else { + nam = slp->ns_nam; + cacherep = RC_DOIT; + } + + /* + * Check to see if authorization is needed. + */ + if (nd->nd_flag & NFSD_NEEDAUTH) { + static int logauth = 0; + + nd->nd_flag &= ~NFSD_NEEDAUTH; + /* + * Check for a mapping already installed. + */ + uidp = slp->ns_uidh[NUIDHASH(nd->nd_cr.cr_uid)]; + while (uidp) { + if (uidp->nu_uid == nd->nd_cr.cr_uid) + break; + uidp = uidp->nu_hnext; + } + if (!uidp) { + nsd->nsd_uid = nd->nd_cr.cr_uid; + if (nam2 && logauth++ == 0) + log(LOG_WARNING, "Kerberized NFS using UDP\n"); + nsd->nsd_haddr = + mtod(nam, struct sockaddr_in *)->sin_addr.s_addr; + nsd->nsd_authlen = nd->nd_authlen; + if (copyout(nd->nd_authstr, nsd->nsd_authstr, + nd->nd_authlen) == 0 && + copyout((caddr_t)nsd, argp, sizeof (*nsd)) == 0) + return (ENEEDAUTH); + cacherep = RC_DROPIT; + } + } + if (cacherep == RC_CHECKIT) + cacherep = nfsrv_getcache(nam2, nd, &mreq); + + /* + * Check for just starting up for NQNFS and send + * fake "try again later" replies to the NQNFS clients. + */ + if (notstarted && nqnfsstarttime <= time.tv_sec) { + if (modify_flag) { + nqnfsstarttime = time.tv_sec + nqsrv_writeslack; + modify_flag = 0; + } else + notstarted = 0; + } + if (notstarted) { + if (nd->nd_nqlflag == NQL_NOVAL) + cacherep = RC_DROPIT; + else if (nd->nd_procnum != NFSPROC_WRITE) { + nd->nd_procnum = NFSPROC_NOOP; + nd->nd_repstat = NQNFS_TRYLATER; + cacherep = RC_DOIT; + } else + modify_flag = 1; + } else if (nd->nd_flag & NFSD_AUTHFAIL) { + nd->nd_flag &= ~NFSD_AUTHFAIL; + nd->nd_procnum = NFSPROC_NOOP; + nd->nd_repstat = NQNFS_AUTHERR; + cacherep = RC_DOIT; + } + + switch (cacherep) { + case RC_DOIT: + error = (*(nfsrv_procs[nd->nd_procnum]))(nd, + nd->nd_mrep, nd->nd_md, nd->nd_dpos, &nd->nd_cr, + nam, &mreq); + if (nd->nd_cr.cr_ref != 1) { + printf("nfssvc cref=%d\n", nd->nd_cr.cr_ref); + panic("nfssvc cref"); + } + if (error) { + if (nd->nd_procnum != NQNFSPROC_VACATED) + nfsstats.srv_errs++; + if (nam2) { + nfsrv_updatecache(nam2, nd, FALSE, mreq); + m_freem(nam2); + } + break; + } + nfsstats.srvrpccnt[nd->nd_procnum]++; + if (nam2) + nfsrv_updatecache(nam2, nd, TRUE, mreq); + nd->nd_mrep = (struct mbuf *)0; + case RC_REPLY: + m = mreq; + siz = 0; + while (m) { + siz += m->m_len; + m = m->m_next; + } + if (siz <= 0 || siz > NFS_MAXPACKET) { + printf("mbuf siz=%d\n",siz); + panic("Bad nfs svc reply"); + } + m = mreq; + m->m_pkthdr.len = siz; + m->m_pkthdr.rcvif = (struct ifnet *)0; + /* + * For stream protocols, prepend a Sun RPC + * Record Mark. + */ + if (sotype == SOCK_STREAM) { + M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); + *mtod(m, u_long *) = htonl(0x80000000 | siz); + } + if (solockp) + (void) nfs_sndlock(solockp, (struct nfsreq *)0); + if (slp->ns_flag & SLP_VALID) + error = nfs_send(so, nam2, m, (struct nfsreq *)0); + else { + error = EPIPE; + m_freem(m); + } + if (nfsrtton) + nfsd_rt(&starttime, sotype, nd, nam, cacherep); + if (nam2) + MFREE(nam2, m); + if (nd->nd_mrep) + m_freem(nd->nd_mrep); + if (error == EPIPE) + nfsrv_zapsock(slp); + if (solockp) + nfs_sndunlock(solockp); + if (error == EINTR || error == ERESTART) { + nfsrv_slpderef(slp); + s = splnet(); + goto done; + } + break; + case RC_DROPIT: + if (nfsrtton) + nfsd_rt(&starttime, sotype, nd, nam, cacherep); + m_freem(nd->nd_mrep); + m_freem(nam2); + break; + }; + s = splnet(); + if (nfsrv_dorec(slp, nd)) { + nd->nd_flag &= ~NFSD_REQINPROG; + nd->nd_slp = (struct nfssvc_sock *)0; + nfsrv_slpderef(slp); + } + } +done: + remque(nd); + splx(s); + free((caddr_t)nd, M_NFSD); + nsd->nsd_nfsd = (struct nfsd *)0; + if (--nfs_numnfsd == 0) + nfsrv_init(TRUE); /* Reinitialize everything */ + return (error); +} + +/* + * Asynchronous I/O daemons for client nfs. + * They do read-ahead and write-behind operations on the block I/O cache. + * Never returns unless it fails or gets killed. + */ +nfssvc_iod(p) + struct proc *p; +{ + register struct buf *bp; + register int i, myiod; + int error = 0; + + /* + * Assign my position or return error if too many already running + */ + myiod = -1; + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + if (nfs_asyncdaemon[i] == 0) { + nfs_asyncdaemon[i]++; + myiod = i; + break; + } + if (myiod == -1) + return (EBUSY); + nfs_numasync++; + /* + * Just loop around doin our stuff until SIGKILL + */ + for (;;) { + while (nfs_bufq.tqh_first == NULL && error == 0) { + nfs_iodwant[myiod] = p; + error = tsleep((caddr_t)&nfs_iodwant[myiod], + PWAIT | PCATCH, "nfsidl", 0); + } + while ((bp = nfs_bufq.tqh_first) != NULL) { + /* Take one off the front of the list */ + TAILQ_REMOVE(&nfs_bufq, bp, b_freelist); + if (bp->b_flags & B_READ) + (void) nfs_doio(bp, bp->b_rcred, (struct proc *)0); + else + (void) nfs_doio(bp, bp->b_wcred, (struct proc *)0); + } + if (error) { + nfs_asyncdaemon[myiod] = 0; + nfs_numasync--; + return (error); + } + } +} + +/* + * Shut down a socket associated with an nfssvc_sock structure. + * Should be called with the send lock set, if required. + * The trick here is to increment the sref at the start, so that the nfsds + * will stop using it and clear ns_flag at the end so that it will not be + * reassigned during cleanup. + */ +nfsrv_zapsock(slp) + register struct nfssvc_sock *slp; +{ + register struct nfsuid *nuidp, *onuidp; + register int i; + struct socket *so; + struct file *fp; + struct mbuf *m; + + slp->ns_flag &= ~SLP_ALLFLAGS; + if (fp = slp->ns_fp) { + slp->ns_fp = (struct file *)0; + so = slp->ns_so; + so->so_upcall = NULL; + soshutdown(so, 2); + closef(fp, (struct proc *)0); + if (slp->ns_nam) + MFREE(slp->ns_nam, m); + m_freem(slp->ns_raw); + m_freem(slp->ns_rec); + nuidp = slp->ns_lrunext; + while (nuidp != (struct nfsuid *)slp) { + onuidp = nuidp; + nuidp = nuidp->nu_lrunext; + free((caddr_t)onuidp, M_NFSUID); + } + slp->ns_lrunext = slp->ns_lruprev = (struct nfsuid *)slp; + for (i = 0; i < NUIDHASHSIZ; i++) + slp->ns_uidh[i] = (struct nfsuid *)0; + } +} + +/* + * Get an authorization string for the uid by having the mount_nfs sitting + * on this mount point porpous out of the kernel and do it. + */ +nfs_getauth(nmp, rep, cred, auth_type, auth_str, auth_len) + register struct nfsmount *nmp; + struct nfsreq *rep; + struct ucred *cred; + int *auth_type; + char **auth_str; + int *auth_len; +{ + int error = 0; + + while ((nmp->nm_flag & NFSMNT_WAITAUTH) == 0) { + nmp->nm_flag |= NFSMNT_WANTAUTH; + (void) tsleep((caddr_t)&nmp->nm_authtype, PSOCK, + "nfsauth1", 2 * hz); + if (error = nfs_sigintr(nmp, rep, rep->r_procp)) { + nmp->nm_flag &= ~NFSMNT_WANTAUTH; + return (error); + } + } + nmp->nm_flag &= ~(NFSMNT_WAITAUTH | NFSMNT_WANTAUTH); + nmp->nm_authstr = *auth_str = (char *)malloc(RPCAUTH_MAXSIZ, M_TEMP, M_WAITOK); + nmp->nm_authuid = cred->cr_uid; + wakeup((caddr_t)&nmp->nm_authstr); + + /* + * And wait for mount_nfs to do its stuff. + */ + while ((nmp->nm_flag & NFSMNT_HASAUTH) == 0 && error == 0) { + (void) tsleep((caddr_t)&nmp->nm_authlen, PSOCK, + "nfsauth2", 2 * hz); + error = nfs_sigintr(nmp, rep, rep->r_procp); + } + if (nmp->nm_flag & NFSMNT_AUTHERR) { + nmp->nm_flag &= ~NFSMNT_AUTHERR; + error = EAUTH; + } + if (error) + free((caddr_t)*auth_str, M_TEMP); + else { + *auth_type = nmp->nm_authtype; + *auth_len = nmp->nm_authlen; + } + nmp->nm_flag &= ~NFSMNT_HASAUTH; + nmp->nm_flag |= NFSMNT_WAITAUTH; + if (nmp->nm_flag & NFSMNT_WANTAUTH) { + nmp->nm_flag &= ~NFSMNT_WANTAUTH; + wakeup((caddr_t)&nmp->nm_authtype); + } + return (error); +} + +/* + * Derefence a server socket structure. If it has no more references and + * is no longer valid, you can throw it away. + */ +void +nfsrv_slpderef(slp) + register struct nfssvc_sock *slp; +{ + if (--(slp->ns_sref) == 0 && (slp->ns_flag & SLP_VALID) == 0) { + slp->ns_prev->ns_next = slp->ns_next; + slp->ns_next->ns_prev = slp->ns_prev; + free((caddr_t)slp, M_NFSSVC); + } +} + +/* + * Initialize the data structures for the server. + * Handshake with any new nfsds starting up to avoid any chance of + * corruption. + */ +void +nfsrv_init(terminating) + int terminating; +{ + register struct nfssvc_sock *slp; + struct nfssvc_sock *oslp; + + if (nfssvc_sockhead.ns_flag & SLP_INIT) + panic("nfsd init"); + nfssvc_sockhead.ns_flag |= SLP_INIT; + if (terminating) { + slp = nfssvc_sockhead.ns_next; + while (slp != &nfssvc_sockhead) { + if (slp->ns_flag & SLP_VALID) + nfsrv_zapsock(slp); + slp->ns_next->ns_prev = slp->ns_prev; + slp->ns_prev->ns_next = slp->ns_next; + oslp = slp; + slp = slp->ns_next; + free((caddr_t)oslp, M_NFSSVC); + } + nfsrv_cleancache(); /* And clear out server cache */ + } + nfs_udpsock = (struct nfssvc_sock *) + malloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK); + bzero((caddr_t)nfs_udpsock, sizeof (struct nfssvc_sock)); + nfs_cltpsock = (struct nfssvc_sock *) + malloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK); + bzero((caddr_t)nfs_cltpsock, sizeof (struct nfssvc_sock)); + nfssvc_sockhead.ns_next = nfs_udpsock; + nfs_udpsock->ns_next = nfs_cltpsock; + nfs_cltpsock->ns_next = &nfssvc_sockhead; + nfssvc_sockhead.ns_prev = nfs_cltpsock; + nfs_cltpsock->ns_prev = nfs_udpsock; + nfs_udpsock->ns_prev = &nfssvc_sockhead; + nfs_udpsock->ns_lrunext = nfs_udpsock->ns_lruprev = + (struct nfsuid *)nfs_udpsock; + nfs_cltpsock->ns_lrunext = nfs_cltpsock->ns_lruprev = + (struct nfsuid *)nfs_cltpsock; + nfsd_head.nd_next = nfsd_head.nd_prev = &nfsd_head; + nfsd_head.nd_flag = 0; + nfssvc_sockhead.ns_flag &= ~SLP_INIT; + if (nfssvc_sockhead.ns_flag & SLP_WANTINIT) { + nfssvc_sockhead.ns_flag &= ~SLP_WANTINIT; + wakeup((caddr_t)&nfssvc_sockhead); + } +} + +/* + * Add entries to the server monitor log. + */ +static void +nfsd_rt(startp, sotype, nd, nam, cacherep) + struct timeval *startp; + int sotype; + register struct nfsd *nd; + struct mbuf *nam; + int cacherep; +{ + register struct drt *rt; + + rt = &nfsdrt.drt[nfsdrt.pos]; + if (cacherep == RC_DOIT) + rt->flag = 0; + else if (cacherep == RC_REPLY) + rt->flag = DRT_CACHEREPLY; + else + rt->flag = DRT_CACHEDROP; + if (sotype == SOCK_STREAM) + rt->flag |= DRT_TCP; + if (nd->nd_nqlflag != NQL_NOVAL) + rt->flag |= DRT_NQNFS; + rt->proc = nd->nd_procnum; + if (mtod(nam, struct sockaddr *)->sa_family == AF_INET) + rt->ipadr = mtod(nam, struct sockaddr_in *)->sin_addr.s_addr; + else + rt->ipadr = INADDR_ANY; + rt->resptime = ((time.tv_sec - startp->tv_sec) * 1000000) + + (time.tv_usec - startp->tv_usec); + rt->tstamp = time; + nfsdrt.pos = (nfsdrt.pos + 1) % NFSRTTLOGSIZ; +} diff --git a/sys/nfsclient/nfs_node.c b/sys/nfsclient/nfs_node.c new file mode 100644 index 00000000000..032bdef0d5a --- /dev/null +++ b/sys/nfsclient/nfs_node.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_node.c 8.2 (Berkeley) 12/30/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +struct nfsnode **nheadhashtbl; +u_long nheadhash; +#define NFSNOHASH(fhsum) ((fhsum)&nheadhash) + +#define TRUE 1 +#define FALSE 0 + +/* + * Initialize hash links for nfsnodes + * and build nfsnode free list. + */ +nfs_nhinit() +{ + +#ifndef lint + if ((sizeof(struct nfsnode) - 1) & sizeof(struct nfsnode)) + printf("nfs_nhinit: bad size %d\n", sizeof(struct nfsnode)); +#endif /* not lint */ + nheadhashtbl = hashinit(desiredvnodes, M_NFSNODE, &nheadhash); +} + +/* + * Compute an entry in the NFS hash table structure + */ +struct nfsnode ** +nfs_hash(fhp) + register nfsv2fh_t *fhp; +{ + register u_char *fhpp; + register u_long fhsum; + int i; + + fhpp = &fhp->fh_bytes[0]; + fhsum = 0; + for (i = 0; i < NFSX_FH; i++) + fhsum += *fhpp++; + return (&nheadhashtbl[NFSNOHASH(fhsum)]); +} + +/* + * Look up a vnode/nfsnode by file handle. + * Callers must check for mount points!! + * In all cases, a pointer to a + * nfsnode structure is returned. + */ +nfs_nget(mntp, fhp, npp) + struct mount *mntp; + register nfsv2fh_t *fhp; + struct nfsnode **npp; +{ + register struct nfsnode *np, *nq, **nhpp; + register struct vnode *vp; + extern int (**nfsv2_vnodeop_p)(); + struct vnode *nvp; + int error; + + nhpp = nfs_hash(fhp); +loop: + for (np = *nhpp; np; np = np->n_forw) { + if (mntp != NFSTOV(np)->v_mount || + bcmp((caddr_t)fhp, (caddr_t)&np->n_fh, NFSX_FH)) + continue; + vp = NFSTOV(np); + if (vget(vp, 1)) + goto loop; + *npp = np; + return(0); + } + if (error = getnewvnode(VT_NFS, mntp, nfsv2_vnodeop_p, &nvp)) { + *npp = 0; + return (error); + } + vp = nvp; + MALLOC(np, struct nfsnode *, sizeof *np, M_NFSNODE, M_WAITOK); + vp->v_data = np; + np->n_vnode = vp; + /* + * Insert the nfsnode in the hash queue for its new file handle + */ + np->n_flag = 0; + if (nq = *nhpp) + nq->n_back = &np->n_forw; + np->n_forw = nq; + np->n_back = nhpp; + *nhpp = np; + bcopy((caddr_t)fhp, (caddr_t)&np->n_fh, NFSX_FH); + np->n_attrstamp = 0; + np->n_direofoffset = 0; + np->n_sillyrename = (struct sillyrename *)0; + np->n_size = 0; + np->n_mtime = 0; + if (VFSTONFS(mntp)->nm_flag & NFSMNT_NQNFS) { + np->n_brev = 0; + np->n_lrev = 0; + np->n_expiry = (time_t)0; + np->n_tnext = (struct nfsnode *)0; + } + *npp = np; + return (0); +} + +nfs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct nfsnode *np; + register struct sillyrename *sp; + struct proc *p = curproc; /* XXX */ + extern int prtactive; + + np = VTONFS(ap->a_vp); + if (prtactive && ap->a_vp->v_usecount != 0) + vprint("nfs_inactive: pushing active", ap->a_vp); + sp = np->n_sillyrename; + np->n_sillyrename = (struct sillyrename *)0; + if (sp) { + /* + * Remove the silly file that was rename'd earlier + */ + (void) nfs_vinvalbuf(ap->a_vp, 0, sp->s_cred, p, 1); + nfs_removeit(sp); + crfree(sp->s_cred); + vrele(sp->s_dvp); +#ifdef SILLYSEPARATE + free((caddr_t)sp, M_NFSREQ); +#endif + } + np->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT | NQNFSEVICTED | + NQNFSNONCACHE | NQNFSWRITE); + return (0); +} + +/* + * Reclaim an nfsnode so that it can be used for other purposes. + */ +nfs_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + register struct nfsmount *nmp = VFSTONFS(vp->v_mount); + register struct nfsnode *nq; + extern int prtactive; + + if (prtactive && vp->v_usecount != 0) + vprint("nfs_reclaim: pushing active", vp); + /* + * Remove the nfsnode from its hash chain. + */ + if (nq = np->n_forw) + nq->n_back = np->n_back; + *np->n_back = nq; + + /* + * For nqnfs, take it off the timer queue as required. + */ + if ((nmp->nm_flag & NFSMNT_NQNFS) && np->n_tnext) { + if (np->n_tnext == (struct nfsnode *)nmp) + nmp->nm_tprev = np->n_tprev; + else + np->n_tnext->n_tprev = np->n_tprev; + if (np->n_tprev == (struct nfsnode *)nmp) + nmp->nm_tnext = np->n_tnext; + else + np->n_tprev->n_tnext = np->n_tnext; + } + cache_purge(vp); + FREE(vp->v_data, M_NFSNODE); + vp->v_data = (void *)0; + return (0); +} + +/* + * Lock an nfsnode + */ +nfs_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + + /* + * Ugh, another place where interruptible mounts will get hung. + * If you make this sleep interruptible, then you have to fix all + * the VOP_LOCK() calls to expect interruptibility. + */ + while (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + } + if (vp->v_tag == VT_NON) + return (ENOENT); + return (0); +} + +/* + * Unlock an nfsnode + */ +nfs_unlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* + * Check for a locked nfsnode + */ +nfs_islocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + return (0); +} + +/* + * Nfs abort op, called after namei() when a CREATE/DELETE isn't actually + * done. Currently nothing to do. + */ +/* ARGSUSED */ +int +nfs_abortop(ap) + struct vop_abortop_args /* { + struct vnode *a_dvp; + struct componentname *a_cnp; + } */ *ap; +{ + + if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) + FREE(ap->a_cnp->cn_pnbuf, M_NAMEI); + return (0); +} diff --git a/sys/nfsclient/nfs_socket.c b/sys/nfsclient/nfs_socket.c new file mode 100644 index 00000000000..cf88ed33d92 --- /dev/null +++ b/sys/nfsclient/nfs_socket.c @@ -0,0 +1,1990 @@ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_socket.c 8.3 (Berkeley) 1/12/94 + */ + +/* + * Socket operations for use by nfs + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TRUE 1 +#define FALSE 0 + +/* + * Estimate rto for an nfs rpc sent via. an unreliable datagram. + * Use the mean and mean deviation of rtt for the appropriate type of rpc + * for the frequent rpcs and a default for the others. + * The justification for doing "other" this way is that these rpcs + * happen so infrequently that timer est. would probably be stale. + * Also, since many of these rpcs are + * non-idempotent, a conservative timeout is desired. + * getattr, lookup - A+2D + * read, write - A+4D + * other - nm_timeo + */ +#define NFS_RTO(n, t) \ + ((t) == 0 ? (n)->nm_timeo : \ + ((t) < 3 ? \ + (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \ + ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1))) +#define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1] +#define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1] +/* + * External data, mostly RPC constants in XDR form + */ +extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix, + rpc_msgaccepted, rpc_call, rpc_autherr, rpc_rejectedcred, + rpc_auth_kerb; +extern u_long nfs_prog, nfs_vers, nqnfs_prog, nqnfs_vers; +extern time_t nqnfsstarttime; +extern int nonidempotent[NFS_NPROCS]; + +/* + * Maps errno values to nfs error numbers. + * Use NFSERR_IO as the catch all for ones not specifically defined in + * RFC 1094. + */ +static int nfsrv_errmap[ELAST] = { + NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, + NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, + NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, +}; + +/* + * Defines which timer to use for the procnum. + * 0 - default + * 1 - getattr + * 2 - lookup + * 3 - read + * 4 - write + */ +static int proct[NFS_NPROCS] = { + 0, 1, 0, 0, 2, 3, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0, +}; + +/* + * There is a congestion window for outstanding rpcs maintained per mount + * point. The cwnd size is adjusted in roughly the way that: + * Van Jacobson, Congestion avoidance and Control, In "Proceedings of + * SIGCOMM '88". ACM, August 1988. + * describes for TCP. The cwnd size is chopped in half on a retransmit timeout + * and incremented by 1/cwnd when each rpc reply is received and a full cwnd + * of rpcs is in progress. + * (The sent count and cwnd are scaled for integer arith.) + * Variants of "slow start" were tried and were found to be too much of a + * performance hit (ave. rtt 3 times larger), + * I suspect due to the large rtt that nfs rpcs have. + */ +#define NFS_CWNDSCALE 256 +#define NFS_MAXCWND (NFS_CWNDSCALE * 32) +static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, }; +int nfs_sbwait(); +void nfs_disconnect(), nfs_realign(), nfsrv_wakenfsd(), nfs_sndunlock(); +void nfs_rcvunlock(), nqnfs_serverd(), nqnfs_clientlease(); +struct mbuf *nfsm_rpchead(); +int nfsrtton = 0; +struct nfsrtt nfsrtt; +struct nfsd nfsd_head; + +int nfsrv_null(), + nfsrv_getattr(), + nfsrv_setattr(), + nfsrv_lookup(), + nfsrv_readlink(), + nfsrv_read(), + nfsrv_write(), + nfsrv_create(), + nfsrv_remove(), + nfsrv_rename(), + nfsrv_link(), + nfsrv_symlink(), + nfsrv_mkdir(), + nfsrv_rmdir(), + nfsrv_readdir(), + nfsrv_statfs(), + nfsrv_noop(), + nqnfsrv_readdirlook(), + nqnfsrv_getlease(), + nqnfsrv_vacated(), + nqnfsrv_access(); + +int (*nfsrv_procs[NFS_NPROCS])() = { + nfsrv_null, + nfsrv_getattr, + nfsrv_setattr, + nfsrv_noop, + nfsrv_lookup, + nfsrv_readlink, + nfsrv_read, + nfsrv_noop, + nfsrv_write, + nfsrv_create, + nfsrv_remove, + nfsrv_rename, + nfsrv_link, + nfsrv_symlink, + nfsrv_mkdir, + nfsrv_rmdir, + nfsrv_readdir, + nfsrv_statfs, + nqnfsrv_readdirlook, + nqnfsrv_getlease, + nqnfsrv_vacated, + nfsrv_noop, + nqnfsrv_access, +}; + +struct nfsreq nfsreqh; + +/* + * Initialize sockets and congestion for a new NFS connection. + * We do not free the sockaddr if error. + */ +nfs_connect(nmp, rep) + register struct nfsmount *nmp; + struct nfsreq *rep; +{ + register struct socket *so; + int s, error, rcvreserve, sndreserve; + struct sockaddr *saddr; + struct sockaddr_in *sin; + struct mbuf *m; + u_short tport; + + nmp->nm_so = (struct socket *)0; + saddr = mtod(nmp->nm_nam, struct sockaddr *); + if (error = socreate(saddr->sa_family, + &nmp->nm_so, nmp->nm_sotype, nmp->nm_soproto)) + goto bad; + so = nmp->nm_so; + nmp->nm_soflags = so->so_proto->pr_flags; + + /* + * Some servers require that the client port be a reserved port number. + */ + if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) { + MGET(m, M_WAIT, MT_SONAME); + sin = mtod(m, struct sockaddr_in *); + sin->sin_len = m->m_len = sizeof (struct sockaddr_in); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + tport = IPPORT_RESERVED - 1; + sin->sin_port = htons(tport); + while ((error = sobind(so, m)) == EADDRINUSE && + --tport > IPPORT_RESERVED / 2) + sin->sin_port = htons(tport); + m_freem(m); + if (error) + goto bad; + } + + /* + * Protocols that do not require connections may be optionally left + * unconnected for servers that reply from a port other than NFS_PORT. + */ + if (nmp->nm_flag & NFSMNT_NOCONN) { + if (nmp->nm_soflags & PR_CONNREQUIRED) { + error = ENOTCONN; + goto bad; + } + } else { + if (error = soconnect(so, nmp->nm_nam)) + goto bad; + + /* + * Wait for the connection to complete. Cribbed from the + * connect system call but with the wait timing out so + * that interruptible mounts don't hang here for a long time. + */ + s = splnet(); + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { + (void) tsleep((caddr_t)&so->so_timeo, PSOCK, + "nfscon", 2 * hz); + if ((so->so_state & SS_ISCONNECTING) && + so->so_error == 0 && rep && + (error = nfs_sigintr(nmp, rep, rep->r_procp))) { + so->so_state &= ~SS_ISCONNECTING; + splx(s); + goto bad; + } + } + if (so->so_error) { + error = so->so_error; + so->so_error = 0; + splx(s); + goto bad; + } + splx(s); + } + if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) { + so->so_rcv.sb_timeo = (5 * hz); + so->so_snd.sb_timeo = (5 * hz); + } else { + so->so_rcv.sb_timeo = 0; + so->so_snd.sb_timeo = 0; + } + if (nmp->nm_sotype == SOCK_DGRAM) { + sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; + rcvreserve = nmp->nm_rsize + NFS_MAXPKTHDR; + } else if (nmp->nm_sotype == SOCK_SEQPACKET) { + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * 2; + } else { + if (nmp->nm_sotype != SOCK_STREAM) + panic("nfscon sotype"); + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); + } + if (so->so_proto->pr_protocol == IPPROTO_TCP) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); + } + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) + * 2; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) + * 2; + } + if (error = soreserve(so, sndreserve, rcvreserve)) + goto bad; + so->so_rcv.sb_flags |= SB_NOINTR; + so->so_snd.sb_flags |= SB_NOINTR; + + /* Initialize other non-zero congestion variables */ + nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = nmp->nm_srtt[3] = + nmp->nm_srtt[4] = (NFS_TIMEO << 3); + nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] = + nmp->nm_sdrtt[3] = nmp->nm_sdrtt[4] = 0; + nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ + nmp->nm_sent = 0; + nmp->nm_timeouts = 0; + return (0); + +bad: + nfs_disconnect(nmp); + return (error); +} + +/* + * Reconnect routine: + * Called when a connection is broken on a reliable protocol. + * - clean up the old socket + * - nfs_connect() again + * - set R_MUSTRESEND for all outstanding requests on mount point + * If this fails the mount point is DEAD! + * nb: Must be called with the nfs_sndlock() set on the mount point. + */ +nfs_reconnect(rep) + register struct nfsreq *rep; +{ + register struct nfsreq *rp; + register struct nfsmount *nmp = rep->r_nmp; + int error; + + nfs_disconnect(nmp); + while (error = nfs_connect(nmp, rep)) { + if (error == EINTR || error == ERESTART) + return (EINTR); + (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); + } + + /* + * Loop through outstanding request list and fix up all requests + * on old socket. + */ + rp = nfsreqh.r_next; + while (rp != &nfsreqh) { + if (rp->r_nmp == nmp) + rp->r_flags |= R_MUSTRESEND; + rp = rp->r_next; + } + return (0); +} + +/* + * NFS disconnect. Clean up and unlink. + */ +void +nfs_disconnect(nmp) + register struct nfsmount *nmp; +{ + register struct socket *so; + + if (nmp->nm_so) { + so = nmp->nm_so; + nmp->nm_so = (struct socket *)0; + soshutdown(so, 2); + soclose(so); + } +} + +/* + * This is the nfs send routine. For connection based socket types, it + * must be called with an nfs_sndlock() on the socket. + * "rep == NULL" indicates that it has been called from a server. + * For the client side: + * - return EINTR if the RPC is terminated, 0 otherwise + * - set R_MUSTRESEND if the send fails for any reason + * - do any cleanup required by recoverable socket errors (???) + * For the server side: + * - return EINTR or ERESTART if interrupted by a signal + * - return EPIPE if a connection is lost for connection based sockets (TCP...) + * - do any cleanup required by recoverable socket errors (???) + */ +nfs_send(so, nam, top, rep) + register struct socket *so; + struct mbuf *nam; + register struct mbuf *top; + struct nfsreq *rep; +{ + struct mbuf *sendnam; + int error, soflags, flags; + + if (rep) { + if (rep->r_flags & R_SOFTTERM) { + m_freem(top); + return (EINTR); + } + if ((so = rep->r_nmp->nm_so) == NULL) { + rep->r_flags |= R_MUSTRESEND; + m_freem(top); + return (0); + } + rep->r_flags &= ~R_MUSTRESEND; + soflags = rep->r_nmp->nm_soflags; + } else + soflags = so->so_proto->pr_flags; + if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) + sendnam = (struct mbuf *)0; + else + sendnam = nam; + if (so->so_type == SOCK_SEQPACKET) + flags = MSG_EOR; + else + flags = 0; + + error = sosend(so, sendnam, (struct uio *)0, top, + (struct mbuf *)0, flags); + if (error) { + if (rep) { + log(LOG_INFO, "nfs send error %d for server %s\n",error, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + /* + * Deal with errors for the client side. + */ + if (rep->r_flags & R_SOFTTERM) + error = EINTR; + else + rep->r_flags |= R_MUSTRESEND; + } else + log(LOG_INFO, "nfsd send error %d\n", error); + + /* + * Handle any recoverable (soft) socket errors here. (???) + */ + if (error != EINTR && error != ERESTART && + error != EWOULDBLOCK && error != EPIPE) + error = 0; + } + return (error); +} + +/* + * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all + * done by soreceive(), but for SOCK_STREAM we must deal with the Record + * Mark and consolidate the data into a new mbuf list. + * nb: Sometimes TCP passes the data up to soreceive() in long lists of + * small mbufs. + * For SOCK_STREAM we must be very careful to read an entire record once + * we have read any of it, even if the system call has been interrupted. + */ +nfs_receive(rep, aname, mp) + register struct nfsreq *rep; + struct mbuf **aname; + struct mbuf **mp; +{ + register struct socket *so; + struct uio auio; + struct iovec aio; + register struct mbuf *m; + struct mbuf *control; + u_long len; + struct mbuf **getnam; + int error, sotype, rcvflg; + struct proc *p = curproc; /* XXX */ + + /* + * Set up arguments for soreceive() + */ + *mp = (struct mbuf *)0; + *aname = (struct mbuf *)0; + sotype = rep->r_nmp->nm_sotype; + + /* + * For reliable protocols, lock against other senders/receivers + * in case a reconnect is necessary. + * For SOCK_STREAM, first get the Record Mark to find out how much + * more there is to get. + * We must lock the socket against other receivers + * until we have an entire rpc request/reply. + */ + if (sotype != SOCK_DGRAM) { + if (error = nfs_sndlock(&rep->r_nmp->nm_flag, rep)) + return (error); +tryagain: + /* + * Check for fatal errors and resending request. + */ + /* + * Ugh: If a reconnect attempt just happened, nm_so + * would have changed. NULL indicates a failed + * attempt that has essentially shut down this + * mount point. + */ + if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { + nfs_sndunlock(&rep->r_nmp->nm_flag); + return (EINTR); + } + if ((so = rep->r_nmp->nm_so) == NULL) { + if (error = nfs_reconnect(rep)) { + nfs_sndunlock(&rep->r_nmp->nm_flag); + return (error); + } + goto tryagain; + } + while (rep->r_flags & R_MUSTRESEND) { + m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); + nfsstats.rpcretries++; + if (error = nfs_send(so, rep->r_nmp->nm_nam, m, rep)) { + if (error == EINTR || error == ERESTART || + (error = nfs_reconnect(rep))) { + nfs_sndunlock(&rep->r_nmp->nm_flag); + return (error); + } + goto tryagain; + } + } + nfs_sndunlock(&rep->r_nmp->nm_flag); + if (sotype == SOCK_STREAM) { + aio.iov_base = (caddr_t) &len; + aio.iov_len = sizeof(u_long); + auio.uio_iov = &aio; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_offset = 0; + auio.uio_resid = sizeof(u_long); + auio.uio_procp = p; + do { + rcvflg = MSG_WAITALL; + error = soreceive(so, (struct mbuf **)0, &auio, + (struct mbuf **)0, (struct mbuf **)0, &rcvflg); + if (error == EWOULDBLOCK && rep) { + if (rep->r_flags & R_SOFTTERM) + return (EINTR); + } + } while (error == EWOULDBLOCK); + if (!error && auio.uio_resid > 0) { + log(LOG_INFO, + "short receive (%d/%d) from nfs server %s\n", + sizeof(u_long) - auio.uio_resid, + sizeof(u_long), + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = EPIPE; + } + if (error) + goto errout; + len = ntohl(len) & ~0x80000000; + /* + * This is SERIOUS! We are out of sync with the sender + * and forcing a disconnect/reconnect is all I can do. + */ + if (len > NFS_MAXPACKET) { + log(LOG_ERR, "%s (%d) from nfs server %s\n", + "impossible packet length", + len, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = EFBIG; + goto errout; + } + auio.uio_resid = len; + do { + rcvflg = MSG_WAITALL; + error = soreceive(so, (struct mbuf **)0, + &auio, mp, (struct mbuf **)0, &rcvflg); + } while (error == EWOULDBLOCK || error == EINTR || + error == ERESTART); + if (!error && auio.uio_resid > 0) { + log(LOG_INFO, + "short receive (%d/%d) from nfs server %s\n", + len - auio.uio_resid, len, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = EPIPE; + } + } else { + /* + * NB: Since uio_resid is big, MSG_WAITALL is ignored + * and soreceive() will return when it has either a + * control msg or a data msg. + * We have no use for control msg., but must grab them + * and then throw them away so we know what is going + * on. + */ + auio.uio_resid = len = 100000000; /* Anything Big */ + auio.uio_procp = p; + do { + rcvflg = 0; + error = soreceive(so, (struct mbuf **)0, + &auio, mp, &control, &rcvflg); + if (control) + m_freem(control); + if (error == EWOULDBLOCK && rep) { + if (rep->r_flags & R_SOFTTERM) + return (EINTR); + } + } while (error == EWOULDBLOCK || + (!error && *mp == NULL && control)); + if ((rcvflg & MSG_EOR) == 0) + printf("Egad!!\n"); + if (!error && *mp == NULL) + error = EPIPE; + len -= auio.uio_resid; + } +errout: + if (error && error != EINTR && error != ERESTART) { + m_freem(*mp); + *mp = (struct mbuf *)0; + if (error != EPIPE) + log(LOG_INFO, + "receive error %d from nfs server %s\n", + error, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); + if (!error) + error = nfs_reconnect(rep); + if (!error) + goto tryagain; + } + } else { + if ((so = rep->r_nmp->nm_so) == NULL) + return (EACCES); + if (so->so_state & SS_ISCONNECTED) + getnam = (struct mbuf **)0; + else + getnam = aname; + auio.uio_resid = len = 1000000; + auio.uio_procp = p; + do { + rcvflg = 0; + error = soreceive(so, getnam, &auio, mp, + (struct mbuf **)0, &rcvflg); + if (error == EWOULDBLOCK && + (rep->r_flags & R_SOFTTERM)) + return (EINTR); + } while (error == EWOULDBLOCK); + len -= auio.uio_resid; + } + if (error) { + m_freem(*mp); + *mp = (struct mbuf *)0; + } + /* + * Search for any mbufs that are not a multiple of 4 bytes long + * or with m_data not longword aligned. + * These could cause pointer alignment problems, so copy them to + * well aligned mbufs. + */ + nfs_realign(*mp, 5 * NFSX_UNSIGNED); + return (error); +} + +/* + * Implement receipt of reply on a socket. + * We must search through the list of received datagrams matching them + * with outstanding requests using the xid, until ours is found. + */ +/* ARGSUSED */ +nfs_reply(myrep) + struct nfsreq *myrep; +{ + register struct nfsreq *rep; + register struct nfsmount *nmp = myrep->r_nmp; + register long t1; + struct mbuf *mrep, *nam, *md; + u_long rxid, *tl; + caddr_t dpos, cp2; + int error; + + /* + * Loop around until we get our own reply + */ + for (;;) { + /* + * Lock against other receivers so that I don't get stuck in + * sbwait() after someone else has received my reply for me. + * Also necessary for connection based protocols to avoid + * race conditions during a reconnect. + */ + if (error = nfs_rcvlock(myrep)) + return (error); + /* Already received, bye bye */ + if (myrep->r_mrep != NULL) { + nfs_rcvunlock(&nmp->nm_flag); + return (0); + } + /* + * Get the next Rpc reply off the socket + */ + error = nfs_receive(myrep, &nam, &mrep); + nfs_rcvunlock(&nmp->nm_flag); + if (error) { + + /* + * Ignore routing errors on connectionless protocols?? + */ + if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { + nmp->nm_so->so_error = 0; + if (myrep->r_flags & R_GETONEREP) + return (0); + continue; + } + return (error); + } + if (nam) + m_freem(nam); + + /* + * Get the xid and check that it is an rpc reply + */ + md = mrep; + dpos = mtod(md, caddr_t); + nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); + rxid = *tl++; + if (*tl != rpc_reply) { + if (nmp->nm_flag & NFSMNT_NQNFS) { + if (nqnfs_callback(nmp, mrep, md, dpos)) + nfsstats.rpcinvalid++; + } else { + nfsstats.rpcinvalid++; + m_freem(mrep); + } +nfsmout: + if (myrep->r_flags & R_GETONEREP) + return (0); + continue; + } + + /* + * Loop through the request list to match up the reply + * Iff no match, just drop the datagram + */ + rep = nfsreqh.r_next; + while (rep != &nfsreqh) { + if (rep->r_mrep == NULL && rxid == rep->r_xid) { + /* Found it.. */ + rep->r_mrep = mrep; + rep->r_md = md; + rep->r_dpos = dpos; + if (nfsrtton) { + struct rttl *rt; + + rt = &nfsrtt.rttl[nfsrtt.pos]; + rt->proc = rep->r_procnum; + rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]); + rt->sent = nmp->nm_sent; + rt->cwnd = nmp->nm_cwnd; + rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1]; + rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1]; + rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid; + rt->tstamp = time; + if (rep->r_flags & R_TIMING) + rt->rtt = rep->r_rtt; + else + rt->rtt = 1000000; + nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ; + } + /* + * Update congestion window. + * Do the additive increase of + * one rpc/rtt. + */ + if (nmp->nm_cwnd <= nmp->nm_sent) { + nmp->nm_cwnd += + (NFS_CWNDSCALE * NFS_CWNDSCALE + + (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; + if (nmp->nm_cwnd > NFS_MAXCWND) + nmp->nm_cwnd = NFS_MAXCWND; + } + rep->r_flags &= ~R_SENT; + nmp->nm_sent -= NFS_CWNDSCALE; + /* + * Update rtt using a gain of 0.125 on the mean + * and a gain of 0.25 on the deviation. + */ + if (rep->r_flags & R_TIMING) { + /* + * Since the timer resolution of + * NFS_HZ is so course, it can often + * result in r_rtt == 0. Since + * r_rtt == N means that the actual + * rtt is between N+dt and N+2-dt ticks, + * add 1. + */ + t1 = rep->r_rtt + 1; + t1 -= (NFS_SRTT(rep) >> 3); + NFS_SRTT(rep) += t1; + if (t1 < 0) + t1 = -t1; + t1 -= (NFS_SDRTT(rep) >> 2); + NFS_SDRTT(rep) += t1; + } + nmp->nm_timeouts = 0; + break; + } + rep = rep->r_next; + } + /* + * If not matched to a request, drop it. + * If it's mine, get out. + */ + if (rep == &nfsreqh) { + nfsstats.rpcunexpected++; + m_freem(mrep); + } else if (rep == myrep) { + if (rep->r_mrep == NULL) + panic("nfsreply nil"); + return (0); + } + if (myrep->r_flags & R_GETONEREP) + return (0); + } +} + +/* + * nfs_request - goes something like this + * - fill in request struct + * - links it into list + * - calls nfs_send() for first transmit + * - calls nfs_receive() to get reply + * - break down rpc header and return with nfs reply pointed to + * by mrep or error + * nb: always frees up mreq mbuf list + */ +nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp) + struct vnode *vp; + struct mbuf *mrest; + int procnum; + struct proc *procp; + struct ucred *cred; + struct mbuf **mrp; + struct mbuf **mdp; + caddr_t *dposp; +{ + register struct mbuf *m, *mrep; + register struct nfsreq *rep; + register u_long *tl; + register int i; + struct nfsmount *nmp; + struct mbuf *md, *mheadend; + struct nfsreq *reph; + struct nfsnode *np; + time_t reqtime, waituntil; + caddr_t dpos, cp2; + int t1, nqlflag, cachable, s, error = 0, mrest_len, auth_len, auth_type; + int trylater_delay = NQ_TRYLATERDEL, trylater_cnt = 0, failed_auth = 0; + u_long xid; + u_quad_t frev; + char *auth_str; + + nmp = VFSTONFS(vp->v_mount); + MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); + rep->r_nmp = nmp; + rep->r_vp = vp; + rep->r_procp = procp; + rep->r_procnum = procnum; + i = 0; + m = mrest; + while (m) { + i += m->m_len; + m = m->m_next; + } + mrest_len = i; + + /* + * Get the RPC header with authorization. + */ +kerbauth: + auth_str = (char *)0; + if (nmp->nm_flag & NFSMNT_KERB) { + if (failed_auth) { + error = nfs_getauth(nmp, rep, cred, &auth_type, + &auth_str, &auth_len); + if (error) { + free((caddr_t)rep, M_NFSREQ); + m_freem(mrest); + return (error); + } + } else { + auth_type = RPCAUTH_UNIX; + auth_len = 5 * NFSX_UNSIGNED; + } + } else { + auth_type = RPCAUTH_UNIX; + if (cred->cr_ngroups < 1) + panic("nfsreq nogrps"); + auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ? + nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + + 5 * NFSX_UNSIGNED; + } + m = nfsm_rpchead(cred, (nmp->nm_flag & NFSMNT_NQNFS), procnum, + auth_type, auth_len, auth_str, mrest, mrest_len, &mheadend, &xid); + if (auth_str) + free(auth_str, M_TEMP); + + /* + * For stream protocols, insert a Sun RPC Record Mark. + */ + if (nmp->nm_sotype == SOCK_STREAM) { + M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); + *mtod(m, u_long *) = htonl(0x80000000 | + (m->m_pkthdr.len - NFSX_UNSIGNED)); + } + rep->r_mreq = m; + rep->r_xid = xid; +tryagain: + if (nmp->nm_flag & NFSMNT_SOFT) + rep->r_retry = nmp->nm_retry; + else + rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ + rep->r_rtt = rep->r_rexmit = 0; + if (proct[procnum] > 0) + rep->r_flags = R_TIMING; + else + rep->r_flags = 0; + rep->r_mrep = NULL; + + /* + * Do the client side RPC. + */ + nfsstats.rpcrequests++; + /* + * Chain request into list of outstanding requests. Be sure + * to put it LAST so timer finds oldest requests first. + */ + s = splsoftclock(); + reph = &nfsreqh; + reph->r_prev->r_next = rep; + rep->r_prev = reph->r_prev; + reph->r_prev = rep; + rep->r_next = reph; + + /* Get send time for nqnfs */ + reqtime = time.tv_sec; + + /* + * If backing off another request or avoiding congestion, don't + * send this one now but let timer do it. If not timing a request, + * do it now. + */ + if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || + (nmp->nm_flag & NFSMNT_DUMBTIMR) || + nmp->nm_sent < nmp->nm_cwnd)) { + splx(s); + if (nmp->nm_soflags & PR_CONNREQUIRED) + error = nfs_sndlock(&nmp->nm_flag, rep); + if (!error) { + m = m_copym(m, 0, M_COPYALL, M_WAIT); + error = nfs_send(nmp->nm_so, nmp->nm_nam, m, rep); + if (nmp->nm_soflags & PR_CONNREQUIRED) + nfs_sndunlock(&nmp->nm_flag); + } + if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { + nmp->nm_sent += NFS_CWNDSCALE; + rep->r_flags |= R_SENT; + } + } else { + splx(s); + rep->r_rtt = -1; + } + + /* + * Wait for the reply from our send or the timer's. + */ + if (!error || error == EPIPE) + error = nfs_reply(rep); + + /* + * RPC done, unlink the request. + */ + s = splsoftclock(); + rep->r_prev->r_next = rep->r_next; + rep->r_next->r_prev = rep->r_prev; + splx(s); + + /* + * Decrement the outstanding request count. + */ + if (rep->r_flags & R_SENT) { + rep->r_flags &= ~R_SENT; /* paranoia */ + nmp->nm_sent -= NFS_CWNDSCALE; + } + + /* + * If there was a successful reply and a tprintf msg. + * tprintf a response. + */ + if (!error && (rep->r_flags & R_TPRINTFMSG)) + nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname, + "is alive again"); + mrep = rep->r_mrep; + md = rep->r_md; + dpos = rep->r_dpos; + if (error) { + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + return (error); + } + + /* + * break down the rpc header and check if ok + */ + nfsm_dissect(tl, u_long *, 3*NFSX_UNSIGNED); + if (*tl++ == rpc_msgdenied) { + if (*tl == rpc_mismatch) + error = EOPNOTSUPP; + else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) { + if (*tl == rpc_rejectedcred && failed_auth == 0) { + failed_auth++; + mheadend->m_next = (struct mbuf *)0; + m_freem(mrep); + m_freem(rep->r_mreq); + goto kerbauth; + } else + error = EAUTH; + } else + error = EACCES; + m_freem(mrep); + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + return (error); + } + + /* + * skip over the auth_verf, someday we may want to cache auth_short's + * for nfs_reqhead(), but for now just dump it + */ + if (*++tl != 0) { + i = nfsm_rndup(fxdr_unsigned(long, *tl)); + nfsm_adv(i); + } + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + /* 0 == ok */ + if (*tl == 0) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + if (*tl != 0) { + error = fxdr_unsigned(int, *tl); + m_freem(mrep); + if ((nmp->nm_flag & NFSMNT_NQNFS) && + error == NQNFS_TRYLATER) { + error = 0; + waituntil = time.tv_sec + trylater_delay; + while (time.tv_sec < waituntil) + (void) tsleep((caddr_t)&lbolt, + PSOCK, "nqnfstry", 0); + trylater_delay *= nfs_backoff[trylater_cnt]; + if (trylater_cnt < 7) + trylater_cnt++; + goto tryagain; + } + + /* + * If the File Handle was stale, invalidate the + * lookup cache, just in case. + */ + if (error == ESTALE) + cache_purge(vp); + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + return (error); + } + + /* + * For nqnfs, get any lease in reply + */ + if (nmp->nm_flag & NFSMNT_NQNFS) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + if (*tl) { + np = VTONFS(vp); + nqlflag = fxdr_unsigned(int, *tl); + nfsm_dissect(tl, u_long *, 4*NFSX_UNSIGNED); + cachable = fxdr_unsigned(int, *tl++); + reqtime += fxdr_unsigned(int, *tl++); + if (reqtime > time.tv_sec) { + fxdr_hyper(tl, &frev); + nqnfs_clientlease(nmp, np, nqlflag, + cachable, reqtime, frev); + } + } + } + *mrp = mrep; + *mdp = md; + *dposp = dpos; + m_freem(rep->r_mreq); + FREE((caddr_t)rep, M_NFSREQ); + return (0); + } + m_freem(mrep); + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + error = EPROTONOSUPPORT; +nfsmout: + return (error); +} + +/* + * Generate the rpc reply header + * siz arg. is used to decide if adding a cluster is worthwhile + */ +nfs_rephead(siz, nd, err, cache, frev, mrq, mbp, bposp) + int siz; + struct nfsd *nd; + int err; + int cache; + u_quad_t *frev; + struct mbuf **mrq; + struct mbuf **mbp; + caddr_t *bposp; +{ + register u_long *tl; + register struct mbuf *mreq; + caddr_t bpos; + struct mbuf *mb, *mb2; + + MGETHDR(mreq, M_WAIT, MT_DATA); + mb = mreq; + /* + * If this is a big reply, use a cluster else + * try and leave leading space for the lower level headers. + */ + siz += RPC_REPLYSIZ; + if (siz >= MINCLSIZE) { + MCLGET(mreq, M_WAIT); + } else + mreq->m_data += max_hdr; + tl = mtod(mreq, u_long *); + mreq->m_len = 6*NFSX_UNSIGNED; + bpos = ((caddr_t)tl)+mreq->m_len; + *tl++ = nd->nd_retxid; + *tl++ = rpc_reply; + if (err == ERPCMISMATCH || err == NQNFS_AUTHERR) { + *tl++ = rpc_msgdenied; + if (err == NQNFS_AUTHERR) { + *tl++ = rpc_autherr; + *tl = rpc_rejectedcred; + mreq->m_len -= NFSX_UNSIGNED; + bpos -= NFSX_UNSIGNED; + } else { + *tl++ = rpc_mismatch; + *tl++ = txdr_unsigned(2); + *tl = txdr_unsigned(2); + } + } else { + *tl++ = rpc_msgaccepted; + *tl++ = 0; + *tl++ = 0; + switch (err) { + case EPROGUNAVAIL: + *tl = txdr_unsigned(RPC_PROGUNAVAIL); + break; + case EPROGMISMATCH: + *tl = txdr_unsigned(RPC_PROGMISMATCH); + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(2); + *tl = txdr_unsigned(2); /* someday 3 */ + break; + case EPROCUNAVAIL: + *tl = txdr_unsigned(RPC_PROCUNAVAIL); + break; + default: + *tl = 0; + if (err != VNOVAL) { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + if (err) + *tl = txdr_unsigned(nfsrv_errmap[err - 1]); + else + *tl = 0; + } + break; + }; + } + + /* + * For nqnfs, piggyback lease as requested. + */ + if (nd->nd_nqlflag != NQL_NOVAL && err == 0) { + if (nd->nd_nqlflag) { + nfsm_build(tl, u_long *, 5*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(nd->nd_nqlflag); + *tl++ = txdr_unsigned(cache); + *tl++ = txdr_unsigned(nd->nd_duration); + txdr_hyper(frev, tl); + } else { + if (nd->nd_nqlflag != 0) + panic("nqreph"); + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = 0; + } + } + *mrq = mreq; + *mbp = mb; + *bposp = bpos; + if (err != 0 && err != VNOVAL) + nfsstats.srvrpc_errs++; + return (0); +} + +/* + * Nfs timer routine + * Scan the nfsreq list and retranmit any requests that have timed out + * To avoid retransmission attempts on STREAM sockets (in the future) make + * sure to set the r_retry field to 0 (implies nm_retry == 0). + */ +void +nfs_timer(arg) + void *arg; +{ + register struct nfsreq *rep; + register struct mbuf *m; + register struct socket *so; + register struct nfsmount *nmp; + register int timeo; + static long lasttime = 0; + int s, error; + + s = splnet(); + for (rep = nfsreqh.r_next; rep != &nfsreqh; rep = rep->r_next) { + nmp = rep->r_nmp; + if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) + continue; + if (nfs_sigintr(nmp, rep, rep->r_procp)) { + rep->r_flags |= R_SOFTTERM; + continue; + } + if (rep->r_rtt >= 0) { + rep->r_rtt++; + if (nmp->nm_flag & NFSMNT_DUMBTIMR) + timeo = nmp->nm_timeo; + else + timeo = NFS_RTO(nmp, proct[rep->r_procnum]); + if (nmp->nm_timeouts > 0) + timeo *= nfs_backoff[nmp->nm_timeouts - 1]; + if (rep->r_rtt <= timeo) + continue; + if (nmp->nm_timeouts < 8) + nmp->nm_timeouts++; + } + /* + * Check for server not responding + */ + if ((rep->r_flags & R_TPRINTFMSG) == 0 && + rep->r_rexmit > nmp->nm_deadthresh) { + nfs_msg(rep->r_procp, + nmp->nm_mountp->mnt_stat.f_mntfromname, + "not responding"); + rep->r_flags |= R_TPRINTFMSG; + } + if (rep->r_rexmit >= rep->r_retry) { /* too many */ + nfsstats.rpctimeouts++; + rep->r_flags |= R_SOFTTERM; + continue; + } + if (nmp->nm_sotype != SOCK_DGRAM) { + if (++rep->r_rexmit > NFS_MAXREXMIT) + rep->r_rexmit = NFS_MAXREXMIT; + continue; + } + if ((so = nmp->nm_so) == NULL) + continue; + + /* + * If there is enough space and the window allows.. + * Resend it + * Set r_rtt to -1 in case we fail to send it now. + */ + rep->r_rtt = -1; + if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && + ((nmp->nm_flag & NFSMNT_DUMBTIMR) || + (rep->r_flags & R_SENT) || + nmp->nm_sent < nmp->nm_cwnd) && + (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ + if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) + error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, + (struct mbuf *)0, (struct mbuf *)0); + else + error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, + nmp->nm_nam, (struct mbuf *)0); + if (error) { + if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) + so->so_error = 0; + } else { + /* + * Iff first send, start timing + * else turn timing off, backoff timer + * and divide congestion window by 2. + */ + if (rep->r_flags & R_SENT) { + rep->r_flags &= ~R_TIMING; + if (++rep->r_rexmit > NFS_MAXREXMIT) + rep->r_rexmit = NFS_MAXREXMIT; + nmp->nm_cwnd >>= 1; + if (nmp->nm_cwnd < NFS_CWNDSCALE) + nmp->nm_cwnd = NFS_CWNDSCALE; + nfsstats.rpcretries++; + } else { + rep->r_flags |= R_SENT; + nmp->nm_sent += NFS_CWNDSCALE; + } + rep->r_rtt = 0; + } + } + } + + /* + * Call the nqnfs server timer once a second to handle leases. + */ + if (lasttime != time.tv_sec) { + lasttime = time.tv_sec; + nqnfs_serverd(); + } + splx(s); + timeout(nfs_timer, (void *)0, hz / NFS_HZ); +} + +/* + * Test for a termination condition pending on the process. + * This is used for NFSMNT_INT mounts. + */ +nfs_sigintr(nmp, rep, p) + struct nfsmount *nmp; + struct nfsreq *rep; + register struct proc *p; +{ + + if (rep && (rep->r_flags & R_SOFTTERM)) + return (EINTR); + if (!(nmp->nm_flag & NFSMNT_INT)) + return (0); + if (p && p->p_siglist && + (((p->p_siglist & ~p->p_sigmask) & ~p->p_sigignore) & + NFSINT_SIGMASK)) + return (EINTR); + return (0); +} + +/* + * Lock a socket against others. + * Necessary for STREAM sockets to ensure you get an entire rpc request/reply + * and also to avoid race conditions between the processes with nfs requests + * in progress when a reconnect is necessary. + */ +nfs_sndlock(flagp, rep) + register int *flagp; + struct nfsreq *rep; +{ + struct proc *p; + int slpflag = 0, slptimeo = 0; + + if (rep) { + p = rep->r_procp; + if (rep->r_nmp->nm_flag & NFSMNT_INT) + slpflag = PCATCH; + } else + p = (struct proc *)0; + while (*flagp & NFSMNT_SNDLOCK) { + if (nfs_sigintr(rep->r_nmp, rep, p)) + return (EINTR); + *flagp |= NFSMNT_WANTSND; + (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", + slptimeo); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + *flagp |= NFSMNT_SNDLOCK; + return (0); +} + +/* + * Unlock the stream socket for others. + */ +void +nfs_sndunlock(flagp) + register int *flagp; +{ + + if ((*flagp & NFSMNT_SNDLOCK) == 0) + panic("nfs sndunlock"); + *flagp &= ~NFSMNT_SNDLOCK; + if (*flagp & NFSMNT_WANTSND) { + *flagp &= ~NFSMNT_WANTSND; + wakeup((caddr_t)flagp); + } +} + +nfs_rcvlock(rep) + register struct nfsreq *rep; +{ + register int *flagp = &rep->r_nmp->nm_flag; + int slpflag, slptimeo = 0; + + if (*flagp & NFSMNT_INT) + slpflag = PCATCH; + else + slpflag = 0; + while (*flagp & NFSMNT_RCVLOCK) { + if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) + return (EINTR); + *flagp |= NFSMNT_WANTRCV; + (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", + slptimeo); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + *flagp |= NFSMNT_RCVLOCK; + return (0); +} + +/* + * Unlock the stream socket for others. + */ +void +nfs_rcvunlock(flagp) + register int *flagp; +{ + + if ((*flagp & NFSMNT_RCVLOCK) == 0) + panic("nfs rcvunlock"); + *flagp &= ~NFSMNT_RCVLOCK; + if (*flagp & NFSMNT_WANTRCV) { + *flagp &= ~NFSMNT_WANTRCV; + wakeup((caddr_t)flagp); + } +} + +/* + * Check for badly aligned mbuf data areas and + * realign data in an mbuf list by copying the data areas up, as required. + */ +void +nfs_realign(m, hsiz) + register struct mbuf *m; + int hsiz; +{ + register struct mbuf *m2; + register int siz, mlen, olen; + register caddr_t tcp, fcp; + struct mbuf *mnew; + + while (m) { + /* + * This never happens for UDP, rarely happens for TCP + * but frequently happens for iso transport. + */ + if ((m->m_len & 0x3) || (mtod(m, int) & 0x3)) { + olen = m->m_len; + fcp = mtod(m, caddr_t); + if ((int)fcp & 0x3) { + m->m_flags &= ~M_PKTHDR; + if (m->m_flags & M_EXT) + m->m_data = m->m_ext.ext_buf + + ((m->m_ext.ext_size - olen) & ~0x3); + else + m->m_data = m->m_dat; + } + m->m_len = 0; + tcp = mtod(m, caddr_t); + mnew = m; + m2 = m->m_next; + + /* + * If possible, only put the first invariant part + * of the RPC header in the first mbuf. + */ + mlen = M_TRAILINGSPACE(m); + if (olen <= hsiz && mlen > hsiz) + mlen = hsiz; + + /* + * Loop through the mbuf list consolidating data. + */ + while (m) { + while (olen > 0) { + if (mlen == 0) { + m2->m_flags &= ~M_PKTHDR; + if (m2->m_flags & M_EXT) + m2->m_data = m2->m_ext.ext_buf; + else + m2->m_data = m2->m_dat; + m2->m_len = 0; + mlen = M_TRAILINGSPACE(m2); + tcp = mtod(m2, caddr_t); + mnew = m2; + m2 = m2->m_next; + } + siz = min(mlen, olen); + if (tcp != fcp) + bcopy(fcp, tcp, siz); + mnew->m_len += siz; + mlen -= siz; + olen -= siz; + tcp += siz; + fcp += siz; + } + m = m->m_next; + if (m) { + olen = m->m_len; + fcp = mtod(m, caddr_t); + } + } + + /* + * Finally, set m_len == 0 for any trailing mbufs that have + * been copied out of. + */ + while (m2) { + m2->m_len = 0; + m2 = m2->m_next; + } + return; + } + m = m->m_next; + } +} + +/* + * Socket upcall routine for the nfsd sockets. + * The caddr_t arg is a pointer to the "struct nfssvc_sock". + * Essentially do as much as possible non-blocking, else punt and it will + * be called with M_WAIT from an nfsd. + */ +void +nfsrv_rcv(so, arg, waitflag) + struct socket *so; + caddr_t arg; + int waitflag; +{ + register struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; + register struct mbuf *m; + struct mbuf *mp, *nam; + struct uio auio; + int flags, error; + + if ((slp->ns_flag & SLP_VALID) == 0) + return; +#ifdef notdef + /* + * Define this to test for nfsds handling this under heavy load. + */ + if (waitflag == M_DONTWAIT) { + slp->ns_flag |= SLP_NEEDQ; goto dorecs; + } +#endif + auio.uio_procp = NULL; + if (so->so_type == SOCK_STREAM) { + /* + * If there are already records on the queue, defer soreceive() + * to an nfsd so that there is feedback to the TCP layer that + * the nfs servers are heavily loaded. + */ + if (slp->ns_rec && waitflag == M_DONTWAIT) { + slp->ns_flag |= SLP_NEEDQ; + goto dorecs; + } + + /* + * Do soreceive(). + */ + auio.uio_resid = 1000000000; + flags = MSG_DONTWAIT; + error = soreceive(so, &nam, &auio, &mp, (struct mbuf **)0, &flags); + if (error || mp == (struct mbuf *)0) { + if (error == EWOULDBLOCK) + slp->ns_flag |= SLP_NEEDQ; + else + slp->ns_flag |= SLP_DISCONN; + goto dorecs; + } + m = mp; + if (slp->ns_rawend) { + slp->ns_rawend->m_next = m; + slp->ns_cc += 1000000000 - auio.uio_resid; + } else { + slp->ns_raw = m; + slp->ns_cc = 1000000000 - auio.uio_resid; + } + while (m->m_next) + m = m->m_next; + slp->ns_rawend = m; + + /* + * Now try and parse record(s) out of the raw stream data. + */ + if (error = nfsrv_getstream(slp, waitflag)) { + if (error == EPERM) + slp->ns_flag |= SLP_DISCONN; + else + slp->ns_flag |= SLP_NEEDQ; + } + } else { + do { + auio.uio_resid = 1000000000; + flags = MSG_DONTWAIT; + error = soreceive(so, &nam, &auio, &mp, + (struct mbuf **)0, &flags); + if (mp) { + nfs_realign(mp, 10 * NFSX_UNSIGNED); + if (nam) { + m = nam; + m->m_next = mp; + } else + m = mp; + if (slp->ns_recend) + slp->ns_recend->m_nextpkt = m; + else + slp->ns_rec = m; + slp->ns_recend = m; + m->m_nextpkt = (struct mbuf *)0; + } + if (error) { + if ((so->so_proto->pr_flags & PR_CONNREQUIRED) + && error != EWOULDBLOCK) { + slp->ns_flag |= SLP_DISCONN; + goto dorecs; + } + } + } while (mp); + } + + /* + * Now try and process the request records, non-blocking. + */ +dorecs: + if (waitflag == M_DONTWAIT && + (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) + nfsrv_wakenfsd(slp); +} + +/* + * Try and extract an RPC request from the mbuf data list received on a + * stream socket. The "waitflag" argument indicates whether or not it + * can sleep. + */ +nfsrv_getstream(slp, waitflag) + register struct nfssvc_sock *slp; + int waitflag; +{ + register struct mbuf *m; + register char *cp1, *cp2; + register int len; + struct mbuf *om, *m2, *recm; + u_long recmark; + + if (slp->ns_flag & SLP_GETSTREAM) + panic("nfs getstream"); + slp->ns_flag |= SLP_GETSTREAM; + for (;;) { + if (slp->ns_reclen == 0) { + if (slp->ns_cc < NFSX_UNSIGNED) { + slp->ns_flag &= ~SLP_GETSTREAM; + return (0); + } + m = slp->ns_raw; + if (m->m_len >= NFSX_UNSIGNED) { + bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); + m->m_data += NFSX_UNSIGNED; + m->m_len -= NFSX_UNSIGNED; + } else { + cp1 = (caddr_t)&recmark; + cp2 = mtod(m, caddr_t); + while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { + while (m->m_len == 0) { + m = m->m_next; + cp2 = mtod(m, caddr_t); + } + *cp1++ = *cp2++; + m->m_data++; + m->m_len--; + } + } + slp->ns_cc -= NFSX_UNSIGNED; + slp->ns_reclen = ntohl(recmark) & ~0x80000000; + if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) { + slp->ns_flag &= ~SLP_GETSTREAM; + return (EPERM); + } + } + + /* + * Now get the record part. + */ + if (slp->ns_cc == slp->ns_reclen) { + recm = slp->ns_raw; + slp->ns_raw = slp->ns_rawend = (struct mbuf *)0; + slp->ns_cc = slp->ns_reclen = 0; + } else if (slp->ns_cc > slp->ns_reclen) { + len = 0; + m = slp->ns_raw; + om = (struct mbuf *)0; + while (len < slp->ns_reclen) { + if ((len + m->m_len) > slp->ns_reclen) { + m2 = m_copym(m, 0, slp->ns_reclen - len, + waitflag); + if (m2) { + if (om) { + om->m_next = m2; + recm = slp->ns_raw; + } else + recm = m2; + m->m_data += slp->ns_reclen - len; + m->m_len -= slp->ns_reclen - len; + len = slp->ns_reclen; + } else { + slp->ns_flag &= ~SLP_GETSTREAM; + return (EWOULDBLOCK); + } + } else if ((len + m->m_len) == slp->ns_reclen) { + om = m; + len += m->m_len; + m = m->m_next; + recm = slp->ns_raw; + om->m_next = (struct mbuf *)0; + } else { + om = m; + len += m->m_len; + m = m->m_next; + } + } + slp->ns_raw = m; + slp->ns_cc -= len; + slp->ns_reclen = 0; + } else { + slp->ns_flag &= ~SLP_GETSTREAM; + return (0); + } + nfs_realign(recm, 10 * NFSX_UNSIGNED); + if (slp->ns_recend) + slp->ns_recend->m_nextpkt = recm; + else + slp->ns_rec = recm; + slp->ns_recend = recm; + } +} + +/* + * Parse an RPC header. + */ +nfsrv_dorec(slp, nd) + register struct nfssvc_sock *slp; + register struct nfsd *nd; +{ + register struct mbuf *m; + int error; + + if ((slp->ns_flag & SLP_VALID) == 0 || + (m = slp->ns_rec) == (struct mbuf *)0) + return (ENOBUFS); + if (slp->ns_rec = m->m_nextpkt) + m->m_nextpkt = (struct mbuf *)0; + else + slp->ns_recend = (struct mbuf *)0; + if (m->m_type == MT_SONAME) { + nd->nd_nam = m; + nd->nd_md = nd->nd_mrep = m->m_next; + m->m_next = (struct mbuf *)0; + } else { + nd->nd_nam = (struct mbuf *)0; + nd->nd_md = nd->nd_mrep = m; + } + nd->nd_dpos = mtod(nd->nd_md, caddr_t); + if (error = nfs_getreq(nd, TRUE)) { + m_freem(nd->nd_nam); + return (error); + } + return (0); +} + +/* + * Parse an RPC request + * - verify it + * - fill in the cred struct. + */ +nfs_getreq(nd, has_header) + register struct nfsd *nd; + int has_header; +{ + register int len, i; + register u_long *tl; + register long t1; + struct uio uio; + struct iovec iov; + caddr_t dpos, cp2; + u_long nfsvers, auth_type; + int error = 0, nqnfs = 0; + struct mbuf *mrep, *md; + + mrep = nd->nd_mrep; + md = nd->nd_md; + dpos = nd->nd_dpos; + if (has_header) { + nfsm_dissect(tl, u_long *, 10*NFSX_UNSIGNED); + nd->nd_retxid = *tl++; + if (*tl++ != rpc_call) { + m_freem(mrep); + return (EBADRPC); + } + } else { + nfsm_dissect(tl, u_long *, 8*NFSX_UNSIGNED); + } + nd->nd_repstat = 0; + if (*tl++ != rpc_vers) { + nd->nd_repstat = ERPCMISMATCH; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + nfsvers = nfs_vers; + if (*tl != nfs_prog) { + if (*tl == nqnfs_prog) { + nqnfs++; + nfsvers = nqnfs_vers; + } else { + nd->nd_repstat = EPROGUNAVAIL; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + } + tl++; + if (*tl++ != nfsvers) { + nd->nd_repstat = EPROGMISMATCH; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + nd->nd_procnum = fxdr_unsigned(u_long, *tl++); + if (nd->nd_procnum == NFSPROC_NULL) + return (0); + if (nd->nd_procnum >= NFS_NPROCS || + (!nqnfs && nd->nd_procnum > NFSPROC_STATFS) || + (*tl != rpc_auth_unix && *tl != rpc_auth_kerb)) { + nd->nd_repstat = EPROCUNAVAIL; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + auth_type = *tl++; + len = fxdr_unsigned(int, *tl++); + if (len < 0 || len > RPCAUTH_MAXSIZ) { + m_freem(mrep); + return (EBADRPC); + } + + /* + * Handle auth_unix or auth_kerb. + */ + if (auth_type == rpc_auth_unix) { + len = fxdr_unsigned(int, *++tl); + if (len < 0 || len > NFS_MAXNAMLEN) { + m_freem(mrep); + return (EBADRPC); + } + nfsm_adv(nfsm_rndup(len)); + nfsm_dissect(tl, u_long *, 3*NFSX_UNSIGNED); + nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); + nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); + len = fxdr_unsigned(int, *tl); + if (len < 0 || len > RPCAUTH_UNIXGIDS) { + m_freem(mrep); + return (EBADRPC); + } + nfsm_dissect(tl, u_long *, (len + 2)*NFSX_UNSIGNED); + for (i = 1; i <= len; i++) + if (i < NGROUPS) + nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); + else + tl++; + nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1); + } else if (auth_type == rpc_auth_kerb) { + nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); + nd->nd_authlen = fxdr_unsigned(int, *tl); + uio.uio_resid = nfsm_rndup(nd->nd_authlen); + if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) { + m_freem(mrep); + return (EBADRPC); + } + uio.uio_offset = 0; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_segflg = UIO_SYSSPACE; + iov.iov_base = (caddr_t)nd->nd_authstr; + iov.iov_len = RPCAUTH_MAXSIZ; + nfsm_mtouio(&uio, uio.uio_resid); + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + nd->nd_flag |= NFSD_NEEDAUTH; + } + + /* + * Do we have any use for the verifier. + * According to the "Remote Procedure Call Protocol Spec." it + * should be AUTH_NULL, but some clients make it AUTH_UNIX? + * For now, just skip over it + */ + len = fxdr_unsigned(int, *++tl); + if (len < 0 || len > RPCAUTH_MAXSIZ) { + m_freem(mrep); + return (EBADRPC); + } + if (len > 0) { + nfsm_adv(nfsm_rndup(len)); + } + + /* + * For nqnfs, get piggybacked lease request. + */ + if (nqnfs && nd->nd_procnum != NQNFSPROC_EVICTED) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + nd->nd_nqlflag = fxdr_unsigned(int, *tl); + if (nd->nd_nqlflag) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + nd->nd_duration = fxdr_unsigned(int, *tl); + } else + nd->nd_duration = NQ_MINLEASE; + } else { + nd->nd_nqlflag = NQL_NOVAL; + nd->nd_duration = NQ_MINLEASE; + } + nd->nd_md = md; + nd->nd_dpos = dpos; + return (0); +nfsmout: + return (error); +} + +/* + * Search for a sleeping nfsd and wake it up. + * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the + * running nfsds will go look for the work in the nfssvc_sock list. + */ +void +nfsrv_wakenfsd(slp) + struct nfssvc_sock *slp; +{ + register struct nfsd *nd = nfsd_head.nd_next; + + if ((slp->ns_flag & SLP_VALID) == 0) + return; + while (nd != (struct nfsd *)&nfsd_head) { + if (nd->nd_flag & NFSD_WAITING) { + nd->nd_flag &= ~NFSD_WAITING; + if (nd->nd_slp) + panic("nfsd wakeup"); + slp->ns_sref++; + nd->nd_slp = slp; + wakeup((caddr_t)nd); + return; + } + nd = nd->nd_next; + } + slp->ns_flag |= SLP_DOREC; + nfsd_head.nd_flag |= NFSD_CHECKSLP; +} + +nfs_msg(p, server, msg) + struct proc *p; + char *server, *msg; +{ + tpr_t tpr; + + if (p) + tpr = tprintf_open(p); + else + tpr = NULL; + tprintf(tpr, "nfs server %s: %s\n", server, msg); + tprintf_close(tpr); +} diff --git a/sys/nfsclient/nfs_subs.c b/sys/nfsclient/nfs_subs.c new file mode 100644 index 00000000000..5778f7d7f01 --- /dev/null +++ b/sys/nfsclient/nfs_subs.c @@ -0,0 +1,1130 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_subs.c 8.3 (Berkeley) 1/4/94 + */ + +/* + * These functions support the macros and help fiddle mbuf chains for + * the nfs op functions. They do things like create the rpc header and + * copy data between mbuf chains and uio lists. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#ifdef ISO +#include +#endif + +#define TRUE 1 +#define FALSE 0 + +/* + * Data items converted to xdr at startup, since they are constant + * This is kinda hokey, but may save a little time doing byte swaps + */ +u_long nfs_procids[NFS_NPROCS]; +u_long nfs_xdrneg1; +u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, + rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_rejectedcred, + rpc_auth_kerb; +u_long nfs_vers, nfs_prog, nfs_true, nfs_false; + +/* And other global data */ +static u_long nfs_xid = 0; +enum vtype ntov_type[7] = { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON }; +extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +extern struct nfsreq nfsreqh; +extern int nqnfs_piggy[NFS_NPROCS]; +extern struct nfsrtt nfsrtt; +extern time_t nqnfsstarttime; +extern u_long nqnfs_prog, nqnfs_vers; +extern int nqsrv_clockskew; +extern int nqsrv_writeslack; +extern int nqsrv_maxlease; + +/* + * Create the header for an rpc request packet + * The hsiz is the size of the rest of the nfs request header. + * (just used to decide if a cluster is a good idea) + */ +struct mbuf * +nfsm_reqh(vp, procid, hsiz, bposp) + struct vnode *vp; + u_long procid; + int hsiz; + caddr_t *bposp; +{ + register struct mbuf *mb; + register u_long *tl; + register caddr_t bpos; + struct mbuf *mb2; + struct nfsmount *nmp; + int nqflag; + + MGET(mb, M_WAIT, MT_DATA); + if (hsiz >= MINCLSIZE) + MCLGET(mb, M_WAIT); + mb->m_len = 0; + bpos = mtod(mb, caddr_t); + + /* + * For NQNFS, add lease request. + */ + if (vp) { + nmp = VFSTONFS(vp->v_mount); + if (nmp->nm_flag & NFSMNT_NQNFS) { + nqflag = NQNFS_NEEDLEASE(vp, procid); + if (nqflag) { + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(nqflag); + *tl = txdr_unsigned(nmp->nm_leaseterm); + } else { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = 0; + } + } + } + /* Finally, return values */ + *bposp = bpos; + return (mb); +} + +/* + * Build the RPC header and fill in the authorization info. + * The authorization string argument is only used when the credentials + * come from outside of the kernel. + * Returns the head of the mbuf list. + */ +struct mbuf * +nfsm_rpchead(cr, nqnfs, procid, auth_type, auth_len, auth_str, mrest, + mrest_len, mbp, xidp) + register struct ucred *cr; + int nqnfs; + int procid; + int auth_type; + int auth_len; + char *auth_str; + struct mbuf *mrest; + int mrest_len; + struct mbuf **mbp; + u_long *xidp; +{ + register struct mbuf *mb; + register u_long *tl; + register caddr_t bpos; + register int i; + struct mbuf *mreq, *mb2; + int siz, grpsiz, authsiz; + + authsiz = nfsm_rndup(auth_len); + if (auth_type == RPCAUTH_NQNFS) + authsiz += 2 * NFSX_UNSIGNED; + MGETHDR(mb, M_WAIT, MT_DATA); + if ((authsiz + 10*NFSX_UNSIGNED) >= MINCLSIZE) { + MCLGET(mb, M_WAIT); + } else if ((authsiz + 10*NFSX_UNSIGNED) < MHLEN) { + MH_ALIGN(mb, authsiz + 10*NFSX_UNSIGNED); + } else { + MH_ALIGN(mb, 8*NFSX_UNSIGNED); + } + mb->m_len = 0; + mreq = mb; + bpos = mtod(mb, caddr_t); + + /* + * First the RPC header. + */ + nfsm_build(tl, u_long *, 8*NFSX_UNSIGNED); + if (++nfs_xid == 0) + nfs_xid++; + *tl++ = *xidp = txdr_unsigned(nfs_xid); + *tl++ = rpc_call; + *tl++ = rpc_vers; + if (nqnfs) { + *tl++ = txdr_unsigned(NQNFS_PROG); + *tl++ = txdr_unsigned(NQNFS_VER1); + } else { + *tl++ = txdr_unsigned(NFS_PROG); + *tl++ = txdr_unsigned(NFS_VER2); + } + *tl++ = txdr_unsigned(procid); + + /* + * And then the authorization cred. + */ + *tl++ = txdr_unsigned(auth_type); + *tl = txdr_unsigned(authsiz); + switch (auth_type) { + case RPCAUTH_UNIX: + nfsm_build(tl, u_long *, auth_len); + *tl++ = 0; /* stamp ?? */ + *tl++ = 0; /* NULL hostname */ + *tl++ = txdr_unsigned(cr->cr_uid); + *tl++ = txdr_unsigned(cr->cr_groups[0]); + grpsiz = (auth_len >> 2) - 5; + *tl++ = txdr_unsigned(grpsiz); + for (i = 1; i <= grpsiz; i++) + *tl++ = txdr_unsigned(cr->cr_groups[i]); + break; + case RPCAUTH_NQNFS: + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(cr->cr_uid); + *tl = txdr_unsigned(auth_len); + siz = auth_len; + while (siz > 0) { + if (M_TRAILINGSPACE(mb) == 0) { + MGET(mb2, M_WAIT, MT_DATA); + if (siz >= MINCLSIZE) + MCLGET(mb2, M_WAIT); + mb->m_next = mb2; + mb = mb2; + mb->m_len = 0; + bpos = mtod(mb, caddr_t); + } + i = min(siz, M_TRAILINGSPACE(mb)); + bcopy(auth_str, bpos, i); + mb->m_len += i; + auth_str += i; + bpos += i; + siz -= i; + } + if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { + for (i = 0; i < siz; i++) + *bpos++ = '\0'; + mb->m_len += siz; + } + break; + }; + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(RPCAUTH_NULL); + *tl = 0; + mb->m_next = mrest; + mreq->m_pkthdr.len = authsiz + 10*NFSX_UNSIGNED + mrest_len; + mreq->m_pkthdr.rcvif = (struct ifnet *)0; + *mbp = mb; + return (mreq); +} + +/* + * copies mbuf chain to the uio scatter/gather list + */ +nfsm_mbuftouio(mrep, uiop, siz, dpos) + struct mbuf **mrep; + register struct uio *uiop; + int siz; + caddr_t *dpos; +{ + register char *mbufcp, *uiocp; + register int xfer, left, len; + register struct mbuf *mp; + long uiosiz, rem; + int error = 0; + + mp = *mrep; + mbufcp = *dpos; + len = mtod(mp, caddr_t)+mp->m_len-mbufcp; + rem = nfsm_rndup(siz)-siz; + while (siz > 0) { + if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + return (EFBIG); + left = uiop->uio_iov->iov_len; + uiocp = uiop->uio_iov->iov_base; + if (left > siz) + left = siz; + uiosiz = left; + while (left > 0) { + while (len == 0) { + mp = mp->m_next; + if (mp == NULL) + return (EBADRPC); + mbufcp = mtod(mp, caddr_t); + len = mp->m_len; + } + xfer = (left > len) ? len : left; +#ifdef notdef + /* Not Yet.. */ + if (uiop->uio_iov->iov_op != NULL) + (*(uiop->uio_iov->iov_op)) + (mbufcp, uiocp, xfer); + else +#endif + if (uiop->uio_segflg == UIO_SYSSPACE) + bcopy(mbufcp, uiocp, xfer); + else + copyout(mbufcp, uiocp, xfer); + left -= xfer; + len -= xfer; + mbufcp += xfer; + uiocp += xfer; + uiop->uio_offset += xfer; + uiop->uio_resid -= xfer; + } + if (uiop->uio_iov->iov_len <= siz) { + uiop->uio_iovcnt--; + uiop->uio_iov++; + } else { + uiop->uio_iov->iov_base += uiosiz; + uiop->uio_iov->iov_len -= uiosiz; + } + siz -= uiosiz; + } + *dpos = mbufcp; + *mrep = mp; + if (rem > 0) { + if (len < rem) + error = nfs_adv(mrep, dpos, rem, len); + else + *dpos += rem; + } + return (error); +} + +/* + * copies a uio scatter/gather list to an mbuf chain... + */ +nfsm_uiotombuf(uiop, mq, siz, bpos) + register struct uio *uiop; + struct mbuf **mq; + int siz; + caddr_t *bpos; +{ + register char *uiocp; + register struct mbuf *mp, *mp2; + register int xfer, left, mlen; + int uiosiz, clflg, rem; + char *cp; + + if (siz > MLEN) /* or should it >= MCLBYTES ?? */ + clflg = 1; + else + clflg = 0; + rem = nfsm_rndup(siz)-siz; + mp = mp2 = *mq; + while (siz > 0) { + if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + return (EINVAL); + left = uiop->uio_iov->iov_len; + uiocp = uiop->uio_iov->iov_base; + if (left > siz) + left = siz; + uiosiz = left; + while (left > 0) { + mlen = M_TRAILINGSPACE(mp); + if (mlen == 0) { + MGET(mp, M_WAIT, MT_DATA); + if (clflg) + MCLGET(mp, M_WAIT); + mp->m_len = 0; + mp2->m_next = mp; + mp2 = mp; + mlen = M_TRAILINGSPACE(mp); + } + xfer = (left > mlen) ? mlen : left; +#ifdef notdef + /* Not Yet.. */ + if (uiop->uio_iov->iov_op != NULL) + (*(uiop->uio_iov->iov_op)) + (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + else +#endif + if (uiop->uio_segflg == UIO_SYSSPACE) + bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + else + copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + mp->m_len += xfer; + left -= xfer; + uiocp += xfer; + uiop->uio_offset += xfer; + uiop->uio_resid -= xfer; + } + if (uiop->uio_iov->iov_len <= siz) { + uiop->uio_iovcnt--; + uiop->uio_iov++; + } else { + uiop->uio_iov->iov_base += uiosiz; + uiop->uio_iov->iov_len -= uiosiz; + } + siz -= uiosiz; + } + if (rem > 0) { + if (rem > M_TRAILINGSPACE(mp)) { + MGET(mp, M_WAIT, MT_DATA); + mp->m_len = 0; + mp2->m_next = mp; + } + cp = mtod(mp, caddr_t)+mp->m_len; + for (left = 0; left < rem; left++) + *cp++ = '\0'; + mp->m_len += rem; + *bpos = cp; + } else + *bpos = mtod(mp, caddr_t)+mp->m_len; + *mq = mp; + return (0); +} + +/* + * Help break down an mbuf chain by setting the first siz bytes contiguous + * pointed to by returned val. + * This is used by the macros nfsm_dissect and nfsm_dissecton for tough + * cases. (The macros use the vars. dpos and dpos2) + */ +nfsm_disct(mdp, dposp, siz, left, cp2) + struct mbuf **mdp; + caddr_t *dposp; + int siz; + int left; + caddr_t *cp2; +{ + register struct mbuf *mp, *mp2; + register int siz2, xfer; + register caddr_t p; + + mp = *mdp; + while (left == 0) { + *mdp = mp = mp->m_next; + if (mp == NULL) + return (EBADRPC); + left = mp->m_len; + *dposp = mtod(mp, caddr_t); + } + if (left >= siz) { + *cp2 = *dposp; + *dposp += siz; + } else if (mp->m_next == NULL) { + return (EBADRPC); + } else if (siz > MHLEN) { + panic("nfs S too big"); + } else { + MGET(mp2, M_WAIT, MT_DATA); + mp2->m_next = mp->m_next; + mp->m_next = mp2; + mp->m_len -= left; + mp = mp2; + *cp2 = p = mtod(mp, caddr_t); + bcopy(*dposp, p, left); /* Copy what was left */ + siz2 = siz-left; + p += left; + mp2 = mp->m_next; + /* Loop around copying up the siz2 bytes */ + while (siz2 > 0) { + if (mp2 == NULL) + return (EBADRPC); + xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; + if (xfer > 0) { + bcopy(mtod(mp2, caddr_t), p, xfer); + NFSMADV(mp2, xfer); + mp2->m_len -= xfer; + p += xfer; + siz2 -= xfer; + } + if (siz2 > 0) + mp2 = mp2->m_next; + } + mp->m_len = siz; + *mdp = mp2; + *dposp = mtod(mp2, caddr_t); + } + return (0); +} + +/* + * Advance the position in the mbuf chain. + */ +nfs_adv(mdp, dposp, offs, left) + struct mbuf **mdp; + caddr_t *dposp; + int offs; + int left; +{ + register struct mbuf *m; + register int s; + + m = *mdp; + s = left; + while (s < offs) { + offs -= s; + m = m->m_next; + if (m == NULL) + return (EBADRPC); + s = m->m_len; + } + *mdp = m; + *dposp = mtod(m, caddr_t)+offs; + return (0); +} + +/* + * Copy a string into mbufs for the hard cases... + */ +nfsm_strtmbuf(mb, bpos, cp, siz) + struct mbuf **mb; + char **bpos; + char *cp; + long siz; +{ + register struct mbuf *m1, *m2; + long left, xfer, len, tlen; + u_long *tl; + int putsize; + + putsize = 1; + m2 = *mb; + left = M_TRAILINGSPACE(m2); + if (left > 0) { + tl = ((u_long *)(*bpos)); + *tl++ = txdr_unsigned(siz); + putsize = 0; + left -= NFSX_UNSIGNED; + m2->m_len += NFSX_UNSIGNED; + if (left > 0) { + bcopy(cp, (caddr_t) tl, left); + siz -= left; + cp += left; + m2->m_len += left; + left = 0; + } + } + /* Loop around adding mbufs */ + while (siz > 0) { + MGET(m1, M_WAIT, MT_DATA); + if (siz > MLEN) + MCLGET(m1, M_WAIT); + m1->m_len = NFSMSIZ(m1); + m2->m_next = m1; + m2 = m1; + tl = mtod(m1, u_long *); + tlen = 0; + if (putsize) { + *tl++ = txdr_unsigned(siz); + m1->m_len -= NFSX_UNSIGNED; + tlen = NFSX_UNSIGNED; + putsize = 0; + } + if (siz < m1->m_len) { + len = nfsm_rndup(siz); + xfer = siz; + if (xfer < len) + *(tl+(xfer>>2)) = 0; + } else { + xfer = len = m1->m_len; + } + bcopy(cp, (caddr_t) tl, xfer); + m1->m_len = len+tlen; + siz -= xfer; + cp += xfer; + } + *mb = m1; + *bpos = mtod(m1, caddr_t)+m1->m_len; + return (0); +} + +/* + * Called once to initialize data structures... + */ +nfs_init() +{ + register int i; + + nfsrtt.pos = 0; + rpc_vers = txdr_unsigned(RPC_VER2); + rpc_call = txdr_unsigned(RPC_CALL); + rpc_reply = txdr_unsigned(RPC_REPLY); + rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); + rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); + rpc_mismatch = txdr_unsigned(RPC_MISMATCH); + rpc_autherr = txdr_unsigned(RPC_AUTHERR); + rpc_rejectedcred = txdr_unsigned(AUTH_REJECTCRED); + rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); + rpc_auth_kerb = txdr_unsigned(RPCAUTH_NQNFS); + nfs_vers = txdr_unsigned(NFS_VER2); + nfs_prog = txdr_unsigned(NFS_PROG); + nfs_true = txdr_unsigned(TRUE); + nfs_false = txdr_unsigned(FALSE); + /* Loop thru nfs procids */ + for (i = 0; i < NFS_NPROCS; i++) + nfs_procids[i] = txdr_unsigned(i); + /* Ensure async daemons disabled */ + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + nfs_iodwant[i] = (struct proc *)0; + TAILQ_INIT(&nfs_bufq); + nfs_xdrneg1 = txdr_unsigned(-1); + nfs_nhinit(); /* Init the nfsnode table */ + nfsrv_init(0); /* Init server data structures */ + nfsrv_initcache(); /* Init the server request cache */ + + /* + * Initialize the nqnfs server stuff. + */ + if (nqnfsstarttime == 0) { + nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + + nqsrv_clockskew + nqsrv_writeslack; + NQLOADNOVRAM(nqnfsstarttime); + nqnfs_prog = txdr_unsigned(NQNFS_PROG); + nqnfs_vers = txdr_unsigned(NQNFS_VER1); + nqthead.th_head[0] = &nqthead; + nqthead.th_head[1] = &nqthead; + nqfhead = hashinit(NQLCHSZ, M_NQLEASE, &nqfheadhash); + } + + /* + * Initialize reply list and start timer + */ + nfsreqh.r_prev = nfsreqh.r_next = &nfsreqh; + nfs_timer(); +} + +/* + * Attribute cache routines. + * nfs_loadattrcache() - loads or updates the cache contents from attributes + * that are on the mbuf list + * nfs_getattrcache() - returns valid attributes if found in cache, returns + * error otherwise + */ + +/* + * Load the attribute cache (that lives in the nfsnode entry) with + * the values on the mbuf list and + * Iff vap not NULL + * copy the attributes to *vaper + */ +nfs_loadattrcache(vpp, mdp, dposp, vaper) + struct vnode **vpp; + struct mbuf **mdp; + caddr_t *dposp; + struct vattr *vaper; +{ + register struct vnode *vp = *vpp; + register struct vattr *vap; + register struct nfsv2_fattr *fp; + extern int (**spec_nfsv2nodeop_p)(); + register struct nfsnode *np, *nq, **nhpp; + register long t1; + caddr_t dpos, cp2; + int error = 0, isnq; + struct mbuf *md; + enum vtype vtyp; + u_short vmode; + long rdev; + struct timespec mtime; + struct vnode *nvp; + + md = *mdp; + dpos = *dposp; + t1 = (mtod(md, caddr_t) + md->m_len) - dpos; + isnq = (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS); + if (error = nfsm_disct(&md, &dpos, NFSX_FATTR(isnq), t1, &cp2)) + return (error); + fp = (struct nfsv2_fattr *)cp2; + vtyp = nfstov_type(fp->fa_type); + vmode = fxdr_unsigned(u_short, fp->fa_mode); + if (vtyp == VNON || vtyp == VREG) + vtyp = IFTOVT(vmode); + if (isnq) { + rdev = fxdr_unsigned(long, fp->fa_nqrdev); + fxdr_nqtime(&fp->fa_nqmtime, &mtime); + } else { + rdev = fxdr_unsigned(long, fp->fa_nfsrdev); + fxdr_nfstime(&fp->fa_nfsmtime, &mtime); + } + /* + * If v_type == VNON it is a new node, so fill in the v_type, + * n_mtime fields. Check to see if it represents a special + * device, and if so, check for a possible alias. Once the + * correct vnode has been obtained, fill in the rest of the + * information. + */ + np = VTONFS(vp); + if (vp->v_type == VNON) { + if (vtyp == VCHR && rdev == 0xffffffff) + vp->v_type = vtyp = VFIFO; + else + vp->v_type = vtyp; + if (vp->v_type == VFIFO) { +#ifdef FIFO + extern int (**fifo_nfsv2nodeop_p)(); + vp->v_op = fifo_nfsv2nodeop_p; +#else + return (EOPNOTSUPP); +#endif /* FIFO */ + } + if (vp->v_type == VCHR || vp->v_type == VBLK) { + vp->v_op = spec_nfsv2nodeop_p; + if (nvp = checkalias(vp, (dev_t)rdev, vp->v_mount)) { + /* + * Discard unneeded vnode, but save its nfsnode. + */ + if (nq = np->n_forw) + nq->n_back = np->n_back; + *np->n_back = nq; + nvp->v_data = vp->v_data; + vp->v_data = NULL; + vp->v_op = spec_vnodeop_p; + vrele(vp); + vgone(vp); + /* + * Reinitialize aliased node. + */ + np->n_vnode = nvp; + nhpp = (struct nfsnode **)nfs_hash(&np->n_fh); + if (nq = *nhpp) + nq->n_back = &np->n_forw; + np->n_forw = nq; + np->n_back = nhpp; + *nhpp = np; + *vpp = vp = nvp; + } + } + np->n_mtime = mtime.ts_sec; + } + vap = &np->n_vattr; + vap->va_type = vtyp; + vap->va_mode = (vmode & 07777); + vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); + vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); + vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); + vap->va_rdev = (dev_t)rdev; + vap->va_mtime = mtime; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + if (isnq) { + fxdr_hyper(&fp->fa_nqsize, &vap->va_size); + vap->va_blocksize = fxdr_unsigned(long, fp->fa_nqblocksize); + fxdr_hyper(&fp->fa_nqbytes, &vap->va_bytes); + vap->va_fileid = fxdr_unsigned(long, fp->fa_nqfileid); + fxdr_nqtime(&fp->fa_nqatime, &vap->va_atime); + vap->va_flags = fxdr_unsigned(u_long, fp->fa_nqflags); + fxdr_nqtime(&fp->fa_nqctime, &vap->va_ctime); + vap->va_gen = fxdr_unsigned(u_long, fp->fa_nqgen); + fxdr_hyper(&fp->fa_nqfilerev, &vap->va_filerev); + } else { + vap->va_size = fxdr_unsigned(u_long, fp->fa_nfssize); + vap->va_blocksize = fxdr_unsigned(long, fp->fa_nfsblocksize); + vap->va_bytes = fxdr_unsigned(long, fp->fa_nfsblocks) * NFS_FABLKSIZE; + vap->va_fileid = fxdr_unsigned(long, fp->fa_nfsfileid); + fxdr_nfstime(&fp->fa_nfsatime, &vap->va_atime); + vap->va_flags = 0; + vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa_nfsctime.nfs_sec); + vap->va_ctime.ts_nsec = 0; + vap->va_gen = fxdr_unsigned(u_long, fp->fa_nfsctime.nfs_usec); + vap->va_filerev = 0; + } + if (vap->va_size != np->n_size) { + if (vap->va_type == VREG) { + if (np->n_flag & NMODIFIED) { + if (vap->va_size < np->n_size) + vap->va_size = np->n_size; + else + np->n_size = vap->va_size; + } else + np->n_size = vap->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else + np->n_size = vap->va_size; + } + np->n_attrstamp = time.tv_sec; + *dposp = dpos; + *mdp = md; + if (vaper != NULL) { + bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); +#ifdef notdef + if ((np->n_flag & NMODIFIED) && np->n_size > vap->va_size) + if (np->n_size > vap->va_size) + vaper->va_size = np->n_size; +#endif + if (np->n_flag & NCHG) { + if (np->n_flag & NACC) { + vaper->va_atime.ts_sec = np->n_atim.tv_sec; + vaper->va_atime.ts_nsec = + np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vaper->va_mtime.ts_sec = np->n_mtim.tv_sec; + vaper->va_mtime.ts_nsec = + np->n_mtim.tv_usec * 1000; + } + } + } + return (0); +} + +/* + * Check the time stamp + * If the cache is valid, copy contents to *vap and return 0 + * otherwise return an error + */ +nfs_getattrcache(vp, vaper) + register struct vnode *vp; + struct vattr *vaper; +{ + register struct nfsnode *np = VTONFS(vp); + register struct vattr *vap; + + if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQLOOKLEASE) { + if (!NQNFS_CKCACHABLE(vp, NQL_READ) || np->n_attrstamp == 0) { + nfsstats.attrcache_misses++; + return (ENOENT); + } + } else if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) { + nfsstats.attrcache_misses++; + return (ENOENT); + } + nfsstats.attrcache_hits++; + vap = &np->n_vattr; + if (vap->va_size != np->n_size) { + if (vap->va_type == VREG) { + if (np->n_flag & NMODIFIED) { + if (vap->va_size < np->n_size) + vap->va_size = np->n_size; + else + np->n_size = vap->va_size; + } else + np->n_size = vap->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else + np->n_size = vap->va_size; + } + bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); +#ifdef notdef + if ((np->n_flag & NMODIFIED) == 0) { + np->n_size = vaper->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else if (np->n_size > vaper->va_size) + if (np->n_size > vaper->va_size) + vaper->va_size = np->n_size; +#endif + if (np->n_flag & NCHG) { + if (np->n_flag & NACC) { + vaper->va_atime.ts_sec = np->n_atim.tv_sec; + vaper->va_atime.ts_nsec = np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vaper->va_mtime.ts_sec = np->n_mtim.tv_sec; + vaper->va_mtime.ts_nsec = np->n_mtim.tv_usec * 1000; + } + } + return (0); +} + +/* + * Set up nameidata for a lookup() call and do it + */ +nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, p) + register struct nameidata *ndp; + fhandle_t *fhp; + int len; + struct nfssvc_sock *slp; + struct mbuf *nam; + struct mbuf **mdp; + caddr_t *dposp; + struct proc *p; +{ + register int i, rem; + register struct mbuf *md; + register char *fromcp, *tocp; + struct vnode *dp; + int error, rdonly; + struct componentname *cnp = &ndp->ni_cnd; + + MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK); + /* + * Copy the name from the mbuf list to ndp->ni_pnbuf + * and set the various ndp fields appropriately. + */ + fromcp = *dposp; + tocp = cnp->cn_pnbuf; + md = *mdp; + rem = mtod(md, caddr_t) + md->m_len - fromcp; + cnp->cn_hash = 0; + for (i = 0; i < len; i++) { + while (rem == 0) { + md = md->m_next; + if (md == NULL) { + error = EBADRPC; + goto out; + } + fromcp = mtod(md, caddr_t); + rem = md->m_len; + } + if (*fromcp == '\0' || *fromcp == '/') { + error = EINVAL; + goto out; + } + cnp->cn_hash += (unsigned char)*fromcp; + *tocp++ = *fromcp++; + rem--; + } + *tocp = '\0'; + *mdp = md; + *dposp = fromcp; + len = nfsm_rndup(len)-len; + if (len > 0) { + if (rem >= len) + *dposp += len; + else if (error = nfs_adv(mdp, dposp, len, rem)) + goto out; + } + ndp->ni_pathlen = tocp - cnp->cn_pnbuf; + cnp->cn_nameptr = cnp->cn_pnbuf; + /* + * Extract and set starting directory. + */ + if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, + nam, &rdonly)) + goto out; + if (dp->v_type != VDIR) { + vrele(dp); + error = ENOTDIR; + goto out; + } + ndp->ni_startdir = dp; + if (rdonly) + cnp->cn_flags |= (NOCROSSMOUNT | RDONLY); + else + cnp->cn_flags |= NOCROSSMOUNT; + /* + * And call lookup() to do the real work + */ + cnp->cn_proc = p; + if (error = lookup(ndp)) + goto out; + /* + * Check for encountering a symbolic link + */ + if (cnp->cn_flags & ISSYMLINK) { + if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) + vput(ndp->ni_dvp); + else + vrele(ndp->ni_dvp); + vput(ndp->ni_vp); + ndp->ni_vp = NULL; + error = EINVAL; + goto out; + } + /* + * Check for saved name request + */ + if (cnp->cn_flags & (SAVENAME | SAVESTART)) { + cnp->cn_flags |= HASBUF; + return (0); + } +out: + FREE(cnp->cn_pnbuf, M_NAMEI); + return (error); +} + +/* + * A fiddled version of m_adj() that ensures null fill to a long + * boundary and only trims off the back end + */ +void +nfsm_adj(mp, len, nul) + struct mbuf *mp; + register int len; + int nul; +{ + register struct mbuf *m; + register int count, i; + register char *cp; + + /* + * Trim from tail. Scan the mbuf chain, + * calculating its length and finding the last mbuf. + * If the adjustment only affects this mbuf, then just + * adjust and return. Otherwise, rescan and truncate + * after the remaining size. + */ + count = 0; + m = mp; + for (;;) { + count += m->m_len; + if (m->m_next == (struct mbuf *)0) + break; + m = m->m_next; + } + if (m->m_len > len) { + m->m_len -= len; + if (nul > 0) { + cp = mtod(m, caddr_t)+m->m_len-nul; + for (i = 0; i < nul; i++) + *cp++ = '\0'; + } + return; + } + count -= len; + if (count < 0) + count = 0; + /* + * Correct length for chain is "count". + * Find the mbuf with last data, adjust its length, + * and toss data from remaining mbufs on chain. + */ + for (m = mp; m; m = m->m_next) { + if (m->m_len >= count) { + m->m_len = count; + if (nul > 0) { + cp = mtod(m, caddr_t)+m->m_len-nul; + for (i = 0; i < nul; i++) + *cp++ = '\0'; + } + break; + } + count -= m->m_len; + } + while (m = m->m_next) + m->m_len = 0; +} + +/* + * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) + * - look up fsid in mount list (if not found ret error) + * - get vp and export rights by calling VFS_FHTOVP() + * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon + * - if not lockflag unlock it with VOP_UNLOCK() + */ +nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp) + fhandle_t *fhp; + int lockflag; + struct vnode **vpp; + struct ucred *cred; + struct nfssvc_sock *slp; + struct mbuf *nam; + int *rdonlyp; +{ + register struct mount *mp; + register struct nfsuid *uidp; + register int i; + struct ucred *credanon; + int error, exflags; + + *vpp = (struct vnode *)0; + if ((mp = getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if (error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon)) + return (error); + /* + * Check/setup credentials. + */ + if (exflags & MNT_EXKERB) { + uidp = slp->ns_uidh[NUIDHASH(cred->cr_uid)]; + while (uidp) { + if (uidp->nu_uid == cred->cr_uid) + break; + uidp = uidp->nu_hnext; + } + if (uidp) { + cred->cr_uid = uidp->nu_cr.cr_uid; + for (i = 0; i < uidp->nu_cr.cr_ngroups; i++) + cred->cr_groups[i] = uidp->nu_cr.cr_groups[i]; + } else { + vput(*vpp); + return (NQNFS_AUTHERR); + } + } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { + cred->cr_uid = credanon->cr_uid; + for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) + cred->cr_groups[i] = credanon->cr_groups[i]; + } + if (exflags & MNT_EXRDONLY) + *rdonlyp = 1; + else + *rdonlyp = 0; + if (!lockflag) + VOP_UNLOCK(*vpp); + return (0); +} + +/* + * This function compares two net addresses by family and returns TRUE + * if they are the same host. + * If there is any doubt, return FALSE. + * The AF_INET family is handled as a special case so that address mbufs + * don't need to be saved to store "struct in_addr", which is only 4 bytes. + */ +netaddr_match(family, haddr, nam) + int family; + union nethostaddr *haddr; + struct mbuf *nam; +{ + register struct sockaddr_in *inetaddr; + + switch (family) { + case AF_INET: + inetaddr = mtod(nam, struct sockaddr_in *); + if (inetaddr->sin_family == AF_INET && + inetaddr->sin_addr.s_addr == haddr->had_inetaddr) + return (1); + break; +#ifdef ISO + case AF_ISO: + { + register struct sockaddr_iso *isoaddr1, *isoaddr2; + + isoaddr1 = mtod(nam, struct sockaddr_iso *); + isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *); + if (isoaddr1->siso_family == AF_ISO && + isoaddr1->siso_nlen > 0 && + isoaddr1->siso_nlen == isoaddr2->siso_nlen && + SAME_ISOADDR(isoaddr1, isoaddr2)) + return (1); + break; + } +#endif /* ISO */ + default: + break; + }; + return (0); +} diff --git a/sys/nfsclient/nfs_vfsops.c b/sys/nfsclient/nfs_vfsops.c new file mode 100644 index 00000000000..1f186760689 --- /dev/null +++ b/sys/nfsclient/nfs_vfsops.c @@ -0,0 +1,740 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_vfsops.c 8.3 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * nfs vfs operations. + */ +struct vfsops nfs_vfsops = { + nfs_mount, + nfs_start, + nfs_unmount, + nfs_root, + nfs_quotactl, + nfs_statfs, + nfs_sync, + nfs_vget, + nfs_fhtovp, + nfs_vptofh, + nfs_init, +}; + +/* + * This structure must be filled in by a primary bootstrap or bootstrap + * server for a diskless/dataless machine. It is initialized below just + * to ensure that it is allocated to initialized data (.data not .bss). + */ +struct nfs_diskless nfs_diskless = { 0 }; + +extern u_long nfs_procids[NFS_NPROCS]; +extern u_long nfs_prog, nfs_vers; +void nfs_disconnect __P((struct nfsmount *)); +void nfsargs_ntoh __P((struct nfs_args *)); +static struct mount *nfs_mountdiskless __P((char *, char *, int, + struct sockaddr_in *, struct nfs_args *, register struct vnode **)); + +#define TRUE 1 +#define FALSE 0 + +/* + * nfs statfs call + */ +int +nfs_statfs(mp, sbp, p) + struct mount *mp; + register struct statfs *sbp; + struct proc *p; +{ + register struct vnode *vp; + register struct nfsv2_statfs *sfp; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + int error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct nfsmount *nmp; + struct ucred *cred; + struct nfsnode *np; + + nmp = VFSTONFS(mp); + isnq = (nmp->nm_flag & NFSMNT_NQNFS); + if (error = nfs_nget(mp, &nmp->nm_fh, &np)) + return (error); + vp = NFSTOV(np); + nfsstats.rpccnt[NFSPROC_STATFS]++; + cred = crget(); + cred->cr_ngroups = 1; + nfsm_reqhead(vp, NFSPROC_STATFS, NFSX_FH); + nfsm_fhtom(vp); + nfsm_request(vp, NFSPROC_STATFS, p, cred); + nfsm_dissect(sfp, struct nfsv2_statfs *, NFSX_STATFS(isnq)); + sbp->f_type = MOUNT_NFS; + sbp->f_flags = nmp->nm_flag; + sbp->f_iosize = NFS_MAXDGRAMDATA; + sbp->f_bsize = fxdr_unsigned(long, sfp->sf_bsize); + sbp->f_blocks = fxdr_unsigned(long, sfp->sf_blocks); + sbp->f_bfree = fxdr_unsigned(long, sfp->sf_bfree); + sbp->f_bavail = fxdr_unsigned(long, sfp->sf_bavail); + if (isnq) { + sbp->f_files = fxdr_unsigned(long, sfp->sf_files); + sbp->f_ffree = fxdr_unsigned(long, sfp->sf_ffree); + } else { + sbp->f_files = 0; + sbp->f_ffree = 0; + } + if (sbp != &mp->mnt_stat) { + bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); + bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); + } + nfsm_reqdone; + vrele(vp); + crfree(cred); + return (error); +} + +/* + * Mount a remote root fs via. nfs. This depends on the info in the + * nfs_diskless structure that has been filled in properly by some primary + * bootstrap. + * It goes something like this: + * - do enough of "ifconfig" by calling ifioctl() so that the system + * can talk to the server + * - If nfs_diskless.mygateway is filled in, use that address as + * a default gateway. + * - hand craft the swap nfs vnode hanging off a fake mount point + * if swdevt[0].sw_dev == NODEV + * - build the rootfs mount point and call mountnfs() to do the rest. + */ +int +nfs_mountroot() +{ + register struct mount *mp; + register struct nfs_diskless *nd = &nfs_diskless; + struct socket *so; + struct vnode *vp; + struct proc *p = curproc; /* XXX */ + int error, i; + + /* + * XXX time must be non-zero when we init the interface or else + * the arp code will wedge... + */ + if (time.tv_sec == 0) + time.tv_sec = 1; + +#ifdef notyet + /* Set up swap credentials. */ + proc0.p_ucred->cr_uid = ntohl(nd->swap_ucred.cr_uid); + proc0.p_ucred->cr_gid = ntohl(nd->swap_ucred.cr_gid); + if ((proc0.p_ucred->cr_ngroups = ntohs(nd->swap_ucred.cr_ngroups)) > + NGROUPS) + proc0.p_ucred->cr_ngroups = NGROUPS; + for (i = 0; i < proc0.p_ucred->cr_ngroups; i++) + proc0.p_ucred->cr_groups[i] = ntohl(nd->swap_ucred.cr_groups[i]); +#endif + + /* + * Do enough of ifconfig(8) so that the critical net interface can + * talk to the server. + */ + if (error = socreate(nd->myif.ifra_addr.sa_family, &so, SOCK_DGRAM, 0)) + panic("nfs_mountroot: socreate: %d", error); + if (error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, p)) + panic("nfs_mountroot: SIOCAIFADDR: %d", error); + soclose(so); + + /* + * If the gateway field is filled in, set it as the default route. + */ + if (nd->mygateway.sin_len != 0) { + struct sockaddr_in mask, sin; + + bzero((caddr_t)&mask, sizeof(mask)); + sin = mask; + sin.sin_family = AF_INET; + sin.sin_len = sizeof(sin); + if (error = rtrequest(RTM_ADD, (struct sockaddr *)&sin, + (struct sockaddr *)&nd->mygateway, + (struct sockaddr *)&mask, + RTF_UP | RTF_GATEWAY, (struct rtentry **)0)) + panic("nfs_mountroot: RTM_ADD: %d", error); + } + + /* + * If swapping to an nfs node (indicated by swdevt[0].sw_dev == NODEV): + * Create a fake mount point just for the swap vnode so that the + * swap file can be on a different server from the rootfs. + */ + if (swdevt[0].sw_dev == NODEV) { + nd->swap_args.fh = (nfsv2fh_t *)nd->swap_fh; + (void) nfs_mountdiskless(nd->swap_hostnam, "/swap", 0, + &nd->swap_saddr, &nd->swap_args, &vp); + + /* + * Since the swap file is not the root dir of a file system, + * hack it to a regular file. + */ + vp->v_type = VREG; + vp->v_flag = 0; + swapdev_vp = vp; + VREF(vp); + swdevt[0].sw_vp = vp; + swdevt[0].sw_nblks = ntohl(nd->swap_nblks); + } else if (bdevvp(swapdev, &swapdev_vp)) + panic("nfs_mountroot: can't setup swapdev_vp"); + + /* + * Create the rootfs mount point. + */ + nd->root_args.fh = (nfsv2fh_t *)nd->root_fh; + mp = nfs_mountdiskless(nd->root_hostnam, "/", MNT_RDONLY, + &nd->root_saddr, &nd->root_args, &vp); + + if (vfs_lock(mp)) + panic("nfs_mountroot: vfs_lock"); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mp->mnt_flag |= MNT_ROOTFS; + mp->mnt_vnodecovered = NULLVP; + vfs_unlock(mp); + rootvp = vp; + + /* + * This is not really an nfs issue, but it is much easier to + * set hostname here and then let the "/etc/rc.xxx" files + * mount the right /var based upon its preset value. + */ + bcopy(nd->my_hostnam, hostname, MAXHOSTNAMELEN); + hostname[MAXHOSTNAMELEN - 1] = '\0'; + for (i = 0; i < MAXHOSTNAMELEN; i++) + if (hostname[i] == '\0') + break; + hostnamelen = i; + inittodr(ntohl(nd->root_time)); + return (0); +} + +/* + * Internal version of mount system call for diskless setup. + */ +static struct mount * +nfs_mountdiskless(path, which, mountflag, sin, args, vpp) + char *path; + char *which; + int mountflag; + struct sockaddr_in *sin; + struct nfs_args *args; + register struct vnode **vpp; +{ + register struct mount *mp; + register struct mbuf *m; + register int error; + + mp = (struct mount *)malloc((u_long)sizeof(struct mount), + M_MOUNT, M_NOWAIT); + if (mp == NULL) + panic("nfs_mountroot: %s mount malloc", which); + bzero((char *)mp, (u_long)sizeof(struct mount)); + mp->mnt_op = &nfs_vfsops; + mp->mnt_flag = mountflag; + + MGET(m, MT_SONAME, M_DONTWAIT); + if (m == NULL) + panic("nfs_mountroot: %s mount mbuf", which); + bcopy((caddr_t)sin, mtod(m, caddr_t), sin->sin_len); + m->m_len = sin->sin_len; + nfsargs_ntoh(args); + if (error = mountnfs(args, mp, m, which, path, vpp)) + panic("nfs_mountroot: mount %s on %s: %d", path, which, error); + + return (mp); +} + +/* + * Convert the integer fields of the nfs_args structure from net byte order + * to host byte order. Called by nfs_mountroot() above. + */ +void +nfsargs_ntoh(nfsp) + register struct nfs_args *nfsp; +{ + + NTOHL(nfsp->sotype); + NTOHL(nfsp->proto); + NTOHL(nfsp->flags); + NTOHL(nfsp->wsize); + NTOHL(nfsp->rsize); + NTOHL(nfsp->timeo); + NTOHL(nfsp->retrans); + NTOHL(nfsp->maxgrouplist); + NTOHL(nfsp->readahead); + NTOHL(nfsp->leaseterm); + NTOHL(nfsp->deadthresh); +} + +/* + * VFS Operations. + * + * mount system call + * It seems a bit dumb to copyinstr() the host and path here and then + * bcopy() them in mountnfs(), but I wanted to detect errors before + * doing the sockargs() call because sockargs() allocates an mbuf and + * an error after that means that I have to release the mbuf. + */ +/* ARGSUSED */ +int +nfs_mount(mp, path, data, ndp, p) + struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + int error; + struct nfs_args args; + struct mbuf *nam; + struct vnode *vp; + char pth[MNAMELEN], hst[MNAMELEN]; + u_int len; + nfsv2fh_t nfh; + + if (error = copyin(data, (caddr_t)&args, sizeof (struct nfs_args))) + return (error); + if (error = copyin((caddr_t)args.fh, (caddr_t)&nfh, sizeof (nfsv2fh_t))) + return (error); + if (error = copyinstr(path, pth, MNAMELEN-1, &len)) + return (error); + bzero(&pth[len], MNAMELEN - len); + if (error = copyinstr(args.hostname, hst, MNAMELEN-1, &len)) + return (error); + bzero(&hst[len], MNAMELEN - len); + /* sockargs() call must be after above copyin() calls */ + if (error = sockargs(&nam, (caddr_t)args.addr, + args.addrlen, MT_SONAME)) + return (error); + args.fh = &nfh; + error = mountnfs(&args, mp, nam, pth, hst, &vp); + return (error); +} + +/* + * Common code for mount and mountroot + */ +int +mountnfs(argp, mp, nam, pth, hst, vpp) + register struct nfs_args *argp; + register struct mount *mp; + struct mbuf *nam; + char *pth, *hst; + struct vnode **vpp; +{ + register struct nfsmount *nmp; + struct nfsnode *np; + int error; + + if (mp->mnt_flag & MNT_UPDATE) { + nmp = VFSTONFS(mp); + /* update paths, file handles, etc, here XXX */ + m_freem(nam); + return (0); + } else { + MALLOC(nmp, struct nfsmount *, sizeof (struct nfsmount), + M_NFSMNT, M_WAITOK); + bzero((caddr_t)nmp, sizeof (struct nfsmount)); + mp->mnt_data = (qaddr_t)nmp; + } + getnewfsid(mp, MOUNT_NFS); + nmp->nm_mountp = mp; + nmp->nm_flag = argp->flags; + if ((nmp->nm_flag & (NFSMNT_NQNFS | NFSMNT_MYWRITE)) == + (NFSMNT_NQNFS | NFSMNT_MYWRITE)) { + error = EPERM; + goto bad; + } + if (nmp->nm_flag & NFSMNT_NQNFS) + /* + * We have to set mnt_maxsymlink to a non-zero value so + * that COMPAT_43 routines will know that we are setting + * the d_type field in directories (and can zero it for + * unsuspecting binaries). + */ + mp->mnt_maxsymlinklen = 1; + nmp->nm_timeo = NFS_TIMEO; + nmp->nm_retry = NFS_RETRANS; + nmp->nm_wsize = NFS_WSIZE; + nmp->nm_rsize = NFS_RSIZE; + nmp->nm_numgrps = NFS_MAXGRPS; + nmp->nm_readahead = NFS_DEFRAHEAD; + nmp->nm_leaseterm = NQ_DEFLEASE; + nmp->nm_deadthresh = NQ_DEADTHRESH; + nmp->nm_tnext = (struct nfsnode *)nmp; + nmp->nm_tprev = (struct nfsnode *)nmp; + nmp->nm_inprog = NULLVP; + bcopy((caddr_t)argp->fh, (caddr_t)&nmp->nm_fh, sizeof(nfsv2fh_t)); + mp->mnt_stat.f_type = MOUNT_NFS; + bcopy(hst, mp->mnt_stat.f_mntfromname, MNAMELEN); + bcopy(pth, mp->mnt_stat.f_mntonname, MNAMELEN); + nmp->nm_nam = nam; + + if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { + nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10; + if (nmp->nm_timeo < NFS_MINTIMEO) + nmp->nm_timeo = NFS_MINTIMEO; + else if (nmp->nm_timeo > NFS_MAXTIMEO) + nmp->nm_timeo = NFS_MAXTIMEO; + } + + if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) { + nmp->nm_retry = argp->retrans; + if (nmp->nm_retry > NFS_MAXREXMIT) + nmp->nm_retry = NFS_MAXREXMIT; + } + + if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) { + nmp->nm_wsize = argp->wsize; + /* Round down to multiple of blocksize */ + nmp->nm_wsize &= ~0x1ff; + if (nmp->nm_wsize <= 0) + nmp->nm_wsize = 512; + else if (nmp->nm_wsize > NFS_MAXDATA) + nmp->nm_wsize = NFS_MAXDATA; + } + if (nmp->nm_wsize > MAXBSIZE) + nmp->nm_wsize = MAXBSIZE; + + if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) { + nmp->nm_rsize = argp->rsize; + /* Round down to multiple of blocksize */ + nmp->nm_rsize &= ~0x1ff; + if (nmp->nm_rsize <= 0) + nmp->nm_rsize = 512; + else if (nmp->nm_rsize > NFS_MAXDATA) + nmp->nm_rsize = NFS_MAXDATA; + } + if (nmp->nm_rsize > MAXBSIZE) + nmp->nm_rsize = MAXBSIZE; + if ((argp->flags & NFSMNT_MAXGRPS) && argp->maxgrouplist >= 0 && + argp->maxgrouplist <= NFS_MAXGRPS) + nmp->nm_numgrps = argp->maxgrouplist; + if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0 && + argp->readahead <= NFS_MAXRAHEAD) + nmp->nm_readahead = argp->readahead; + if ((argp->flags & NFSMNT_LEASETERM) && argp->leaseterm >= 2 && + argp->leaseterm <= NQ_MAXLEASE) + nmp->nm_leaseterm = argp->leaseterm; + if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 1 && + argp->deadthresh <= NQ_NEVERDEAD) + nmp->nm_deadthresh = argp->deadthresh; + /* Set up the sockets and per-host congestion */ + nmp->nm_sotype = argp->sotype; + nmp->nm_soproto = argp->proto; + + /* + * For Connection based sockets (TCP,...) defer the connect until + * the first request, in case the server is not responding. + */ + if (nmp->nm_sotype == SOCK_DGRAM && + (error = nfs_connect(nmp, (struct nfsreq *)0))) + goto bad; + + /* + * This is silly, but it has to be set so that vinifod() works. + * We do not want to do an nfs_statfs() here since we can get + * stuck on a dead server and we are holding a lock on the mount + * point. + */ + mp->mnt_stat.f_iosize = NFS_MAXDGRAMDATA; + /* + * A reference count is needed on the nfsnode representing the + * remote root. If this object is not persistent, then backward + * traversals of the mount point (i.e. "..") will not work if + * the nfsnode gets flushed out of the cache. Ufs does not have + * this problem, because one can identify root inodes by their + * number == ROOTINO (2). + */ + if (error = nfs_nget(mp, &nmp->nm_fh, &np)) + goto bad; + *vpp = NFSTOV(np); + + return (0); +bad: + nfs_disconnect(nmp); + free((caddr_t)nmp, M_NFSMNT); + m_freem(nam); + return (error); +} + +/* + * unmount system call + */ +int +nfs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + register struct nfsmount *nmp; + struct nfsnode *np; + struct vnode *vp; + int error, flags = 0; + extern int doforce; + + if (mntflags & MNT_FORCE) { + if (!doforce || (mp->mnt_flag & MNT_ROOTFS)) + return (EINVAL); + flags |= FORCECLOSE; + } + nmp = VFSTONFS(mp); + /* + * Goes something like this.. + * - Check for activity on the root vnode (other than ourselves). + * - Call vflush() to clear out vnodes for this file system, + * except for the root vnode. + * - Decrement reference on the vnode representing remote root. + * - Close the socket + * - Free up the data structures + */ + /* + * We need to decrement the ref. count on the nfsnode representing + * the remote root. See comment in mountnfs(). The VFS unmount() + * has done vput on this vnode, otherwise we would get deadlock! + */ + if (error = nfs_nget(mp, &nmp->nm_fh, &np)) + return(error); + vp = NFSTOV(np); + if (vp->v_usecount > 2) { + vput(vp); + return (EBUSY); + } + + /* + * Must handshake with nqnfs_clientd() if it is active. + */ + nmp->nm_flag |= NFSMNT_DISMINPROG; + while (nmp->nm_inprog != NULLVP) + (void) tsleep((caddr_t)&lbolt, PSOCK, "nfsdism", 0); + if (error = vflush(mp, vp, flags)) { + vput(vp); + nmp->nm_flag &= ~NFSMNT_DISMINPROG; + return (error); + } + + /* + * We are now committed to the unmount. + * For NQNFS, let the server daemon free the nfsmount structure. + */ + if (nmp->nm_flag & (NFSMNT_NQNFS | NFSMNT_KERB)) + nmp->nm_flag |= NFSMNT_DISMNT; + + /* + * There are two reference counts to get rid of here. + */ + vrele(vp); + vrele(vp); + vgone(vp); + nfs_disconnect(nmp); + m_freem(nmp->nm_nam); + + if ((nmp->nm_flag & (NFSMNT_NQNFS | NFSMNT_KERB)) == 0) + free((caddr_t)nmp, M_NFSMNT); + return (0); +} + +/* + * Return root of a filesystem + */ +int +nfs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + register struct vnode *vp; + struct nfsmount *nmp; + struct nfsnode *np; + int error; + + nmp = VFSTONFS(mp); + if (error = nfs_nget(mp, &nmp->nm_fh, &np)) + return (error); + vp = NFSTOV(np); + vp->v_type = VDIR; + vp->v_flag = VROOT; + *vpp = vp; + return (0); +} + +extern int syncprt; + +/* + * Flush out the buffer cache + */ +/* ARGSUSED */ +int +nfs_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + register struct vnode *vp; + int error, allerror = 0; + + /* + * Force stale buffer cache information to be flushed. + */ +loop: + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + /* + * If the vnode that we are about to sync is no longer + * associated with this mount point, start over. + */ + if (vp->v_mount != mp) + goto loop; + if (VOP_ISLOCKED(vp) || vp->v_dirtyblkhd.lh_first == NULL) + continue; + if (vget(vp, 1)) + goto loop; + if (error = VOP_FSYNC(vp, cred, waitfor, p)) + allerror = error; + vput(vp); + } + return (allerror); +} + +/* + * NFS flat namespace lookup. + * Currently unsupported. + */ +/* ARGSUSED */ +int +nfs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + + return (EOPNOTSUPP); +} + +/* + * At this point, this should never happen + */ +/* ARGSUSED */ +int +nfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) + register struct mount *mp; + struct fid *fhp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + + return (EINVAL); +} + +/* + * Vnode pointer to File handle, should never happen either + */ +/* ARGSUSED */ +int +nfs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + + return (EINVAL); +} + +/* + * Vfs start routine, a no-op. + */ +/* ARGSUSED */ +int +nfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + + return (0); +} + +/* + * Do operations associated with quotas, not supported + */ +/* ARGSUSED */ +int +nfs_quotactl(mp, cmd, uid, arg, p) + struct mount *mp; + int cmd; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + + return (EOPNOTSUPP); +} diff --git a/sys/nfsclient/nfs_vnops.c b/sys/nfsclient/nfs_vnops.c new file mode 100644 index 00000000000..a909b48dc67 --- /dev/null +++ b/sys/nfsclient/nfs_vnops.c @@ -0,0 +1,2539 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_vnops.c 8.5 (Berkeley) 2/13/94 + */ + +/* + * vnode op calls for sun nfs version 2 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Defs */ +#define TRUE 1 +#define FALSE 0 + +/* + * Global vfs data structures for nfs + */ +int (**nfsv2_vnodeop_p)(); +struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, nfs_lookup }, /* lookup */ + { &vop_create_desc, nfs_create }, /* create */ + { &vop_mknod_desc, nfs_mknod }, /* mknod */ + { &vop_open_desc, nfs_open }, /* open */ + { &vop_close_desc, nfs_close }, /* close */ + { &vop_access_desc, nfs_access }, /* access */ + { &vop_getattr_desc, nfs_getattr }, /* getattr */ + { &vop_setattr_desc, nfs_setattr }, /* setattr */ + { &vop_read_desc, nfs_read }, /* read */ + { &vop_write_desc, nfs_write }, /* write */ + { &vop_ioctl_desc, nfs_ioctl }, /* ioctl */ + { &vop_select_desc, nfs_select }, /* select */ + { &vop_mmap_desc, nfs_mmap }, /* mmap */ + { &vop_fsync_desc, nfs_fsync }, /* fsync */ + { &vop_seek_desc, nfs_seek }, /* seek */ + { &vop_remove_desc, nfs_remove }, /* remove */ + { &vop_link_desc, nfs_link }, /* link */ + { &vop_rename_desc, nfs_rename }, /* rename */ + { &vop_mkdir_desc, nfs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, nfs_rmdir }, /* rmdir */ + { &vop_symlink_desc, nfs_symlink }, /* symlink */ + { &vop_readdir_desc, nfs_readdir }, /* readdir */ + { &vop_readlink_desc, nfs_readlink }, /* readlink */ + { &vop_abortop_desc, nfs_abortop }, /* abortop */ + { &vop_inactive_desc, nfs_inactive }, /* inactive */ + { &vop_reclaim_desc, nfs_reclaim }, /* reclaim */ + { &vop_lock_desc, nfs_lock }, /* lock */ + { &vop_unlock_desc, nfs_unlock }, /* unlock */ + { &vop_bmap_desc, nfs_bmap }, /* bmap */ + { &vop_strategy_desc, nfs_strategy }, /* strategy */ + { &vop_print_desc, nfs_print }, /* print */ + { &vop_islocked_desc, nfs_islocked }, /* islocked */ + { &vop_pathconf_desc, nfs_pathconf }, /* pathconf */ + { &vop_advlock_desc, nfs_advlock }, /* advlock */ + { &vop_blkatoff_desc, nfs_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, nfs_valloc }, /* valloc */ + { &vop_reallocblks_desc, nfs_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, nfs_vfree }, /* vfree */ + { &vop_truncate_desc, nfs_truncate }, /* truncate */ + { &vop_update_desc, nfs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc nfsv2_vnodeop_opv_desc = + { &nfsv2_vnodeop_p, nfsv2_vnodeop_entries }; + +/* + * Special device vnode ops + */ +int (**spec_nfsv2nodeop_p)(); +struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, spec_create }, /* create */ + { &vop_mknod_desc, spec_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, nfsspec_close }, /* close */ + { &vop_access_desc, nfsspec_access }, /* access */ + { &vop_getattr_desc, nfs_getattr }, /* getattr */ + { &vop_setattr_desc, nfs_setattr }, /* setattr */ + { &vop_read_desc, nfsspec_read }, /* read */ + { &vop_write_desc, nfsspec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_select_desc, spec_select }, /* select */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, nfs_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, spec_remove }, /* remove */ + { &vop_link_desc, spec_link }, /* link */ + { &vop_rename_desc, spec_rename }, /* rename */ + { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ + { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ + { &vop_symlink_desc, spec_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, nfs_inactive }, /* inactive */ + { &vop_reclaim_desc, nfs_reclaim }, /* reclaim */ + { &vop_lock_desc, nfs_lock }, /* lock */ + { &vop_unlock_desc, nfs_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + { &vop_strategy_desc, spec_strategy }, /* strategy */ + { &vop_print_desc, nfs_print }, /* print */ + { &vop_islocked_desc, nfs_islocked }, /* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_blkatoff_desc, spec_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, spec_valloc }, /* valloc */ + { &vop_reallocblks_desc, spec_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, spec_vfree }, /* vfree */ + { &vop_truncate_desc, spec_truncate }, /* truncate */ + { &vop_update_desc, nfs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = + { &spec_nfsv2nodeop_p, spec_nfsv2nodeop_entries }; + +#ifdef FIFO +int (**fifo_nfsv2nodeop_p)(); +struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fifo_lookup }, /* lookup */ + { &vop_create_desc, fifo_create }, /* create */ + { &vop_mknod_desc, fifo_mknod }, /* mknod */ + { &vop_open_desc, fifo_open }, /* open */ + { &vop_close_desc, nfsfifo_close }, /* close */ + { &vop_access_desc, nfsspec_access }, /* access */ + { &vop_getattr_desc, nfs_getattr }, /* getattr */ + { &vop_setattr_desc, nfs_setattr }, /* setattr */ + { &vop_read_desc, nfsfifo_read }, /* read */ + { &vop_write_desc, nfsfifo_write }, /* write */ + { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */ + { &vop_select_desc, fifo_select }, /* select */ + { &vop_mmap_desc, fifo_mmap }, /* mmap */ + { &vop_fsync_desc, nfs_fsync }, /* fsync */ + { &vop_seek_desc, fifo_seek }, /* seek */ + { &vop_remove_desc, fifo_remove }, /* remove */ + { &vop_link_desc, fifo_link }, /* link */ + { &vop_rename_desc, fifo_rename }, /* rename */ + { &vop_mkdir_desc, fifo_mkdir }, /* mkdir */ + { &vop_rmdir_desc, fifo_rmdir }, /* rmdir */ + { &vop_symlink_desc, fifo_symlink }, /* symlink */ + { &vop_readdir_desc, fifo_readdir }, /* readdir */ + { &vop_readlink_desc, fifo_readlink }, /* readlink */ + { &vop_abortop_desc, fifo_abortop }, /* abortop */ + { &vop_inactive_desc, nfs_inactive }, /* inactive */ + { &vop_reclaim_desc, nfs_reclaim }, /* reclaim */ + { &vop_lock_desc, nfs_lock }, /* lock */ + { &vop_unlock_desc, nfs_unlock }, /* unlock */ + { &vop_bmap_desc, fifo_bmap }, /* bmap */ + { &vop_strategy_desc, fifo_badop }, /* strategy */ + { &vop_print_desc, nfs_print }, /* print */ + { &vop_islocked_desc, nfs_islocked }, /* islocked */ + { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */ + { &vop_advlock_desc, fifo_advlock }, /* advlock */ + { &vop_blkatoff_desc, fifo_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fifo_valloc }, /* valloc */ + { &vop_reallocblks_desc, fifo_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, fifo_vfree }, /* vfree */ + { &vop_truncate_desc, fifo_truncate }, /* truncate */ + { &vop_update_desc, nfs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = + { &fifo_nfsv2nodeop_p, fifo_nfsv2nodeop_entries }; +#endif /* FIFO */ + +void nqnfs_clientlease(); + +/* + * Global variables + */ +extern u_long nfs_procids[NFS_NPROCS]; +extern u_long nfs_prog, nfs_vers, nfs_true, nfs_false; +extern char nfsiobuf[MAXPHYS+NBPG]; +struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +int nfs_numasync = 0; +#define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1)) + +/* + * nfs null call from vfs. + */ +int +nfs_null(vp, cred, procp) + struct vnode *vp; + struct ucred *cred; + struct proc *procp; +{ + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb; + + nfsm_reqhead(vp, NFSPROC_NULL, 0); + nfsm_request(vp, NFSPROC_NULL, procp, cred); + nfsm_reqdone; + return (error); +} + +/* + * nfs access vnode op. + * For nfs, just return ok. File accesses may fail later. + * For nqnfs, use the access rpc to check accessibility. If file modes are + * changed on the server, accesses might still fail later. + */ +int +nfs_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register u_long *tl; + register caddr_t cp; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + /* + * For nqnfs, do an access rpc, otherwise you are stuck emulating + * ufs_access() locally using the vattr. This may not be correct, + * since the server may apply other access criteria such as + * client uid-->server uid mapping that we do not know about, but + * this is better than just returning anything that is lying about + * in the cache. + */ + if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) { + nfsstats.rpccnt[NQNFSPROC_ACCESS]++; + nfsm_reqhead(vp, NQNFSPROC_ACCESS, NFSX_FH + 3 * NFSX_UNSIGNED); + nfsm_fhtom(vp); + nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); + if (ap->a_mode & VREAD) + *tl++ = nfs_true; + else + *tl++ = nfs_false; + if (ap->a_mode & VWRITE) + *tl++ = nfs_true; + else + *tl++ = nfs_false; + if (ap->a_mode & VEXEC) + *tl = nfs_true; + else + *tl = nfs_false; + nfsm_request(vp, NQNFSPROC_ACCESS, ap->a_p, ap->a_cred); + nfsm_reqdone; + return (error); + } else + return (nfsspec_access(ap)); +} + +/* + * nfs open vnode op + * Check to see if the type is ok + * and that deletion is not in progress. + * For paged in text files, you will need to flush the page cache + * if consistency is lost. + */ +/* ARGSUSED */ +int +nfs_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct vattr vattr; + int error; + + if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) + return (EACCES); + if (vp->v_flag & VTEXT) { + /* + * Get a valid lease. If cached data is stale, flush it. + */ + if (nmp->nm_flag & NFSMNT_NQNFS) { + if (NQNFS_CKINVALID(vp, np, NQL_READ)) { + do { + error = nqnfs_getlease(vp, NQL_READ, ap->a_cred, ap->a_p); + } while (error == NQNFS_EXPIRED); + if (error) + return (error); + if (np->n_lrev != np->n_brev || + (np->n_flag & NQNFSNONCACHE)) { + if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, + ap->a_p, 1)) == EINTR) + return (error); + (void) vnode_pager_uncache(vp); + np->n_brev = np->n_lrev; + } + } + } else { + if (np->n_flag & NMODIFIED) { + if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, + ap->a_p, 1)) == EINTR) + return (error); + (void) vnode_pager_uncache(vp); + np->n_attrstamp = 0; + np->n_direofoffset = 0; + if (error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p)) + return (error); + np->n_mtime = vattr.va_mtime.ts_sec; + } else { + if (error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_p)) + return (error); + if (np->n_mtime != vattr.va_mtime.ts_sec) { + np->n_direofoffset = 0; + if ((error = nfs_vinvalbuf(vp, V_SAVE, + ap->a_cred, ap->a_p, 1)) == EINTR) + return (error); + (void) vnode_pager_uncache(vp); + np->n_mtime = vattr.va_mtime.ts_sec; + } + } + } + } else if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) + np->n_attrstamp = 0; /* For Open/Close consistency */ + return (0); +} + +/* + * nfs close vnode op + * For reg files, invalidate any buffer cache entries. + */ +/* ARGSUSED */ +int +nfs_close(ap) + struct vop_close_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + int error = 0; + + if (vp->v_type == VREG) { + if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) == 0 && + (np->n_flag & NMODIFIED)) { + error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1); + np->n_attrstamp = 0; + } + if (np->n_flag & NWRITEERR) { + np->n_flag &= ~NWRITEERR; + error = np->n_error; + } + } + return (error); +} + +/* + * nfs getattr call from vfs. + */ +int +nfs_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + register caddr_t cp; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + /* + * Update local times for special files. + */ + if (np->n_flag & (NACC | NUPD)) + np->n_flag |= NCHG; + /* + * First look in the cache. + */ + if (nfs_getattrcache(vp, ap->a_vap) == 0) + return (0); + nfsstats.rpccnt[NFSPROC_GETATTR]++; + nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH); + nfsm_fhtom(vp); + nfsm_request(vp, NFSPROC_GETATTR, ap->a_p, ap->a_cred); + nfsm_loadattr(vp, ap->a_vap); + nfsm_reqdone; + return (error); +} + +/* + * nfs setattr call. + */ +int +nfs_setattr(ap) + struct vop_setattr_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct nfsv2_sattr *sp; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + u_long *tl; + int error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + register struct vattr *vap = ap->a_vap; + u_quad_t frev, tsize; + + if (vap->va_size != VNOVAL || vap->va_mtime.ts_sec != VNOVAL || + vap->va_atime.ts_sec != VNOVAL) { + if (vap->va_size != VNOVAL) { + if (np->n_flag & NMODIFIED) { + if (vap->va_size == 0) + error = nfs_vinvalbuf(vp, 0, ap->a_cred, + ap->a_p, 1); + else + error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, + ap->a_p, 1); + if (error) + return (error); + } + tsize = np->n_size; + np->n_size = np->n_vattr.va_size = vap->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else if ((np->n_flag & NMODIFIED) && + (error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, + ap->a_p, 1)) == EINTR) + return (error); + } + nfsstats.rpccnt[NFSPROC_SETATTR]++; + isnq = (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS); + nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH+NFSX_SATTR(isnq)); + nfsm_fhtom(vp); + nfsm_build(sp, struct nfsv2_sattr *, NFSX_SATTR(isnq)); + if (vap->va_mode == (u_short)-1) + sp->sa_mode = VNOVAL; + else + sp->sa_mode = vtonfs_mode(vp->v_type, vap->va_mode); + if (vap->va_uid == (uid_t)-1) + sp->sa_uid = VNOVAL; + else + sp->sa_uid = txdr_unsigned(vap->va_uid); + if (vap->va_gid == (gid_t)-1) + sp->sa_gid = VNOVAL; + else + sp->sa_gid = txdr_unsigned(vap->va_gid); + if (isnq) { + txdr_hyper(&vap->va_size, &sp->sa_nqsize); + txdr_nqtime(&vap->va_atime, &sp->sa_nqatime); + txdr_nqtime(&vap->va_mtime, &sp->sa_nqmtime); + sp->sa_nqflags = txdr_unsigned(vap->va_flags); + sp->sa_nqrdev = VNOVAL; + } else { + sp->sa_nfssize = txdr_unsigned(vap->va_size); + txdr_nfstime(&vap->va_atime, &sp->sa_nfsatime); + txdr_nfstime(&vap->va_mtime, &sp->sa_nfsmtime); + } + nfsm_request(vp, NFSPROC_SETATTR, ap->a_p, ap->a_cred); + nfsm_loadattr(vp, (struct vattr *)0); + if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) && + NQNFS_CKCACHABLE(vp, NQL_WRITE)) { + nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); + fxdr_hyper(tl, &frev); + if (frev > np->n_brev) + np->n_brev = frev; + } + nfsm_reqdone; + if (error) { + np->n_size = np->n_vattr.va_size = tsize; + vnode_pager_setsize(vp, (u_long)np->n_size); + } + return (error); +} + +/* + * nfs lookup call, one step at a time... + * First look in cache + * If not found, unlock the directory nfsnode and do the rpc + */ +int +nfs_lookup(ap) + struct vop_lookup_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct componentname *cnp = ap->a_cnp; + register struct vnode *dvp = ap->a_dvp; + register struct vnode **vpp = ap->a_vpp; + register int flags = cnp->cn_flags; + register struct vnode *vdp; + register u_long *tl; + register caddr_t cp; + register long t1, t2; + struct nfsmount *nmp; + caddr_t bpos, dpos, cp2; + time_t reqtime; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct vnode *newvp; + long len; + nfsv2fh_t *fhp; + struct nfsnode *np; + int lockparent, wantparent, error = 0; + int nqlflag, cachable; + u_quad_t frev; + + *vpp = NULL; + if (dvp->v_type != VDIR) + return (ENOTDIR); + lockparent = flags & LOCKPARENT; + wantparent = flags & (LOCKPARENT|WANTPARENT); + nmp = VFSTONFS(dvp->v_mount); + np = VTONFS(dvp); + if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) { + struct vattr vattr; + int vpid; + + vdp = *vpp; + vpid = vdp->v_id; + /* + * See the comment starting `Step through' in ufs/ufs_lookup.c + * for an explanation of the locking protocol + */ + if (dvp == vdp) { + VREF(vdp); + error = 0; + } else + error = vget(vdp, 1); + if (!error) { + if (vpid == vdp->v_id) { + if (nmp->nm_flag & NFSMNT_NQNFS) { + if ((nmp->nm_flag & NFSMNT_NQLOOKLEASE) == 0) { + nfsstats.lookupcache_hits++; + if (cnp->cn_nameiop != LOOKUP && + (flags & ISLASTCN)) + cnp->cn_flags |= SAVENAME; + return (0); + } else if (NQNFS_CKCACHABLE(dvp, NQL_READ)) { + if (np->n_lrev != np->n_brev || + (np->n_flag & NMODIFIED)) { + np->n_direofoffset = 0; + cache_purge(dvp); + error = nfs_vinvalbuf(dvp, 0, + cnp->cn_cred, cnp->cn_proc, + 1); + if (error == EINTR) + return (error); + np->n_brev = np->n_lrev; + } else { + nfsstats.lookupcache_hits++; + if (cnp->cn_nameiop != LOOKUP && + (flags & ISLASTCN)) + cnp->cn_flags |= SAVENAME; + return (0); + } + } + } else if (!VOP_GETATTR(vdp, &vattr, cnp->cn_cred, cnp->cn_proc) && + vattr.va_ctime.ts_sec == VTONFS(vdp)->n_ctime) { + nfsstats.lookupcache_hits++; + if (cnp->cn_nameiop != LOOKUP && + (flags & ISLASTCN)) + cnp->cn_flags |= SAVENAME; + return (0); + } + cache_purge(vdp); + } + vrele(vdp); + } + *vpp = NULLVP; + } + error = 0; + nfsstats.lookupcache_misses++; + nfsstats.rpccnt[NFSPROC_LOOKUP]++; + len = cnp->cn_namelen; + nfsm_reqhead(dvp, NFSPROC_LOOKUP, NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(len)); + + /* + * For nqnfs optionally piggyback a getlease request for the name + * being looked up. + */ + if (nmp->nm_flag & NFSMNT_NQNFS) { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + if ((nmp->nm_flag & NFSMNT_NQLOOKLEASE) && + ((cnp->cn_flags & MAKEENTRY) && + (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN)))) + *tl = txdr_unsigned(nmp->nm_leaseterm); + else + *tl = 0; + } + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); + reqtime = time.tv_sec; + nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_proc, cnp->cn_cred); +nfsmout: + if (error) { + if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) && + (flags & ISLASTCN) && error == ENOENT) + error = EJUSTRETURN; + if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) + cnp->cn_flags |= SAVENAME; + return (error); + } + if (nmp->nm_flag & NFSMNT_NQNFS) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + if (*tl) { + nqlflag = fxdr_unsigned(int, *tl); + nfsm_dissect(tl, u_long *, 4*NFSX_UNSIGNED); + cachable = fxdr_unsigned(int, *tl++); + reqtime += fxdr_unsigned(int, *tl++); + fxdr_hyper(tl, &frev); + } else + nqlflag = 0; + } + nfsm_dissect(fhp, nfsv2fh_t *, NFSX_FH); + + /* + * Handle RENAME case... + */ + if (cnp->cn_nameiop == RENAME && wantparent && (flags & ISLASTCN)) { + if (!bcmp(np->n_fh.fh_bytes, (caddr_t)fhp, NFSX_FH)) { + m_freem(mrep); + return (EISDIR); + } + if (error = nfs_nget(dvp->v_mount, fhp, &np)) { + m_freem(mrep); + return (error); + } + newvp = NFSTOV(np); + if (error = + nfs_loadattrcache(&newvp, &md, &dpos, (struct vattr *)0)) { + vrele(newvp); + m_freem(mrep); + return (error); + } + *vpp = newvp; + m_freem(mrep); + cnp->cn_flags |= SAVENAME; + return (0); + } + + if (!bcmp(np->n_fh.fh_bytes, (caddr_t)fhp, NFSX_FH)) { + VREF(dvp); + newvp = dvp; + } else { + if (error = nfs_nget(dvp->v_mount, fhp, &np)) { + m_freem(mrep); + return (error); + } + newvp = NFSTOV(np); + } + if (error = nfs_loadattrcache(&newvp, &md, &dpos, (struct vattr *)0)) { + vrele(newvp); + m_freem(mrep); + return (error); + } + m_freem(mrep); + *vpp = newvp; + if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN)) + cnp->cn_flags |= SAVENAME; + if ((cnp->cn_flags & MAKEENTRY) && + (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN))) { + if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) + np->n_ctime = np->n_vattr.va_ctime.ts_sec; + else if (nqlflag && reqtime > time.tv_sec) + nqnfs_clientlease(nmp, np, nqlflag, cachable, reqtime, + frev); + cache_enter(dvp, *vpp, cnp); + } + return (0); +} + +/* + * nfs read call. + * Just call nfs_bioread() to do the work. + */ +int +nfs_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + + if (vp->v_type != VREG) + return (EPERM); + return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred)); +} + +/* + * nfs readlink call + */ +int +nfs_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + + if (vp->v_type != VLNK) + return (EPERM); + return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred)); +} + +/* + * Do a readlink rpc. + * Called by nfs_doio() from below the buffer cache. + */ +int +nfs_readlinkrpc(vp, uiop, cred) + register struct vnode *vp; + struct uio *uiop; + struct ucred *cred; +{ + register u_long *tl; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + long len; + + nfsstats.rpccnt[NFSPROC_READLINK]++; + nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH); + nfsm_fhtom(vp); + nfsm_request(vp, NFSPROC_READLINK, uiop->uio_procp, cred); + nfsm_strsiz(len, NFS_MAXPATHLEN); + nfsm_mtouio(uiop, len); + nfsm_reqdone; + return (error); +} + +/* + * nfs read rpc call + * Ditto above + */ +int +nfs_readrpc(vp, uiop, cred) + register struct vnode *vp; + struct uio *uiop; + struct ucred *cred; +{ + register u_long *tl; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct nfsmount *nmp; + long len, retlen, tsiz; + + nmp = VFSTONFS(vp->v_mount); + tsiz = uiop->uio_resid; + if (uiop->uio_offset + tsiz > 0xffffffff && + (nmp->nm_flag & NFSMNT_NQNFS) == 0) + return (EFBIG); + while (tsiz > 0) { + nfsstats.rpccnt[NFSPROC_READ]++; + len = (tsiz > nmp->nm_rsize) ? nmp->nm_rsize : tsiz; + nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH+NFSX_UNSIGNED*3); + nfsm_fhtom(vp); + nfsm_build(tl, u_long *, NFSX_UNSIGNED*3); + if (nmp->nm_flag & NFSMNT_NQNFS) { + txdr_hyper(&uiop->uio_offset, tl); + *(tl + 2) = txdr_unsigned(len); + } else { + *tl++ = txdr_unsigned(uiop->uio_offset); + *tl++ = txdr_unsigned(len); + *tl = 0; + } + nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred); + nfsm_loadattr(vp, (struct vattr *)0); + nfsm_strsiz(retlen, nmp->nm_rsize); + nfsm_mtouio(uiop, retlen); + m_freem(mrep); + if (retlen < len) + tsiz = 0; + else + tsiz -= len; + } +nfsmout: + return (error); +} + +/* + * nfs write call + */ +int +nfs_writerpc(vp, uiop, cred, ioflags) + register struct vnode *vp; + struct uio *uiop; + struct ucred *cred; + int ioflags; +{ + register u_long *tl; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct nfsmount *nmp; + struct nfsnode *np = VTONFS(vp); + u_quad_t frev; + long len, tsiz; + + nmp = VFSTONFS(vp->v_mount); + tsiz = uiop->uio_resid; + if (uiop->uio_offset + tsiz > 0xffffffff && + (nmp->nm_flag & NFSMNT_NQNFS) == 0) + return (EFBIG); + while (tsiz > 0) { + nfsstats.rpccnt[NFSPROC_WRITE]++; + len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; + nfsm_reqhead(vp, NFSPROC_WRITE, + NFSX_FH+NFSX_UNSIGNED*4+nfsm_rndup(len)); + nfsm_fhtom(vp); + nfsm_build(tl, u_long *, NFSX_UNSIGNED * 4); + if (nmp->nm_flag & NFSMNT_NQNFS) { + txdr_hyper(&uiop->uio_offset, tl); + tl += 2; + if (ioflags & IO_APPEND) + *tl++ = txdr_unsigned(1); + else + *tl++ = 0; + } else { + *++tl = txdr_unsigned(uiop->uio_offset); + tl += 2; + } + *tl = txdr_unsigned(len); + nfsm_uiotom(uiop, len); + nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred); + nfsm_loadattr(vp, (struct vattr *)0); + if (nmp->nm_flag & NFSMNT_MYWRITE) + VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime.ts_sec; + else if ((nmp->nm_flag & NFSMNT_NQNFS) && + NQNFS_CKCACHABLE(vp, NQL_WRITE)) { + nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); + fxdr_hyper(tl, &frev); + if (frev > np->n_brev) + np->n_brev = frev; + } + m_freem(mrep); + tsiz -= len; + } +nfsmout: + if (error) + uiop->uio_resid = tsiz; + return (error); +} + +/* + * nfs mknod call + * This is a kludge. Use a create rpc but with the IFMT bits of the mode + * set to specify the file type and the size field for rdev. + */ +/* ARGSUSED */ +int +nfs_mknod(ap) + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + register struct vnode *dvp = ap->a_dvp; + register struct vattr *vap = ap->a_vap; + register struct componentname *cnp = ap->a_cnp; + register struct nfsv2_sattr *sp; + register u_long *tl; + register caddr_t cp; + register long t1, t2; + struct vnode *newvp; + struct vattr vattr; + char *cp2; + caddr_t bpos, dpos; + int error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + u_long rdev; + + isnq = (VFSTONFS(dvp->v_mount)->nm_flag & NFSMNT_NQNFS); + if (vap->va_type == VCHR || vap->va_type == VBLK) + rdev = txdr_unsigned(vap->va_rdev); +#ifdef FIFO + else if (vap->va_type == VFIFO) + rdev = 0xffffffff; +#endif /* FIFO */ + else { + VOP_ABORTOP(dvp, cnp); + vput(dvp); + return (EOPNOTSUPP); + } + if (error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) { + VOP_ABORTOP(dvp, cnp); + vput(dvp); + return (error); + } + nfsstats.rpccnt[NFSPROC_CREATE]++; + nfsm_reqhead(dvp, NFSPROC_CREATE, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(cnp->cn_namelen)+NFSX_SATTR(isnq)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_build(sp, struct nfsv2_sattr *, NFSX_SATTR(isnq)); + sp->sa_mode = vtonfs_mode(vap->va_type, vap->va_mode); + sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); + sp->sa_gid = txdr_unsigned(vattr.va_gid); + if (isnq) { + sp->sa_nqrdev = rdev; + sp->sa_nqflags = 0; + txdr_nqtime(&vap->va_atime, &sp->sa_nqatime); + txdr_nqtime(&vap->va_mtime, &sp->sa_nqmtime); + } else { + sp->sa_nfssize = rdev; + txdr_nfstime(&vap->va_atime, &sp->sa_nfsatime); + txdr_nfstime(&vap->va_mtime, &sp->sa_nfsmtime); + } + nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_proc, cnp->cn_cred); + nfsm_mtofh(dvp, newvp); + nfsm_reqdone; + if (!error && (cnp->cn_flags & MAKEENTRY)) + cache_enter(dvp, newvp, cnp); + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + vrele(dvp); + return (error); +} + +/* + * nfs file create call + */ +int +nfs_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + register struct vnode *dvp = ap->a_dvp; + register struct vattr *vap = ap->a_vap; + register struct componentname *cnp = ap->a_cnp; + register struct nfsv2_sattr *sp; + register u_long *tl; + register caddr_t cp; + register long t1, t2; + caddr_t bpos, dpos, cp2; + int error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct vattr vattr; + + if (error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) { + VOP_ABORTOP(dvp, cnp); + vput(dvp); + return (error); + } + nfsstats.rpccnt[NFSPROC_CREATE]++; + isnq = (VFSTONFS(dvp->v_mount)->nm_flag & NFSMNT_NQNFS); + nfsm_reqhead(dvp, NFSPROC_CREATE, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(cnp->cn_namelen)+NFSX_SATTR(isnq)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_build(sp, struct nfsv2_sattr *, NFSX_SATTR(isnq)); + sp->sa_mode = vtonfs_mode(vap->va_type, vap->va_mode); + sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); + sp->sa_gid = txdr_unsigned(vattr.va_gid); + if (isnq) { + u_quad_t qval = 0; + + txdr_hyper(&qval, &sp->sa_nqsize); + sp->sa_nqflags = 0; + sp->sa_nqrdev = -1; + txdr_nqtime(&vap->va_atime, &sp->sa_nqatime); + txdr_nqtime(&vap->va_mtime, &sp->sa_nqmtime); + } else { + sp->sa_nfssize = 0; + txdr_nfstime(&vap->va_atime, &sp->sa_nfsatime); + txdr_nfstime(&vap->va_mtime, &sp->sa_nfsmtime); + } + nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_proc, cnp->cn_cred); + nfsm_mtofh(dvp, *ap->a_vpp); + nfsm_reqdone; + if (!error && (cnp->cn_flags & MAKEENTRY)) + cache_enter(dvp, *ap->a_vpp, cnp); + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + vrele(dvp); + return (error); +} + +/* + * nfs file remove call + * To try and make nfs semantics closer to ufs semantics, a file that has + * other processes using the vnode is renamed instead of removed and then + * removed later on the last close. + * - If v_usecount > 1 + * If a rename is not already in the works + * call nfs_sillyrename() to set it up + * else + * do the remove rpc + */ +int +nfs_remove(ap) + struct vop_remove_args /* { + struct vnodeop_desc *a_desc; + struct vnode * a_dvp; + struct vnode * a_vp; + struct componentname * a_cnp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct vnode *dvp = ap->a_dvp; + register struct componentname *cnp = ap->a_cnp; + register struct nfsnode *np = VTONFS(vp); + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + if (vp->v_usecount > 1) { + if (!np->n_sillyrename) + error = nfs_sillyrename(dvp, vp, cnp); + } else { + /* + * Purge the name cache so that the chance of a lookup for + * the name succeeding while the remove is in progress is + * minimized. Without node locking it can still happen, such + * that an I/O op returns ESTALE, but since you get this if + * another host removes the file.. + */ + cache_purge(vp); + /* + * Throw away biocache buffers. Mainly to avoid + * unnecessary delayed writes. + */ + error = nfs_vinvalbuf(vp, 0, cnp->cn_cred, cnp->cn_proc, 1); + if (error == EINTR) + return (error); + /* Do the rpc */ + nfsstats.rpccnt[NFSPROC_REMOVE]++; + nfsm_reqhead(dvp, NFSPROC_REMOVE, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(cnp->cn_namelen)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_request(dvp, NFSPROC_REMOVE, cnp->cn_proc, cnp->cn_cred); + nfsm_reqdone; + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + /* + * Kludge City: If the first reply to the remove rpc is lost.. + * the reply to the retransmitted request will be ENOENT + * since the file was in fact removed + * Therefore, we cheat and return success. + */ + if (error == ENOENT) + error = 0; + } + np->n_attrstamp = 0; + vrele(dvp); + vrele(vp); + return (error); +} + +/* + * nfs file remove rpc called from nfs_inactive + */ +int +nfs_removeit(sp) + register struct sillyrename *sp; +{ + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + nfsstats.rpccnt[NFSPROC_REMOVE]++; + nfsm_reqhead(sp->s_dvp, NFSPROC_REMOVE, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(sp->s_namlen)); + nfsm_fhtom(sp->s_dvp); + nfsm_strtom(sp->s_name, sp->s_namlen, NFS_MAXNAMLEN); + nfsm_request(sp->s_dvp, NFSPROC_REMOVE, NULL, sp->s_cred); + nfsm_reqdone; + VTONFS(sp->s_dvp)->n_flag |= NMODIFIED; + VTONFS(sp->s_dvp)->n_attrstamp = 0; + return (error); +} + +/* + * nfs file rename call + */ +int +nfs_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + register struct vnode *fvp = ap->a_fvp; + register struct vnode *tvp = ap->a_tvp; + register struct vnode *fdvp = ap->a_fdvp; + register struct vnode *tdvp = ap->a_tdvp; + register struct componentname *tcnp = ap->a_tcnp; + register struct componentname *fcnp = ap->a_fcnp; + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + /* Check for cross-device rename */ + if ((fvp->v_mount != tdvp->v_mount) || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; + goto out; + } + + + nfsstats.rpccnt[NFSPROC_RENAME]++; + nfsm_reqhead(fdvp, NFSPROC_RENAME, + (NFSX_FH+NFSX_UNSIGNED)*2+nfsm_rndup(fcnp->cn_namelen)+ + nfsm_rndup(fcnp->cn_namelen)); /* or fcnp->cn_cred?*/ + nfsm_fhtom(fdvp); + nfsm_strtom(fcnp->cn_nameptr, fcnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_fhtom(tdvp); + nfsm_strtom(tcnp->cn_nameptr, tcnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_request(fdvp, NFSPROC_RENAME, tcnp->cn_proc, tcnp->cn_cred); + nfsm_reqdone; + VTONFS(fdvp)->n_flag |= NMODIFIED; + VTONFS(fdvp)->n_attrstamp = 0; + VTONFS(tdvp)->n_flag |= NMODIFIED; + VTONFS(tdvp)->n_attrstamp = 0; + if (fvp->v_type == VDIR) { + if (tvp != NULL && tvp->v_type == VDIR) + cache_purge(tdvp); + cache_purge(fdvp); + } +out: + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + vrele(fdvp); + vrele(fvp); + /* + * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. + */ + if (error == ENOENT) + error = 0; + return (error); +} + +/* + * nfs file rename rpc called from nfs_remove() above + */ +int +nfs_renameit(sdvp, scnp, sp) + struct vnode *sdvp; + struct componentname *scnp; + register struct sillyrename *sp; +{ + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + nfsstats.rpccnt[NFSPROC_RENAME]++; + nfsm_reqhead(sdvp, NFSPROC_RENAME, + (NFSX_FH+NFSX_UNSIGNED)*2+nfsm_rndup(scnp->cn_namelen)+ + nfsm_rndup(sp->s_namlen)); + nfsm_fhtom(sdvp); + nfsm_strtom(scnp->cn_nameptr, scnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_fhtom(sdvp); + nfsm_strtom(sp->s_name, sp->s_namlen, NFS_MAXNAMLEN); + nfsm_request(sdvp, NFSPROC_RENAME, scnp->cn_proc, scnp->cn_cred); + nfsm_reqdone; + FREE(scnp->cn_pnbuf, M_NAMEI); + VTONFS(sdvp)->n_flag |= NMODIFIED; + VTONFS(sdvp)->n_attrstamp = 0; + return (error); +} + +/* + * nfs hard link create call + */ +int +nfs_link(ap) + struct vop_link_args /* { + struct vnode *a_vp; + struct vnode *a_tdvp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct vnode *tdvp = ap->a_tdvp; + register struct componentname *cnp = ap->a_cnp; + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + if (vp->v_mount != tdvp->v_mount) { + /*VOP_ABORTOP(vp, cnp);*/ + if (tdvp == vp) + vrele(vp); + else + vput(vp); + return (EXDEV); + } + + nfsstats.rpccnt[NFSPROC_LINK]++; + nfsm_reqhead(tdvp, NFSPROC_LINK, + NFSX_FH*2+NFSX_UNSIGNED+nfsm_rndup(cnp->cn_namelen)); + nfsm_fhtom(tdvp); + nfsm_fhtom(vp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_request(tdvp, NFSPROC_LINK, cnp->cn_proc, cnp->cn_cred); + nfsm_reqdone; + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(tdvp)->n_attrstamp = 0; + VTONFS(tdvp)->n_flag |= NMODIFIED; + VTONFS(vp)->n_attrstamp = 0; + vrele(vp); + /* + * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. + */ + if (error == EEXIST) + error = 0; + return (error); +} + +/* + * nfs symbolic link create call + */ +/* start here */ +int +nfs_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + register struct vnode *dvp = ap->a_dvp; + register struct vattr *vap = ap->a_vap; + register struct componentname *cnp = ap->a_cnp; + register struct nfsv2_sattr *sp; + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int slen, error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + nfsstats.rpccnt[NFSPROC_SYMLINK]++; + slen = strlen(ap->a_target); + isnq = (VFSTONFS(dvp->v_mount)->nm_flag & NFSMNT_NQNFS); + nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH+2*NFSX_UNSIGNED+ + nfsm_rndup(cnp->cn_namelen)+nfsm_rndup(slen)+NFSX_SATTR(isnq)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN); + nfsm_build(sp, struct nfsv2_sattr *, NFSX_SATTR(isnq)); + sp->sa_mode = vtonfs_mode(VLNK, vap->va_mode); + sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); + sp->sa_gid = txdr_unsigned(cnp->cn_cred->cr_gid); + if (isnq) { + quad_t qval = -1; + + txdr_hyper(&qval, &sp->sa_nqsize); + sp->sa_nqflags = 0; + txdr_nqtime(&vap->va_atime, &sp->sa_nqatime); + txdr_nqtime(&vap->va_mtime, &sp->sa_nqmtime); + } else { + sp->sa_nfssize = -1; + txdr_nfstime(&vap->va_atime, &sp->sa_nfsatime); + txdr_nfstime(&vap->va_mtime, &sp->sa_nfsmtime); + } + nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_proc, cnp->cn_cred); + nfsm_reqdone; + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + vrele(dvp); + /* + * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. + */ + if (error == EEXIST) + error = 0; + return (error); +} + +/* + * nfs make dir call + */ +int +nfs_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + register struct vnode *dvp = ap->a_dvp; + register struct vattr *vap = ap->a_vap; + register struct componentname *cnp = ap->a_cnp; + register struct vnode **vpp = ap->a_vpp; + register struct nfsv2_sattr *sp; + register u_long *tl; + register caddr_t cp; + register long t1, t2; + register int len; + caddr_t bpos, dpos, cp2; + int error = 0, firsttry = 1, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct vattr vattr; + + if (error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred, cnp->cn_proc)) { + VOP_ABORTOP(dvp, cnp); + vput(dvp); + return (error); + } + len = cnp->cn_namelen; + isnq = (VFSTONFS(dvp->v_mount)->nm_flag & NFSMNT_NQNFS); + nfsstats.rpccnt[NFSPROC_MKDIR]++; + nfsm_reqhead(dvp, NFSPROC_MKDIR, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(len)+NFSX_SATTR(isnq)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); + nfsm_build(sp, struct nfsv2_sattr *, NFSX_SATTR(isnq)); + sp->sa_mode = vtonfs_mode(VDIR, vap->va_mode); + sp->sa_uid = txdr_unsigned(cnp->cn_cred->cr_uid); + sp->sa_gid = txdr_unsigned(vattr.va_gid); + if (isnq) { + quad_t qval = -1; + + txdr_hyper(&qval, &sp->sa_nqsize); + sp->sa_nqflags = 0; + txdr_nqtime(&vap->va_atime, &sp->sa_nqatime); + txdr_nqtime(&vap->va_mtime, &sp->sa_nqmtime); + } else { + sp->sa_nfssize = -1; + txdr_nfstime(&vap->va_atime, &sp->sa_nfsatime); + txdr_nfstime(&vap->va_mtime, &sp->sa_nfsmtime); + } + nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_proc, cnp->cn_cred); + nfsm_mtofh(dvp, *vpp); + nfsm_reqdone; + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + /* + * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry + * if we can succeed in looking up the directory. + * "firsttry" is necessary since the macros may "goto nfsmout" which + * is above the if on errors. (Ugh) + */ + if (error == EEXIST && firsttry) { + firsttry = 0; + error = 0; + nfsstats.rpccnt[NFSPROC_LOOKUP]++; + *vpp = NULL; + nfsm_reqhead(dvp, NFSPROC_LOOKUP, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(len)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN); + nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_proc, cnp->cn_cred); + nfsm_mtofh(dvp, *vpp); + if ((*vpp)->v_type != VDIR) { + vput(*vpp); + error = EEXIST; + } + m_freem(mrep); + } + FREE(cnp->cn_pnbuf, M_NAMEI); + vrele(dvp); + return (error); +} + +/* + * nfs remove directory call + */ +int +nfs_rmdir(ap) + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct vnode *dvp = ap->a_dvp; + register struct componentname *cnp = ap->a_cnp; + register u_long *tl; + register caddr_t cp; + register long t2; + caddr_t bpos, dpos; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + + if (dvp == vp) { + vrele(dvp); + vrele(dvp); + FREE(cnp->cn_pnbuf, M_NAMEI); + return (EINVAL); + } + nfsstats.rpccnt[NFSPROC_RMDIR]++; + nfsm_reqhead(dvp, NFSPROC_RMDIR, + NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(cnp->cn_namelen)); + nfsm_fhtom(dvp); + nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN); + nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_proc, cnp->cn_cred); + nfsm_reqdone; + FREE(cnp->cn_pnbuf, M_NAMEI); + VTONFS(dvp)->n_flag |= NMODIFIED; + VTONFS(dvp)->n_attrstamp = 0; + cache_purge(dvp); + cache_purge(vp); + vrele(vp); + vrele(dvp); + /* + * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. + */ + if (error == ENOENT) + error = 0; + return (error); +} + +/* + * nfs readdir call + * Although cookie is defined as opaque, I translate it to/from net byte + * order so that it looks more sensible. This appears consistent with the + * Ultrix implementation of NFS. + */ +int +nfs_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + register struct uio *uio = ap->a_uio; + int tresid, error; + struct vattr vattr; + + if (vp->v_type != VDIR) + return (EPERM); + /* + * First, check for hit on the EOF offset cache + */ + if (uio->uio_offset != 0 && uio->uio_offset == np->n_direofoffset && + (np->n_flag & NMODIFIED) == 0) { + if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) { + if (NQNFS_CKCACHABLE(vp, NQL_READ)) { + nfsstats.direofcache_hits++; + return (0); + } + } else if (VOP_GETATTR(vp, &vattr, ap->a_cred, uio->uio_procp) == 0 && + np->n_mtime == vattr.va_mtime.ts_sec) { + nfsstats.direofcache_hits++; + return (0); + } + } + + /* + * Call nfs_bioread() to do the real work. + */ + tresid = uio->uio_resid; + error = nfs_bioread(vp, uio, 0, ap->a_cred); + + if (!error && uio->uio_resid == tresid) + nfsstats.direofcache_misses++; + return (error); +} + +/* + * Readdir rpc call. + * Called from below the buffer cache by nfs_doio(). + */ +int +nfs_readdirrpc(vp, uiop, cred) + register struct vnode *vp; + struct uio *uiop; + struct ucred *cred; +{ + register long len; + register struct dirent *dp; + register u_long *tl; + register caddr_t cp; + register long t1; + long tlen, lastlen; + caddr_t bpos, dpos, cp2; + int error = 0; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct mbuf *md2; + caddr_t dpos2; + int siz; + int more_dirs = 1; + u_long off, savoff; + struct dirent *savdp; + struct nfsmount *nmp; + struct nfsnode *np = VTONFS(vp); + long tresid; + + nmp = VFSTONFS(vp->v_mount); + tresid = uiop->uio_resid; + /* + * Loop around doing readdir rpc's of size uio_resid or nm_rsize, + * whichever is smaller, truncated to a multiple of NFS_DIRBLKSIZ. + * The stopping criteria is EOF or buffer full. + */ + while (more_dirs && uiop->uio_resid >= NFS_DIRBLKSIZ) { + nfsstats.rpccnt[NFSPROC_READDIR]++; + nfsm_reqhead(vp, NFSPROC_READDIR, + NFSX_FH + 2 * NFSX_UNSIGNED); + nfsm_fhtom(vp); + nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); + off = (u_long)uiop->uio_offset; + *tl++ = txdr_unsigned(off); + *tl = txdr_unsigned(((uiop->uio_resid > nmp->nm_rsize) ? + nmp->nm_rsize : uiop->uio_resid) & ~(NFS_DIRBLKSIZ-1)); + nfsm_request(vp, NFSPROC_READDIR, uiop->uio_procp, cred); + siz = 0; + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + more_dirs = fxdr_unsigned(int, *tl); + + /* Save the position so that we can do nfsm_mtouio() later */ + dpos2 = dpos; + md2 = md; + + /* loop thru the dir entries, doctoring them to 4bsd form */ +#ifdef lint + dp = (struct dirent *)0; +#endif /* lint */ + while (more_dirs && siz < uiop->uio_resid) { + savoff = off; /* Hold onto offset and dp */ + savdp = dp; + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + dp = (struct dirent *)tl; + dp->d_fileno = fxdr_unsigned(u_long, *tl++); + len = fxdr_unsigned(int, *tl); + if (len <= 0 || len > NFS_MAXNAMLEN) { + error = EBADRPC; + m_freem(mrep); + goto nfsmout; + } + dp->d_namlen = (u_char)len; + dp->d_type = DT_UNKNOWN; + nfsm_adv(len); /* Point past name */ + tlen = nfsm_rndup(len); + /* + * This should not be necessary, but some servers have + * broken XDR such that these bytes are not null filled. + */ + if (tlen != len) { + *dpos = '\0'; /* Null-terminate */ + nfsm_adv(tlen - len); + len = tlen; + } + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + off = fxdr_unsigned(u_long, *tl); + *tl++ = 0; /* Ensures null termination of name */ + more_dirs = fxdr_unsigned(int, *tl); + dp->d_reclen = len + 4 * NFSX_UNSIGNED; + siz += dp->d_reclen; + } + /* + * If at end of rpc data, get the eof boolean + */ + if (!more_dirs) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + more_dirs = (fxdr_unsigned(int, *tl) == 0); + + /* + * If at EOF, cache directory offset + */ + if (!more_dirs) + np->n_direofoffset = off; + } + /* + * If there is too much to fit in the data buffer, use savoff and + * savdp to trim off the last record. + * --> we are not at eof + */ + if (siz > uiop->uio_resid) { + off = savoff; + siz -= dp->d_reclen; + dp = savdp; + more_dirs = 0; /* Paranoia */ + } + if (siz > 0) { + lastlen = dp->d_reclen; + md = md2; + dpos = dpos2; + nfsm_mtouio(uiop, siz); + uiop->uio_offset = (off_t)off; + } else + more_dirs = 0; /* Ugh, never happens, but in case.. */ + m_freem(mrep); + } + /* + * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ + * by increasing d_reclen for the last record. + */ + if (uiop->uio_resid < tresid) { + len = uiop->uio_resid & (NFS_DIRBLKSIZ - 1); + if (len > 0) { + dp = (struct dirent *) + (uiop->uio_iov->iov_base - lastlen); + dp->d_reclen += len; + uiop->uio_iov->iov_base += len; + uiop->uio_iov->iov_len -= len; + uiop->uio_resid -= len; + } + } +nfsmout: + return (error); +} + +/* + * Nqnfs readdir_and_lookup RPC. Used in place of nfs_readdirrpc(). + */ +int +nfs_readdirlookrpc(vp, uiop, cred) + struct vnode *vp; + register struct uio *uiop; + struct ucred *cred; +{ + register int len; + register struct dirent *dp; + register u_long *tl; + register caddr_t cp; + register long t1; + caddr_t bpos, dpos, cp2; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + struct nameidata nami, *ndp = &nami; + struct componentname *cnp = &ndp->ni_cnd; + u_long off, endoff, fileno; + time_t reqtime, ltime; + struct nfsmount *nmp; + struct nfsnode *np; + struct vnode *newvp; + nfsv2fh_t *fhp; + u_quad_t frev; + int error = 0, tlen, more_dirs = 1, tresid, doit, bigenough, i; + int cachable; + + if (uiop->uio_iovcnt != 1) + panic("nfs rdirlook"); + nmp = VFSTONFS(vp->v_mount); + tresid = uiop->uio_resid; + ndp->ni_dvp = vp; + newvp = NULLVP; + /* + * Loop around doing readdir rpc's of size uio_resid or nm_rsize, + * whichever is smaller, truncated to a multiple of NFS_DIRBLKSIZ. + * The stopping criteria is EOF or buffer full. + */ + while (more_dirs && uiop->uio_resid >= NFS_DIRBLKSIZ) { + nfsstats.rpccnt[NQNFSPROC_READDIRLOOK]++; + nfsm_reqhead(vp, NQNFSPROC_READDIRLOOK, + NFSX_FH + 3 * NFSX_UNSIGNED); + nfsm_fhtom(vp); + nfsm_build(tl, u_long *, 3 * NFSX_UNSIGNED); + off = (u_long)uiop->uio_offset; + *tl++ = txdr_unsigned(off); + *tl++ = txdr_unsigned(((uiop->uio_resid > nmp->nm_rsize) ? + nmp->nm_rsize : uiop->uio_resid) & ~(NFS_DIRBLKSIZ-1)); + if (nmp->nm_flag & NFSMNT_NQLOOKLEASE) + *tl = txdr_unsigned(nmp->nm_leaseterm); + else + *tl = 0; + reqtime = time.tv_sec; + nfsm_request(vp, NQNFSPROC_READDIRLOOK, uiop->uio_procp, cred); + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + more_dirs = fxdr_unsigned(int, *tl); + + /* loop thru the dir entries, doctoring them to 4bsd form */ + bigenough = 1; + while (more_dirs && bigenough) { + doit = 1; + nfsm_dissect(tl, u_long *, 4 * NFSX_UNSIGNED); + if (nmp->nm_flag & NFSMNT_NQLOOKLEASE) { + cachable = fxdr_unsigned(int, *tl++); + ltime = reqtime + fxdr_unsigned(int, *tl++); + fxdr_hyper(tl, &frev); + } + nfsm_dissect(fhp, nfsv2fh_t *, NFSX_FH); + if (!bcmp(VTONFS(vp)->n_fh.fh_bytes, (caddr_t)fhp, NFSX_FH)) { + VREF(vp); + newvp = vp; + np = VTONFS(vp); + } else { + if (error = nfs_nget(vp->v_mount, fhp, &np)) + doit = 0; + newvp = NFSTOV(np); + } + if (error = nfs_loadattrcache(&newvp, &md, &dpos, + (struct vattr *)0)) + doit = 0; + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + fileno = fxdr_unsigned(u_long, *tl++); + len = fxdr_unsigned(int, *tl); + if (len <= 0 || len > NFS_MAXNAMLEN) { + error = EBADRPC; + m_freem(mrep); + goto nfsmout; + } + tlen = (len + 4) & ~0x3; + if ((tlen + DIRHDSIZ) > uiop->uio_resid) + bigenough = 0; + if (bigenough && doit) { + dp = (struct dirent *)uiop->uio_iov->iov_base; + dp->d_fileno = fileno; + dp->d_namlen = len; + dp->d_reclen = tlen + DIRHDSIZ; + dp->d_type = + IFTODT(VTTOIF(np->n_vattr.va_type)); + uiop->uio_resid -= DIRHDSIZ; + uiop->uio_iov->iov_base += DIRHDSIZ; + uiop->uio_iov->iov_len -= DIRHDSIZ; + cnp->cn_nameptr = uiop->uio_iov->iov_base; + cnp->cn_namelen = len; + ndp->ni_vp = newvp; + nfsm_mtouio(uiop, len); + cp = uiop->uio_iov->iov_base; + tlen -= len; + for (i = 0; i < tlen; i++) + *cp++ = '\0'; + uiop->uio_iov->iov_base += tlen; + uiop->uio_iov->iov_len -= tlen; + uiop->uio_resid -= tlen; + cnp->cn_hash = 0; + for (cp = cnp->cn_nameptr, i = 1; i <= len; i++, cp++) + cnp->cn_hash += (unsigned char)*cp * i; + if ((nmp->nm_flag & NFSMNT_NQLOOKLEASE) && + ltime > time.tv_sec) + nqnfs_clientlease(nmp, np, NQL_READ, + cachable, ltime, frev); + if (cnp->cn_namelen <= NCHNAMLEN) + cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp); + } else { + nfsm_adv(nfsm_rndup(len)); + } + if (newvp != NULLVP) { + vrele(newvp); + newvp = NULLVP; + } + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + if (bigenough) + endoff = off = fxdr_unsigned(u_long, *tl++); + else + endoff = fxdr_unsigned(u_long, *tl++); + more_dirs = fxdr_unsigned(int, *tl); + } + /* + * If at end of rpc data, get the eof boolean + */ + if (!more_dirs) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + more_dirs = (fxdr_unsigned(int, *tl) == 0); + + /* + * If at EOF, cache directory offset + */ + if (!more_dirs) + VTONFS(vp)->n_direofoffset = endoff; + } + if (uiop->uio_resid < tresid) + uiop->uio_offset = (off_t)off; + else + more_dirs = 0; + m_freem(mrep); + } + /* + * Fill last record, iff any, out to a multiple of NFS_DIRBLKSIZ + * by increasing d_reclen for the last record. + */ + if (uiop->uio_resid < tresid) { + len = uiop->uio_resid & (NFS_DIRBLKSIZ - 1); + if (len > 0) { + dp->d_reclen += len; + uiop->uio_iov->iov_base += len; + uiop->uio_iov->iov_len -= len; + uiop->uio_resid -= len; + } + } +nfsmout: + if (newvp != NULLVP) + vrele(newvp); + return (error); +} +static char hextoasc[] = "0123456789abcdef"; + +/* + * Silly rename. To make the NFS filesystem that is stateless look a little + * more like the "ufs" a remove of an active vnode is translated to a rename + * to a funny looking filename that is removed by nfs_inactive on the + * nfsnode. There is the potential for another process on a different client + * to create the same funny name between the nfs_lookitup() fails and the + * nfs_rename() completes, but... + */ +int +nfs_sillyrename(dvp, vp, cnp) + struct vnode *dvp, *vp; + struct componentname *cnp; +{ + register struct nfsnode *np; + register struct sillyrename *sp; + int error; + short pid; + + cache_purge(dvp); + np = VTONFS(vp); +#ifdef SILLYSEPARATE + MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename), + M_NFSREQ, M_WAITOK); +#else + sp = &np->n_silly; +#endif + sp->s_cred = crdup(cnp->cn_cred); + sp->s_dvp = dvp; + VREF(dvp); + + /* Fudge together a funny name */ + pid = cnp->cn_proc->p_pid; + bcopy(".nfsAxxxx4.4", sp->s_name, 13); + sp->s_namlen = 12; + sp->s_name[8] = hextoasc[pid & 0xf]; + sp->s_name[7] = hextoasc[(pid >> 4) & 0xf]; + sp->s_name[6] = hextoasc[(pid >> 8) & 0xf]; + sp->s_name[5] = hextoasc[(pid >> 12) & 0xf]; + + /* Try lookitups until we get one that isn't there */ + while (nfs_lookitup(sp, (nfsv2fh_t *)0, cnp->cn_proc) == 0) { + sp->s_name[4]++; + if (sp->s_name[4] > 'z') { + error = EINVAL; + goto bad; + } + } + if (error = nfs_renameit(dvp, cnp, sp)) + goto bad; + nfs_lookitup(sp, &np->n_fh, cnp->cn_proc); + np->n_sillyrename = sp; + return (0); +bad: + vrele(sp->s_dvp); + crfree(sp->s_cred); +#ifdef SILLYSEPARATE + free((caddr_t)sp, M_NFSREQ); +#endif + return (error); +} + +/* + * Look up a file name for silly rename stuff. + * Just like nfs_lookup() except that it doesn't load returned values + * into the nfsnode table. + * If fhp != NULL it copies the returned file handle out + */ +int +nfs_lookitup(sp, fhp, procp) + register struct sillyrename *sp; + nfsv2fh_t *fhp; + struct proc *procp; +{ + register struct vnode *vp = sp->s_dvp; + register u_long *tl; + register caddr_t cp; + register long t1, t2; + caddr_t bpos, dpos, cp2; + int error = 0, isnq; + struct mbuf *mreq, *mrep, *md, *mb, *mb2; + long len; + + isnq = (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS); + nfsstats.rpccnt[NFSPROC_LOOKUP]++; + len = sp->s_namlen; + nfsm_reqhead(vp, NFSPROC_LOOKUP, NFSX_FH+NFSX_UNSIGNED+nfsm_rndup(len)); + if (isnq) { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = 0; + } + nfsm_fhtom(vp); + nfsm_strtom(sp->s_name, len, NFS_MAXNAMLEN); + nfsm_request(vp, NFSPROC_LOOKUP, procp, sp->s_cred); + if (fhp != NULL) { + if (isnq) + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + nfsm_dissect(cp, caddr_t, NFSX_FH); + bcopy(cp, (caddr_t)fhp, NFSX_FH); + } + nfsm_reqdone; + return (error); +} + +/* + * Kludge City.. + * - make nfs_bmap() essentially a no-op that does no translation + * - do nfs_strategy() by faking physical I/O with nfs_readrpc/nfs_writerpc + * after mapping the physical addresses into Kernel Virtual space in the + * nfsiobuf area. + * (Maybe I could use the process's page mapping, but I was concerned that + * Kernel Write might not be enabled and also figured copyout() would do + * a lot more work than bcopy() and also it currently happens in the + * context of the swapper process (2). + */ +int +nfs_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + + if (ap->a_vpp != NULL) + *ap->a_vpp = vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn * btodb(vp->v_mount->mnt_stat.f_iosize); + return (0); +} + +/* + * Strategy routine. + * For async requests when nfsiod(s) are running, queue the request by + * calling nfs_asyncio(), otherwise just all nfs_doio() to do the + * request. + */ +int +nfs_strategy(ap) + struct vop_strategy_args *ap; +{ + register struct buf *bp = ap->a_bp; + struct ucred *cr; + struct proc *p; + int error = 0; + + if (bp->b_flags & B_PHYS) + panic("nfs physio"); + if (bp->b_flags & B_ASYNC) + p = (struct proc *)0; + else + p = curproc; /* XXX */ + if (bp->b_flags & B_READ) + cr = bp->b_rcred; + else + cr = bp->b_wcred; + /* + * If the op is asynchronous and an i/o daemon is waiting + * queue the request, wake it up and wait for completion + * otherwise just do it ourselves. + */ + if ((bp->b_flags & B_ASYNC) == 0 || + nfs_asyncio(bp, NOCRED)) + error = nfs_doio(bp, cr, p); + return (error); +} + +/* + * Mmap a file + * + * NB Currently unsupported. + */ +/* ARGSUSED */ +int +nfs_mmap(ap) + struct vop_mmap_args /* { + struct vnode *a_vp; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (EINVAL); +} + +/* + * Flush all the blocks associated with a vnode. + * Walk through the buffer pool and push any dirty pages + * associated with the vnode. + */ +/* ARGSUSED */ +int +nfs_fsync(ap) + struct vop_fsync_args /* { + struct vnodeop_desc *a_desc; + struct vnode * a_vp; + struct ucred * a_cred; + int a_waitfor; + struct proc * a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + register struct buf *bp; + struct buf *nbp; + struct nfsmount *nmp; + int s, error = 0, slptimeo = 0, slpflag = 0; + + nmp = VFSTONFS(vp->v_mount); + if (nmp->nm_flag & NFSMNT_INT) + slpflag = PCATCH; +loop: + s = splbio(); + for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { + nbp = bp->b_vnbufs.le_next; + if (bp->b_flags & B_BUSY) { + if (ap->a_waitfor != MNT_WAIT) + continue; + bp->b_flags |= B_WANTED; + error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), + "nfsfsync", slptimeo); + splx(s); + if (error) { + if (nfs_sigintr(nmp, (struct nfsreq *)0, ap->a_p)) + return (EINTR); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + goto loop; + } + if ((bp->b_flags & B_DELWRI) == 0) + panic("nfs_fsync: not dirty"); + bremfree(bp); + bp->b_flags |= B_BUSY; + splx(s); + bp->b_flags |= B_ASYNC; + VOP_BWRITE(bp); + goto loop; + } + splx(s); + if (ap->a_waitfor == MNT_WAIT) { + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + error = tsleep((caddr_t)&vp->v_numoutput, + slpflag | (PRIBIO + 1), "nfsfsync", slptimeo); + if (error) { + if (nfs_sigintr(nmp, (struct nfsreq *)0, ap->a_p)) + return (EINTR); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + } + if (vp->v_dirtyblkhd.lh_first) { +#ifdef DIAGNOSTIC + vprint("nfs_fsync: dirty", vp); +#endif + goto loop; + } + } + if (np->n_flag & NWRITEERR) { + error = np->n_error; + np->n_flag &= ~NWRITEERR; + } + return (error); +} + +/* + * Return POSIX pathconf information applicable to nfs. + * + * Currently the NFS protocol does not support getting such + * information from the remote server. + */ +/* ARGSUSED */ +nfs_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + return (EINVAL); +} + +/* + * NFS advisory byte-level locks. + * Currently unsupported. + */ +int +nfs_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * Print out the contents of an nfsnode. + */ +int +nfs_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + + printf("tag VT_NFS, fileid %d fsid 0x%x", + np->n_vattr.va_fileid, np->n_vattr.va_fsid); +#ifdef FIFO + if (vp->v_type == VFIFO) + fifo_printinfo(vp); +#endif /* FIFO */ + printf("\n"); +} + +/* + * NFS directory offset lookup. + * Currently unsupported. + */ +int +nfs_blkatoff(ap) + struct vop_blkatoff_args /* { + struct vnode *a_vp; + off_t a_offset; + char **a_res; + struct buf **a_bpp; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * NFS flat namespace allocation. + * Currently unsupported. + */ +int +nfs_valloc(ap) + struct vop_valloc_args /* { + struct vnode *a_pvp; + int a_mode; + struct ucred *a_cred; + struct vnode **a_vpp; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * NFS flat namespace free. + * Currently unsupported. + */ +int +nfs_vfree(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + + return (EOPNOTSUPP); +} + +/* + * NFS file truncation. + */ +int +nfs_truncate(ap) + struct vop_truncate_args /* { + struct vnode *a_vp; + off_t a_length; + int a_flags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* Use nfs_setattr */ + printf("nfs_truncate: need to implement!!"); + return (EOPNOTSUPP); +} + +/* + * NFS update. + */ +int +nfs_update(ap) + struct vop_update_args /* { + struct vnode *a_vp; + struct timeval *a_ta; + struct timeval *a_tm; + int a_waitfor; + } */ *ap; +{ + + /* Use nfs_setattr */ + printf("nfs_update: need to implement!!"); + return (EOPNOTSUPP); +} + +/* + * nfs special file access vnode op. + * Essentially just get vattr and then imitate iaccess() since the device is + * local to the client. + */ +int +nfsspec_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vattr *vap; + register gid_t *gp; + register struct ucred *cred = ap->a_cred; + mode_t mode = ap->a_mode; + struct vattr vattr; + register int i; + int error; + + /* + * If you're the super-user, + * you always get access. + */ + if (cred->cr_uid == 0) + return (0); + vap = &vattr; + if (error = VOP_GETATTR(ap->a_vp, vap, cred, ap->a_p)) + return (error); + /* + * Access check is based on only one of owner, group, public. + * If not owner, then check group. If not a member of the + * group, then check public access. + */ + if (cred->cr_uid != vap->va_uid) { + mode >>= 3; + gp = cred->cr_groups; + for (i = 0; i < cred->cr_ngroups; i++, gp++) + if (vap->va_gid == *gp) + goto found; + mode >>= 3; +found: + ; + } + return ((vap->va_mode & mode) == mode ? 0 : EACCES); +} + +/* + * Read wrapper for special devices. + */ +int +nfsspec_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct nfsnode *np = VTONFS(ap->a_vp); + + /* + * Set access flag. + */ + np->n_flag |= NACC; + np->n_atim = time; + return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap)); +} + +/* + * Write wrapper for special devices. + */ +int +nfsspec_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct nfsnode *np = VTONFS(ap->a_vp); + + /* + * Set update flag. + */ + np->n_flag |= NUPD; + np->n_mtim = time; + return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap)); +} + +/* + * Close wrapper for special devices. + * + * Update the times on the nfsnode then do device close. + */ +int +nfsspec_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + struct vattr vattr; + + if (np->n_flag & (NACC | NUPD)) { + np->n_flag |= NCHG; + if (vp->v_usecount == 1 && + (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { + VATTR_NULL(&vattr); + if (np->n_flag & NACC) { + vattr.va_atime.ts_sec = np->n_atim.tv_sec; + vattr.va_atime.ts_nsec = + np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vattr.va_mtime.ts_sec = np->n_mtim.tv_sec; + vattr.va_mtime.ts_nsec = + np->n_mtim.tv_usec * 1000; + } + (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); + } + } + return (VOCALL(spec_vnodeop_p, VOFFSET(vop_close), ap)); +} + +#ifdef FIFO +/* + * Read wrapper for fifos. + */ +int +nfsfifo_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + extern int (**fifo_vnodeop_p)(); + register struct nfsnode *np = VTONFS(ap->a_vp); + + /* + * Set access flag. + */ + np->n_flag |= NACC; + np->n_atim = time; + return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap)); +} + +/* + * Write wrapper for fifos. + */ +int +nfsfifo_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + extern int (**fifo_vnodeop_p)(); + register struct nfsnode *np = VTONFS(ap->a_vp); + + /* + * Set update flag. + */ + np->n_flag |= NUPD; + np->n_mtim = time; + return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap)); +} + +/* + * Close wrapper for fifos. + * + * Update the times on the nfsnode then do fifo close. + */ +int +nfsfifo_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct nfsnode *np = VTONFS(vp); + struct vattr vattr; + extern int (**fifo_vnodeop_p)(); + + if (np->n_flag & (NACC | NUPD)) { + if (np->n_flag & NACC) + np->n_atim = time; + if (np->n_flag & NUPD) + np->n_mtim = time; + np->n_flag |= NCHG; + if (vp->v_usecount == 1 && + (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { + VATTR_NULL(&vattr); + if (np->n_flag & NACC) { + vattr.va_atime.ts_sec = np->n_atim.tv_sec; + vattr.va_atime.ts_nsec = + np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vattr.va_mtime.ts_sec = np->n_mtim.tv_sec; + vattr.va_mtime.ts_nsec = + np->n_mtim.tv_usec * 1000; + } + (void)VOP_SETATTR(vp, &vattr, ap->a_cred, ap->a_p); + } + } + return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap)); +} +#endif /* FIFO */ diff --git a/sys/nfsclient/nfsargs.h b/sys/nfsclient/nfsargs.h new file mode 100644 index 00000000000..261fd42657a --- /dev/null +++ b/sys/nfsclient/nfsargs.h @@ -0,0 +1,297 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Tunable constants for nfs + */ + +#define NFS_MAXIOVEC 34 +#define NFS_HZ 25 /* Ticks per second for NFS timeouts */ +#define NFS_TIMEO (1*NFS_HZ) /* Default timeout = 1 second */ +#define NFS_MINTIMEO (1*NFS_HZ) /* Min timeout to use */ +#define NFS_MAXTIMEO (60*NFS_HZ) /* Max timeout to backoff to */ +#define NFS_MINIDEMTIMEO (5*NFS_HZ) /* Min timeout for non-idempotent ops*/ +#define NFS_MAXREXMIT 100 /* Stop counting after this many */ +#define NFS_MAXWINDOW 1024 /* Max number of outstanding requests */ +#define NFS_RETRANS 10 /* Num of retrans for soft mounts */ +#define NFS_MAXGRPS 16 /* Max. size of groups list */ +#define NFS_MINATTRTIMO 5 /* Attribute cache timeout in sec */ +#define NFS_MAXATTRTIMO 60 +#define NFS_WSIZE 8192 /* Def. write data size <= 8192 */ +#define NFS_RSIZE 8192 /* Def. read data size <= 8192 */ +#define NFS_DEFRAHEAD 1 /* Def. read ahead # blocks */ +#define NFS_MAXRAHEAD 4 /* Max. read ahead # blocks */ +#define NFS_MAXREADDIR NFS_MAXDATA /* Max. size of directory read */ +#define NFS_MAXUIDHASH 64 /* Max. # of hashed uid entries/mp */ +#define NFS_MAXASYNCDAEMON 20 /* Max. number async_daemons runable */ +#define NFS_DIRBLKSIZ 1024 /* Size of an NFS directory block */ +#define NMOD(a) ((a) % nfs_asyncdaemons) + +/* + * Set the attribute timeout based on how recently the file has been modified. + */ +#define NFS_ATTRTIMEO(np) \ + ((((np)->n_flag & NMODIFIED) || \ + (time.tv_sec - (np)->n_mtime) / 10 < NFS_MINATTRTIMO) ? NFS_MINATTRTIMO : \ + ((time.tv_sec - (np)->n_mtime) / 10 > NFS_MAXATTRTIMO ? NFS_MAXATTRTIMO : \ + (time.tv_sec - (np)->n_mtime) / 10)) + +/* + * Structures for the nfssvc(2) syscall. Not that anyone but nfsd and mount_nfs + * should ever try and use it. + */ +struct nfsd_args { + int sock; /* Socket to serve */ + caddr_t name; /* Client address for connection based sockets */ + int namelen; /* Length of name */ +}; + +struct nfsd_srvargs { + struct nfsd *nsd_nfsd; /* Pointer to in kernel nfsd struct */ + uid_t nsd_uid; /* Effective uid mapped to cred */ + u_long nsd_haddr; /* Ip address of client */ + struct ucred nsd_cr; /* Cred. uid maps to */ + int nsd_authlen; /* Length of auth string (ret) */ + char *nsd_authstr; /* Auth string (ret) */ +}; + +struct nfsd_cargs { + char *ncd_dirp; /* Mount dir path */ + uid_t ncd_authuid; /* Effective uid */ + int ncd_authtype; /* Type of authenticator */ + int ncd_authlen; /* Length of authenticator string */ + char *ncd_authstr; /* Authenticator string */ +}; + +/* + * Stats structure + */ +struct nfsstats { + int attrcache_hits; + int attrcache_misses; + int lookupcache_hits; + int lookupcache_misses; + int direofcache_hits; + int direofcache_misses; + int biocache_reads; + int read_bios; + int read_physios; + int biocache_writes; + int write_bios; + int write_physios; + int biocache_readlinks; + int readlink_bios; + int biocache_readdirs; + int readdir_bios; + int rpccnt[NFS_NPROCS]; + int rpcretries; + int srvrpccnt[NFS_NPROCS]; + int srvrpc_errs; + int srv_errs; + int rpcrequests; + int rpctimeouts; + int rpcunexpected; + int rpcinvalid; + int srvcache_inproghits; + int srvcache_idemdonehits; + int srvcache_nonidemdonehits; + int srvcache_misses; + int srvnqnfs_leases; + int srvnqnfs_maxleases; + int srvnqnfs_getleases; +}; + +/* + * Flags for nfssvc() system call. + */ +#define NFSSVC_BIOD 0x002 +#define NFSSVC_NFSD 0x004 +#define NFSSVC_ADDSOCK 0x008 +#define NFSSVC_AUTHIN 0x010 +#define NFSSVC_GOTAUTH 0x040 +#define NFSSVC_AUTHINFAIL 0x080 +#define NFSSVC_MNTD 0x100 + +/* + * The set of signals the interrupt an I/O in progress for NFSMNT_INT mounts. + * What should be in this set is open to debate, but I believe that since + * I/O system calls on ufs are never interrupted by signals the set should + * be minimal. My reasoning is that many current programs that use signals + * such as SIGALRM will not expect file I/O system calls to be interrupted + * by them and break. + */ +#ifdef KERNEL +#define NFSINT_SIGMASK (sigmask(SIGINT)|sigmask(SIGTERM)|sigmask(SIGKILL)| \ + sigmask(SIGHUP)|sigmask(SIGQUIT)) + +/* + * Socket errors ignored for connectionless sockets?? + * For now, ignore them all + */ +#define NFSIGNORE_SOERROR(s, e) \ + ((e) != EINTR && (e) != ERESTART && (e) != EWOULDBLOCK && \ + ((s) & PR_CONNREQUIRED) == 0) + +/* + * Nfs outstanding request list element + */ +struct nfsreq { + struct nfsreq *r_next; + struct nfsreq *r_prev; + struct mbuf *r_mreq; + struct mbuf *r_mrep; + struct mbuf *r_md; + caddr_t r_dpos; + struct nfsmount *r_nmp; + struct vnode *r_vp; + u_long r_xid; + int r_flags; /* flags on request, see below */ + int r_retry; /* max retransmission count */ + int r_rexmit; /* current retrans count */ + int r_timer; /* tick counter on reply */ + int r_procnum; /* NFS procedure number */ + int r_rtt; /* RTT for rpc */ + struct proc *r_procp; /* Proc that did I/O system call */ +}; + +/* Flag values for r_flags */ +#define R_TIMING 0x01 /* timing request (in mntp) */ +#define R_SENT 0x02 /* request has been sent */ +#define R_SOFTTERM 0x04 /* soft mnt, too many retries */ +#define R_INTR 0x08 /* intr mnt, signal pending */ +#define R_SOCKERR 0x10 /* Fatal error on socket */ +#define R_TPRINTFMSG 0x20 /* Did a tprintf msg. */ +#define R_MUSTRESEND 0x40 /* Must resend request */ +#define R_GETONEREP 0x80 /* Probe for one reply only */ + +struct nfsstats nfsstats; + +/* + * A list of nfssvc_sock structures is maintained with all the sockets + * that require service by the nfsd. + * The nfsuid structs hang off of the nfssvc_sock structs in both lru + * and uid hash lists. + */ +#define NUIDHASHSIZ 32 +#define NUIDHASH(uid) ((uid) & (NUIDHASHSIZ - 1)) + +/* + * Network address hash list element + */ +union nethostaddr { + u_long had_inetaddr; + struct mbuf *had_nam; +}; + +struct nfsuid { + struct nfsuid *nu_lrunext; /* MUST be first */ + struct nfsuid *nu_lruprev; + struct nfsuid *nu_hnext; + struct nfsuid *nu_hprev; + int nu_flag; /* Flags */ + uid_t nu_uid; /* Uid mapped by this entry */ + union nethostaddr nu_haddr; /* Host addr. for dgram sockets */ + struct ucred nu_cr; /* Cred uid mapped to */ +}; + +#define nu_inetaddr nu_haddr.had_inetaddr +#define nu_nam nu_haddr.had_nam +/* Bits for nu_flag */ +#define NU_INETADDR 0x1 + +struct nfssvc_sock { + struct nfsuid *ns_lrunext; /* MUST be first */ + struct nfsuid *ns_lruprev; + struct nfssvc_sock *ns_next; + struct nfssvc_sock *ns_prev; + int ns_flag; + u_long ns_sref; + struct file *ns_fp; + struct socket *ns_so; + int ns_solock; + struct mbuf *ns_nam; + int ns_cc; + struct mbuf *ns_raw; + struct mbuf *ns_rawend; + int ns_reclen; + struct mbuf *ns_rec; + struct mbuf *ns_recend; + int ns_numuids; + struct nfsuid *ns_uidh[NUIDHASHSIZ]; +}; + +/* Bits for "ns_flag" */ +#define SLP_VALID 0x01 +#define SLP_DOREC 0x02 +#define SLP_NEEDQ 0x04 +#define SLP_DISCONN 0x08 +#define SLP_GETSTREAM 0x10 +#define SLP_INIT 0x20 +#define SLP_WANTINIT 0x40 + +#define SLP_ALLFLAGS 0xff + +/* + * One of these structures is allocated for each nfsd. + */ +struct nfsd { + struct nfsd *nd_next; /* Must be first */ + struct nfsd *nd_prev; + int nd_flag; /* NFSD_ flags */ + struct nfssvc_sock *nd_slp; /* Current socket */ + struct mbuf *nd_nam; /* Client addr for datagram req. */ + struct mbuf *nd_mrep; /* Req. mbuf list */ + struct mbuf *nd_md; + caddr_t nd_dpos; /* Position in list */ + int nd_procnum; /* RPC procedure number */ + u_long nd_retxid; /* RPC xid */ + int nd_repstat; /* Reply status value */ + struct ucred nd_cr; /* Credentials for req. */ + int nd_nqlflag; /* Leasing flag */ + int nd_duration; /* Lease duration */ + int nd_authlen; /* Authenticator len */ + u_char nd_authstr[RPCAUTH_MAXSIZ]; /* Authenticator data */ + struct proc *nd_procp; /* Proc ptr */ +}; + +#define NFSD_WAITING 0x01 +#define NFSD_CHECKSLP 0x02 +#define NFSD_REQINPROG 0x04 +#define NFSD_NEEDAUTH 0x08 +#define NFSD_AUTHFAIL 0x10 +#endif /* KERNEL */ diff --git a/sys/nfsclient/nfsdiskless.h b/sys/nfsclient/nfsdiskless.h new file mode 100644 index 00000000000..74e6b7bca43 --- /dev/null +++ b/sys/nfsclient/nfsdiskless.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsdiskless.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Structure that must be initialized for a diskless nfs client. + * This structure is used by nfs_mountroot() to set up the root and swap + * vnodes plus do a partial ifconfig(8) and route(8) so that the critical net + * interface can communicate with the server. + * The primary bootstrap is expected to fill in the appropriate fields before + * starting vmunix. Whether or not the swap area is nfs mounted is determined + * by the value in swdevt[0]. (equal to NODEV --> swap over nfs) + * Currently only works for AF_INET protocols. + * NB: All fields are stored in net byte order to avoid hassles with + * client/server byte ordering differences. + */ +struct nfs_diskless { + struct ifaliasreq myif; /* Default interface */ + struct sockaddr_in mygateway; /* Default gateway */ + struct nfs_args swap_args; /* Mount args for swap file */ + u_char swap_fh[NFS_FHSIZE]; /* Swap file's file handle */ + struct sockaddr_in swap_saddr; /* Address of swap server */ + char swap_hostnam[MNAMELEN]; /* Host name for mount pt */ + int swap_nblks; /* Size of server swap file */ + struct ucred swap_ucred; /* Swap credentials */ + struct nfs_args root_args; /* Mount args for root fs */ + u_char root_fh[NFS_FHSIZE]; /* File handle of root dir */ + struct sockaddr_in root_saddr; /* Address of root server */ + char root_hostnam[MNAMELEN]; /* Host name for mount pt */ + long root_time; /* Timestamp of root fs */ + char my_hostnam[MAXHOSTNAMELEN]; /* Client host name */ +}; diff --git a/sys/nfsclient/nfsm_subs.h b/sys/nfsclient/nfsm_subs.h new file mode 100644 index 00000000000..879db360057 --- /dev/null +++ b/sys/nfsclient/nfsm_subs.h @@ -0,0 +1,269 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsm_subs.h 8.1 (Berkeley) 6/16/93 + */ + +/* + * These macros do strange and peculiar things to mbuf chains for + * the assistance of the nfs code. To attempt to use them for any + * other purpose will be dangerous. (they make weird assumptions) + */ + +/* + * First define what the actual subs. return + */ +extern struct mbuf *nfsm_reqh(); + +#define M_HASCL(m) ((m)->m_flags & M_EXT) +#define NFSMINOFF(m) \ + if (M_HASCL(m)) \ + (m)->m_data = (m)->m_ext.ext_buf; \ + else if ((m)->m_flags & M_PKTHDR) \ + (m)->m_data = (m)->m_pktdat; \ + else \ + (m)->m_data = (m)->m_dat +#define NFSMADV(m, s) (m)->m_data += (s) +#define NFSMSIZ(m) ((M_HASCL(m))?MCLBYTES: \ + (((m)->m_flags & M_PKTHDR)?MHLEN:MLEN)) + +/* + * Now for the macros that do the simple stuff and call the functions + * for the hard stuff. + * These macros use several vars. declared in nfsm_reqhead and these + * vars. must not be used elsewhere unless you are careful not to corrupt + * them. The vars. starting with pN and tN (N=1,2,3,..) are temporaries + * that may be used so long as the value is not expected to retained + * after a macro. + * I know, this is kind of dorkey, but it makes the actual op functions + * fairly clean and deals with the mess caused by the xdr discriminating + * unions. + */ + +#define nfsm_build(a,c,s) \ + { if ((s) > M_TRAILINGSPACE(mb)) { \ + MGET(mb2, M_WAIT, MT_DATA); \ + if ((s) > MLEN) \ + panic("build > MLEN"); \ + mb->m_next = mb2; \ + mb = mb2; \ + mb->m_len = 0; \ + bpos = mtod(mb, caddr_t); \ + } \ + (a) = (c)(bpos); \ + mb->m_len += (s); \ + bpos += (s); } + +#define nfsm_dissect(a,c,s) \ + { t1 = mtod(md, caddr_t)+md->m_len-dpos; \ + if (t1 >= (s)) { \ + (a) = (c)(dpos); \ + dpos += (s); \ + } else if (error = nfsm_disct(&md, &dpos, (s), t1, &cp2)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } else { \ + (a) = (c)cp2; \ + } } + +#define nfsm_fhtom(v) \ + nfsm_build(cp,caddr_t,NFSX_FH); \ + bcopy((caddr_t)&(VTONFS(v)->n_fh), cp, NFSX_FH) + +#define nfsm_srvfhtom(f) \ + nfsm_build(cp,caddr_t,NFSX_FH); \ + bcopy((caddr_t)(f), cp, NFSX_FH) + +#define nfsm_mtofh(d,v) \ + { struct nfsnode *np; nfsv2fh_t *fhp; \ + nfsm_dissect(fhp,nfsv2fh_t *,NFSX_FH); \ + if (error = nfs_nget((d)->v_mount, fhp, &np)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } \ + (v) = NFSTOV(np); \ + nfsm_loadattr(v, (struct vattr *)0); \ + } + +#define nfsm_loadattr(v,a) \ + { struct vnode *tvp = (v); \ + if (error = nfs_loadattrcache(&tvp, &md, &dpos, (a))) { \ + m_freem(mrep); \ + goto nfsmout; \ + } \ + (v) = tvp; } + +#define nfsm_strsiz(s,m) \ + { nfsm_dissect(tl,u_long *,NFSX_UNSIGNED); \ + if (((s) = fxdr_unsigned(long,*tl)) > (m)) { \ + m_freem(mrep); \ + error = EBADRPC; \ + goto nfsmout; \ + } } + +#define nfsm_srvstrsiz(s,m) \ + { nfsm_dissect(tl,u_long *,NFSX_UNSIGNED); \ + if (((s) = fxdr_unsigned(long,*tl)) > (m) || (s) <= 0) { \ + error = EBADRPC; \ + nfsm_reply(0); \ + } } + +#define nfsm_mtouio(p,s) \ + if ((s) > 0 && \ + (error = nfsm_mbuftouio(&md,(p),(s),&dpos))) { \ + m_freem(mrep); \ + goto nfsmout; \ + } + +#define nfsm_uiotom(p,s) \ + if (error = nfsm_uiotombuf((p),&mb,(s),&bpos)) { \ + m_freem(mreq); \ + goto nfsmout; \ + } + +#define nfsm_reqhead(v,a,s) \ + mb = mreq = nfsm_reqh((v),(a),(s),&bpos) + +#define nfsm_reqdone m_freem(mrep); \ + nfsmout: + +#define nfsm_rndup(a) (((a)+3)&(~0x3)) + +#define nfsm_request(v, t, p, c) \ + if (error = nfs_request((v), mreq, (t), (p), \ + (c), &mrep, &md, &dpos)) \ + goto nfsmout + +#define nfsm_strtom(a,s,m) \ + if ((s) > (m)) { \ + m_freem(mreq); \ + error = ENAMETOOLONG; \ + goto nfsmout; \ + } \ + t2 = nfsm_rndup(s)+NFSX_UNSIGNED; \ + if (t2 <= M_TRAILINGSPACE(mb)) { \ + nfsm_build(tl,u_long *,t2); \ + *tl++ = txdr_unsigned(s); \ + *(tl+((t2>>2)-2)) = 0; \ + bcopy((caddr_t)(a), (caddr_t)tl, (s)); \ + } else if (error = nfsm_strtmbuf(&mb, &bpos, (a), (s))) { \ + m_freem(mreq); \ + goto nfsmout; \ + } + +#define nfsm_srvdone \ + nfsmout: \ + return(error) + +#define nfsm_reply(s) \ + { \ + nfsd->nd_repstat = error; \ + if (error) \ + (void) nfs_rephead(0, nfsd, error, cache, &frev, \ + mrq, &mb, &bpos); \ + else \ + (void) nfs_rephead((s), nfsd, error, cache, &frev, \ + mrq, &mb, &bpos); \ + m_freem(mrep); \ + mreq = *mrq; \ + if (error) \ + return(0); \ + } + +#define nfsm_adv(s) \ + t1 = mtod(md, caddr_t)+md->m_len-dpos; \ + if (t1 >= (s)) { \ + dpos += (s); \ + } else if (error = nfs_adv(&md, &dpos, (s), t1)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } + +#define nfsm_srvmtofh(f) \ + nfsm_dissect(tl, u_long *, NFSX_FH); \ + bcopy((caddr_t)tl, (caddr_t)f, NFSX_FH) + +#define nfsm_clget \ + if (bp >= be) { \ + if (mp == mb) \ + mp->m_len += bp-bpos; \ + MGET(mp, M_WAIT, MT_DATA); \ + MCLGET(mp, M_WAIT); \ + mp->m_len = NFSMSIZ(mp); \ + mp2->m_next = mp; \ + mp2 = mp; \ + bp = mtod(mp, caddr_t); \ + be = bp+mp->m_len; \ + } \ + tl = (u_long *)bp + +#define nfsm_srvfillattr \ + fp->fa_type = vtonfs_type(vap->va_type); \ + fp->fa_mode = vtonfs_mode(vap->va_type, vap->va_mode); \ + fp->fa_nlink = txdr_unsigned(vap->va_nlink); \ + fp->fa_uid = txdr_unsigned(vap->va_uid); \ + fp->fa_gid = txdr_unsigned(vap->va_gid); \ + if (nfsd->nd_nqlflag == NQL_NOVAL) { \ + fp->fa_nfsblocksize = txdr_unsigned(vap->va_blocksize); \ + if (vap->va_type == VFIFO) \ + fp->fa_nfsrdev = 0xffffffff; \ + else \ + fp->fa_nfsrdev = txdr_unsigned(vap->va_rdev); \ + fp->fa_nfsfsid = txdr_unsigned(vap->va_fsid); \ + fp->fa_nfsfileid = txdr_unsigned(vap->va_fileid); \ + fp->fa_nfssize = txdr_unsigned(vap->va_size); \ + fp->fa_nfsblocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); \ + txdr_nfstime(&vap->va_atime, &fp->fa_nfsatime); \ + txdr_nfstime(&vap->va_mtime, &fp->fa_nfsmtime); \ + fp->fa_nfsctime.nfs_sec = txdr_unsigned(vap->va_ctime.ts_sec); \ + fp->fa_nfsctime.nfs_usec = txdr_unsigned(vap->va_gen); \ + } else { \ + fp->fa_nqblocksize = txdr_unsigned(vap->va_blocksize); \ + if (vap->va_type == VFIFO) \ + fp->fa_nqrdev = 0xffffffff; \ + else \ + fp->fa_nqrdev = txdr_unsigned(vap->va_rdev); \ + fp->fa_nqfsid = txdr_unsigned(vap->va_fsid); \ + fp->fa_nqfileid = txdr_unsigned(vap->va_fileid); \ + txdr_hyper(&vap->va_size, &fp->fa_nqsize); \ + txdr_hyper(&vap->va_bytes, &fp->fa_nqbytes); \ + txdr_nqtime(&vap->va_atime, &fp->fa_nqatime); \ + txdr_nqtime(&vap->va_mtime, &fp->fa_nqmtime); \ + txdr_nqtime(&vap->va_ctime, &fp->fa_nqctime); \ + fp->fa_nqflags = txdr_unsigned(vap->va_flags); \ + fp->fa_nqgen = txdr_unsigned(vap->va_gen); \ + txdr_hyper(&vap->va_filerev, &fp->fa_nqfilerev); \ + } + diff --git a/sys/nfsclient/nfsmount.h b/sys/nfsclient/nfsmount.h new file mode 100644 index 00000000000..4d74acb38a5 --- /dev/null +++ b/sys/nfsclient/nfsmount.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsmount.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Mount structure. + * One allocated on every NFS mount. + * Holds NFS specific information for mount. + */ +struct nfsmount { + int nm_flag; /* Flags for soft/hard... */ + struct mount *nm_mountp; /* Vfs structure for this filesystem */ + int nm_numgrps; /* Max. size of groupslist */ + nfsv2fh_t nm_fh; /* File handle of root dir */ + struct socket *nm_so; /* Rpc socket */ + int nm_sotype; /* Type of socket */ + int nm_soproto; /* and protocol */ + int nm_soflags; /* pr_flags for socket protocol */ + struct mbuf *nm_nam; /* Addr of server */ + int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */ + int nm_retry; /* Max retries */ + int nm_srtt[4]; /* Timers for rpcs */ + int nm_sdrtt[4]; + int nm_sent; /* Request send count */ + int nm_cwnd; /* Request send window */ + int nm_timeouts; /* Request timeouts */ + int nm_deadthresh; /* Threshold of timeouts-->dead server*/ + int nm_rsize; /* Max size of read rpc */ + int nm_wsize; /* Max size of write rpc */ + int nm_readahead; /* Num. of blocks to readahead */ + int nm_leaseterm; /* Term (sec) for NQNFS lease */ + struct nfsnode *nm_tnext; /* Head of lease timer queue */ + struct nfsnode *nm_tprev; + struct vnode *nm_inprog; /* Vnode in prog by nqnfs_clientd() */ + uid_t nm_authuid; /* Uid for authenticator */ + int nm_authtype; /* Authenticator type */ + int nm_authlen; /* and length */ + char *nm_authstr; /* Authenticator string */ +}; + +#ifdef KERNEL +/* + * Convert mount ptr to nfsmount ptr. + */ +#define VFSTONFS(mp) ((struct nfsmount *)((mp)->mnt_data)) +#endif /* KERNEL */ + +/* + * Prototypes for NFS mount operations + */ +int nfs_mount __P(( + struct mount *mp, + char *path, + caddr_t data, + struct nameidata *ndp, + struct proc *p)); +int nfs_start __P(( + struct mount *mp, + int flags, + struct proc *p)); +int nfs_unmount __P(( + struct mount *mp, + int mntflags, + struct proc *p)); +int nfs_root __P(( + struct mount *mp, + struct vnode **vpp)); +int nfs_quotactl __P(( + struct mount *mp, + int cmds, + uid_t uid, + caddr_t arg, + struct proc *p)); +int nfs_statfs __P(( + struct mount *mp, + struct statfs *sbp, + struct proc *p)); +int nfs_sync __P(( + struct mount *mp, + int waitfor, + struct ucred *cred, + struct proc *p)); +int nfs_fhtovp __P(( + struct mount *mp, + struct fid *fhp, + struct mbuf *nam, + struct vnode **vpp, + int *exflagsp, + struct ucred **credanonp)); +int nfs_vptofh __P(( + struct vnode *vp, + struct fid *fhp)); +int nfs_init __P(()); diff --git a/sys/nfsclient/nfsnode.h b/sys/nfsclient/nfsnode.h new file mode 100644 index 00000000000..f5fee5bf2f3 --- /dev/null +++ b/sys/nfsclient/nfsnode.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsnode.h 8.4 (Berkeley) 2/13/94 + */ + +/* + * Silly rename structure that hangs off the nfsnode until the name + * can be removed by nfs_inactive() + */ +struct sillyrename { + struct ucred *s_cred; + struct vnode *s_dvp; + long s_namlen; + char s_name[20]; +}; + +/* + * The nfsnode is the nfs equivalent to ufs's inode. Any similarity + * is purely coincidental. + * There is a unique nfsnode allocated for each active file, + * each current directory, each mounted-on file, text file, and the root. + * An nfsnode is 'named' by its file handle. (nget/nfs_node.c) + */ + +struct nfsnode { + struct nfsnode *n_forw; /* hash, forward */ + struct nfsnode **n_back; /* hash, backward */ + nfsv2fh_t n_fh; /* NFS File Handle */ + long n_flag; /* Flag for locking.. */ + struct vnode *n_vnode; /* vnode associated with this node */ + struct vattr n_vattr; /* Vnode attribute cache */ + time_t n_attrstamp; /* Time stamp for cached attributes */ + struct sillyrename *n_sillyrename; /* Ptr to silly rename struct */ + u_quad_t n_size; /* Current size of file */ + int n_error; /* Save write error value */ + u_long n_direofoffset; /* Dir. EOF offset cache */ + time_t n_mtime; /* Prev modify time. */ + time_t n_ctime; /* Prev create time. */ + u_quad_t n_brev; /* Modify rev when cached */ + u_quad_t n_lrev; /* Modify rev for lease */ + time_t n_expiry; /* Lease expiry time */ + struct nfsnode *n_tnext; /* Nqnfs timer chain */ + struct nfsnode *n_tprev; + long spare1; /* To 8 byte boundary */ + struct sillyrename n_silly; /* Silly rename struct */ + struct timeval n_atim; /* Special file times */ + struct timeval n_mtim; +}; + +/* + * Flags for n_flag + */ +#define NFLUSHWANT 0x0001 /* Want wakeup from a flush in prog. */ +#define NFLUSHINPROG 0x0002 /* Avoid multiple calls to vinvalbuf() */ +#define NMODIFIED 0x0004 /* Might have a modified buffer in bio */ +#define NWRITEERR 0x0008 /* Flag write errors so close will know */ +#define NQNFSNONCACHE 0x0020 /* Non-cachable lease */ +#define NQNFSWRITE 0x0040 /* Write lease */ +#define NQNFSEVICTED 0x0080 /* Has been evicted */ +#define NACC 0x0100 /* Special file accessed */ +#define NUPD 0x0200 /* Special file updated */ +#define NCHG 0x0400 /* Special file times changed */ + +/* + * Convert between nfsnode pointers and vnode pointers + */ +#define VTONFS(vp) ((struct nfsnode *)(vp)->v_data) +#define NFSTOV(np) ((struct vnode *)(np)->n_vnode) + +/* + * Queue head for nfsiod's + */ +TAILQ_HEAD(nfsbufs, buf) nfs_bufq; + +#ifdef KERNEL +/* + * Prototypes for NFS vnode operations + */ +int nfs_lookup __P((struct vop_lookup_args *)); +int nfs_create __P((struct vop_create_args *)); +int nfs_mknod __P((struct vop_mknod_args *)); +int nfs_open __P((struct vop_open_args *)); +int nfs_close __P((struct vop_close_args *)); +int nfsspec_close __P((struct vop_close_args *)); +#ifdef FIFO +int nfsfifo_close __P((struct vop_close_args *)); +#endif +int nfs_access __P((struct vop_access_args *)); +int nfsspec_access __P((struct vop_access_args *)); +int nfs_getattr __P((struct vop_getattr_args *)); +int nfs_setattr __P((struct vop_setattr_args *)); +int nfs_read __P((struct vop_read_args *)); +int nfs_write __P((struct vop_write_args *)); +int nfsspec_read __P((struct vop_read_args *)); +int nfsspec_write __P((struct vop_write_args *)); +#ifdef FIFO +int nfsfifo_read __P((struct vop_read_args *)); +int nfsfifo_write __P((struct vop_write_args *)); +#endif +#define nfs_ioctl ((int (*) __P((struct vop_ioctl_args *)))enoioctl) +#define nfs_select ((int (*) __P((struct vop_select_args *)))seltrue) +int nfs_mmap __P((struct vop_mmap_args *)); +int nfs_fsync __P((struct vop_fsync_args *)); +#define nfs_seek ((int (*) __P((struct vop_seek_args *)))nullop) +int nfs_remove __P((struct vop_remove_args *)); +int nfs_link __P((struct vop_link_args *)); +int nfs_rename __P((struct vop_rename_args *)); +int nfs_mkdir __P((struct vop_mkdir_args *)); +int nfs_rmdir __P((struct vop_rmdir_args *)); +int nfs_symlink __P((struct vop_symlink_args *)); +int nfs_readdir __P((struct vop_readdir_args *)); +int nfs_readlink __P((struct vop_readlink_args *)); +int nfs_abortop __P((struct vop_abortop_args *)); +int nfs_inactive __P((struct vop_inactive_args *)); +int nfs_reclaim __P((struct vop_reclaim_args *)); +int nfs_lock __P((struct vop_lock_args *)); +int nfs_unlock __P((struct vop_unlock_args *)); +int nfs_bmap __P((struct vop_bmap_args *)); +int nfs_strategy __P((struct vop_strategy_args *)); +int nfs_print __P((struct vop_print_args *)); +int nfs_islocked __P((struct vop_islocked_args *)); +int nfs_pathconf __P((struct vop_pathconf_args *)); +int nfs_advlock __P((struct vop_advlock_args *)); +int nfs_blkatoff __P((struct vop_blkatoff_args *)); +int nfs_vget __P((struct mount *, ino_t, struct vnode **)); +int nfs_valloc __P((struct vop_valloc_args *)); +#define nfs_reallocblks \ + ((int (*) __P((struct vop_reallocblks_args *)))eopnotsupp) +int nfs_vfree __P((struct vop_vfree_args *)); +int nfs_truncate __P((struct vop_truncate_args *)); +int nfs_update __P((struct vop_update_args *)); +int nfs_bwrite __P((struct vop_bwrite_args *)); +#endif /* KERNEL */ diff --git a/sys/nfsclient/nfsstats.h b/sys/nfsclient/nfsstats.h new file mode 100644 index 00000000000..261fd42657a --- /dev/null +++ b/sys/nfsclient/nfsstats.h @@ -0,0 +1,297 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Tunable constants for nfs + */ + +#define NFS_MAXIOVEC 34 +#define NFS_HZ 25 /* Ticks per second for NFS timeouts */ +#define NFS_TIMEO (1*NFS_HZ) /* Default timeout = 1 second */ +#define NFS_MINTIMEO (1*NFS_HZ) /* Min timeout to use */ +#define NFS_MAXTIMEO (60*NFS_HZ) /* Max timeout to backoff to */ +#define NFS_MINIDEMTIMEO (5*NFS_HZ) /* Min timeout for non-idempotent ops*/ +#define NFS_MAXREXMIT 100 /* Stop counting after this many */ +#define NFS_MAXWINDOW 1024 /* Max number of outstanding requests */ +#define NFS_RETRANS 10 /* Num of retrans for soft mounts */ +#define NFS_MAXGRPS 16 /* Max. size of groups list */ +#define NFS_MINATTRTIMO 5 /* Attribute cache timeout in sec */ +#define NFS_MAXATTRTIMO 60 +#define NFS_WSIZE 8192 /* Def. write data size <= 8192 */ +#define NFS_RSIZE 8192 /* Def. read data size <= 8192 */ +#define NFS_DEFRAHEAD 1 /* Def. read ahead # blocks */ +#define NFS_MAXRAHEAD 4 /* Max. read ahead # blocks */ +#define NFS_MAXREADDIR NFS_MAXDATA /* Max. size of directory read */ +#define NFS_MAXUIDHASH 64 /* Max. # of hashed uid entries/mp */ +#define NFS_MAXASYNCDAEMON 20 /* Max. number async_daemons runable */ +#define NFS_DIRBLKSIZ 1024 /* Size of an NFS directory block */ +#define NMOD(a) ((a) % nfs_asyncdaemons) + +/* + * Set the attribute timeout based on how recently the file has been modified. + */ +#define NFS_ATTRTIMEO(np) \ + ((((np)->n_flag & NMODIFIED) || \ + (time.tv_sec - (np)->n_mtime) / 10 < NFS_MINATTRTIMO) ? NFS_MINATTRTIMO : \ + ((time.tv_sec - (np)->n_mtime) / 10 > NFS_MAXATTRTIMO ? NFS_MAXATTRTIMO : \ + (time.tv_sec - (np)->n_mtime) / 10)) + +/* + * Structures for the nfssvc(2) syscall. Not that anyone but nfsd and mount_nfs + * should ever try and use it. + */ +struct nfsd_args { + int sock; /* Socket to serve */ + caddr_t name; /* Client address for connection based sockets */ + int namelen; /* Length of name */ +}; + +struct nfsd_srvargs { + struct nfsd *nsd_nfsd; /* Pointer to in kernel nfsd struct */ + uid_t nsd_uid; /* Effective uid mapped to cred */ + u_long nsd_haddr; /* Ip address of client */ + struct ucred nsd_cr; /* Cred. uid maps to */ + int nsd_authlen; /* Length of auth string (ret) */ + char *nsd_authstr; /* Auth string (ret) */ +}; + +struct nfsd_cargs { + char *ncd_dirp; /* Mount dir path */ + uid_t ncd_authuid; /* Effective uid */ + int ncd_authtype; /* Type of authenticator */ + int ncd_authlen; /* Length of authenticator string */ + char *ncd_authstr; /* Authenticator string */ +}; + +/* + * Stats structure + */ +struct nfsstats { + int attrcache_hits; + int attrcache_misses; + int lookupcache_hits; + int lookupcache_misses; + int direofcache_hits; + int direofcache_misses; + int biocache_reads; + int read_bios; + int read_physios; + int biocache_writes; + int write_bios; + int write_physios; + int biocache_readlinks; + int readlink_bios; + int biocache_readdirs; + int readdir_bios; + int rpccnt[NFS_NPROCS]; + int rpcretries; + int srvrpccnt[NFS_NPROCS]; + int srvrpc_errs; + int srv_errs; + int rpcrequests; + int rpctimeouts; + int rpcunexpected; + int rpcinvalid; + int srvcache_inproghits; + int srvcache_idemdonehits; + int srvcache_nonidemdonehits; + int srvcache_misses; + int srvnqnfs_leases; + int srvnqnfs_maxleases; + int srvnqnfs_getleases; +}; + +/* + * Flags for nfssvc() system call. + */ +#define NFSSVC_BIOD 0x002 +#define NFSSVC_NFSD 0x004 +#define NFSSVC_ADDSOCK 0x008 +#define NFSSVC_AUTHIN 0x010 +#define NFSSVC_GOTAUTH 0x040 +#define NFSSVC_AUTHINFAIL 0x080 +#define NFSSVC_MNTD 0x100 + +/* + * The set of signals the interrupt an I/O in progress for NFSMNT_INT mounts. + * What should be in this set is open to debate, but I believe that since + * I/O system calls on ufs are never interrupted by signals the set should + * be minimal. My reasoning is that many current programs that use signals + * such as SIGALRM will not expect file I/O system calls to be interrupted + * by them and break. + */ +#ifdef KERNEL +#define NFSINT_SIGMASK (sigmask(SIGINT)|sigmask(SIGTERM)|sigmask(SIGKILL)| \ + sigmask(SIGHUP)|sigmask(SIGQUIT)) + +/* + * Socket errors ignored for connectionless sockets?? + * For now, ignore them all + */ +#define NFSIGNORE_SOERROR(s, e) \ + ((e) != EINTR && (e) != ERESTART && (e) != EWOULDBLOCK && \ + ((s) & PR_CONNREQUIRED) == 0) + +/* + * Nfs outstanding request list element + */ +struct nfsreq { + struct nfsreq *r_next; + struct nfsreq *r_prev; + struct mbuf *r_mreq; + struct mbuf *r_mrep; + struct mbuf *r_md; + caddr_t r_dpos; + struct nfsmount *r_nmp; + struct vnode *r_vp; + u_long r_xid; + int r_flags; /* flags on request, see below */ + int r_retry; /* max retransmission count */ + int r_rexmit; /* current retrans count */ + int r_timer; /* tick counter on reply */ + int r_procnum; /* NFS procedure number */ + int r_rtt; /* RTT for rpc */ + struct proc *r_procp; /* Proc that did I/O system call */ +}; + +/* Flag values for r_flags */ +#define R_TIMING 0x01 /* timing request (in mntp) */ +#define R_SENT 0x02 /* request has been sent */ +#define R_SOFTTERM 0x04 /* soft mnt, too many retries */ +#define R_INTR 0x08 /* intr mnt, signal pending */ +#define R_SOCKERR 0x10 /* Fatal error on socket */ +#define R_TPRINTFMSG 0x20 /* Did a tprintf msg. */ +#define R_MUSTRESEND 0x40 /* Must resend request */ +#define R_GETONEREP 0x80 /* Probe for one reply only */ + +struct nfsstats nfsstats; + +/* + * A list of nfssvc_sock structures is maintained with all the sockets + * that require service by the nfsd. + * The nfsuid structs hang off of the nfssvc_sock structs in both lru + * and uid hash lists. + */ +#define NUIDHASHSIZ 32 +#define NUIDHASH(uid) ((uid) & (NUIDHASHSIZ - 1)) + +/* + * Network address hash list element + */ +union nethostaddr { + u_long had_inetaddr; + struct mbuf *had_nam; +}; + +struct nfsuid { + struct nfsuid *nu_lrunext; /* MUST be first */ + struct nfsuid *nu_lruprev; + struct nfsuid *nu_hnext; + struct nfsuid *nu_hprev; + int nu_flag; /* Flags */ + uid_t nu_uid; /* Uid mapped by this entry */ + union nethostaddr nu_haddr; /* Host addr. for dgram sockets */ + struct ucred nu_cr; /* Cred uid mapped to */ +}; + +#define nu_inetaddr nu_haddr.had_inetaddr +#define nu_nam nu_haddr.had_nam +/* Bits for nu_flag */ +#define NU_INETADDR 0x1 + +struct nfssvc_sock { + struct nfsuid *ns_lrunext; /* MUST be first */ + struct nfsuid *ns_lruprev; + struct nfssvc_sock *ns_next; + struct nfssvc_sock *ns_prev; + int ns_flag; + u_long ns_sref; + struct file *ns_fp; + struct socket *ns_so; + int ns_solock; + struct mbuf *ns_nam; + int ns_cc; + struct mbuf *ns_raw; + struct mbuf *ns_rawend; + int ns_reclen; + struct mbuf *ns_rec; + struct mbuf *ns_recend; + int ns_numuids; + struct nfsuid *ns_uidh[NUIDHASHSIZ]; +}; + +/* Bits for "ns_flag" */ +#define SLP_VALID 0x01 +#define SLP_DOREC 0x02 +#define SLP_NEEDQ 0x04 +#define SLP_DISCONN 0x08 +#define SLP_GETSTREAM 0x10 +#define SLP_INIT 0x20 +#define SLP_WANTINIT 0x40 + +#define SLP_ALLFLAGS 0xff + +/* + * One of these structures is allocated for each nfsd. + */ +struct nfsd { + struct nfsd *nd_next; /* Must be first */ + struct nfsd *nd_prev; + int nd_flag; /* NFSD_ flags */ + struct nfssvc_sock *nd_slp; /* Current socket */ + struct mbuf *nd_nam; /* Client addr for datagram req. */ + struct mbuf *nd_mrep; /* Req. mbuf list */ + struct mbuf *nd_md; + caddr_t nd_dpos; /* Position in list */ + int nd_procnum; /* RPC procedure number */ + u_long nd_retxid; /* RPC xid */ + int nd_repstat; /* Reply status value */ + struct ucred nd_cr; /* Credentials for req. */ + int nd_nqlflag; /* Leasing flag */ + int nd_duration; /* Lease duration */ + int nd_authlen; /* Authenticator len */ + u_char nd_authstr[RPCAUTH_MAXSIZ]; /* Authenticator data */ + struct proc *nd_procp; /* Proc ptr */ +}; + +#define NFSD_WAITING 0x01 +#define NFSD_CHECKSLP 0x02 +#define NFSD_REQINPROG 0x04 +#define NFSD_NEEDAUTH 0x08 +#define NFSD_AUTHFAIL 0x10 +#endif /* KERNEL */ diff --git a/sys/nfsserver/nfs.h b/sys/nfsserver/nfs.h new file mode 100644 index 00000000000..261fd42657a --- /dev/null +++ b/sys/nfsserver/nfs.h @@ -0,0 +1,297 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Tunable constants for nfs + */ + +#define NFS_MAXIOVEC 34 +#define NFS_HZ 25 /* Ticks per second for NFS timeouts */ +#define NFS_TIMEO (1*NFS_HZ) /* Default timeout = 1 second */ +#define NFS_MINTIMEO (1*NFS_HZ) /* Min timeout to use */ +#define NFS_MAXTIMEO (60*NFS_HZ) /* Max timeout to backoff to */ +#define NFS_MINIDEMTIMEO (5*NFS_HZ) /* Min timeout for non-idempotent ops*/ +#define NFS_MAXREXMIT 100 /* Stop counting after this many */ +#define NFS_MAXWINDOW 1024 /* Max number of outstanding requests */ +#define NFS_RETRANS 10 /* Num of retrans for soft mounts */ +#define NFS_MAXGRPS 16 /* Max. size of groups list */ +#define NFS_MINATTRTIMO 5 /* Attribute cache timeout in sec */ +#define NFS_MAXATTRTIMO 60 +#define NFS_WSIZE 8192 /* Def. write data size <= 8192 */ +#define NFS_RSIZE 8192 /* Def. read data size <= 8192 */ +#define NFS_DEFRAHEAD 1 /* Def. read ahead # blocks */ +#define NFS_MAXRAHEAD 4 /* Max. read ahead # blocks */ +#define NFS_MAXREADDIR NFS_MAXDATA /* Max. size of directory read */ +#define NFS_MAXUIDHASH 64 /* Max. # of hashed uid entries/mp */ +#define NFS_MAXASYNCDAEMON 20 /* Max. number async_daemons runable */ +#define NFS_DIRBLKSIZ 1024 /* Size of an NFS directory block */ +#define NMOD(a) ((a) % nfs_asyncdaemons) + +/* + * Set the attribute timeout based on how recently the file has been modified. + */ +#define NFS_ATTRTIMEO(np) \ + ((((np)->n_flag & NMODIFIED) || \ + (time.tv_sec - (np)->n_mtime) / 10 < NFS_MINATTRTIMO) ? NFS_MINATTRTIMO : \ + ((time.tv_sec - (np)->n_mtime) / 10 > NFS_MAXATTRTIMO ? NFS_MAXATTRTIMO : \ + (time.tv_sec - (np)->n_mtime) / 10)) + +/* + * Structures for the nfssvc(2) syscall. Not that anyone but nfsd and mount_nfs + * should ever try and use it. + */ +struct nfsd_args { + int sock; /* Socket to serve */ + caddr_t name; /* Client address for connection based sockets */ + int namelen; /* Length of name */ +}; + +struct nfsd_srvargs { + struct nfsd *nsd_nfsd; /* Pointer to in kernel nfsd struct */ + uid_t nsd_uid; /* Effective uid mapped to cred */ + u_long nsd_haddr; /* Ip address of client */ + struct ucred nsd_cr; /* Cred. uid maps to */ + int nsd_authlen; /* Length of auth string (ret) */ + char *nsd_authstr; /* Auth string (ret) */ +}; + +struct nfsd_cargs { + char *ncd_dirp; /* Mount dir path */ + uid_t ncd_authuid; /* Effective uid */ + int ncd_authtype; /* Type of authenticator */ + int ncd_authlen; /* Length of authenticator string */ + char *ncd_authstr; /* Authenticator string */ +}; + +/* + * Stats structure + */ +struct nfsstats { + int attrcache_hits; + int attrcache_misses; + int lookupcache_hits; + int lookupcache_misses; + int direofcache_hits; + int direofcache_misses; + int biocache_reads; + int read_bios; + int read_physios; + int biocache_writes; + int write_bios; + int write_physios; + int biocache_readlinks; + int readlink_bios; + int biocache_readdirs; + int readdir_bios; + int rpccnt[NFS_NPROCS]; + int rpcretries; + int srvrpccnt[NFS_NPROCS]; + int srvrpc_errs; + int srv_errs; + int rpcrequests; + int rpctimeouts; + int rpcunexpected; + int rpcinvalid; + int srvcache_inproghits; + int srvcache_idemdonehits; + int srvcache_nonidemdonehits; + int srvcache_misses; + int srvnqnfs_leases; + int srvnqnfs_maxleases; + int srvnqnfs_getleases; +}; + +/* + * Flags for nfssvc() system call. + */ +#define NFSSVC_BIOD 0x002 +#define NFSSVC_NFSD 0x004 +#define NFSSVC_ADDSOCK 0x008 +#define NFSSVC_AUTHIN 0x010 +#define NFSSVC_GOTAUTH 0x040 +#define NFSSVC_AUTHINFAIL 0x080 +#define NFSSVC_MNTD 0x100 + +/* + * The set of signals the interrupt an I/O in progress for NFSMNT_INT mounts. + * What should be in this set is open to debate, but I believe that since + * I/O system calls on ufs are never interrupted by signals the set should + * be minimal. My reasoning is that many current programs that use signals + * such as SIGALRM will not expect file I/O system calls to be interrupted + * by them and break. + */ +#ifdef KERNEL +#define NFSINT_SIGMASK (sigmask(SIGINT)|sigmask(SIGTERM)|sigmask(SIGKILL)| \ + sigmask(SIGHUP)|sigmask(SIGQUIT)) + +/* + * Socket errors ignored for connectionless sockets?? + * For now, ignore them all + */ +#define NFSIGNORE_SOERROR(s, e) \ + ((e) != EINTR && (e) != ERESTART && (e) != EWOULDBLOCK && \ + ((s) & PR_CONNREQUIRED) == 0) + +/* + * Nfs outstanding request list element + */ +struct nfsreq { + struct nfsreq *r_next; + struct nfsreq *r_prev; + struct mbuf *r_mreq; + struct mbuf *r_mrep; + struct mbuf *r_md; + caddr_t r_dpos; + struct nfsmount *r_nmp; + struct vnode *r_vp; + u_long r_xid; + int r_flags; /* flags on request, see below */ + int r_retry; /* max retransmission count */ + int r_rexmit; /* current retrans count */ + int r_timer; /* tick counter on reply */ + int r_procnum; /* NFS procedure number */ + int r_rtt; /* RTT for rpc */ + struct proc *r_procp; /* Proc that did I/O system call */ +}; + +/* Flag values for r_flags */ +#define R_TIMING 0x01 /* timing request (in mntp) */ +#define R_SENT 0x02 /* request has been sent */ +#define R_SOFTTERM 0x04 /* soft mnt, too many retries */ +#define R_INTR 0x08 /* intr mnt, signal pending */ +#define R_SOCKERR 0x10 /* Fatal error on socket */ +#define R_TPRINTFMSG 0x20 /* Did a tprintf msg. */ +#define R_MUSTRESEND 0x40 /* Must resend request */ +#define R_GETONEREP 0x80 /* Probe for one reply only */ + +struct nfsstats nfsstats; + +/* + * A list of nfssvc_sock structures is maintained with all the sockets + * that require service by the nfsd. + * The nfsuid structs hang off of the nfssvc_sock structs in both lru + * and uid hash lists. + */ +#define NUIDHASHSIZ 32 +#define NUIDHASH(uid) ((uid) & (NUIDHASHSIZ - 1)) + +/* + * Network address hash list element + */ +union nethostaddr { + u_long had_inetaddr; + struct mbuf *had_nam; +}; + +struct nfsuid { + struct nfsuid *nu_lrunext; /* MUST be first */ + struct nfsuid *nu_lruprev; + struct nfsuid *nu_hnext; + struct nfsuid *nu_hprev; + int nu_flag; /* Flags */ + uid_t nu_uid; /* Uid mapped by this entry */ + union nethostaddr nu_haddr; /* Host addr. for dgram sockets */ + struct ucred nu_cr; /* Cred uid mapped to */ +}; + +#define nu_inetaddr nu_haddr.had_inetaddr +#define nu_nam nu_haddr.had_nam +/* Bits for nu_flag */ +#define NU_INETADDR 0x1 + +struct nfssvc_sock { + struct nfsuid *ns_lrunext; /* MUST be first */ + struct nfsuid *ns_lruprev; + struct nfssvc_sock *ns_next; + struct nfssvc_sock *ns_prev; + int ns_flag; + u_long ns_sref; + struct file *ns_fp; + struct socket *ns_so; + int ns_solock; + struct mbuf *ns_nam; + int ns_cc; + struct mbuf *ns_raw; + struct mbuf *ns_rawend; + int ns_reclen; + struct mbuf *ns_rec; + struct mbuf *ns_recend; + int ns_numuids; + struct nfsuid *ns_uidh[NUIDHASHSIZ]; +}; + +/* Bits for "ns_flag" */ +#define SLP_VALID 0x01 +#define SLP_DOREC 0x02 +#define SLP_NEEDQ 0x04 +#define SLP_DISCONN 0x08 +#define SLP_GETSTREAM 0x10 +#define SLP_INIT 0x20 +#define SLP_WANTINIT 0x40 + +#define SLP_ALLFLAGS 0xff + +/* + * One of these structures is allocated for each nfsd. + */ +struct nfsd { + struct nfsd *nd_next; /* Must be first */ + struct nfsd *nd_prev; + int nd_flag; /* NFSD_ flags */ + struct nfssvc_sock *nd_slp; /* Current socket */ + struct mbuf *nd_nam; /* Client addr for datagram req. */ + struct mbuf *nd_mrep; /* Req. mbuf list */ + struct mbuf *nd_md; + caddr_t nd_dpos; /* Position in list */ + int nd_procnum; /* RPC procedure number */ + u_long nd_retxid; /* RPC xid */ + int nd_repstat; /* Reply status value */ + struct ucred nd_cr; /* Credentials for req. */ + int nd_nqlflag; /* Leasing flag */ + int nd_duration; /* Lease duration */ + int nd_authlen; /* Authenticator len */ + u_char nd_authstr[RPCAUTH_MAXSIZ]; /* Authenticator data */ + struct proc *nd_procp; /* Proc ptr */ +}; + +#define NFSD_WAITING 0x01 +#define NFSD_CHECKSLP 0x02 +#define NFSD_REQINPROG 0x04 +#define NFSD_NEEDAUTH 0x08 +#define NFSD_AUTHFAIL 0x10 +#endif /* KERNEL */ diff --git a/sys/nfsserver/nfs_serv.c b/sys/nfsserver/nfs_serv.c new file mode 100644 index 00000000000..f31b96e02ed --- /dev/null +++ b/sys/nfsserver/nfs_serv.c @@ -0,0 +1,1908 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_serv.c 8.3 (Berkeley) 1/12/94 + */ + +/* + * nfs version 2 server calls to vnode ops + * - these routines generally have 3 phases + * 1 - break down and validate rpc request in mbuf list + * 2 - do the vnode ops for the request + * (surprisingly ?? many are very similar to syscalls in vfs_syscalls.c) + * 3 - build the rpc reply in an mbuf list + * nb: + * - do not mix the phases, since the nfsm_?? macros can return failures + * on a bad rpc or similar and do not do any vrele() or vput()'s + * + * - the nfsm_reply() macro generates an nfs rpc reply with the nfs + * error number iff error != 0 whereas + * returning an error from the server function implies a fatal error + * such as a badly constructed rpc request that should be dropped without + * a reply. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +/* Defs */ +#define TRUE 1 +#define FALSE 0 + +/* Global vars */ +extern u_long nfs_procids[NFS_NPROCS]; +extern u_long nfs_xdrneg1; +extern u_long nfs_false, nfs_true; +nfstype nfs_type[9] = { NFNON, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, NFNON, + NFCHR, NFNON }; + +/* + * nqnfs access service + */ +nqnfsrv_access(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, mode = 0; + char *cp2; + struct mbuf *mb, *mreq; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_dissect(tl, u_long *, 3 * NFSX_UNSIGNED); + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + if (*tl++ == nfs_true) + mode |= VREAD; + if (*tl++ == nfs_true) + mode |= VWRITE; + if (*tl == nfs_true) + mode |= VEXEC; + error = nfsrv_access(vp, mode, cred, rdonly, nfsd->nd_procp); + vput(vp); + nfsm_reply(0); + nfsm_srvdone; +} + +/* + * nfs getattr service + */ +nfsrv_getattr(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct nfsv2_fattr *fp; + struct vattr va; + register struct vattr *vap = &va; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + nqsrv_getl(vp, NQL_READ); + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + nfsm_srvdone; +} + +/* + * nfs setattr service + */ +nfsrv_setattr(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct vattr va; + register struct vattr *vap = &va; + register struct nfsv2_sattr *sp; + register struct nfsv2_fattr *fp; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + u_quad_t frev, frev2; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_dissect(sp, struct nfsv2_sattr *, NFSX_SATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + nqsrv_getl(vp, NQL_WRITE); + VATTR_NULL(vap); + /* + * Nah nah nah nah na nah + * There is a bug in the Sun client that puts 0xffff in the mode + * field of sattr when it should put in 0xffffffff. The u_short + * doesn't sign extend. + * --> check the low order 2 bytes for 0xffff + */ + if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff) + vap->va_mode = nfstov_mode(sp->sa_mode); + if (sp->sa_uid != nfs_xdrneg1) + vap->va_uid = fxdr_unsigned(uid_t, sp->sa_uid); + if (sp->sa_gid != nfs_xdrneg1) + vap->va_gid = fxdr_unsigned(gid_t, sp->sa_gid); + if (nfsd->nd_nqlflag == NQL_NOVAL) { + if (sp->sa_nfssize != nfs_xdrneg1) + vap->va_size = fxdr_unsigned(u_quad_t, sp->sa_nfssize); + if (sp->sa_nfsatime.nfs_sec != nfs_xdrneg1) { +#ifdef notyet + fxdr_nfstime(&sp->sa_nfsatime, &vap->va_atime); +#else + vap->va_atime.ts_sec = + fxdr_unsigned(long, sp->sa_nfsatime.nfs_sec); + vap->va_atime.ts_nsec = 0; +#endif + } + if (sp->sa_nfsmtime.nfs_sec != nfs_xdrneg1) + fxdr_nfstime(&sp->sa_nfsmtime, &vap->va_mtime); + } else { + fxdr_hyper(&sp->sa_nqsize, &vap->va_size); + fxdr_nqtime(&sp->sa_nqatime, &vap->va_atime); + fxdr_nqtime(&sp->sa_nqmtime, &vap->va_mtime); + vap->va_flags = fxdr_unsigned(u_long, sp->sa_nqflags); + } + + /* + * If the size is being changed write acces is required, otherwise + * just check for a read only file system. + */ + if (vap->va_size == ((u_quad_t)((quad_t) -1))) { + if (rdonly || (vp->v_mount->mnt_flag & MNT_RDONLY)) { + error = EROFS; + goto out; + } + } else { + if (vp->v_type == VDIR) { + error = EISDIR; + goto out; + } else if (error = nfsrv_access(vp, VWRITE, cred, rdonly, + nfsd->nd_procp)) + goto out; + } + if (error = VOP_SETATTR(vp, vap, cred, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); +out: + vput(vp); + nfsm_reply(NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL) + 2*NFSX_UNSIGNED); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + if (nfsd->nd_nqlflag != NQL_NOVAL) { + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + txdr_hyper(&frev2, tl); + } + nfsm_srvdone; +} + +/* + * nfs lookup rpc + */ +nfsrv_lookup(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct nfsv2_fattr *fp; + struct nameidata nd; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + register caddr_t cp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, cache, duration2, cache2, len; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct vattr va, *vap = &va; + u_quad_t frev, frev2; + + fhp = &nfh.fh_generic; + duration2 = 0; + if (nfsd->nd_nqlflag != NQL_NOVAL) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + duration2 = fxdr_unsigned(int, *tl); + } + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = LOOKUP; + nd.ni_cnd.cn_flags = LOCKLEAF | SAVESTART; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + nfsm_reply(0); + nqsrv_getl(nd.ni_startdir, NQL_READ); + vrele(nd.ni_startdir); + FREE(nd.ni_cnd.cn_pnbuf, M_NAMEI); + vp = nd.ni_vp; + bzero((caddr_t)fhp, sizeof(nfh)); + fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; + if (error = VFS_VPTOFH(vp, &fhp->fh_fid)) { + vput(vp); + nfsm_reply(0); + } + if (duration2) + (void) nqsrv_getlease(vp, &duration2, NQL_READ, nfsd, + nam, &cache2, &frev2, cred); + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_FH + NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL) + 5*NFSX_UNSIGNED); + if (nfsd->nd_nqlflag != NQL_NOVAL) { + if (duration2) { + nfsm_build(tl, u_long *, 5*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(NQL_READ); + *tl++ = txdr_unsigned(cache2); + *tl++ = txdr_unsigned(duration2); + txdr_hyper(&frev2, tl); + } else { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = 0; + } + } + nfsm_srvfhtom(fhp); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + nfsm_srvdone; +} + +/* + * nfs readlink service + */ +nfsrv_readlink(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct iovec iv[(NFS_MAXPATHLEN+MLEN-1)/MLEN]; + register struct iovec *ivp = iv; + register struct mbuf *mp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, i, tlen, len; + char *cp2; + struct mbuf *mb, *mb2, *mp2, *mp3, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct uio io, *uiop = &io; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + len = 0; + i = 0; + while (len < NFS_MAXPATHLEN) { + MGET(mp, M_WAIT, MT_DATA); + MCLGET(mp, M_WAIT); + mp->m_len = NFSMSIZ(mp); + if (len == 0) + mp3 = mp2 = mp; + else { + mp2->m_next = mp; + mp2 = mp; + } + if ((len+mp->m_len) > NFS_MAXPATHLEN) { + mp->m_len = NFS_MAXPATHLEN-len; + len = NFS_MAXPATHLEN; + } else + len += mp->m_len; + ivp->iov_base = mtod(mp, caddr_t); + ivp->iov_len = mp->m_len; + i++; + ivp++; + } + uiop->uio_iov = iv; + uiop->uio_iovcnt = i; + uiop->uio_offset = 0; + uiop->uio_resid = len; + uiop->uio_rw = UIO_READ; + uiop->uio_segflg = UIO_SYSSPACE; + uiop->uio_procp = (struct proc *)0; + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) { + m_freem(mp3); + nfsm_reply(0); + } + if (vp->v_type != VLNK) { + error = EINVAL; + goto out; + } + nqsrv_getl(vp, NQL_READ); + error = VOP_READLINK(vp, uiop, cred); +out: + vput(vp); + if (error) + m_freem(mp3); + nfsm_reply(NFSX_UNSIGNED); + if (uiop->uio_resid > 0) { + len -= uiop->uio_resid; + tlen = nfsm_rndup(len); + nfsm_adj(mp3, NFS_MAXPATHLEN-tlen, tlen-len); + } + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = txdr_unsigned(len); + mb->m_next = mp3; + nfsm_srvdone; +} + +/* + * nfs read service + */ +nfsrv_read(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct iovec *iv; + struct iovec *iv2; + register struct mbuf *m; + register struct nfsv2_fattr *fp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, i, cnt, len, left, siz, tlen; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct mbuf *m2; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct uio io, *uiop = &io; + struct vattr va, *vap = &va; + off_t off; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + if (nfsd->nd_nqlflag == NQL_NOVAL) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + off = (off_t)fxdr_unsigned(u_long, *tl); + } else { + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + fxdr_hyper(tl, &off); + } + nfsm_srvstrsiz(cnt, NFS_MAXDATA); + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + if (vp->v_type != VREG) { + error = (vp->v_type == VDIR) ? EISDIR : EACCES; + vput(vp); + nfsm_reply(0); + } + nqsrv_getl(vp, NQL_READ); + if ((error = nfsrv_access(vp, VREAD, cred, rdonly, nfsd->nd_procp)) && + (error = nfsrv_access(vp, VEXEC, cred, rdonly, nfsd->nd_procp))) { + vput(vp); + nfsm_reply(0); + } + if (error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + if (off >= vap->va_size) + cnt = 0; + else if ((off + cnt) > vap->va_size) + cnt = nfsm_rndup(vap->va_size - off); + nfsm_reply(NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)+NFSX_UNSIGNED+nfsm_rndup(cnt)); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + len = left = cnt; + if (cnt > 0) { + /* + * Generate the mbuf list with the uio_iov ref. to it. + */ + i = 0; + m = m2 = mb; + MALLOC(iv, struct iovec *, + ((NFS_MAXDATA+MLEN-1)/MLEN) * sizeof (struct iovec), + M_TEMP, M_WAITOK); + iv2 = iv; + while (left > 0) { + siz = min(M_TRAILINGSPACE(m), left); + if (siz > 0) { + m->m_len += siz; + iv->iov_base = bpos; + iv->iov_len = siz; + iv++; + i++; + left -= siz; + } + if (left > 0) { + MGET(m, M_WAIT, MT_DATA); + MCLGET(m, M_WAIT); + m->m_len = 0; + m2->m_next = m; + m2 = m; + bpos = mtod(m, caddr_t); + } + } + uiop->uio_iov = iv2; + uiop->uio_iovcnt = i; + uiop->uio_offset = off; + uiop->uio_resid = cnt; + uiop->uio_rw = UIO_READ; + uiop->uio_segflg = UIO_SYSSPACE; + error = VOP_READ(vp, uiop, IO_NODELOCKED, cred); + off = uiop->uio_offset; + FREE((caddr_t)iv2, M_TEMP); + if (error || (error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp))) { + m_freem(mreq); + vput(vp); + nfsm_reply(0); + } + } else + uiop->uio_resid = 0; + vput(vp); + nfsm_srvfillattr; + len -= uiop->uio_resid; + tlen = nfsm_rndup(len); + if (cnt != tlen || tlen != len) + nfsm_adj(mb, cnt-tlen, tlen-len); + *tl = txdr_unsigned(len); + nfsm_srvdone; +} + +/* + * nfs write service + */ +nfsrv_write(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct iovec *ivp; + register struct mbuf *mp; + register struct nfsv2_fattr *fp; + struct iovec iv[NFS_MAXIOVEC]; + struct vattr va; + register struct vattr *vap = &va; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, siz, len, xfer; + int ioflags = IO_SYNC | IO_NODELOCKED; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct uio io, *uiop = &io; + off_t off; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_dissect(tl, u_long *, 4 * NFSX_UNSIGNED); + if (nfsd->nd_nqlflag == NQL_NOVAL) { + off = (off_t)fxdr_unsigned(u_long, *++tl); + tl += 2; + } else { + fxdr_hyper(tl, &off); + tl += 2; + if (fxdr_unsigned(u_long, *tl++)) + ioflags |= IO_APPEND; + } + len = fxdr_unsigned(long, *tl); + if (len > NFS_MAXDATA || len <= 0) { + error = EBADRPC; + nfsm_reply(0); + } + if (dpos == (mtod(md, caddr_t)+md->m_len)) { + mp = md->m_next; + if (mp == NULL) { + error = EBADRPC; + nfsm_reply(0); + } + } else { + mp = md; + siz = dpos-mtod(mp, caddr_t); + mp->m_len -= siz; + NFSMADV(mp, siz); + } + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + if (vp->v_type != VREG) { + error = (vp->v_type == VDIR) ? EISDIR : EACCES; + vput(vp); + nfsm_reply(0); + } + nqsrv_getl(vp, NQL_WRITE); + if (error = nfsrv_access(vp, VWRITE, cred, rdonly, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + uiop->uio_resid = 0; + uiop->uio_rw = UIO_WRITE; + uiop->uio_segflg = UIO_SYSSPACE; + uiop->uio_procp = (struct proc *)0; + /* + * Do up to NFS_MAXIOVEC mbufs of write each iteration of the + * loop until done. + */ + while (len > 0 && uiop->uio_resid == 0) { + ivp = iv; + siz = 0; + uiop->uio_iov = ivp; + uiop->uio_iovcnt = 0; + uiop->uio_offset = off; + while (len > 0 && uiop->uio_iovcnt < NFS_MAXIOVEC && mp != NULL) { + ivp->iov_base = mtod(mp, caddr_t); + if (len < mp->m_len) + ivp->iov_len = xfer = len; + else + ivp->iov_len = xfer = mp->m_len; +#ifdef notdef + /* Not Yet .. */ + if (M_HASCL(mp) && (((u_long)ivp->iov_base) & CLOFSET) == 0) + ivp->iov_op = NULL; /* what should it be ?? */ + else + ivp->iov_op = NULL; +#endif + uiop->uio_iovcnt++; + ivp++; + len -= xfer; + siz += xfer; + mp = mp->m_next; + } + if (len > 0 && mp == NULL) { + error = EBADRPC; + vput(vp); + nfsm_reply(0); + } + uiop->uio_resid = siz; + if (error = VOP_WRITE(vp, uiop, ioflags, cred)) { + vput(vp); + nfsm_reply(0); + } + off = uiop->uio_offset; + } + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + if (nfsd->nd_nqlflag != NQL_NOVAL) { + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + txdr_hyper(&vap->va_filerev, tl); + } + nfsm_srvdone; +} + +/* + * nfs create service + * now does a truncate to 0 length via. setattr if it already exists + */ +nfsrv_create(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct nfsv2_fattr *fp; + struct vattr va; + register struct vattr *vap = &va; + register struct nfsv2_sattr *sp; + register u_long *tl; + struct nameidata nd; + register caddr_t cp; + register long t1; + caddr_t bpos; + int error = 0, rdev, cache, len, tsize; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + u_quad_t frev; + + nd.ni_cnd.cn_nameiop = 0; + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = CREATE; + nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | SAVESTART; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + nfsm_reply(0); + VATTR_NULL(vap); + nfsm_dissect(sp, struct nfsv2_sattr *, NFSX_SATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + /* + * Iff doesn't exist, create it + * otherwise just truncate to 0 length + * should I set the mode too ?? + */ + if (nd.ni_vp == NULL) { + vap->va_type = IFTOVT(fxdr_unsigned(u_long, sp->sa_mode)); + if (vap->va_type == VNON) + vap->va_type = VREG; + vap->va_mode = nfstov_mode(sp->sa_mode); + if (nfsd->nd_nqlflag == NQL_NOVAL) + rdev = fxdr_unsigned(long, sp->sa_nfssize); + else + rdev = fxdr_unsigned(long, sp->sa_nqrdev); + if (vap->va_type == VREG || vap->va_type == VSOCK) { + vrele(nd.ni_startdir); + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + if (error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap)) + nfsm_reply(0); + FREE(nd.ni_cnd.cn_pnbuf, M_NAMEI); + } else if (vap->va_type == VCHR || vap->va_type == VBLK || + vap->va_type == VFIFO) { + if (vap->va_type == VCHR && rdev == 0xffffffff) + vap->va_type = VFIFO; + if (vap->va_type == VFIFO) { +#ifndef FIFO + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + error = ENXIO; + goto out; +#endif /* FIFO */ + } else if (error = suser(cred, (u_short *)0)) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + goto out; + } else + vap->va_rdev = (dev_t)rdev; + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + if (error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap)) { + vrele(nd.ni_startdir); + nfsm_reply(0); + } + nd.ni_cnd.cn_nameiop = LOOKUP; + nd.ni_cnd.cn_flags &= ~(LOCKPARENT | SAVESTART); + nd.ni_cnd.cn_proc = nfsd->nd_procp; + nd.ni_cnd.cn_cred = nfsd->nd_procp->p_ucred; + if (error = lookup(&nd)) { + free(nd.ni_cnd.cn_pnbuf, M_NAMEI); + nfsm_reply(0); + } + FREE(nd.ni_cnd.cn_pnbuf, M_NAMEI); + if (nd.ni_cnd.cn_flags & ISSYMLINK) { + vrele(nd.ni_dvp); + vput(nd.ni_vp); + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + error = EINVAL; + nfsm_reply(0); + } + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + error = ENXIO; + goto out; + } + vp = nd.ni_vp; + } else { + vrele(nd.ni_startdir); + free(nd.ni_cnd.cn_pnbuf, M_NAMEI); + vp = nd.ni_vp; + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nfsd->nd_nqlflag == NQL_NOVAL) { + tsize = fxdr_unsigned(long, sp->sa_nfssize); + if (tsize != -1) + vap->va_size = (u_quad_t)tsize; + else + vap->va_size = -1; + } else + fxdr_hyper(&sp->sa_nqsize, &vap->va_size); + if (vap->va_size != -1) { + if (error = nfsrv_access(vp, VWRITE, cred, + (nd.ni_cnd.cn_flags & RDONLY), nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + nqsrv_getl(vp, NQL_WRITE); + if (error = VOP_SETATTR(vp, vap, cred, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + } + } + bzero((caddr_t)fhp, sizeof(nfh)); + fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; + if (error = VFS_VPTOFH(vp, &fhp->fh_fid)) { + vput(vp); + nfsm_reply(0); + } + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_FH+NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfhtom(fhp); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + return (error); +nfsmout: + if (nd.ni_cnd.cn_nameiop || nd.ni_cnd.cn_flags) + vrele(nd.ni_startdir); + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vput(nd.ni_vp); + return (error); + +out: + vrele(nd.ni_startdir); + free(nd.ni_cnd.cn_pnbuf, M_NAMEI); + nfsm_reply(0); +} + +/* + * nfs remove service + */ +nfsrv_remove(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct nameidata nd; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, cache, len; + char *cp2; + struct mbuf *mb, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = DELETE; + nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + nfsm_reply(0); + vp = nd.ni_vp; + if (vp->v_type == VDIR && + (error = suser(cred, (u_short *)0))) + goto out; + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) { + error = EBUSY; + goto out; + } + if (vp->v_flag & VTEXT) + (void) vnode_pager_uncache(vp); +out: + if (!error) { + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + nqsrv_getl(vp, NQL_WRITE); + error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + } + nfsm_reply(0); + nfsm_srvdone; +} + +/* + * nfs rename service + */ +nfsrv_rename(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, cache, len, len2; + char *cp2; + struct mbuf *mb, *mreq; + struct nameidata fromnd, tond; + struct vnode *fvp, *tvp, *tdvp; + nfsv2fh_t fnfh, tnfh; + fhandle_t *ffhp, *tfhp; + u_quad_t frev; + uid_t saved_uid; + + ffhp = &fnfh.fh_generic; + tfhp = &tnfh.fh_generic; + fromnd.ni_cnd.cn_nameiop = 0; + tond.ni_cnd.cn_nameiop = 0; + nfsm_srvmtofh(ffhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + /* + * Remember our original uid so that we can reset cr_uid before + * the second nfs_namei() call, in case it is remapped. + */ + saved_uid = cred->cr_uid; + fromnd.ni_cnd.cn_cred = cred; + fromnd.ni_cnd.cn_nameiop = DELETE; + fromnd.ni_cnd.cn_flags = WANTPARENT | SAVESTART; + if (error = nfs_namei(&fromnd, ffhp, len, nfsd->nd_slp, nam, &md, + &dpos, nfsd->nd_procp)) + nfsm_reply(0); + fvp = fromnd.ni_vp; + nfsm_srvmtofh(tfhp); + nfsm_strsiz(len2, NFS_MAXNAMLEN); + cred->cr_uid = saved_uid; + tond.ni_cnd.cn_cred = cred; + tond.ni_cnd.cn_nameiop = RENAME; + tond.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART; + if (error = nfs_namei(&tond, tfhp, len2, nfsd->nd_slp, nam, &md, + &dpos, nfsd->nd_procp)) { + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = EISDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = ENOTDIR; + goto out; + } + if (tvp->v_type == VDIR && tvp->v_mountedhere) { + error = EXDEV; + goto out; + } + } + if (fvp->v_type == VDIR && fvp->v_mountedhere) { + error = EBUSY; + goto out; + } + if (fvp->v_mount != tdvp->v_mount) { + error = EXDEV; + goto out; + } + if (fvp == tdvp) + error = EINVAL; + /* + * If source is the same as the destination (that is the + * same vnode with the same name in the same directory), + * then there is nothing to do. + */ + if (fvp == tvp && fromnd.ni_dvp == tdvp && + fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && + !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, + fromnd.ni_cnd.cn_namelen)) + error = -1; +out: + if (!error) { + nqsrv_getl(fromnd.ni_dvp, NQL_WRITE); + nqsrv_getl(tdvp, NQL_WRITE); + if (tvp) + nqsrv_getl(tvp, NQL_WRITE); + error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + } else { + VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + vrele(tond.ni_startdir); + FREE(tond.ni_cnd.cn_pnbuf, M_NAMEI); +out1: + vrele(fromnd.ni_startdir); + FREE(fromnd.ni_cnd.cn_pnbuf, M_NAMEI); + nfsm_reply(0); + return (error); + +nfsmout: + if (tond.ni_cnd.cn_nameiop || tond.ni_cnd.cn_flags) { + vrele(tond.ni_startdir); + FREE(tond.ni_cnd.cn_pnbuf, M_NAMEI); + } + if (fromnd.ni_cnd.cn_nameiop || fromnd.ni_cnd.cn_flags) { + vrele(fromnd.ni_startdir); + FREE(fromnd.ni_cnd.cn_pnbuf, M_NAMEI); + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + return (error); +} + +/* + * nfs link service + */ +nfsrv_link(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct nameidata nd; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, len; + char *cp2; + struct mbuf *mb, *mreq; + struct vnode *vp, *xp; + nfsv2fh_t nfh, dnfh; + fhandle_t *fhp, *dfhp; + u_quad_t frev; + + fhp = &nfh.fh_generic; + dfhp = &dnfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvmtofh(dfhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + if (error = nfsrv_fhtovp(fhp, FALSE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + if (vp->v_type == VDIR && (error = suser(cred, (u_short *)0))) + goto out1; + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = CREATE; + nd.ni_cnd.cn_flags = LOCKPARENT; + if (error = nfs_namei(&nd, dfhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + goto out1; + xp = nd.ni_vp; + if (xp != NULL) { + error = EEXIST; + goto out; + } + xp = nd.ni_dvp; + if (vp->v_mount != xp->v_mount) + error = EXDEV; +out: + if (!error) { + nqsrv_getl(vp, NQL_WRITE); + nqsrv_getl(xp, NQL_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + } +out1: + vrele(vp); + nfsm_reply(0); + nfsm_srvdone; +} + +/* + * nfs symbolic link service + */ +nfsrv_symlink(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct vattr va; + struct nameidata nd; + register struct vattr *vap = &va; + register u_long *tl; + register long t1; + struct nfsv2_sattr *sp; + caddr_t bpos; + struct uio io; + struct iovec iv; + int error = 0, cache, len, len2; + char *pathcp, *cp2; + struct mbuf *mb, *mreq; + nfsv2fh_t nfh; + fhandle_t *fhp; + u_quad_t frev; + + pathcp = (char *)0; + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = CREATE; + nd.ni_cnd.cn_flags = LOCKPARENT; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + goto out; + nfsm_strsiz(len2, NFS_MAXPATHLEN); + MALLOC(pathcp, caddr_t, len2 + 1, M_TEMP, M_WAITOK); + iv.iov_base = pathcp; + iv.iov_len = len2; + io.uio_resid = len2; + io.uio_offset = 0; + io.uio_iov = &iv; + io.uio_iovcnt = 1; + io.uio_segflg = UIO_SYSSPACE; + io.uio_rw = UIO_READ; + io.uio_procp = (struct proc *)0; + nfsm_mtouio(&io, len2); + nfsm_dissect(sp, struct nfsv2_sattr *, NFSX_SATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + *(pathcp + len2) = '\0'; + if (nd.ni_vp) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + error = EEXIST; + goto out; + } + VATTR_NULL(vap); + vap->va_mode = fxdr_unsigned(u_short, sp->sa_mode); + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap, pathcp); +out: + if (pathcp) + FREE(pathcp, M_TEMP); + nfsm_reply(0); + return (error); +nfsmout: + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + if (pathcp) + FREE(pathcp, M_TEMP); + return (error); +} + +/* + * nfs mkdir service + */ +nfsrv_mkdir(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + struct vattr va; + register struct vattr *vap = &va; + register struct nfsv2_fattr *fp; + struct nameidata nd; + register caddr_t cp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, cache, len; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = CREATE; + nd.ni_cnd.cn_flags = LOCKPARENT; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + nfsm_reply(0); + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + VATTR_NULL(vap); + vap->va_type = VDIR; + vap->va_mode = nfstov_mode(*tl++); + vp = nd.ni_vp; + if (vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + error = EEXIST; + nfsm_reply(0); + } + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + if (error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, vap)) + nfsm_reply(0); + vp = nd.ni_vp; + bzero((caddr_t)fhp, sizeof(nfh)); + fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid; + if (error = VFS_VPTOFH(vp, &fhp->fh_fid)) { + vput(vp); + nfsm_reply(0); + } + error = VOP_GETATTR(vp, vap, cred, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_FH+NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfhtom(fhp); + nfsm_build(fp, struct nfsv2_fattr *, NFSX_FATTR(nfsd->nd_nqlflag != NQL_NOVAL)); + nfsm_srvfillattr; + return (error); +nfsmout: + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + return (error); +} + +/* + * nfs rmdir service + */ +nfsrv_rmdir(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, cache, len; + char *cp2; + struct mbuf *mb, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct nameidata nd; + u_quad_t frev; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_srvstrsiz(len, NFS_MAXNAMLEN); + nd.ni_cnd.cn_cred = cred; + nd.ni_cnd.cn_nameiop = DELETE; + nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; + if (error = nfs_namei(&nd, fhp, len, nfsd->nd_slp, nam, &md, &dpos, + nfsd->nd_procp)) + nfsm_reply(0); + vp = nd.ni_vp; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + /* + * No rmdir "." please. + */ + if (nd.ni_dvp == vp) { + error = EINVAL; + goto out; + } + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) + error = EBUSY; +out: + if (!error) { + nqsrv_getl(nd.ni_dvp, NQL_WRITE); + nqsrv_getl(vp, NQL_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + } + nfsm_reply(0); + nfsm_srvdone; +} + +/* + * nfs readdir service + * - mallocs what it thinks is enough to read + * count rounded up to a multiple of NFS_DIRBLKSIZ <= NFS_MAXREADDIR + * - calls VOP_READDIR() + * - loops around building the reply + * if the output generated exceeds count break out of loop + * The nfsm_clget macro is used here so that the reply will be packed + * tightly in mbuf clusters. + * - it only knows that it has encountered eof when the VOP_READDIR() + * reads nothing + * - as such one readdir rpc will return eof false although you are there + * and then the next will return eof + * - it trims out records with d_fileno == 0 + * this doesn't matter for Unix clients, but they might confuse clients + * for other os'. + * NB: It is tempting to set eof to true if the VOP_READDIR() reads less + * than requested, but this may not apply to all filesystems. For + * example, client NFS does not { although it is never remote mounted + * anyhow } + * The alternate call nqnfsrv_readdirlook() does lookups as well. + * PS: The NFS protocol spec. does not clarify what the "count" byte + * argument is a count of.. just name strings and file id's or the + * entire reply rpc or ... + * I tried just file name and id sizes and it confused the Sun client, + * so I am using the full rpc size now. The "paranoia.." comment refers + * to including the status longwords that are not a part of the dir. + * "entry" structures, but are in the rpc. + */ +struct flrep { + u_long fl_cachable; + u_long fl_duration; + u_long fl_frev[2]; + nfsv2fh_t fl_nfh; + u_long fl_fattr[NFSX_NQFATTR / sizeof (u_long)]; +}; + +nfsrv_readdir(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register char *bp, *be; + register struct mbuf *mp; + register struct dirent *dp; + register caddr_t cp; + register u_long *tl; + register long t1; + caddr_t bpos; + struct mbuf *mb, *mb2, *mreq, *mp2; + char *cpos, *cend, *cp2, *rbuf; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct uio io; + struct iovec iv; + int len, nlen, rem, xfer, tsiz, i, error = 0; + int siz, cnt, fullsiz, eofflag, rdonly, cache; + u_quad_t frev; + u_long on, off, toff; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); + toff = fxdr_unsigned(u_long, *tl++); + off = (toff & ~(NFS_DIRBLKSIZ-1)); + on = (toff & (NFS_DIRBLKSIZ-1)); + cnt = fxdr_unsigned(int, *tl); + siz = ((cnt+NFS_DIRBLKSIZ-1) & ~(NFS_DIRBLKSIZ-1)); + if (cnt > NFS_MAXREADDIR) + siz = NFS_MAXREADDIR; + fullsiz = siz; + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + nqsrv_getl(vp, NQL_READ); + if (error = nfsrv_access(vp, VEXEC, cred, rdonly, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + VOP_UNLOCK(vp); + MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); +again: + iv.iov_base = rbuf; + iv.iov_len = fullsiz; + io.uio_iov = &iv; + io.uio_iovcnt = 1; + io.uio_offset = (off_t)off; + io.uio_resid = fullsiz; + io.uio_segflg = UIO_SYSSPACE; + io.uio_rw = UIO_READ; + io.uio_procp = (struct proc *)0; + error = VOP_READDIR(vp, &io, cred); + off = (off_t)io.uio_offset; + if (error) { + vrele(vp); + free((caddr_t)rbuf, M_TEMP); + nfsm_reply(0); + } + if (io.uio_resid < fullsiz) + eofflag = 0; + else + eofflag = 1; + if (io.uio_resid) { + siz -= io.uio_resid; + + /* + * If nothing read, return eof + * rpc reply + */ + if (siz == 0) { + vrele(vp); + nfsm_reply(2*NFSX_UNSIGNED); + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = nfs_false; + *tl = nfs_true; + FREE((caddr_t)rbuf, M_TEMP); + return (0); + } + } + + /* + * Check for degenerate cases of nothing useful read. + * If so go try again + */ + cpos = rbuf + on; + cend = rbuf + siz; + dp = (struct dirent *)cpos; + while (cpos < cend && dp->d_fileno == 0) { + cpos += dp->d_reclen; + dp = (struct dirent *)cpos; + } + if (cpos >= cend) { + toff = off; + siz = fullsiz; + on = 0; + goto again; + } + + cpos = rbuf + on; + cend = rbuf + siz; + dp = (struct dirent *)cpos; + len = 3*NFSX_UNSIGNED; /* paranoia, probably can be 0 */ + nfsm_reply(siz); + mp = mp2 = mb; + bp = bpos; + be = bp + M_TRAILINGSPACE(mp); + + /* Loop through the records and build reply */ + while (cpos < cend) { + if (dp->d_fileno != 0) { + nlen = dp->d_namlen; + rem = nfsm_rndup(nlen)-nlen; + len += (4*NFSX_UNSIGNED + nlen + rem); + if (len > cnt) { + eofflag = 0; + break; + } + /* + * Build the directory record xdr from + * the dirent entry. + */ + nfsm_clget; + *tl = nfs_true; + bp += NFSX_UNSIGNED; + nfsm_clget; + *tl = txdr_unsigned(dp->d_fileno); + bp += NFSX_UNSIGNED; + nfsm_clget; + *tl = txdr_unsigned(nlen); + bp += NFSX_UNSIGNED; + + /* And loop around copying the name */ + xfer = nlen; + cp = dp->d_name; + while (xfer > 0) { + nfsm_clget; + if ((bp+xfer) > be) + tsiz = be-bp; + else + tsiz = xfer; + bcopy(cp, bp, tsiz); + bp += tsiz; + xfer -= tsiz; + if (xfer > 0) + cp += tsiz; + } + /* And null pad to a long boundary */ + for (i = 0; i < rem; i++) + *bp++ = '\0'; + nfsm_clget; + + /* Finish off the record */ + toff += dp->d_reclen; + *tl = txdr_unsigned(toff); + bp += NFSX_UNSIGNED; + } else + toff += dp->d_reclen; + cpos += dp->d_reclen; + dp = (struct dirent *)cpos; + } + vrele(vp); + nfsm_clget; + *tl = nfs_false; + bp += NFSX_UNSIGNED; + nfsm_clget; + if (eofflag) + *tl = nfs_true; + else + *tl = nfs_false; + bp += NFSX_UNSIGNED; + if (mp != mb) { + if (bp < be) + mp->m_len = bp - mtod(mp, caddr_t); + } else + mp->m_len += bp - bpos; + FREE(rbuf, M_TEMP); + nfsm_srvdone; +} + +nqnfsrv_readdirlook(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register char *bp, *be; + register struct mbuf *mp; + register struct dirent *dp; + register caddr_t cp; + register u_long *tl; + register long t1; + caddr_t bpos; + struct mbuf *mb, *mb2, *mreq, *mp2; + char *cpos, *cend, *cp2, *rbuf; + struct vnode *vp, *nvp; + struct flrep fl; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct uio io; + struct iovec iv; + struct vattr va, *vap = &va; + struct nfsv2_fattr *fp; + int len, nlen, rem, xfer, tsiz, i, error = 0, duration2, cache2; + int siz, cnt, fullsiz, eofflag, rdonly, cache; + u_quad_t frev, frev2; + u_long on, off, toff; + + fhp = &nfh.fh_generic; + nfsm_srvmtofh(fhp); + nfsm_dissect(tl, u_long *, 3*NFSX_UNSIGNED); + toff = fxdr_unsigned(u_long, *tl++); + off = (toff & ~(NFS_DIRBLKSIZ-1)); + on = (toff & (NFS_DIRBLKSIZ-1)); + cnt = fxdr_unsigned(int, *tl++); + duration2 = fxdr_unsigned(int, *tl); + siz = ((cnt+NFS_DIRBLKSIZ-1) & ~(NFS_DIRBLKSIZ-1)); + if (cnt > NFS_MAXREADDIR) + siz = NFS_MAXREADDIR; + fullsiz = siz; + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + nqsrv_getl(vp, NQL_READ); + if (error = nfsrv_access(vp, VEXEC, cred, rdonly, nfsd->nd_procp)) { + vput(vp); + nfsm_reply(0); + } + VOP_UNLOCK(vp); + MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK); +again: + iv.iov_base = rbuf; + iv.iov_len = fullsiz; + io.uio_iov = &iv; + io.uio_iovcnt = 1; + io.uio_offset = (off_t)off; + io.uio_resid = fullsiz; + io.uio_segflg = UIO_SYSSPACE; + io.uio_rw = UIO_READ; + io.uio_procp = (struct proc *)0; + error = VOP_READDIR(vp, &io, cred); + off = (u_long)io.uio_offset; + if (error) { + vrele(vp); + free((caddr_t)rbuf, M_TEMP); + nfsm_reply(0); + } + if (io.uio_resid < fullsiz) + eofflag = 0; + else + eofflag = 1; + if (io.uio_resid) { + siz -= io.uio_resid; + + /* + * If nothing read, return eof + * rpc reply + */ + if (siz == 0) { + vrele(vp); + nfsm_reply(2 * NFSX_UNSIGNED); + nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED); + *tl++ = nfs_false; + *tl = nfs_true; + FREE((caddr_t)rbuf, M_TEMP); + return (0); + } + } + + /* + * Check for degenerate cases of nothing useful read. + * If so go try again + */ + cpos = rbuf + on; + cend = rbuf + siz; + dp = (struct dirent *)cpos; + while (cpos < cend && dp->d_fileno == 0) { + cpos += dp->d_reclen; + dp = (struct dirent *)cpos; + } + if (cpos >= cend) { + toff = off; + siz = fullsiz; + on = 0; + goto again; + } + + cpos = rbuf + on; + cend = rbuf + siz; + dp = (struct dirent *)cpos; + len = 3 * NFSX_UNSIGNED; /* paranoia, probably can be 0 */ + nfsm_reply(siz); + mp = mp2 = mb; + bp = bpos; + be = bp + M_TRAILINGSPACE(mp); + + /* Loop through the records and build reply */ + while (cpos < cend) { + if (dp->d_fileno != 0) { + nlen = dp->d_namlen; + rem = nfsm_rndup(nlen)-nlen; + + /* + * For readdir_and_lookup get the vnode using + * the file number. + */ + if (VFS_VGET(vp->v_mount, dp->d_fileno, &nvp)) + goto invalid; + bzero((caddr_t)&fl.fl_nfh, sizeof (nfsv2fh_t)); + fl.fl_nfh.fh_generic.fh_fsid = + nvp->v_mount->mnt_stat.f_fsid; + if (VFS_VPTOFH(nvp, &fl.fl_nfh.fh_generic.fh_fid)) { + vput(nvp); + goto invalid; + } + if (duration2) { + (void) nqsrv_getlease(nvp, &duration2, NQL_READ, + nfsd, nam, &cache2, &frev2, cred); + fl.fl_duration = txdr_unsigned(duration2); + fl.fl_cachable = txdr_unsigned(cache2); + txdr_hyper(&frev2, fl.fl_frev); + } else + fl.fl_duration = 0; + if (VOP_GETATTR(nvp, vap, cred, nfsd->nd_procp)) { + vput(nvp); + goto invalid; + } + vput(nvp); + fp = (struct nfsv2_fattr *)&fl.fl_fattr; + nfsm_srvfillattr; + len += (4*NFSX_UNSIGNED + nlen + rem + NFSX_FH + + NFSX_NQFATTR); + if (len > cnt) { + eofflag = 0; + break; + } + /* + * Build the directory record xdr from + * the dirent entry. + */ + nfsm_clget; + *tl = nfs_true; + bp += NFSX_UNSIGNED; + + /* + * For readdir_and_lookup copy the stuff out. + */ + xfer = sizeof (struct flrep); + cp = (caddr_t)&fl; + while (xfer > 0) { + nfsm_clget; + if ((bp+xfer) > be) + tsiz = be-bp; + else + tsiz = xfer; + bcopy(cp, bp, tsiz); + bp += tsiz; + xfer -= tsiz; + if (xfer > 0) + cp += tsiz; + } + nfsm_clget; + *tl = txdr_unsigned(dp->d_fileno); + bp += NFSX_UNSIGNED; + nfsm_clget; + *tl = txdr_unsigned(nlen); + bp += NFSX_UNSIGNED; + + /* And loop around copying the name */ + xfer = nlen; + cp = dp->d_name; + while (xfer > 0) { + nfsm_clget; + if ((bp+xfer) > be) + tsiz = be-bp; + else + tsiz = xfer; + bcopy(cp, bp, tsiz); + bp += tsiz; + xfer -= tsiz; + if (xfer > 0) + cp += tsiz; + } + /* And null pad to a long boundary */ + for (i = 0; i < rem; i++) + *bp++ = '\0'; + nfsm_clget; + + /* Finish off the record */ + toff += dp->d_reclen; + *tl = txdr_unsigned(toff); + bp += NFSX_UNSIGNED; + } else +invalid: + toff += dp->d_reclen; + cpos += dp->d_reclen; + dp = (struct dirent *)cpos; + } + vrele(vp); + nfsm_clget; + *tl = nfs_false; + bp += NFSX_UNSIGNED; + nfsm_clget; + if (eofflag) + *tl = nfs_true; + else + *tl = nfs_false; + bp += NFSX_UNSIGNED; + if (mp != mb) { + if (bp < be) + mp->m_len = bp - mtod(mp, caddr_t); + } else + mp->m_len += bp - bpos; + FREE(rbuf, M_TEMP); + nfsm_srvdone; +} + +/* + * nfs statfs service + */ +nfsrv_statfs(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + register struct statfs *sf; + register struct nfsv2_statfs *sfp; + register u_long *tl; + register long t1; + caddr_t bpos; + int error = 0, rdonly, cache, isnq; + char *cp2; + struct mbuf *mb, *mb2, *mreq; + struct vnode *vp; + nfsv2fh_t nfh; + fhandle_t *fhp; + struct statfs statfs; + u_quad_t frev; + + fhp = &nfh.fh_generic; + isnq = (nfsd->nd_nqlflag != NQL_NOVAL); + nfsm_srvmtofh(fhp); + if (error = nfsrv_fhtovp(fhp, TRUE, &vp, cred, nfsd->nd_slp, nam, &rdonly)) + nfsm_reply(0); + sf = &statfs; + error = VFS_STATFS(vp->v_mount, sf, nfsd->nd_procp); + vput(vp); + nfsm_reply(NFSX_STATFS(isnq)); + nfsm_build(sfp, struct nfsv2_statfs *, NFSX_STATFS(isnq)); + sfp->sf_tsize = txdr_unsigned(NFS_MAXDGRAMDATA); + sfp->sf_bsize = txdr_unsigned(sf->f_bsize); + sfp->sf_blocks = txdr_unsigned(sf->f_blocks); + sfp->sf_bfree = txdr_unsigned(sf->f_bfree); + sfp->sf_bavail = txdr_unsigned(sf->f_bavail); + if (isnq) { + sfp->sf_files = txdr_unsigned(sf->f_files); + sfp->sf_ffree = txdr_unsigned(sf->f_ffree); + } + nfsm_srvdone; +} + +/* + * Null operation, used by clients to ping server + */ +/* ARGSUSED */ +nfsrv_null(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + caddr_t bpos; + int error = VNOVAL, cache; + struct mbuf *mb, *mreq; + u_quad_t frev; + + nfsm_reply(0); + return (error); +} + +/* + * No operation, used for obsolete procedures + */ +/* ARGSUSED */ +nfsrv_noop(nfsd, mrep, md, dpos, cred, nam, mrq) + struct nfsd *nfsd; + struct mbuf *mrep, *md; + caddr_t dpos; + struct ucred *cred; + struct mbuf *nam, **mrq; +{ + caddr_t bpos; + int error, cache; + struct mbuf *mb, *mreq; + u_quad_t frev; + + if (nfsd->nd_repstat) + error = nfsd->nd_repstat; + else + error = EPROCUNAVAIL; + nfsm_reply(0); + return (error); +} + +/* + * Perform access checking for vnodes obtained from file handles that would + * refer to files already opened by a Unix client. You cannot just use + * vn_writechk() and VOP_ACCESS() for two reasons. + * 1 - You must check for exported rdonly as well as MNT_RDONLY for the write case + * 2 - The owner is to be given access irrespective of mode bits so that + * processes that chmod after opening a file don't break. I don't like + * this because it opens a security hole, but since the nfs server opens + * a security hole the size of a barn door anyhow, what the heck. + */ +nfsrv_access(vp, flags, cred, rdonly, p) + register struct vnode *vp; + int flags; + register struct ucred *cred; + int rdonly; + struct proc *p; +{ + struct vattr vattr; + int error; + if (flags & VWRITE) { + /* Just vn_writechk() changed to check rdonly */ + /* + * Disallow write attempts on read-only file systems; + * unless the file is a socket or a block or character + * device resident on the file system. + */ + if (rdonly || (vp->v_mount->mnt_flag & MNT_RDONLY)) { + switch (vp->v_type) { + case VREG: case VDIR: case VLNK: + return (EROFS); + } + } + /* + * If there's shared text associated with + * the inode, try to free it up once. If + * we fail, we can't allow writing. + */ + if ((vp->v_flag & VTEXT) && !vnode_pager_uncache(vp)) + return (ETXTBSY); + } + if (error = VOP_GETATTR(vp, &vattr, cred, p)) + return (error); + if ((error = VOP_ACCESS(vp, flags, cred, p)) && + cred->cr_uid != vattr.va_uid) + return (error); + return (0); +} diff --git a/sys/nfsserver/nfs_srvcache.c b/sys/nfsserver/nfs_srvcache.c new file mode 100644 index 00000000000..63d8bb72d82 --- /dev/null +++ b/sys/nfsserver/nfs_srvcache.c @@ -0,0 +1,348 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_srvcache.c 8.1 (Berkeley) 6/10/93 + */ + +/* + * Reference: Chet Juszczak, "Improving the Performance and Correctness + * of an NFS Server", in Proc. Winter 1989 USENIX Conference, + * pages 53-63. San Diego, February 1989. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifdef ISO +#include +#endif +#include +#include +#include +#include +#include +#include + +long numnfsrvcache, desirednfsrvcache = NFSRVCACHESIZ; + +#define NFSRCHASH(xid) (((xid) + ((xid) >> 24)) & rheadhash) +static struct nfsrvcache *nfsrvlruhead, **nfsrvlrutail = &nfsrvlruhead; +static struct nfsrvcache **rheadhtbl; +static u_long rheadhash; + +#define TRUE 1 +#define FALSE 0 + +#define NETFAMILY(rp) \ + (((rp)->rc_flag & RC_INETADDR) ? AF_INET : AF_ISO) + +/* + * Static array that defines which nfs rpc's are nonidempotent + */ +int nonidempotent[NFS_NPROCS] = { + FALSE, + FALSE, + TRUE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + TRUE, + TRUE, + TRUE, + TRUE, + TRUE, + TRUE, + TRUE, + TRUE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, +}; + +/* True iff the rpc reply is an nfs status ONLY! */ +static int repliesstatus[NFS_NPROCS] = { + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + TRUE, + TRUE, + TRUE, + TRUE, + FALSE, + TRUE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + FALSE, + TRUE, +}; + +/* + * Initialize the server request cache list + */ +nfsrv_initcache() +{ + + rheadhtbl = hashinit(desirednfsrvcache, M_NFSD, &rheadhash); +} + +/* + * Look for the request in the cache + * If found then + * return action and optionally reply + * else + * insert it in the cache + * + * The rules are as follows: + * - if in progress, return DROP request + * - if completed within DELAY of the current time, return DROP it + * - if completed a longer time ago return REPLY if the reply was cached or + * return DOIT + * Update/add new request at end of lru list + */ +nfsrv_getcache(nam, nd, repp) + struct mbuf *nam; + register struct nfsd *nd; + struct mbuf **repp; +{ + register struct nfsrvcache *rp, *rq, **rpp; + struct mbuf *mb; + struct sockaddr_in *saddr; + caddr_t bpos; + int ret; + + if (nd->nd_nqlflag != NQL_NOVAL) + return (RC_DOIT); + rpp = &rheadhtbl[NFSRCHASH(nd->nd_retxid)]; +loop: + for (rp = *rpp; rp; rp = rp->rc_forw) { + if (nd->nd_retxid == rp->rc_xid && nd->nd_procnum == rp->rc_proc && + netaddr_match(NETFAMILY(rp), &rp->rc_haddr, nam)) { + if ((rp->rc_flag & RC_LOCKED) != 0) { + rp->rc_flag |= RC_WANTED; + (void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 0); + goto loop; + } + rp->rc_flag |= RC_LOCKED; + /* If not at end of LRU chain, move it there */ + if (rp->rc_next) { + /* remove from LRU chain */ + *rp->rc_prev = rp->rc_next; + rp->rc_next->rc_prev = rp->rc_prev; + /* and replace at end of it */ + rp->rc_next = NULL; + rp->rc_prev = nfsrvlrutail; + *nfsrvlrutail = rp; + nfsrvlrutail = &rp->rc_next; + } + if (rp->rc_state == RC_UNUSED) + panic("nfsrv cache"); + if (rp->rc_state == RC_INPROG) { + nfsstats.srvcache_inproghits++; + ret = RC_DROPIT; + } else if (rp->rc_flag & RC_REPSTATUS) { + nfsstats.srvcache_nonidemdonehits++; + nfs_rephead(0, nd, rp->rc_status, + 0, (u_quad_t *)0, repp, &mb, &bpos); + ret = RC_REPLY; + } else if (rp->rc_flag & RC_REPMBUF) { + nfsstats.srvcache_nonidemdonehits++; + *repp = m_copym(rp->rc_reply, 0, M_COPYALL, + M_WAIT); + ret = RC_REPLY; + } else { + nfsstats.srvcache_idemdonehits++; + rp->rc_state = RC_INPROG; + ret = RC_DOIT; + } + rp->rc_flag &= ~RC_LOCKED; + if (rp->rc_flag & RC_WANTED) { + rp->rc_flag &= ~RC_WANTED; + wakeup((caddr_t)rp); + } + return (ret); + } + } + nfsstats.srvcache_misses++; + if (numnfsrvcache < desirednfsrvcache) { + rp = (struct nfsrvcache *)malloc((u_long)sizeof *rp, + M_NFSD, M_WAITOK); + bzero((char *)rp, sizeof *rp); + numnfsrvcache++; + rp->rc_flag = RC_LOCKED; + } else { + rp = nfsrvlruhead; + while ((rp->rc_flag & RC_LOCKED) != 0) { + rp->rc_flag |= RC_WANTED; + (void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 0); + rp = nfsrvlruhead; + } + rp->rc_flag |= RC_LOCKED; + /* remove from hash chain */ + if (rq = rp->rc_forw) + rq->rc_back = rp->rc_back; + *rp->rc_back = rq; + /* remove from LRU chain */ + *rp->rc_prev = rp->rc_next; + rp->rc_next->rc_prev = rp->rc_prev; + if (rp->rc_flag & RC_REPMBUF) + m_freem(rp->rc_reply); + if (rp->rc_flag & RC_NAM) + MFREE(rp->rc_nam, mb); + rp->rc_flag &= (RC_LOCKED | RC_WANTED); + } + /* place at end of LRU list */ + rp->rc_next = NULL; + rp->rc_prev = nfsrvlrutail; + *nfsrvlrutail = rp; + nfsrvlrutail = &rp->rc_next; + rp->rc_state = RC_INPROG; + rp->rc_xid = nd->nd_retxid; + saddr = mtod(nam, struct sockaddr_in *); + switch (saddr->sin_family) { + case AF_INET: + rp->rc_flag |= RC_INETADDR; + rp->rc_inetaddr = saddr->sin_addr.s_addr; + break; + case AF_ISO: + default: + rp->rc_flag |= RC_NAM; + rp->rc_nam = m_copym(nam, 0, M_COPYALL, M_WAIT); + break; + }; + rp->rc_proc = nd->nd_procnum; + /* insert into hash chain */ + if (rq = *rpp) + rq->rc_back = &rp->rc_forw; + rp->rc_forw = rq; + rp->rc_back = rpp; + *rpp = rp; + rp->rc_flag &= ~RC_LOCKED; + if (rp->rc_flag & RC_WANTED) { + rp->rc_flag &= ~RC_WANTED; + wakeup((caddr_t)rp); + } + return (RC_DOIT); +} + +/* + * Update a request cache entry after the rpc has been done + */ +void +nfsrv_updatecache(nam, nd, repvalid, repmbuf) + struct mbuf *nam; + register struct nfsd *nd; + int repvalid; + struct mbuf *repmbuf; +{ + register struct nfsrvcache *rp; + + if (nd->nd_nqlflag != NQL_NOVAL) + return; +loop: + for (rp = rheadhtbl[NFSRCHASH(nd->nd_retxid)]; rp; rp = rp->rc_forw) { + if (nd->nd_retxid == rp->rc_xid && nd->nd_procnum == rp->rc_proc && + netaddr_match(NETFAMILY(rp), &rp->rc_haddr, nam)) { + if ((rp->rc_flag & RC_LOCKED) != 0) { + rp->rc_flag |= RC_WANTED; + (void) tsleep((caddr_t)rp, PZERO-1, "nfsrc", 0); + goto loop; + } + rp->rc_flag |= RC_LOCKED; + rp->rc_state = RC_DONE; + /* + * If we have a valid reply update status and save + * the reply for non-idempotent rpc's. + */ + if (repvalid && nonidempotent[nd->nd_procnum]) { + if (repliesstatus[nd->nd_procnum]) { + rp->rc_status = nd->nd_repstat; + rp->rc_flag |= RC_REPSTATUS; + } else { + rp->rc_reply = m_copym(repmbuf, + 0, M_COPYALL, M_WAIT); + rp->rc_flag |= RC_REPMBUF; + } + } + rp->rc_flag &= ~RC_LOCKED; + if (rp->rc_flag & RC_WANTED) { + rp->rc_flag &= ~RC_WANTED; + wakeup((caddr_t)rp); + } + return; + } + } +} + +/* + * Clean out the cache. Called when the last nfsd terminates. + */ +void +nfsrv_cleancache() +{ + register struct nfsrvcache *rp, *nextrp; + + for (rp = nfsrvlruhead; rp; rp = nextrp) { + nextrp = rp->rc_next; + free(rp, M_NFSD); + } + bzero((char *)rheadhtbl, (rheadhash + 1) * sizeof(void *)); + nfsrvlruhead = NULL; + nfsrvlrutail = &nfsrvlruhead; + numnfsrvcache = 0; +} diff --git a/sys/nfsserver/nfs_srvsock.c b/sys/nfsserver/nfs_srvsock.c new file mode 100644 index 00000000000..cf88ed33d92 --- /dev/null +++ b/sys/nfsserver/nfs_srvsock.c @@ -0,0 +1,1990 @@ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_socket.c 8.3 (Berkeley) 1/12/94 + */ + +/* + * Socket operations for use by nfs + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TRUE 1 +#define FALSE 0 + +/* + * Estimate rto for an nfs rpc sent via. an unreliable datagram. + * Use the mean and mean deviation of rtt for the appropriate type of rpc + * for the frequent rpcs and a default for the others. + * The justification for doing "other" this way is that these rpcs + * happen so infrequently that timer est. would probably be stale. + * Also, since many of these rpcs are + * non-idempotent, a conservative timeout is desired. + * getattr, lookup - A+2D + * read, write - A+4D + * other - nm_timeo + */ +#define NFS_RTO(n, t) \ + ((t) == 0 ? (n)->nm_timeo : \ + ((t) < 3 ? \ + (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \ + ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1))) +#define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1] +#define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1] +/* + * External data, mostly RPC constants in XDR form + */ +extern u_long rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, rpc_auth_unix, + rpc_msgaccepted, rpc_call, rpc_autherr, rpc_rejectedcred, + rpc_auth_kerb; +extern u_long nfs_prog, nfs_vers, nqnfs_prog, nqnfs_vers; +extern time_t nqnfsstarttime; +extern int nonidempotent[NFS_NPROCS]; + +/* + * Maps errno values to nfs error numbers. + * Use NFSERR_IO as the catch all for ones not specifically defined in + * RFC 1094. + */ +static int nfsrv_errmap[ELAST] = { + NFSERR_PERM, NFSERR_NOENT, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_NXIO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_ACCES, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_EXIST, NFSERR_IO, NFSERR_NODEV, NFSERR_NOTDIR, + NFSERR_ISDIR, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_FBIG, NFSERR_NOSPC, NFSERR_IO, NFSERR_ROFS, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_NAMETOL, NFSERR_IO, NFSERR_IO, + NFSERR_NOTEMPTY, NFSERR_IO, NFSERR_IO, NFSERR_DQUOT, NFSERR_STALE, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, NFSERR_IO, + NFSERR_IO, +}; + +/* + * Defines which timer to use for the procnum. + * 0 - default + * 1 - getattr + * 2 - lookup + * 3 - read + * 4 - write + */ +static int proct[NFS_NPROCS] = { + 0, 1, 0, 0, 2, 3, 3, 0, 4, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0, +}; + +/* + * There is a congestion window for outstanding rpcs maintained per mount + * point. The cwnd size is adjusted in roughly the way that: + * Van Jacobson, Congestion avoidance and Control, In "Proceedings of + * SIGCOMM '88". ACM, August 1988. + * describes for TCP. The cwnd size is chopped in half on a retransmit timeout + * and incremented by 1/cwnd when each rpc reply is received and a full cwnd + * of rpcs is in progress. + * (The sent count and cwnd are scaled for integer arith.) + * Variants of "slow start" were tried and were found to be too much of a + * performance hit (ave. rtt 3 times larger), + * I suspect due to the large rtt that nfs rpcs have. + */ +#define NFS_CWNDSCALE 256 +#define NFS_MAXCWND (NFS_CWNDSCALE * 32) +static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, }; +int nfs_sbwait(); +void nfs_disconnect(), nfs_realign(), nfsrv_wakenfsd(), nfs_sndunlock(); +void nfs_rcvunlock(), nqnfs_serverd(), nqnfs_clientlease(); +struct mbuf *nfsm_rpchead(); +int nfsrtton = 0; +struct nfsrtt nfsrtt; +struct nfsd nfsd_head; + +int nfsrv_null(), + nfsrv_getattr(), + nfsrv_setattr(), + nfsrv_lookup(), + nfsrv_readlink(), + nfsrv_read(), + nfsrv_write(), + nfsrv_create(), + nfsrv_remove(), + nfsrv_rename(), + nfsrv_link(), + nfsrv_symlink(), + nfsrv_mkdir(), + nfsrv_rmdir(), + nfsrv_readdir(), + nfsrv_statfs(), + nfsrv_noop(), + nqnfsrv_readdirlook(), + nqnfsrv_getlease(), + nqnfsrv_vacated(), + nqnfsrv_access(); + +int (*nfsrv_procs[NFS_NPROCS])() = { + nfsrv_null, + nfsrv_getattr, + nfsrv_setattr, + nfsrv_noop, + nfsrv_lookup, + nfsrv_readlink, + nfsrv_read, + nfsrv_noop, + nfsrv_write, + nfsrv_create, + nfsrv_remove, + nfsrv_rename, + nfsrv_link, + nfsrv_symlink, + nfsrv_mkdir, + nfsrv_rmdir, + nfsrv_readdir, + nfsrv_statfs, + nqnfsrv_readdirlook, + nqnfsrv_getlease, + nqnfsrv_vacated, + nfsrv_noop, + nqnfsrv_access, +}; + +struct nfsreq nfsreqh; + +/* + * Initialize sockets and congestion for a new NFS connection. + * We do not free the sockaddr if error. + */ +nfs_connect(nmp, rep) + register struct nfsmount *nmp; + struct nfsreq *rep; +{ + register struct socket *so; + int s, error, rcvreserve, sndreserve; + struct sockaddr *saddr; + struct sockaddr_in *sin; + struct mbuf *m; + u_short tport; + + nmp->nm_so = (struct socket *)0; + saddr = mtod(nmp->nm_nam, struct sockaddr *); + if (error = socreate(saddr->sa_family, + &nmp->nm_so, nmp->nm_sotype, nmp->nm_soproto)) + goto bad; + so = nmp->nm_so; + nmp->nm_soflags = so->so_proto->pr_flags; + + /* + * Some servers require that the client port be a reserved port number. + */ + if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) { + MGET(m, M_WAIT, MT_SONAME); + sin = mtod(m, struct sockaddr_in *); + sin->sin_len = m->m_len = sizeof (struct sockaddr_in); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + tport = IPPORT_RESERVED - 1; + sin->sin_port = htons(tport); + while ((error = sobind(so, m)) == EADDRINUSE && + --tport > IPPORT_RESERVED / 2) + sin->sin_port = htons(tport); + m_freem(m); + if (error) + goto bad; + } + + /* + * Protocols that do not require connections may be optionally left + * unconnected for servers that reply from a port other than NFS_PORT. + */ + if (nmp->nm_flag & NFSMNT_NOCONN) { + if (nmp->nm_soflags & PR_CONNREQUIRED) { + error = ENOTCONN; + goto bad; + } + } else { + if (error = soconnect(so, nmp->nm_nam)) + goto bad; + + /* + * Wait for the connection to complete. Cribbed from the + * connect system call but with the wait timing out so + * that interruptible mounts don't hang here for a long time. + */ + s = splnet(); + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { + (void) tsleep((caddr_t)&so->so_timeo, PSOCK, + "nfscon", 2 * hz); + if ((so->so_state & SS_ISCONNECTING) && + so->so_error == 0 && rep && + (error = nfs_sigintr(nmp, rep, rep->r_procp))) { + so->so_state &= ~SS_ISCONNECTING; + splx(s); + goto bad; + } + } + if (so->so_error) { + error = so->so_error; + so->so_error = 0; + splx(s); + goto bad; + } + splx(s); + } + if (nmp->nm_flag & (NFSMNT_SOFT | NFSMNT_INT)) { + so->so_rcv.sb_timeo = (5 * hz); + so->so_snd.sb_timeo = (5 * hz); + } else { + so->so_rcv.sb_timeo = 0; + so->so_snd.sb_timeo = 0; + } + if (nmp->nm_sotype == SOCK_DGRAM) { + sndreserve = nmp->nm_wsize + NFS_MAXPKTHDR; + rcvreserve = nmp->nm_rsize + NFS_MAXPKTHDR; + } else if (nmp->nm_sotype == SOCK_SEQPACKET) { + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * 2; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR) * 2; + } else { + if (nmp->nm_sotype != SOCK_STREAM) + panic("nfscon sotype"); + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); + } + if (so->so_proto->pr_protocol == IPPROTO_TCP) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); + } + sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR + sizeof (u_long)) + * 2; + rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR + sizeof (u_long)) + * 2; + } + if (error = soreserve(so, sndreserve, rcvreserve)) + goto bad; + so->so_rcv.sb_flags |= SB_NOINTR; + so->so_snd.sb_flags |= SB_NOINTR; + + /* Initialize other non-zero congestion variables */ + nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = nmp->nm_srtt[3] = + nmp->nm_srtt[4] = (NFS_TIMEO << 3); + nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] = + nmp->nm_sdrtt[3] = nmp->nm_sdrtt[4] = 0; + nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ + nmp->nm_sent = 0; + nmp->nm_timeouts = 0; + return (0); + +bad: + nfs_disconnect(nmp); + return (error); +} + +/* + * Reconnect routine: + * Called when a connection is broken on a reliable protocol. + * - clean up the old socket + * - nfs_connect() again + * - set R_MUSTRESEND for all outstanding requests on mount point + * If this fails the mount point is DEAD! + * nb: Must be called with the nfs_sndlock() set on the mount point. + */ +nfs_reconnect(rep) + register struct nfsreq *rep; +{ + register struct nfsreq *rp; + register struct nfsmount *nmp = rep->r_nmp; + int error; + + nfs_disconnect(nmp); + while (error = nfs_connect(nmp, rep)) { + if (error == EINTR || error == ERESTART) + return (EINTR); + (void) tsleep((caddr_t)&lbolt, PSOCK, "nfscon", 0); + } + + /* + * Loop through outstanding request list and fix up all requests + * on old socket. + */ + rp = nfsreqh.r_next; + while (rp != &nfsreqh) { + if (rp->r_nmp == nmp) + rp->r_flags |= R_MUSTRESEND; + rp = rp->r_next; + } + return (0); +} + +/* + * NFS disconnect. Clean up and unlink. + */ +void +nfs_disconnect(nmp) + register struct nfsmount *nmp; +{ + register struct socket *so; + + if (nmp->nm_so) { + so = nmp->nm_so; + nmp->nm_so = (struct socket *)0; + soshutdown(so, 2); + soclose(so); + } +} + +/* + * This is the nfs send routine. For connection based socket types, it + * must be called with an nfs_sndlock() on the socket. + * "rep == NULL" indicates that it has been called from a server. + * For the client side: + * - return EINTR if the RPC is terminated, 0 otherwise + * - set R_MUSTRESEND if the send fails for any reason + * - do any cleanup required by recoverable socket errors (???) + * For the server side: + * - return EINTR or ERESTART if interrupted by a signal + * - return EPIPE if a connection is lost for connection based sockets (TCP...) + * - do any cleanup required by recoverable socket errors (???) + */ +nfs_send(so, nam, top, rep) + register struct socket *so; + struct mbuf *nam; + register struct mbuf *top; + struct nfsreq *rep; +{ + struct mbuf *sendnam; + int error, soflags, flags; + + if (rep) { + if (rep->r_flags & R_SOFTTERM) { + m_freem(top); + return (EINTR); + } + if ((so = rep->r_nmp->nm_so) == NULL) { + rep->r_flags |= R_MUSTRESEND; + m_freem(top); + return (0); + } + rep->r_flags &= ~R_MUSTRESEND; + soflags = rep->r_nmp->nm_soflags; + } else + soflags = so->so_proto->pr_flags; + if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED)) + sendnam = (struct mbuf *)0; + else + sendnam = nam; + if (so->so_type == SOCK_SEQPACKET) + flags = MSG_EOR; + else + flags = 0; + + error = sosend(so, sendnam, (struct uio *)0, top, + (struct mbuf *)0, flags); + if (error) { + if (rep) { + log(LOG_INFO, "nfs send error %d for server %s\n",error, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + /* + * Deal with errors for the client side. + */ + if (rep->r_flags & R_SOFTTERM) + error = EINTR; + else + rep->r_flags |= R_MUSTRESEND; + } else + log(LOG_INFO, "nfsd send error %d\n", error); + + /* + * Handle any recoverable (soft) socket errors here. (???) + */ + if (error != EINTR && error != ERESTART && + error != EWOULDBLOCK && error != EPIPE) + error = 0; + } + return (error); +} + +/* + * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all + * done by soreceive(), but for SOCK_STREAM we must deal with the Record + * Mark and consolidate the data into a new mbuf list. + * nb: Sometimes TCP passes the data up to soreceive() in long lists of + * small mbufs. + * For SOCK_STREAM we must be very careful to read an entire record once + * we have read any of it, even if the system call has been interrupted. + */ +nfs_receive(rep, aname, mp) + register struct nfsreq *rep; + struct mbuf **aname; + struct mbuf **mp; +{ + register struct socket *so; + struct uio auio; + struct iovec aio; + register struct mbuf *m; + struct mbuf *control; + u_long len; + struct mbuf **getnam; + int error, sotype, rcvflg; + struct proc *p = curproc; /* XXX */ + + /* + * Set up arguments for soreceive() + */ + *mp = (struct mbuf *)0; + *aname = (struct mbuf *)0; + sotype = rep->r_nmp->nm_sotype; + + /* + * For reliable protocols, lock against other senders/receivers + * in case a reconnect is necessary. + * For SOCK_STREAM, first get the Record Mark to find out how much + * more there is to get. + * We must lock the socket against other receivers + * until we have an entire rpc request/reply. + */ + if (sotype != SOCK_DGRAM) { + if (error = nfs_sndlock(&rep->r_nmp->nm_flag, rep)) + return (error); +tryagain: + /* + * Check for fatal errors and resending request. + */ + /* + * Ugh: If a reconnect attempt just happened, nm_so + * would have changed. NULL indicates a failed + * attempt that has essentially shut down this + * mount point. + */ + if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) { + nfs_sndunlock(&rep->r_nmp->nm_flag); + return (EINTR); + } + if ((so = rep->r_nmp->nm_so) == NULL) { + if (error = nfs_reconnect(rep)) { + nfs_sndunlock(&rep->r_nmp->nm_flag); + return (error); + } + goto tryagain; + } + while (rep->r_flags & R_MUSTRESEND) { + m = m_copym(rep->r_mreq, 0, M_COPYALL, M_WAIT); + nfsstats.rpcretries++; + if (error = nfs_send(so, rep->r_nmp->nm_nam, m, rep)) { + if (error == EINTR || error == ERESTART || + (error = nfs_reconnect(rep))) { + nfs_sndunlock(&rep->r_nmp->nm_flag); + return (error); + } + goto tryagain; + } + } + nfs_sndunlock(&rep->r_nmp->nm_flag); + if (sotype == SOCK_STREAM) { + aio.iov_base = (caddr_t) &len; + aio.iov_len = sizeof(u_long); + auio.uio_iov = &aio; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_offset = 0; + auio.uio_resid = sizeof(u_long); + auio.uio_procp = p; + do { + rcvflg = MSG_WAITALL; + error = soreceive(so, (struct mbuf **)0, &auio, + (struct mbuf **)0, (struct mbuf **)0, &rcvflg); + if (error == EWOULDBLOCK && rep) { + if (rep->r_flags & R_SOFTTERM) + return (EINTR); + } + } while (error == EWOULDBLOCK); + if (!error && auio.uio_resid > 0) { + log(LOG_INFO, + "short receive (%d/%d) from nfs server %s\n", + sizeof(u_long) - auio.uio_resid, + sizeof(u_long), + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = EPIPE; + } + if (error) + goto errout; + len = ntohl(len) & ~0x80000000; + /* + * This is SERIOUS! We are out of sync with the sender + * and forcing a disconnect/reconnect is all I can do. + */ + if (len > NFS_MAXPACKET) { + log(LOG_ERR, "%s (%d) from nfs server %s\n", + "impossible packet length", + len, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = EFBIG; + goto errout; + } + auio.uio_resid = len; + do { + rcvflg = MSG_WAITALL; + error = soreceive(so, (struct mbuf **)0, + &auio, mp, (struct mbuf **)0, &rcvflg); + } while (error == EWOULDBLOCK || error == EINTR || + error == ERESTART); + if (!error && auio.uio_resid > 0) { + log(LOG_INFO, + "short receive (%d/%d) from nfs server %s\n", + len - auio.uio_resid, len, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = EPIPE; + } + } else { + /* + * NB: Since uio_resid is big, MSG_WAITALL is ignored + * and soreceive() will return when it has either a + * control msg or a data msg. + * We have no use for control msg., but must grab them + * and then throw them away so we know what is going + * on. + */ + auio.uio_resid = len = 100000000; /* Anything Big */ + auio.uio_procp = p; + do { + rcvflg = 0; + error = soreceive(so, (struct mbuf **)0, + &auio, mp, &control, &rcvflg); + if (control) + m_freem(control); + if (error == EWOULDBLOCK && rep) { + if (rep->r_flags & R_SOFTTERM) + return (EINTR); + } + } while (error == EWOULDBLOCK || + (!error && *mp == NULL && control)); + if ((rcvflg & MSG_EOR) == 0) + printf("Egad!!\n"); + if (!error && *mp == NULL) + error = EPIPE; + len -= auio.uio_resid; + } +errout: + if (error && error != EINTR && error != ERESTART) { + m_freem(*mp); + *mp = (struct mbuf *)0; + if (error != EPIPE) + log(LOG_INFO, + "receive error %d from nfs server %s\n", + error, + rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname); + error = nfs_sndlock(&rep->r_nmp->nm_flag, rep); + if (!error) + error = nfs_reconnect(rep); + if (!error) + goto tryagain; + } + } else { + if ((so = rep->r_nmp->nm_so) == NULL) + return (EACCES); + if (so->so_state & SS_ISCONNECTED) + getnam = (struct mbuf **)0; + else + getnam = aname; + auio.uio_resid = len = 1000000; + auio.uio_procp = p; + do { + rcvflg = 0; + error = soreceive(so, getnam, &auio, mp, + (struct mbuf **)0, &rcvflg); + if (error == EWOULDBLOCK && + (rep->r_flags & R_SOFTTERM)) + return (EINTR); + } while (error == EWOULDBLOCK); + len -= auio.uio_resid; + } + if (error) { + m_freem(*mp); + *mp = (struct mbuf *)0; + } + /* + * Search for any mbufs that are not a multiple of 4 bytes long + * or with m_data not longword aligned. + * These could cause pointer alignment problems, so copy them to + * well aligned mbufs. + */ + nfs_realign(*mp, 5 * NFSX_UNSIGNED); + return (error); +} + +/* + * Implement receipt of reply on a socket. + * We must search through the list of received datagrams matching them + * with outstanding requests using the xid, until ours is found. + */ +/* ARGSUSED */ +nfs_reply(myrep) + struct nfsreq *myrep; +{ + register struct nfsreq *rep; + register struct nfsmount *nmp = myrep->r_nmp; + register long t1; + struct mbuf *mrep, *nam, *md; + u_long rxid, *tl; + caddr_t dpos, cp2; + int error; + + /* + * Loop around until we get our own reply + */ + for (;;) { + /* + * Lock against other receivers so that I don't get stuck in + * sbwait() after someone else has received my reply for me. + * Also necessary for connection based protocols to avoid + * race conditions during a reconnect. + */ + if (error = nfs_rcvlock(myrep)) + return (error); + /* Already received, bye bye */ + if (myrep->r_mrep != NULL) { + nfs_rcvunlock(&nmp->nm_flag); + return (0); + } + /* + * Get the next Rpc reply off the socket + */ + error = nfs_receive(myrep, &nam, &mrep); + nfs_rcvunlock(&nmp->nm_flag); + if (error) { + + /* + * Ignore routing errors on connectionless protocols?? + */ + if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) { + nmp->nm_so->so_error = 0; + if (myrep->r_flags & R_GETONEREP) + return (0); + continue; + } + return (error); + } + if (nam) + m_freem(nam); + + /* + * Get the xid and check that it is an rpc reply + */ + md = mrep; + dpos = mtod(md, caddr_t); + nfsm_dissect(tl, u_long *, 2*NFSX_UNSIGNED); + rxid = *tl++; + if (*tl != rpc_reply) { + if (nmp->nm_flag & NFSMNT_NQNFS) { + if (nqnfs_callback(nmp, mrep, md, dpos)) + nfsstats.rpcinvalid++; + } else { + nfsstats.rpcinvalid++; + m_freem(mrep); + } +nfsmout: + if (myrep->r_flags & R_GETONEREP) + return (0); + continue; + } + + /* + * Loop through the request list to match up the reply + * Iff no match, just drop the datagram + */ + rep = nfsreqh.r_next; + while (rep != &nfsreqh) { + if (rep->r_mrep == NULL && rxid == rep->r_xid) { + /* Found it.. */ + rep->r_mrep = mrep; + rep->r_md = md; + rep->r_dpos = dpos; + if (nfsrtton) { + struct rttl *rt; + + rt = &nfsrtt.rttl[nfsrtt.pos]; + rt->proc = rep->r_procnum; + rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]); + rt->sent = nmp->nm_sent; + rt->cwnd = nmp->nm_cwnd; + rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1]; + rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1]; + rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid; + rt->tstamp = time; + if (rep->r_flags & R_TIMING) + rt->rtt = rep->r_rtt; + else + rt->rtt = 1000000; + nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ; + } + /* + * Update congestion window. + * Do the additive increase of + * one rpc/rtt. + */ + if (nmp->nm_cwnd <= nmp->nm_sent) { + nmp->nm_cwnd += + (NFS_CWNDSCALE * NFS_CWNDSCALE + + (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd; + if (nmp->nm_cwnd > NFS_MAXCWND) + nmp->nm_cwnd = NFS_MAXCWND; + } + rep->r_flags &= ~R_SENT; + nmp->nm_sent -= NFS_CWNDSCALE; + /* + * Update rtt using a gain of 0.125 on the mean + * and a gain of 0.25 on the deviation. + */ + if (rep->r_flags & R_TIMING) { + /* + * Since the timer resolution of + * NFS_HZ is so course, it can often + * result in r_rtt == 0. Since + * r_rtt == N means that the actual + * rtt is between N+dt and N+2-dt ticks, + * add 1. + */ + t1 = rep->r_rtt + 1; + t1 -= (NFS_SRTT(rep) >> 3); + NFS_SRTT(rep) += t1; + if (t1 < 0) + t1 = -t1; + t1 -= (NFS_SDRTT(rep) >> 2); + NFS_SDRTT(rep) += t1; + } + nmp->nm_timeouts = 0; + break; + } + rep = rep->r_next; + } + /* + * If not matched to a request, drop it. + * If it's mine, get out. + */ + if (rep == &nfsreqh) { + nfsstats.rpcunexpected++; + m_freem(mrep); + } else if (rep == myrep) { + if (rep->r_mrep == NULL) + panic("nfsreply nil"); + return (0); + } + if (myrep->r_flags & R_GETONEREP) + return (0); + } +} + +/* + * nfs_request - goes something like this + * - fill in request struct + * - links it into list + * - calls nfs_send() for first transmit + * - calls nfs_receive() to get reply + * - break down rpc header and return with nfs reply pointed to + * by mrep or error + * nb: always frees up mreq mbuf list + */ +nfs_request(vp, mrest, procnum, procp, cred, mrp, mdp, dposp) + struct vnode *vp; + struct mbuf *mrest; + int procnum; + struct proc *procp; + struct ucred *cred; + struct mbuf **mrp; + struct mbuf **mdp; + caddr_t *dposp; +{ + register struct mbuf *m, *mrep; + register struct nfsreq *rep; + register u_long *tl; + register int i; + struct nfsmount *nmp; + struct mbuf *md, *mheadend; + struct nfsreq *reph; + struct nfsnode *np; + time_t reqtime, waituntil; + caddr_t dpos, cp2; + int t1, nqlflag, cachable, s, error = 0, mrest_len, auth_len, auth_type; + int trylater_delay = NQ_TRYLATERDEL, trylater_cnt = 0, failed_auth = 0; + u_long xid; + u_quad_t frev; + char *auth_str; + + nmp = VFSTONFS(vp->v_mount); + MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK); + rep->r_nmp = nmp; + rep->r_vp = vp; + rep->r_procp = procp; + rep->r_procnum = procnum; + i = 0; + m = mrest; + while (m) { + i += m->m_len; + m = m->m_next; + } + mrest_len = i; + + /* + * Get the RPC header with authorization. + */ +kerbauth: + auth_str = (char *)0; + if (nmp->nm_flag & NFSMNT_KERB) { + if (failed_auth) { + error = nfs_getauth(nmp, rep, cred, &auth_type, + &auth_str, &auth_len); + if (error) { + free((caddr_t)rep, M_NFSREQ); + m_freem(mrest); + return (error); + } + } else { + auth_type = RPCAUTH_UNIX; + auth_len = 5 * NFSX_UNSIGNED; + } + } else { + auth_type = RPCAUTH_UNIX; + if (cred->cr_ngroups < 1) + panic("nfsreq nogrps"); + auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ? + nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + + 5 * NFSX_UNSIGNED; + } + m = nfsm_rpchead(cred, (nmp->nm_flag & NFSMNT_NQNFS), procnum, + auth_type, auth_len, auth_str, mrest, mrest_len, &mheadend, &xid); + if (auth_str) + free(auth_str, M_TEMP); + + /* + * For stream protocols, insert a Sun RPC Record Mark. + */ + if (nmp->nm_sotype == SOCK_STREAM) { + M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); + *mtod(m, u_long *) = htonl(0x80000000 | + (m->m_pkthdr.len - NFSX_UNSIGNED)); + } + rep->r_mreq = m; + rep->r_xid = xid; +tryagain: + if (nmp->nm_flag & NFSMNT_SOFT) + rep->r_retry = nmp->nm_retry; + else + rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ + rep->r_rtt = rep->r_rexmit = 0; + if (proct[procnum] > 0) + rep->r_flags = R_TIMING; + else + rep->r_flags = 0; + rep->r_mrep = NULL; + + /* + * Do the client side RPC. + */ + nfsstats.rpcrequests++; + /* + * Chain request into list of outstanding requests. Be sure + * to put it LAST so timer finds oldest requests first. + */ + s = splsoftclock(); + reph = &nfsreqh; + reph->r_prev->r_next = rep; + rep->r_prev = reph->r_prev; + reph->r_prev = rep; + rep->r_next = reph; + + /* Get send time for nqnfs */ + reqtime = time.tv_sec; + + /* + * If backing off another request or avoiding congestion, don't + * send this one now but let timer do it. If not timing a request, + * do it now. + */ + if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM || + (nmp->nm_flag & NFSMNT_DUMBTIMR) || + nmp->nm_sent < nmp->nm_cwnd)) { + splx(s); + if (nmp->nm_soflags & PR_CONNREQUIRED) + error = nfs_sndlock(&nmp->nm_flag, rep); + if (!error) { + m = m_copym(m, 0, M_COPYALL, M_WAIT); + error = nfs_send(nmp->nm_so, nmp->nm_nam, m, rep); + if (nmp->nm_soflags & PR_CONNREQUIRED) + nfs_sndunlock(&nmp->nm_flag); + } + if (!error && (rep->r_flags & R_MUSTRESEND) == 0) { + nmp->nm_sent += NFS_CWNDSCALE; + rep->r_flags |= R_SENT; + } + } else { + splx(s); + rep->r_rtt = -1; + } + + /* + * Wait for the reply from our send or the timer's. + */ + if (!error || error == EPIPE) + error = nfs_reply(rep); + + /* + * RPC done, unlink the request. + */ + s = splsoftclock(); + rep->r_prev->r_next = rep->r_next; + rep->r_next->r_prev = rep->r_prev; + splx(s); + + /* + * Decrement the outstanding request count. + */ + if (rep->r_flags & R_SENT) { + rep->r_flags &= ~R_SENT; /* paranoia */ + nmp->nm_sent -= NFS_CWNDSCALE; + } + + /* + * If there was a successful reply and a tprintf msg. + * tprintf a response. + */ + if (!error && (rep->r_flags & R_TPRINTFMSG)) + nfs_msg(rep->r_procp, nmp->nm_mountp->mnt_stat.f_mntfromname, + "is alive again"); + mrep = rep->r_mrep; + md = rep->r_md; + dpos = rep->r_dpos; + if (error) { + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + return (error); + } + + /* + * break down the rpc header and check if ok + */ + nfsm_dissect(tl, u_long *, 3*NFSX_UNSIGNED); + if (*tl++ == rpc_msgdenied) { + if (*tl == rpc_mismatch) + error = EOPNOTSUPP; + else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) { + if (*tl == rpc_rejectedcred && failed_auth == 0) { + failed_auth++; + mheadend->m_next = (struct mbuf *)0; + m_freem(mrep); + m_freem(rep->r_mreq); + goto kerbauth; + } else + error = EAUTH; + } else + error = EACCES; + m_freem(mrep); + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + return (error); + } + + /* + * skip over the auth_verf, someday we may want to cache auth_short's + * for nfs_reqhead(), but for now just dump it + */ + if (*++tl != 0) { + i = nfsm_rndup(fxdr_unsigned(long, *tl)); + nfsm_adv(i); + } + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + /* 0 == ok */ + if (*tl == 0) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + if (*tl != 0) { + error = fxdr_unsigned(int, *tl); + m_freem(mrep); + if ((nmp->nm_flag & NFSMNT_NQNFS) && + error == NQNFS_TRYLATER) { + error = 0; + waituntil = time.tv_sec + trylater_delay; + while (time.tv_sec < waituntil) + (void) tsleep((caddr_t)&lbolt, + PSOCK, "nqnfstry", 0); + trylater_delay *= nfs_backoff[trylater_cnt]; + if (trylater_cnt < 7) + trylater_cnt++; + goto tryagain; + } + + /* + * If the File Handle was stale, invalidate the + * lookup cache, just in case. + */ + if (error == ESTALE) + cache_purge(vp); + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + return (error); + } + + /* + * For nqnfs, get any lease in reply + */ + if (nmp->nm_flag & NFSMNT_NQNFS) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + if (*tl) { + np = VTONFS(vp); + nqlflag = fxdr_unsigned(int, *tl); + nfsm_dissect(tl, u_long *, 4*NFSX_UNSIGNED); + cachable = fxdr_unsigned(int, *tl++); + reqtime += fxdr_unsigned(int, *tl++); + if (reqtime > time.tv_sec) { + fxdr_hyper(tl, &frev); + nqnfs_clientlease(nmp, np, nqlflag, + cachable, reqtime, frev); + } + } + } + *mrp = mrep; + *mdp = md; + *dposp = dpos; + m_freem(rep->r_mreq); + FREE((caddr_t)rep, M_NFSREQ); + return (0); + } + m_freem(mrep); + m_freem(rep->r_mreq); + free((caddr_t)rep, M_NFSREQ); + error = EPROTONOSUPPORT; +nfsmout: + return (error); +} + +/* + * Generate the rpc reply header + * siz arg. is used to decide if adding a cluster is worthwhile + */ +nfs_rephead(siz, nd, err, cache, frev, mrq, mbp, bposp) + int siz; + struct nfsd *nd; + int err; + int cache; + u_quad_t *frev; + struct mbuf **mrq; + struct mbuf **mbp; + caddr_t *bposp; +{ + register u_long *tl; + register struct mbuf *mreq; + caddr_t bpos; + struct mbuf *mb, *mb2; + + MGETHDR(mreq, M_WAIT, MT_DATA); + mb = mreq; + /* + * If this is a big reply, use a cluster else + * try and leave leading space for the lower level headers. + */ + siz += RPC_REPLYSIZ; + if (siz >= MINCLSIZE) { + MCLGET(mreq, M_WAIT); + } else + mreq->m_data += max_hdr; + tl = mtod(mreq, u_long *); + mreq->m_len = 6*NFSX_UNSIGNED; + bpos = ((caddr_t)tl)+mreq->m_len; + *tl++ = nd->nd_retxid; + *tl++ = rpc_reply; + if (err == ERPCMISMATCH || err == NQNFS_AUTHERR) { + *tl++ = rpc_msgdenied; + if (err == NQNFS_AUTHERR) { + *tl++ = rpc_autherr; + *tl = rpc_rejectedcred; + mreq->m_len -= NFSX_UNSIGNED; + bpos -= NFSX_UNSIGNED; + } else { + *tl++ = rpc_mismatch; + *tl++ = txdr_unsigned(2); + *tl = txdr_unsigned(2); + } + } else { + *tl++ = rpc_msgaccepted; + *tl++ = 0; + *tl++ = 0; + switch (err) { + case EPROGUNAVAIL: + *tl = txdr_unsigned(RPC_PROGUNAVAIL); + break; + case EPROGMISMATCH: + *tl = txdr_unsigned(RPC_PROGMISMATCH); + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(2); + *tl = txdr_unsigned(2); /* someday 3 */ + break; + case EPROCUNAVAIL: + *tl = txdr_unsigned(RPC_PROCUNAVAIL); + break; + default: + *tl = 0; + if (err != VNOVAL) { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + if (err) + *tl = txdr_unsigned(nfsrv_errmap[err - 1]); + else + *tl = 0; + } + break; + }; + } + + /* + * For nqnfs, piggyback lease as requested. + */ + if (nd->nd_nqlflag != NQL_NOVAL && err == 0) { + if (nd->nd_nqlflag) { + nfsm_build(tl, u_long *, 5*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(nd->nd_nqlflag); + *tl++ = txdr_unsigned(cache); + *tl++ = txdr_unsigned(nd->nd_duration); + txdr_hyper(frev, tl); + } else { + if (nd->nd_nqlflag != 0) + panic("nqreph"); + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = 0; + } + } + *mrq = mreq; + *mbp = mb; + *bposp = bpos; + if (err != 0 && err != VNOVAL) + nfsstats.srvrpc_errs++; + return (0); +} + +/* + * Nfs timer routine + * Scan the nfsreq list and retranmit any requests that have timed out + * To avoid retransmission attempts on STREAM sockets (in the future) make + * sure to set the r_retry field to 0 (implies nm_retry == 0). + */ +void +nfs_timer(arg) + void *arg; +{ + register struct nfsreq *rep; + register struct mbuf *m; + register struct socket *so; + register struct nfsmount *nmp; + register int timeo; + static long lasttime = 0; + int s, error; + + s = splnet(); + for (rep = nfsreqh.r_next; rep != &nfsreqh; rep = rep->r_next) { + nmp = rep->r_nmp; + if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) + continue; + if (nfs_sigintr(nmp, rep, rep->r_procp)) { + rep->r_flags |= R_SOFTTERM; + continue; + } + if (rep->r_rtt >= 0) { + rep->r_rtt++; + if (nmp->nm_flag & NFSMNT_DUMBTIMR) + timeo = nmp->nm_timeo; + else + timeo = NFS_RTO(nmp, proct[rep->r_procnum]); + if (nmp->nm_timeouts > 0) + timeo *= nfs_backoff[nmp->nm_timeouts - 1]; + if (rep->r_rtt <= timeo) + continue; + if (nmp->nm_timeouts < 8) + nmp->nm_timeouts++; + } + /* + * Check for server not responding + */ + if ((rep->r_flags & R_TPRINTFMSG) == 0 && + rep->r_rexmit > nmp->nm_deadthresh) { + nfs_msg(rep->r_procp, + nmp->nm_mountp->mnt_stat.f_mntfromname, + "not responding"); + rep->r_flags |= R_TPRINTFMSG; + } + if (rep->r_rexmit >= rep->r_retry) { /* too many */ + nfsstats.rpctimeouts++; + rep->r_flags |= R_SOFTTERM; + continue; + } + if (nmp->nm_sotype != SOCK_DGRAM) { + if (++rep->r_rexmit > NFS_MAXREXMIT) + rep->r_rexmit = NFS_MAXREXMIT; + continue; + } + if ((so = nmp->nm_so) == NULL) + continue; + + /* + * If there is enough space and the window allows.. + * Resend it + * Set r_rtt to -1 in case we fail to send it now. + */ + rep->r_rtt = -1; + if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && + ((nmp->nm_flag & NFSMNT_DUMBTIMR) || + (rep->r_flags & R_SENT) || + nmp->nm_sent < nmp->nm_cwnd) && + (m = m_copym(rep->r_mreq, 0, M_COPYALL, M_DONTWAIT))){ + if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) + error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, + (struct mbuf *)0, (struct mbuf *)0); + else + error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, + nmp->nm_nam, (struct mbuf *)0); + if (error) { + if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) + so->so_error = 0; + } else { + /* + * Iff first send, start timing + * else turn timing off, backoff timer + * and divide congestion window by 2. + */ + if (rep->r_flags & R_SENT) { + rep->r_flags &= ~R_TIMING; + if (++rep->r_rexmit > NFS_MAXREXMIT) + rep->r_rexmit = NFS_MAXREXMIT; + nmp->nm_cwnd >>= 1; + if (nmp->nm_cwnd < NFS_CWNDSCALE) + nmp->nm_cwnd = NFS_CWNDSCALE; + nfsstats.rpcretries++; + } else { + rep->r_flags |= R_SENT; + nmp->nm_sent += NFS_CWNDSCALE; + } + rep->r_rtt = 0; + } + } + } + + /* + * Call the nqnfs server timer once a second to handle leases. + */ + if (lasttime != time.tv_sec) { + lasttime = time.tv_sec; + nqnfs_serverd(); + } + splx(s); + timeout(nfs_timer, (void *)0, hz / NFS_HZ); +} + +/* + * Test for a termination condition pending on the process. + * This is used for NFSMNT_INT mounts. + */ +nfs_sigintr(nmp, rep, p) + struct nfsmount *nmp; + struct nfsreq *rep; + register struct proc *p; +{ + + if (rep && (rep->r_flags & R_SOFTTERM)) + return (EINTR); + if (!(nmp->nm_flag & NFSMNT_INT)) + return (0); + if (p && p->p_siglist && + (((p->p_siglist & ~p->p_sigmask) & ~p->p_sigignore) & + NFSINT_SIGMASK)) + return (EINTR); + return (0); +} + +/* + * Lock a socket against others. + * Necessary for STREAM sockets to ensure you get an entire rpc request/reply + * and also to avoid race conditions between the processes with nfs requests + * in progress when a reconnect is necessary. + */ +nfs_sndlock(flagp, rep) + register int *flagp; + struct nfsreq *rep; +{ + struct proc *p; + int slpflag = 0, slptimeo = 0; + + if (rep) { + p = rep->r_procp; + if (rep->r_nmp->nm_flag & NFSMNT_INT) + slpflag = PCATCH; + } else + p = (struct proc *)0; + while (*flagp & NFSMNT_SNDLOCK) { + if (nfs_sigintr(rep->r_nmp, rep, p)) + return (EINTR); + *flagp |= NFSMNT_WANTSND; + (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsndlck", + slptimeo); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + *flagp |= NFSMNT_SNDLOCK; + return (0); +} + +/* + * Unlock the stream socket for others. + */ +void +nfs_sndunlock(flagp) + register int *flagp; +{ + + if ((*flagp & NFSMNT_SNDLOCK) == 0) + panic("nfs sndunlock"); + *flagp &= ~NFSMNT_SNDLOCK; + if (*flagp & NFSMNT_WANTSND) { + *flagp &= ~NFSMNT_WANTSND; + wakeup((caddr_t)flagp); + } +} + +nfs_rcvlock(rep) + register struct nfsreq *rep; +{ + register int *flagp = &rep->r_nmp->nm_flag; + int slpflag, slptimeo = 0; + + if (*flagp & NFSMNT_INT) + slpflag = PCATCH; + else + slpflag = 0; + while (*flagp & NFSMNT_RCVLOCK) { + if (nfs_sigintr(rep->r_nmp, rep, rep->r_procp)) + return (EINTR); + *flagp |= NFSMNT_WANTRCV; + (void) tsleep((caddr_t)flagp, slpflag | (PZERO - 1), "nfsrcvlk", + slptimeo); + if (slpflag == PCATCH) { + slpflag = 0; + slptimeo = 2 * hz; + } + } + *flagp |= NFSMNT_RCVLOCK; + return (0); +} + +/* + * Unlock the stream socket for others. + */ +void +nfs_rcvunlock(flagp) + register int *flagp; +{ + + if ((*flagp & NFSMNT_RCVLOCK) == 0) + panic("nfs rcvunlock"); + *flagp &= ~NFSMNT_RCVLOCK; + if (*flagp & NFSMNT_WANTRCV) { + *flagp &= ~NFSMNT_WANTRCV; + wakeup((caddr_t)flagp); + } +} + +/* + * Check for badly aligned mbuf data areas and + * realign data in an mbuf list by copying the data areas up, as required. + */ +void +nfs_realign(m, hsiz) + register struct mbuf *m; + int hsiz; +{ + register struct mbuf *m2; + register int siz, mlen, olen; + register caddr_t tcp, fcp; + struct mbuf *mnew; + + while (m) { + /* + * This never happens for UDP, rarely happens for TCP + * but frequently happens for iso transport. + */ + if ((m->m_len & 0x3) || (mtod(m, int) & 0x3)) { + olen = m->m_len; + fcp = mtod(m, caddr_t); + if ((int)fcp & 0x3) { + m->m_flags &= ~M_PKTHDR; + if (m->m_flags & M_EXT) + m->m_data = m->m_ext.ext_buf + + ((m->m_ext.ext_size - olen) & ~0x3); + else + m->m_data = m->m_dat; + } + m->m_len = 0; + tcp = mtod(m, caddr_t); + mnew = m; + m2 = m->m_next; + + /* + * If possible, only put the first invariant part + * of the RPC header in the first mbuf. + */ + mlen = M_TRAILINGSPACE(m); + if (olen <= hsiz && mlen > hsiz) + mlen = hsiz; + + /* + * Loop through the mbuf list consolidating data. + */ + while (m) { + while (olen > 0) { + if (mlen == 0) { + m2->m_flags &= ~M_PKTHDR; + if (m2->m_flags & M_EXT) + m2->m_data = m2->m_ext.ext_buf; + else + m2->m_data = m2->m_dat; + m2->m_len = 0; + mlen = M_TRAILINGSPACE(m2); + tcp = mtod(m2, caddr_t); + mnew = m2; + m2 = m2->m_next; + } + siz = min(mlen, olen); + if (tcp != fcp) + bcopy(fcp, tcp, siz); + mnew->m_len += siz; + mlen -= siz; + olen -= siz; + tcp += siz; + fcp += siz; + } + m = m->m_next; + if (m) { + olen = m->m_len; + fcp = mtod(m, caddr_t); + } + } + + /* + * Finally, set m_len == 0 for any trailing mbufs that have + * been copied out of. + */ + while (m2) { + m2->m_len = 0; + m2 = m2->m_next; + } + return; + } + m = m->m_next; + } +} + +/* + * Socket upcall routine for the nfsd sockets. + * The caddr_t arg is a pointer to the "struct nfssvc_sock". + * Essentially do as much as possible non-blocking, else punt and it will + * be called with M_WAIT from an nfsd. + */ +void +nfsrv_rcv(so, arg, waitflag) + struct socket *so; + caddr_t arg; + int waitflag; +{ + register struct nfssvc_sock *slp = (struct nfssvc_sock *)arg; + register struct mbuf *m; + struct mbuf *mp, *nam; + struct uio auio; + int flags, error; + + if ((slp->ns_flag & SLP_VALID) == 0) + return; +#ifdef notdef + /* + * Define this to test for nfsds handling this under heavy load. + */ + if (waitflag == M_DONTWAIT) { + slp->ns_flag |= SLP_NEEDQ; goto dorecs; + } +#endif + auio.uio_procp = NULL; + if (so->so_type == SOCK_STREAM) { + /* + * If there are already records on the queue, defer soreceive() + * to an nfsd so that there is feedback to the TCP layer that + * the nfs servers are heavily loaded. + */ + if (slp->ns_rec && waitflag == M_DONTWAIT) { + slp->ns_flag |= SLP_NEEDQ; + goto dorecs; + } + + /* + * Do soreceive(). + */ + auio.uio_resid = 1000000000; + flags = MSG_DONTWAIT; + error = soreceive(so, &nam, &auio, &mp, (struct mbuf **)0, &flags); + if (error || mp == (struct mbuf *)0) { + if (error == EWOULDBLOCK) + slp->ns_flag |= SLP_NEEDQ; + else + slp->ns_flag |= SLP_DISCONN; + goto dorecs; + } + m = mp; + if (slp->ns_rawend) { + slp->ns_rawend->m_next = m; + slp->ns_cc += 1000000000 - auio.uio_resid; + } else { + slp->ns_raw = m; + slp->ns_cc = 1000000000 - auio.uio_resid; + } + while (m->m_next) + m = m->m_next; + slp->ns_rawend = m; + + /* + * Now try and parse record(s) out of the raw stream data. + */ + if (error = nfsrv_getstream(slp, waitflag)) { + if (error == EPERM) + slp->ns_flag |= SLP_DISCONN; + else + slp->ns_flag |= SLP_NEEDQ; + } + } else { + do { + auio.uio_resid = 1000000000; + flags = MSG_DONTWAIT; + error = soreceive(so, &nam, &auio, &mp, + (struct mbuf **)0, &flags); + if (mp) { + nfs_realign(mp, 10 * NFSX_UNSIGNED); + if (nam) { + m = nam; + m->m_next = mp; + } else + m = mp; + if (slp->ns_recend) + slp->ns_recend->m_nextpkt = m; + else + slp->ns_rec = m; + slp->ns_recend = m; + m->m_nextpkt = (struct mbuf *)0; + } + if (error) { + if ((so->so_proto->pr_flags & PR_CONNREQUIRED) + && error != EWOULDBLOCK) { + slp->ns_flag |= SLP_DISCONN; + goto dorecs; + } + } + } while (mp); + } + + /* + * Now try and process the request records, non-blocking. + */ +dorecs: + if (waitflag == M_DONTWAIT && + (slp->ns_rec || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) + nfsrv_wakenfsd(slp); +} + +/* + * Try and extract an RPC request from the mbuf data list received on a + * stream socket. The "waitflag" argument indicates whether or not it + * can sleep. + */ +nfsrv_getstream(slp, waitflag) + register struct nfssvc_sock *slp; + int waitflag; +{ + register struct mbuf *m; + register char *cp1, *cp2; + register int len; + struct mbuf *om, *m2, *recm; + u_long recmark; + + if (slp->ns_flag & SLP_GETSTREAM) + panic("nfs getstream"); + slp->ns_flag |= SLP_GETSTREAM; + for (;;) { + if (slp->ns_reclen == 0) { + if (slp->ns_cc < NFSX_UNSIGNED) { + slp->ns_flag &= ~SLP_GETSTREAM; + return (0); + } + m = slp->ns_raw; + if (m->m_len >= NFSX_UNSIGNED) { + bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED); + m->m_data += NFSX_UNSIGNED; + m->m_len -= NFSX_UNSIGNED; + } else { + cp1 = (caddr_t)&recmark; + cp2 = mtod(m, caddr_t); + while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) { + while (m->m_len == 0) { + m = m->m_next; + cp2 = mtod(m, caddr_t); + } + *cp1++ = *cp2++; + m->m_data++; + m->m_len--; + } + } + slp->ns_cc -= NFSX_UNSIGNED; + slp->ns_reclen = ntohl(recmark) & ~0x80000000; + if (slp->ns_reclen < NFS_MINPACKET || slp->ns_reclen > NFS_MAXPACKET) { + slp->ns_flag &= ~SLP_GETSTREAM; + return (EPERM); + } + } + + /* + * Now get the record part. + */ + if (slp->ns_cc == slp->ns_reclen) { + recm = slp->ns_raw; + slp->ns_raw = slp->ns_rawend = (struct mbuf *)0; + slp->ns_cc = slp->ns_reclen = 0; + } else if (slp->ns_cc > slp->ns_reclen) { + len = 0; + m = slp->ns_raw; + om = (struct mbuf *)0; + while (len < slp->ns_reclen) { + if ((len + m->m_len) > slp->ns_reclen) { + m2 = m_copym(m, 0, slp->ns_reclen - len, + waitflag); + if (m2) { + if (om) { + om->m_next = m2; + recm = slp->ns_raw; + } else + recm = m2; + m->m_data += slp->ns_reclen - len; + m->m_len -= slp->ns_reclen - len; + len = slp->ns_reclen; + } else { + slp->ns_flag &= ~SLP_GETSTREAM; + return (EWOULDBLOCK); + } + } else if ((len + m->m_len) == slp->ns_reclen) { + om = m; + len += m->m_len; + m = m->m_next; + recm = slp->ns_raw; + om->m_next = (struct mbuf *)0; + } else { + om = m; + len += m->m_len; + m = m->m_next; + } + } + slp->ns_raw = m; + slp->ns_cc -= len; + slp->ns_reclen = 0; + } else { + slp->ns_flag &= ~SLP_GETSTREAM; + return (0); + } + nfs_realign(recm, 10 * NFSX_UNSIGNED); + if (slp->ns_recend) + slp->ns_recend->m_nextpkt = recm; + else + slp->ns_rec = recm; + slp->ns_recend = recm; + } +} + +/* + * Parse an RPC header. + */ +nfsrv_dorec(slp, nd) + register struct nfssvc_sock *slp; + register struct nfsd *nd; +{ + register struct mbuf *m; + int error; + + if ((slp->ns_flag & SLP_VALID) == 0 || + (m = slp->ns_rec) == (struct mbuf *)0) + return (ENOBUFS); + if (slp->ns_rec = m->m_nextpkt) + m->m_nextpkt = (struct mbuf *)0; + else + slp->ns_recend = (struct mbuf *)0; + if (m->m_type == MT_SONAME) { + nd->nd_nam = m; + nd->nd_md = nd->nd_mrep = m->m_next; + m->m_next = (struct mbuf *)0; + } else { + nd->nd_nam = (struct mbuf *)0; + nd->nd_md = nd->nd_mrep = m; + } + nd->nd_dpos = mtod(nd->nd_md, caddr_t); + if (error = nfs_getreq(nd, TRUE)) { + m_freem(nd->nd_nam); + return (error); + } + return (0); +} + +/* + * Parse an RPC request + * - verify it + * - fill in the cred struct. + */ +nfs_getreq(nd, has_header) + register struct nfsd *nd; + int has_header; +{ + register int len, i; + register u_long *tl; + register long t1; + struct uio uio; + struct iovec iov; + caddr_t dpos, cp2; + u_long nfsvers, auth_type; + int error = 0, nqnfs = 0; + struct mbuf *mrep, *md; + + mrep = nd->nd_mrep; + md = nd->nd_md; + dpos = nd->nd_dpos; + if (has_header) { + nfsm_dissect(tl, u_long *, 10*NFSX_UNSIGNED); + nd->nd_retxid = *tl++; + if (*tl++ != rpc_call) { + m_freem(mrep); + return (EBADRPC); + } + } else { + nfsm_dissect(tl, u_long *, 8*NFSX_UNSIGNED); + } + nd->nd_repstat = 0; + if (*tl++ != rpc_vers) { + nd->nd_repstat = ERPCMISMATCH; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + nfsvers = nfs_vers; + if (*tl != nfs_prog) { + if (*tl == nqnfs_prog) { + nqnfs++; + nfsvers = nqnfs_vers; + } else { + nd->nd_repstat = EPROGUNAVAIL; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + } + tl++; + if (*tl++ != nfsvers) { + nd->nd_repstat = EPROGMISMATCH; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + nd->nd_procnum = fxdr_unsigned(u_long, *tl++); + if (nd->nd_procnum == NFSPROC_NULL) + return (0); + if (nd->nd_procnum >= NFS_NPROCS || + (!nqnfs && nd->nd_procnum > NFSPROC_STATFS) || + (*tl != rpc_auth_unix && *tl != rpc_auth_kerb)) { + nd->nd_repstat = EPROCUNAVAIL; + nd->nd_procnum = NFSPROC_NOOP; + return (0); + } + auth_type = *tl++; + len = fxdr_unsigned(int, *tl++); + if (len < 0 || len > RPCAUTH_MAXSIZ) { + m_freem(mrep); + return (EBADRPC); + } + + /* + * Handle auth_unix or auth_kerb. + */ + if (auth_type == rpc_auth_unix) { + len = fxdr_unsigned(int, *++tl); + if (len < 0 || len > NFS_MAXNAMLEN) { + m_freem(mrep); + return (EBADRPC); + } + nfsm_adv(nfsm_rndup(len)); + nfsm_dissect(tl, u_long *, 3*NFSX_UNSIGNED); + nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); + nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++); + len = fxdr_unsigned(int, *tl); + if (len < 0 || len > RPCAUTH_UNIXGIDS) { + m_freem(mrep); + return (EBADRPC); + } + nfsm_dissect(tl, u_long *, (len + 2)*NFSX_UNSIGNED); + for (i = 1; i <= len; i++) + if (i < NGROUPS) + nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++); + else + tl++; + nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1); + } else if (auth_type == rpc_auth_kerb) { + nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++); + nd->nd_authlen = fxdr_unsigned(int, *tl); + uio.uio_resid = nfsm_rndup(nd->nd_authlen); + if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) { + m_freem(mrep); + return (EBADRPC); + } + uio.uio_offset = 0; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_segflg = UIO_SYSSPACE; + iov.iov_base = (caddr_t)nd->nd_authstr; + iov.iov_len = RPCAUTH_MAXSIZ; + nfsm_mtouio(&uio, uio.uio_resid); + nfsm_dissect(tl, u_long *, 2 * NFSX_UNSIGNED); + nd->nd_flag |= NFSD_NEEDAUTH; + } + + /* + * Do we have any use for the verifier. + * According to the "Remote Procedure Call Protocol Spec." it + * should be AUTH_NULL, but some clients make it AUTH_UNIX? + * For now, just skip over it + */ + len = fxdr_unsigned(int, *++tl); + if (len < 0 || len > RPCAUTH_MAXSIZ) { + m_freem(mrep); + return (EBADRPC); + } + if (len > 0) { + nfsm_adv(nfsm_rndup(len)); + } + + /* + * For nqnfs, get piggybacked lease request. + */ + if (nqnfs && nd->nd_procnum != NQNFSPROC_EVICTED) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + nd->nd_nqlflag = fxdr_unsigned(int, *tl); + if (nd->nd_nqlflag) { + nfsm_dissect(tl, u_long *, NFSX_UNSIGNED); + nd->nd_duration = fxdr_unsigned(int, *tl); + } else + nd->nd_duration = NQ_MINLEASE; + } else { + nd->nd_nqlflag = NQL_NOVAL; + nd->nd_duration = NQ_MINLEASE; + } + nd->nd_md = md; + nd->nd_dpos = dpos; + return (0); +nfsmout: + return (error); +} + +/* + * Search for a sleeping nfsd and wake it up. + * SIDE EFFECT: If none found, set NFSD_CHECKSLP flag, so that one of the + * running nfsds will go look for the work in the nfssvc_sock list. + */ +void +nfsrv_wakenfsd(slp) + struct nfssvc_sock *slp; +{ + register struct nfsd *nd = nfsd_head.nd_next; + + if ((slp->ns_flag & SLP_VALID) == 0) + return; + while (nd != (struct nfsd *)&nfsd_head) { + if (nd->nd_flag & NFSD_WAITING) { + nd->nd_flag &= ~NFSD_WAITING; + if (nd->nd_slp) + panic("nfsd wakeup"); + slp->ns_sref++; + nd->nd_slp = slp; + wakeup((caddr_t)nd); + return; + } + nd = nd->nd_next; + } + slp->ns_flag |= SLP_DOREC; + nfsd_head.nd_flag |= NFSD_CHECKSLP; +} + +nfs_msg(p, server, msg) + struct proc *p; + char *server, *msg; +{ + tpr_t tpr; + + if (p) + tpr = tprintf_open(p); + else + tpr = NULL; + tprintf(tpr, "nfs server %s: %s\n", server, msg); + tprintf_close(tpr); +} diff --git a/sys/nfsserver/nfs_srvsubs.c b/sys/nfsserver/nfs_srvsubs.c new file mode 100644 index 00000000000..5778f7d7f01 --- /dev/null +++ b/sys/nfsserver/nfs_srvsubs.c @@ -0,0 +1,1130 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_subs.c 8.3 (Berkeley) 1/4/94 + */ + +/* + * These functions support the macros and help fiddle mbuf chains for + * the nfs op functions. They do things like create the rpc header and + * copy data between mbuf chains and uio lists. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#ifdef ISO +#include +#endif + +#define TRUE 1 +#define FALSE 0 + +/* + * Data items converted to xdr at startup, since they are constant + * This is kinda hokey, but may save a little time doing byte swaps + */ +u_long nfs_procids[NFS_NPROCS]; +u_long nfs_xdrneg1; +u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, + rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, rpc_rejectedcred, + rpc_auth_kerb; +u_long nfs_vers, nfs_prog, nfs_true, nfs_false; + +/* And other global data */ +static u_long nfs_xid = 0; +enum vtype ntov_type[7] = { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON }; +extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +extern struct nfsreq nfsreqh; +extern int nqnfs_piggy[NFS_NPROCS]; +extern struct nfsrtt nfsrtt; +extern time_t nqnfsstarttime; +extern u_long nqnfs_prog, nqnfs_vers; +extern int nqsrv_clockskew; +extern int nqsrv_writeslack; +extern int nqsrv_maxlease; + +/* + * Create the header for an rpc request packet + * The hsiz is the size of the rest of the nfs request header. + * (just used to decide if a cluster is a good idea) + */ +struct mbuf * +nfsm_reqh(vp, procid, hsiz, bposp) + struct vnode *vp; + u_long procid; + int hsiz; + caddr_t *bposp; +{ + register struct mbuf *mb; + register u_long *tl; + register caddr_t bpos; + struct mbuf *mb2; + struct nfsmount *nmp; + int nqflag; + + MGET(mb, M_WAIT, MT_DATA); + if (hsiz >= MINCLSIZE) + MCLGET(mb, M_WAIT); + mb->m_len = 0; + bpos = mtod(mb, caddr_t); + + /* + * For NQNFS, add lease request. + */ + if (vp) { + nmp = VFSTONFS(vp->v_mount); + if (nmp->nm_flag & NFSMNT_NQNFS) { + nqflag = NQNFS_NEEDLEASE(vp, procid); + if (nqflag) { + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(nqflag); + *tl = txdr_unsigned(nmp->nm_leaseterm); + } else { + nfsm_build(tl, u_long *, NFSX_UNSIGNED); + *tl = 0; + } + } + } + /* Finally, return values */ + *bposp = bpos; + return (mb); +} + +/* + * Build the RPC header and fill in the authorization info. + * The authorization string argument is only used when the credentials + * come from outside of the kernel. + * Returns the head of the mbuf list. + */ +struct mbuf * +nfsm_rpchead(cr, nqnfs, procid, auth_type, auth_len, auth_str, mrest, + mrest_len, mbp, xidp) + register struct ucred *cr; + int nqnfs; + int procid; + int auth_type; + int auth_len; + char *auth_str; + struct mbuf *mrest; + int mrest_len; + struct mbuf **mbp; + u_long *xidp; +{ + register struct mbuf *mb; + register u_long *tl; + register caddr_t bpos; + register int i; + struct mbuf *mreq, *mb2; + int siz, grpsiz, authsiz; + + authsiz = nfsm_rndup(auth_len); + if (auth_type == RPCAUTH_NQNFS) + authsiz += 2 * NFSX_UNSIGNED; + MGETHDR(mb, M_WAIT, MT_DATA); + if ((authsiz + 10*NFSX_UNSIGNED) >= MINCLSIZE) { + MCLGET(mb, M_WAIT); + } else if ((authsiz + 10*NFSX_UNSIGNED) < MHLEN) { + MH_ALIGN(mb, authsiz + 10*NFSX_UNSIGNED); + } else { + MH_ALIGN(mb, 8*NFSX_UNSIGNED); + } + mb->m_len = 0; + mreq = mb; + bpos = mtod(mb, caddr_t); + + /* + * First the RPC header. + */ + nfsm_build(tl, u_long *, 8*NFSX_UNSIGNED); + if (++nfs_xid == 0) + nfs_xid++; + *tl++ = *xidp = txdr_unsigned(nfs_xid); + *tl++ = rpc_call; + *tl++ = rpc_vers; + if (nqnfs) { + *tl++ = txdr_unsigned(NQNFS_PROG); + *tl++ = txdr_unsigned(NQNFS_VER1); + } else { + *tl++ = txdr_unsigned(NFS_PROG); + *tl++ = txdr_unsigned(NFS_VER2); + } + *tl++ = txdr_unsigned(procid); + + /* + * And then the authorization cred. + */ + *tl++ = txdr_unsigned(auth_type); + *tl = txdr_unsigned(authsiz); + switch (auth_type) { + case RPCAUTH_UNIX: + nfsm_build(tl, u_long *, auth_len); + *tl++ = 0; /* stamp ?? */ + *tl++ = 0; /* NULL hostname */ + *tl++ = txdr_unsigned(cr->cr_uid); + *tl++ = txdr_unsigned(cr->cr_groups[0]); + grpsiz = (auth_len >> 2) - 5; + *tl++ = txdr_unsigned(grpsiz); + for (i = 1; i <= grpsiz; i++) + *tl++ = txdr_unsigned(cr->cr_groups[i]); + break; + case RPCAUTH_NQNFS: + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(cr->cr_uid); + *tl = txdr_unsigned(auth_len); + siz = auth_len; + while (siz > 0) { + if (M_TRAILINGSPACE(mb) == 0) { + MGET(mb2, M_WAIT, MT_DATA); + if (siz >= MINCLSIZE) + MCLGET(mb2, M_WAIT); + mb->m_next = mb2; + mb = mb2; + mb->m_len = 0; + bpos = mtod(mb, caddr_t); + } + i = min(siz, M_TRAILINGSPACE(mb)); + bcopy(auth_str, bpos, i); + mb->m_len += i; + auth_str += i; + bpos += i; + siz -= i; + } + if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { + for (i = 0; i < siz; i++) + *bpos++ = '\0'; + mb->m_len += siz; + } + break; + }; + nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED); + *tl++ = txdr_unsigned(RPCAUTH_NULL); + *tl = 0; + mb->m_next = mrest; + mreq->m_pkthdr.len = authsiz + 10*NFSX_UNSIGNED + mrest_len; + mreq->m_pkthdr.rcvif = (struct ifnet *)0; + *mbp = mb; + return (mreq); +} + +/* + * copies mbuf chain to the uio scatter/gather list + */ +nfsm_mbuftouio(mrep, uiop, siz, dpos) + struct mbuf **mrep; + register struct uio *uiop; + int siz; + caddr_t *dpos; +{ + register char *mbufcp, *uiocp; + register int xfer, left, len; + register struct mbuf *mp; + long uiosiz, rem; + int error = 0; + + mp = *mrep; + mbufcp = *dpos; + len = mtod(mp, caddr_t)+mp->m_len-mbufcp; + rem = nfsm_rndup(siz)-siz; + while (siz > 0) { + if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + return (EFBIG); + left = uiop->uio_iov->iov_len; + uiocp = uiop->uio_iov->iov_base; + if (left > siz) + left = siz; + uiosiz = left; + while (left > 0) { + while (len == 0) { + mp = mp->m_next; + if (mp == NULL) + return (EBADRPC); + mbufcp = mtod(mp, caddr_t); + len = mp->m_len; + } + xfer = (left > len) ? len : left; +#ifdef notdef + /* Not Yet.. */ + if (uiop->uio_iov->iov_op != NULL) + (*(uiop->uio_iov->iov_op)) + (mbufcp, uiocp, xfer); + else +#endif + if (uiop->uio_segflg == UIO_SYSSPACE) + bcopy(mbufcp, uiocp, xfer); + else + copyout(mbufcp, uiocp, xfer); + left -= xfer; + len -= xfer; + mbufcp += xfer; + uiocp += xfer; + uiop->uio_offset += xfer; + uiop->uio_resid -= xfer; + } + if (uiop->uio_iov->iov_len <= siz) { + uiop->uio_iovcnt--; + uiop->uio_iov++; + } else { + uiop->uio_iov->iov_base += uiosiz; + uiop->uio_iov->iov_len -= uiosiz; + } + siz -= uiosiz; + } + *dpos = mbufcp; + *mrep = mp; + if (rem > 0) { + if (len < rem) + error = nfs_adv(mrep, dpos, rem, len); + else + *dpos += rem; + } + return (error); +} + +/* + * copies a uio scatter/gather list to an mbuf chain... + */ +nfsm_uiotombuf(uiop, mq, siz, bpos) + register struct uio *uiop; + struct mbuf **mq; + int siz; + caddr_t *bpos; +{ + register char *uiocp; + register struct mbuf *mp, *mp2; + register int xfer, left, mlen; + int uiosiz, clflg, rem; + char *cp; + + if (siz > MLEN) /* or should it >= MCLBYTES ?? */ + clflg = 1; + else + clflg = 0; + rem = nfsm_rndup(siz)-siz; + mp = mp2 = *mq; + while (siz > 0) { + if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + return (EINVAL); + left = uiop->uio_iov->iov_len; + uiocp = uiop->uio_iov->iov_base; + if (left > siz) + left = siz; + uiosiz = left; + while (left > 0) { + mlen = M_TRAILINGSPACE(mp); + if (mlen == 0) { + MGET(mp, M_WAIT, MT_DATA); + if (clflg) + MCLGET(mp, M_WAIT); + mp->m_len = 0; + mp2->m_next = mp; + mp2 = mp; + mlen = M_TRAILINGSPACE(mp); + } + xfer = (left > mlen) ? mlen : left; +#ifdef notdef + /* Not Yet.. */ + if (uiop->uio_iov->iov_op != NULL) + (*(uiop->uio_iov->iov_op)) + (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + else +#endif + if (uiop->uio_segflg == UIO_SYSSPACE) + bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + else + copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + mp->m_len += xfer; + left -= xfer; + uiocp += xfer; + uiop->uio_offset += xfer; + uiop->uio_resid -= xfer; + } + if (uiop->uio_iov->iov_len <= siz) { + uiop->uio_iovcnt--; + uiop->uio_iov++; + } else { + uiop->uio_iov->iov_base += uiosiz; + uiop->uio_iov->iov_len -= uiosiz; + } + siz -= uiosiz; + } + if (rem > 0) { + if (rem > M_TRAILINGSPACE(mp)) { + MGET(mp, M_WAIT, MT_DATA); + mp->m_len = 0; + mp2->m_next = mp; + } + cp = mtod(mp, caddr_t)+mp->m_len; + for (left = 0; left < rem; left++) + *cp++ = '\0'; + mp->m_len += rem; + *bpos = cp; + } else + *bpos = mtod(mp, caddr_t)+mp->m_len; + *mq = mp; + return (0); +} + +/* + * Help break down an mbuf chain by setting the first siz bytes contiguous + * pointed to by returned val. + * This is used by the macros nfsm_dissect and nfsm_dissecton for tough + * cases. (The macros use the vars. dpos and dpos2) + */ +nfsm_disct(mdp, dposp, siz, left, cp2) + struct mbuf **mdp; + caddr_t *dposp; + int siz; + int left; + caddr_t *cp2; +{ + register struct mbuf *mp, *mp2; + register int siz2, xfer; + register caddr_t p; + + mp = *mdp; + while (left == 0) { + *mdp = mp = mp->m_next; + if (mp == NULL) + return (EBADRPC); + left = mp->m_len; + *dposp = mtod(mp, caddr_t); + } + if (left >= siz) { + *cp2 = *dposp; + *dposp += siz; + } else if (mp->m_next == NULL) { + return (EBADRPC); + } else if (siz > MHLEN) { + panic("nfs S too big"); + } else { + MGET(mp2, M_WAIT, MT_DATA); + mp2->m_next = mp->m_next; + mp->m_next = mp2; + mp->m_len -= left; + mp = mp2; + *cp2 = p = mtod(mp, caddr_t); + bcopy(*dposp, p, left); /* Copy what was left */ + siz2 = siz-left; + p += left; + mp2 = mp->m_next; + /* Loop around copying up the siz2 bytes */ + while (siz2 > 0) { + if (mp2 == NULL) + return (EBADRPC); + xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; + if (xfer > 0) { + bcopy(mtod(mp2, caddr_t), p, xfer); + NFSMADV(mp2, xfer); + mp2->m_len -= xfer; + p += xfer; + siz2 -= xfer; + } + if (siz2 > 0) + mp2 = mp2->m_next; + } + mp->m_len = siz; + *mdp = mp2; + *dposp = mtod(mp2, caddr_t); + } + return (0); +} + +/* + * Advance the position in the mbuf chain. + */ +nfs_adv(mdp, dposp, offs, left) + struct mbuf **mdp; + caddr_t *dposp; + int offs; + int left; +{ + register struct mbuf *m; + register int s; + + m = *mdp; + s = left; + while (s < offs) { + offs -= s; + m = m->m_next; + if (m == NULL) + return (EBADRPC); + s = m->m_len; + } + *mdp = m; + *dposp = mtod(m, caddr_t)+offs; + return (0); +} + +/* + * Copy a string into mbufs for the hard cases... + */ +nfsm_strtmbuf(mb, bpos, cp, siz) + struct mbuf **mb; + char **bpos; + char *cp; + long siz; +{ + register struct mbuf *m1, *m2; + long left, xfer, len, tlen; + u_long *tl; + int putsize; + + putsize = 1; + m2 = *mb; + left = M_TRAILINGSPACE(m2); + if (left > 0) { + tl = ((u_long *)(*bpos)); + *tl++ = txdr_unsigned(siz); + putsize = 0; + left -= NFSX_UNSIGNED; + m2->m_len += NFSX_UNSIGNED; + if (left > 0) { + bcopy(cp, (caddr_t) tl, left); + siz -= left; + cp += left; + m2->m_len += left; + left = 0; + } + } + /* Loop around adding mbufs */ + while (siz > 0) { + MGET(m1, M_WAIT, MT_DATA); + if (siz > MLEN) + MCLGET(m1, M_WAIT); + m1->m_len = NFSMSIZ(m1); + m2->m_next = m1; + m2 = m1; + tl = mtod(m1, u_long *); + tlen = 0; + if (putsize) { + *tl++ = txdr_unsigned(siz); + m1->m_len -= NFSX_UNSIGNED; + tlen = NFSX_UNSIGNED; + putsize = 0; + } + if (siz < m1->m_len) { + len = nfsm_rndup(siz); + xfer = siz; + if (xfer < len) + *(tl+(xfer>>2)) = 0; + } else { + xfer = len = m1->m_len; + } + bcopy(cp, (caddr_t) tl, xfer); + m1->m_len = len+tlen; + siz -= xfer; + cp += xfer; + } + *mb = m1; + *bpos = mtod(m1, caddr_t)+m1->m_len; + return (0); +} + +/* + * Called once to initialize data structures... + */ +nfs_init() +{ + register int i; + + nfsrtt.pos = 0; + rpc_vers = txdr_unsigned(RPC_VER2); + rpc_call = txdr_unsigned(RPC_CALL); + rpc_reply = txdr_unsigned(RPC_REPLY); + rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED); + rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED); + rpc_mismatch = txdr_unsigned(RPC_MISMATCH); + rpc_autherr = txdr_unsigned(RPC_AUTHERR); + rpc_rejectedcred = txdr_unsigned(AUTH_REJECTCRED); + rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX); + rpc_auth_kerb = txdr_unsigned(RPCAUTH_NQNFS); + nfs_vers = txdr_unsigned(NFS_VER2); + nfs_prog = txdr_unsigned(NFS_PROG); + nfs_true = txdr_unsigned(TRUE); + nfs_false = txdr_unsigned(FALSE); + /* Loop thru nfs procids */ + for (i = 0; i < NFS_NPROCS; i++) + nfs_procids[i] = txdr_unsigned(i); + /* Ensure async daemons disabled */ + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + nfs_iodwant[i] = (struct proc *)0; + TAILQ_INIT(&nfs_bufq); + nfs_xdrneg1 = txdr_unsigned(-1); + nfs_nhinit(); /* Init the nfsnode table */ + nfsrv_init(0); /* Init server data structures */ + nfsrv_initcache(); /* Init the server request cache */ + + /* + * Initialize the nqnfs server stuff. + */ + if (nqnfsstarttime == 0) { + nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease + + nqsrv_clockskew + nqsrv_writeslack; + NQLOADNOVRAM(nqnfsstarttime); + nqnfs_prog = txdr_unsigned(NQNFS_PROG); + nqnfs_vers = txdr_unsigned(NQNFS_VER1); + nqthead.th_head[0] = &nqthead; + nqthead.th_head[1] = &nqthead; + nqfhead = hashinit(NQLCHSZ, M_NQLEASE, &nqfheadhash); + } + + /* + * Initialize reply list and start timer + */ + nfsreqh.r_prev = nfsreqh.r_next = &nfsreqh; + nfs_timer(); +} + +/* + * Attribute cache routines. + * nfs_loadattrcache() - loads or updates the cache contents from attributes + * that are on the mbuf list + * nfs_getattrcache() - returns valid attributes if found in cache, returns + * error otherwise + */ + +/* + * Load the attribute cache (that lives in the nfsnode entry) with + * the values on the mbuf list and + * Iff vap not NULL + * copy the attributes to *vaper + */ +nfs_loadattrcache(vpp, mdp, dposp, vaper) + struct vnode **vpp; + struct mbuf **mdp; + caddr_t *dposp; + struct vattr *vaper; +{ + register struct vnode *vp = *vpp; + register struct vattr *vap; + register struct nfsv2_fattr *fp; + extern int (**spec_nfsv2nodeop_p)(); + register struct nfsnode *np, *nq, **nhpp; + register long t1; + caddr_t dpos, cp2; + int error = 0, isnq; + struct mbuf *md; + enum vtype vtyp; + u_short vmode; + long rdev; + struct timespec mtime; + struct vnode *nvp; + + md = *mdp; + dpos = *dposp; + t1 = (mtod(md, caddr_t) + md->m_len) - dpos; + isnq = (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS); + if (error = nfsm_disct(&md, &dpos, NFSX_FATTR(isnq), t1, &cp2)) + return (error); + fp = (struct nfsv2_fattr *)cp2; + vtyp = nfstov_type(fp->fa_type); + vmode = fxdr_unsigned(u_short, fp->fa_mode); + if (vtyp == VNON || vtyp == VREG) + vtyp = IFTOVT(vmode); + if (isnq) { + rdev = fxdr_unsigned(long, fp->fa_nqrdev); + fxdr_nqtime(&fp->fa_nqmtime, &mtime); + } else { + rdev = fxdr_unsigned(long, fp->fa_nfsrdev); + fxdr_nfstime(&fp->fa_nfsmtime, &mtime); + } + /* + * If v_type == VNON it is a new node, so fill in the v_type, + * n_mtime fields. Check to see if it represents a special + * device, and if so, check for a possible alias. Once the + * correct vnode has been obtained, fill in the rest of the + * information. + */ + np = VTONFS(vp); + if (vp->v_type == VNON) { + if (vtyp == VCHR && rdev == 0xffffffff) + vp->v_type = vtyp = VFIFO; + else + vp->v_type = vtyp; + if (vp->v_type == VFIFO) { +#ifdef FIFO + extern int (**fifo_nfsv2nodeop_p)(); + vp->v_op = fifo_nfsv2nodeop_p; +#else + return (EOPNOTSUPP); +#endif /* FIFO */ + } + if (vp->v_type == VCHR || vp->v_type == VBLK) { + vp->v_op = spec_nfsv2nodeop_p; + if (nvp = checkalias(vp, (dev_t)rdev, vp->v_mount)) { + /* + * Discard unneeded vnode, but save its nfsnode. + */ + if (nq = np->n_forw) + nq->n_back = np->n_back; + *np->n_back = nq; + nvp->v_data = vp->v_data; + vp->v_data = NULL; + vp->v_op = spec_vnodeop_p; + vrele(vp); + vgone(vp); + /* + * Reinitialize aliased node. + */ + np->n_vnode = nvp; + nhpp = (struct nfsnode **)nfs_hash(&np->n_fh); + if (nq = *nhpp) + nq->n_back = &np->n_forw; + np->n_forw = nq; + np->n_back = nhpp; + *nhpp = np; + *vpp = vp = nvp; + } + } + np->n_mtime = mtime.ts_sec; + } + vap = &np->n_vattr; + vap->va_type = vtyp; + vap->va_mode = (vmode & 07777); + vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink); + vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid); + vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid); + vap->va_rdev = (dev_t)rdev; + vap->va_mtime = mtime; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + if (isnq) { + fxdr_hyper(&fp->fa_nqsize, &vap->va_size); + vap->va_blocksize = fxdr_unsigned(long, fp->fa_nqblocksize); + fxdr_hyper(&fp->fa_nqbytes, &vap->va_bytes); + vap->va_fileid = fxdr_unsigned(long, fp->fa_nqfileid); + fxdr_nqtime(&fp->fa_nqatime, &vap->va_atime); + vap->va_flags = fxdr_unsigned(u_long, fp->fa_nqflags); + fxdr_nqtime(&fp->fa_nqctime, &vap->va_ctime); + vap->va_gen = fxdr_unsigned(u_long, fp->fa_nqgen); + fxdr_hyper(&fp->fa_nqfilerev, &vap->va_filerev); + } else { + vap->va_size = fxdr_unsigned(u_long, fp->fa_nfssize); + vap->va_blocksize = fxdr_unsigned(long, fp->fa_nfsblocksize); + vap->va_bytes = fxdr_unsigned(long, fp->fa_nfsblocks) * NFS_FABLKSIZE; + vap->va_fileid = fxdr_unsigned(long, fp->fa_nfsfileid); + fxdr_nfstime(&fp->fa_nfsatime, &vap->va_atime); + vap->va_flags = 0; + vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa_nfsctime.nfs_sec); + vap->va_ctime.ts_nsec = 0; + vap->va_gen = fxdr_unsigned(u_long, fp->fa_nfsctime.nfs_usec); + vap->va_filerev = 0; + } + if (vap->va_size != np->n_size) { + if (vap->va_type == VREG) { + if (np->n_flag & NMODIFIED) { + if (vap->va_size < np->n_size) + vap->va_size = np->n_size; + else + np->n_size = vap->va_size; + } else + np->n_size = vap->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else + np->n_size = vap->va_size; + } + np->n_attrstamp = time.tv_sec; + *dposp = dpos; + *mdp = md; + if (vaper != NULL) { + bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap)); +#ifdef notdef + if ((np->n_flag & NMODIFIED) && np->n_size > vap->va_size) + if (np->n_size > vap->va_size) + vaper->va_size = np->n_size; +#endif + if (np->n_flag & NCHG) { + if (np->n_flag & NACC) { + vaper->va_atime.ts_sec = np->n_atim.tv_sec; + vaper->va_atime.ts_nsec = + np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vaper->va_mtime.ts_sec = np->n_mtim.tv_sec; + vaper->va_mtime.ts_nsec = + np->n_mtim.tv_usec * 1000; + } + } + } + return (0); +} + +/* + * Check the time stamp + * If the cache is valid, copy contents to *vap and return 0 + * otherwise return an error + */ +nfs_getattrcache(vp, vaper) + register struct vnode *vp; + struct vattr *vaper; +{ + register struct nfsnode *np = VTONFS(vp); + register struct vattr *vap; + + if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQLOOKLEASE) { + if (!NQNFS_CKCACHABLE(vp, NQL_READ) || np->n_attrstamp == 0) { + nfsstats.attrcache_misses++; + return (ENOENT); + } + } else if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) { + nfsstats.attrcache_misses++; + return (ENOENT); + } + nfsstats.attrcache_hits++; + vap = &np->n_vattr; + if (vap->va_size != np->n_size) { + if (vap->va_type == VREG) { + if (np->n_flag & NMODIFIED) { + if (vap->va_size < np->n_size) + vap->va_size = np->n_size; + else + np->n_size = vap->va_size; + } else + np->n_size = vap->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else + np->n_size = vap->va_size; + } + bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr)); +#ifdef notdef + if ((np->n_flag & NMODIFIED) == 0) { + np->n_size = vaper->va_size; + vnode_pager_setsize(vp, (u_long)np->n_size); + } else if (np->n_size > vaper->va_size) + if (np->n_size > vaper->va_size) + vaper->va_size = np->n_size; +#endif + if (np->n_flag & NCHG) { + if (np->n_flag & NACC) { + vaper->va_atime.ts_sec = np->n_atim.tv_sec; + vaper->va_atime.ts_nsec = np->n_atim.tv_usec * 1000; + } + if (np->n_flag & NUPD) { + vaper->va_mtime.ts_sec = np->n_mtim.tv_sec; + vaper->va_mtime.ts_nsec = np->n_mtim.tv_usec * 1000; + } + } + return (0); +} + +/* + * Set up nameidata for a lookup() call and do it + */ +nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, p) + register struct nameidata *ndp; + fhandle_t *fhp; + int len; + struct nfssvc_sock *slp; + struct mbuf *nam; + struct mbuf **mdp; + caddr_t *dposp; + struct proc *p; +{ + register int i, rem; + register struct mbuf *md; + register char *fromcp, *tocp; + struct vnode *dp; + int error, rdonly; + struct componentname *cnp = &ndp->ni_cnd; + + MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK); + /* + * Copy the name from the mbuf list to ndp->ni_pnbuf + * and set the various ndp fields appropriately. + */ + fromcp = *dposp; + tocp = cnp->cn_pnbuf; + md = *mdp; + rem = mtod(md, caddr_t) + md->m_len - fromcp; + cnp->cn_hash = 0; + for (i = 0; i < len; i++) { + while (rem == 0) { + md = md->m_next; + if (md == NULL) { + error = EBADRPC; + goto out; + } + fromcp = mtod(md, caddr_t); + rem = md->m_len; + } + if (*fromcp == '\0' || *fromcp == '/') { + error = EINVAL; + goto out; + } + cnp->cn_hash += (unsigned char)*fromcp; + *tocp++ = *fromcp++; + rem--; + } + *tocp = '\0'; + *mdp = md; + *dposp = fromcp; + len = nfsm_rndup(len)-len; + if (len > 0) { + if (rem >= len) + *dposp += len; + else if (error = nfs_adv(mdp, dposp, len, rem)) + goto out; + } + ndp->ni_pathlen = tocp - cnp->cn_pnbuf; + cnp->cn_nameptr = cnp->cn_pnbuf; + /* + * Extract and set starting directory. + */ + if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp, + nam, &rdonly)) + goto out; + if (dp->v_type != VDIR) { + vrele(dp); + error = ENOTDIR; + goto out; + } + ndp->ni_startdir = dp; + if (rdonly) + cnp->cn_flags |= (NOCROSSMOUNT | RDONLY); + else + cnp->cn_flags |= NOCROSSMOUNT; + /* + * And call lookup() to do the real work + */ + cnp->cn_proc = p; + if (error = lookup(ndp)) + goto out; + /* + * Check for encountering a symbolic link + */ + if (cnp->cn_flags & ISSYMLINK) { + if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) + vput(ndp->ni_dvp); + else + vrele(ndp->ni_dvp); + vput(ndp->ni_vp); + ndp->ni_vp = NULL; + error = EINVAL; + goto out; + } + /* + * Check for saved name request + */ + if (cnp->cn_flags & (SAVENAME | SAVESTART)) { + cnp->cn_flags |= HASBUF; + return (0); + } +out: + FREE(cnp->cn_pnbuf, M_NAMEI); + return (error); +} + +/* + * A fiddled version of m_adj() that ensures null fill to a long + * boundary and only trims off the back end + */ +void +nfsm_adj(mp, len, nul) + struct mbuf *mp; + register int len; + int nul; +{ + register struct mbuf *m; + register int count, i; + register char *cp; + + /* + * Trim from tail. Scan the mbuf chain, + * calculating its length and finding the last mbuf. + * If the adjustment only affects this mbuf, then just + * adjust and return. Otherwise, rescan and truncate + * after the remaining size. + */ + count = 0; + m = mp; + for (;;) { + count += m->m_len; + if (m->m_next == (struct mbuf *)0) + break; + m = m->m_next; + } + if (m->m_len > len) { + m->m_len -= len; + if (nul > 0) { + cp = mtod(m, caddr_t)+m->m_len-nul; + for (i = 0; i < nul; i++) + *cp++ = '\0'; + } + return; + } + count -= len; + if (count < 0) + count = 0; + /* + * Correct length for chain is "count". + * Find the mbuf with last data, adjust its length, + * and toss data from remaining mbufs on chain. + */ + for (m = mp; m; m = m->m_next) { + if (m->m_len >= count) { + m->m_len = count; + if (nul > 0) { + cp = mtod(m, caddr_t)+m->m_len-nul; + for (i = 0; i < nul; i++) + *cp++ = '\0'; + } + break; + } + count -= m->m_len; + } + while (m = m->m_next) + m->m_len = 0; +} + +/* + * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) + * - look up fsid in mount list (if not found ret error) + * - get vp and export rights by calling VFS_FHTOVP() + * - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon + * - if not lockflag unlock it with VOP_UNLOCK() + */ +nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp) + fhandle_t *fhp; + int lockflag; + struct vnode **vpp; + struct ucred *cred; + struct nfssvc_sock *slp; + struct mbuf *nam; + int *rdonlyp; +{ + register struct mount *mp; + register struct nfsuid *uidp; + register int i; + struct ucred *credanon; + int error, exflags; + + *vpp = (struct vnode *)0; + if ((mp = getvfs(&fhp->fh_fsid)) == NULL) + return (ESTALE); + if (error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon)) + return (error); + /* + * Check/setup credentials. + */ + if (exflags & MNT_EXKERB) { + uidp = slp->ns_uidh[NUIDHASH(cred->cr_uid)]; + while (uidp) { + if (uidp->nu_uid == cred->cr_uid) + break; + uidp = uidp->nu_hnext; + } + if (uidp) { + cred->cr_uid = uidp->nu_cr.cr_uid; + for (i = 0; i < uidp->nu_cr.cr_ngroups; i++) + cred->cr_groups[i] = uidp->nu_cr.cr_groups[i]; + } else { + vput(*vpp); + return (NQNFS_AUTHERR); + } + } else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) { + cred->cr_uid = credanon->cr_uid; + for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++) + cred->cr_groups[i] = credanon->cr_groups[i]; + } + if (exflags & MNT_EXRDONLY) + *rdonlyp = 1; + else + *rdonlyp = 0; + if (!lockflag) + VOP_UNLOCK(*vpp); + return (0); +} + +/* + * This function compares two net addresses by family and returns TRUE + * if they are the same host. + * If there is any doubt, return FALSE. + * The AF_INET family is handled as a special case so that address mbufs + * don't need to be saved to store "struct in_addr", which is only 4 bytes. + */ +netaddr_match(family, haddr, nam) + int family; + union nethostaddr *haddr; + struct mbuf *nam; +{ + register struct sockaddr_in *inetaddr; + + switch (family) { + case AF_INET: + inetaddr = mtod(nam, struct sockaddr_in *); + if (inetaddr->sin_family == AF_INET && + inetaddr->sin_addr.s_addr == haddr->had_inetaddr) + return (1); + break; +#ifdef ISO + case AF_ISO: + { + register struct sockaddr_iso *isoaddr1, *isoaddr2; + + isoaddr1 = mtod(nam, struct sockaddr_iso *); + isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *); + if (isoaddr1->siso_family == AF_ISO && + isoaddr1->siso_nlen > 0 && + isoaddr1->siso_nlen == isoaddr2->siso_nlen && + SAME_ISOADDR(isoaddr1, isoaddr2)) + return (1); + break; + } +#endif /* ISO */ + default: + break; + }; + return (0); +} diff --git a/sys/nfsserver/nfs_syscalls.c b/sys/nfsserver/nfs_syscalls.c new file mode 100644 index 00000000000..5d86b42ee20 --- /dev/null +++ b/sys/nfsserver/nfs_syscalls.c @@ -0,0 +1,874 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs_syscalls.c 8.3 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef ISO +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +/* Global defs. */ +extern u_long nfs_prog, nfs_vers; +extern int (*nfsrv_procs[NFS_NPROCS])(); +extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; +extern int nfs_numasync; +extern time_t nqnfsstarttime; +extern struct nfsrv_req nsrvq_head; +extern struct nfsd nfsd_head; +extern int nqsrv_writeslack; +extern int nfsrtton; +struct nfssvc_sock *nfs_udpsock, *nfs_cltpsock; +int nuidhash_max = NFS_MAXUIDHASH; +static int nfs_numnfsd = 0; +int nfsd_waiting = 0; +static int notstarted = 1; +static int modify_flag = 0; +static struct nfsdrt nfsdrt; +void nfsrv_cleancache(), nfsrv_rcv(), nfsrv_wakenfsd(), nfs_sndunlock(); +static void nfsd_rt(); +void nfsrv_slpderef(), nfsrv_init(); + +#define TRUE 1 +#define FALSE 0 + +static int nfs_asyncdaemon[NFS_MAXASYNCDAEMON]; +/* + * NFS server system calls + * getfh() lives here too, but maybe should move to kern/vfs_syscalls.c + */ + +/* + * Get file handle system call + */ +struct getfh_args { + char *fname; + fhandle_t *fhp; +}; +getfh(p, uap, retval) + struct proc *p; + register struct getfh_args *uap; + int *retval; +{ + register struct vnode *vp; + fhandle_t fh; + int error; + struct nameidata nd; + + /* + * Must be super user + */ + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + bzero((caddr_t)&fh, sizeof(fh)); + fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; + error = VFS_VPTOFH(vp, &fh.fh_fid); + vput(vp); + if (error) + return (error); + error = copyout((caddr_t)&fh, (caddr_t)uap->fhp, sizeof (fh)); + return (error); +} + +static struct nfssvc_sock nfssvc_sockhead; + +/* + * Nfs server psuedo system call for the nfsd's + * Based on the flag value it either: + * - adds a socket to the selection list + * - remains in the kernel as an nfsd + * - remains in the kernel as an nfsiod + */ +struct nfssvc_args { + int flag; + caddr_t argp; +}; +nfssvc(p, uap, retval) + struct proc *p; + register struct nfssvc_args *uap; + int *retval; +{ + struct nameidata nd; + struct file *fp; + struct mbuf *nam; + struct nfsd_args nfsdarg; + struct nfsd_srvargs nfsd_srvargs, *nsd = &nfsd_srvargs; + struct nfsd_cargs ncd; + struct nfsd *nfsd; + struct nfssvc_sock *slp; + struct nfsuid *nuidp, **nuh; + struct nfsmount *nmp; + int error; + + /* + * Must be super user + */ + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + while (nfssvc_sockhead.ns_flag & SLP_INIT) { + nfssvc_sockhead.ns_flag |= SLP_WANTINIT; + (void) tsleep((caddr_t)&nfssvc_sockhead, PSOCK, "nfsd init", 0); + } + if (uap->flag & NFSSVC_BIOD) + error = nfssvc_iod(p); + else if (uap->flag & NFSSVC_MNTD) { + if (error = copyin(uap->argp, (caddr_t)&ncd, sizeof (ncd))) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + ncd.ncd_dirp, p); + if (error = namei(&nd)) + return (error); + if ((nd.ni_vp->v_flag & VROOT) == 0) + error = EINVAL; + nmp = VFSTONFS(nd.ni_vp->v_mount); + vput(nd.ni_vp); + if (error) + return (error); + if ((nmp->nm_flag & NFSMNT_MNTD) && + (uap->flag & NFSSVC_GOTAUTH) == 0) + return (0); + nmp->nm_flag |= NFSMNT_MNTD; + error = nqnfs_clientd(nmp, p->p_ucred, &ncd, uap->flag, + uap->argp, p); + } else if (uap->flag & NFSSVC_ADDSOCK) { + if (error = copyin(uap->argp, (caddr_t)&nfsdarg, + sizeof(nfsdarg))) + return (error); + if (error = getsock(p->p_fd, nfsdarg.sock, &fp)) + return (error); + /* + * Get the client address for connected sockets. + */ + if (nfsdarg.name == NULL || nfsdarg.namelen == 0) + nam = (struct mbuf *)0; + else if (error = sockargs(&nam, nfsdarg.name, nfsdarg.namelen, + MT_SONAME)) + return (error); + error = nfssvc_addsock(fp, nam); + } else { + if (error = copyin(uap->argp, (caddr_t)nsd, sizeof (*nsd))) + return (error); + if ((uap->flag & NFSSVC_AUTHIN) && (nfsd = nsd->nsd_nfsd) && + (nfsd->nd_slp->ns_flag & SLP_VALID)) { + slp = nfsd->nd_slp; + + /* + * First check to see if another nfsd has already + * added this credential. + */ + nuidp = slp->ns_uidh[NUIDHASH(nsd->nsd_uid)]; + while (nuidp) { + if (nuidp->nu_uid == nsd->nsd_uid) + break; + nuidp = nuidp->nu_hnext; + } + if (!nuidp) { + /* + * Nope, so we will. + */ + if (slp->ns_numuids < nuidhash_max) { + slp->ns_numuids++; + nuidp = (struct nfsuid *) + malloc(sizeof (struct nfsuid), M_NFSUID, + M_WAITOK); + } else + nuidp = (struct nfsuid *)0; + if ((slp->ns_flag & SLP_VALID) == 0) { + if (nuidp) + free((caddr_t)nuidp, M_NFSUID); + } else { + if (nuidp == (struct nfsuid *)0) { + nuidp = slp->ns_lruprev; + remque(nuidp); + if (nuidp->nu_hprev) + nuidp->nu_hprev->nu_hnext = + nuidp->nu_hnext; + if (nuidp->nu_hnext) + nuidp->nu_hnext->nu_hprev = + nuidp->nu_hprev; + } + nuidp->nu_cr = nsd->nsd_cr; + if (nuidp->nu_cr.cr_ngroups > NGROUPS) + nuidp->nu_cr.cr_ngroups = NGROUPS; + nuidp->nu_cr.cr_ref = 1; + nuidp->nu_uid = nsd->nsd_uid; + insque(nuidp, (struct nfsuid *)slp); + nuh = &slp->ns_uidh[NUIDHASH(nsd->nsd_uid)]; + if (nuidp->nu_hnext = *nuh) + nuidp->nu_hnext->nu_hprev = nuidp; + nuidp->nu_hprev = (struct nfsuid *)0; + *nuh = nuidp; + } + } + } + if ((uap->flag & NFSSVC_AUTHINFAIL) && (nfsd = nsd->nsd_nfsd)) + nfsd->nd_flag |= NFSD_AUTHFAIL; + error = nfssvc_nfsd(nsd, uap->argp, p); + } + if (error == EINTR || error == ERESTART) + error = 0; + return (error); +} + +/* + * Adds a socket to the list for servicing by nfsds. + */ +nfssvc_addsock(fp, mynam) + struct file *fp; + struct mbuf *mynam; +{ + register struct mbuf *m; + register int siz; + register struct nfssvc_sock *slp; + register struct socket *so; + struct nfssvc_sock *tslp; + int error, s; + + so = (struct socket *)fp->f_data; + tslp = (struct nfssvc_sock *)0; + /* + * Add it to the list, as required. + */ + if (so->so_proto->pr_protocol == IPPROTO_UDP) { + tslp = nfs_udpsock; + if (tslp->ns_flag & SLP_VALID) { + m_freem(mynam); + return (EPERM); + } +#ifdef ISO + } else if (so->so_proto->pr_protocol == ISOPROTO_CLTP) { + tslp = nfs_cltpsock; + if (tslp->ns_flag & SLP_VALID) { + m_freem(mynam); + return (EPERM); + } +#endif /* ISO */ + } + if (so->so_type == SOCK_STREAM) + siz = NFS_MAXPACKET + sizeof (u_long); + else + siz = NFS_MAXPACKET; + if (error = soreserve(so, siz, siz)) { + m_freem(mynam); + return (error); + } + + /* + * Set protocol specific options { for now TCP only } and + * reserve some space. For datagram sockets, this can get called + * repeatedly for the same socket, but that isn't harmful. + */ + if (so->so_type == SOCK_STREAM) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, SOL_SOCKET, SO_KEEPALIVE, m); + } + if (so->so_proto->pr_domain->dom_family == AF_INET && + so->so_proto->pr_protocol == IPPROTO_TCP) { + MGET(m, M_WAIT, MT_SOOPTS); + *mtod(m, int *) = 1; + m->m_len = sizeof(int); + sosetopt(so, IPPROTO_TCP, TCP_NODELAY, m); + } + so->so_rcv.sb_flags &= ~SB_NOINTR; + so->so_rcv.sb_timeo = 0; + so->so_snd.sb_flags &= ~SB_NOINTR; + so->so_snd.sb_timeo = 0; + if (tslp) + slp = tslp; + else { + slp = (struct nfssvc_sock *) + malloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK); + bzero((caddr_t)slp, sizeof (struct nfssvc_sock)); + slp->ns_prev = nfssvc_sockhead.ns_prev; + slp->ns_prev->ns_next = slp; + slp->ns_next = &nfssvc_sockhead; + nfssvc_sockhead.ns_prev = slp; + slp->ns_lrunext = slp->ns_lruprev = (struct nfsuid *)slp; + } + slp->ns_so = so; + slp->ns_nam = mynam; + fp->f_count++; + slp->ns_fp = fp; + s = splnet(); + so->so_upcallarg = (caddr_t)slp; + so->so_upcall = nfsrv_rcv; + slp->ns_flag = (SLP_VALID | SLP_NEEDQ); + nfsrv_wakenfsd(slp); + splx(s); + return (0); +} + +/* + * Called by nfssvc() for nfsds. Just loops around servicing rpc requests + * until it is killed by a signal. + */ +nfssvc_nfsd(nsd, argp, p) + struct nfsd_srvargs *nsd; + caddr_t argp; + struct proc *p; +{ + register struct mbuf *m, *nam2; + register int siz; + register struct nfssvc_sock *slp; + register struct socket *so; + register int *solockp; + struct nfsd *nd = nsd->nsd_nfsd; + struct mbuf *mreq, *nam; + struct timeval starttime; + struct nfsuid *uidp; + int error, cacherep, s; + int sotype; + + s = splnet(); + if (nd == (struct nfsd *)0) { + nsd->nsd_nfsd = nd = (struct nfsd *) + malloc(sizeof (struct nfsd), M_NFSD, M_WAITOK); + bzero((caddr_t)nd, sizeof (struct nfsd)); + nd->nd_procp = p; + nd->nd_cr.cr_ref = 1; + insque(nd, &nfsd_head); + nd->nd_nqlflag = NQL_NOVAL; + nfs_numnfsd++; + } + /* + * Loop getting rpc requests until SIGKILL. + */ + for (;;) { + if ((nd->nd_flag & NFSD_REQINPROG) == 0) { + while (nd->nd_slp == (struct nfssvc_sock *)0 && + (nfsd_head.nd_flag & NFSD_CHECKSLP) == 0) { + nd->nd_flag |= NFSD_WAITING; + nfsd_waiting++; + error = tsleep((caddr_t)nd, PSOCK | PCATCH, "nfsd", 0); + nfsd_waiting--; + if (error) + goto done; + } + if (nd->nd_slp == (struct nfssvc_sock *)0 && + (nfsd_head.nd_flag & NFSD_CHECKSLP)) { + slp = nfssvc_sockhead.ns_next; + while (slp != &nfssvc_sockhead) { + if ((slp->ns_flag & (SLP_VALID | SLP_DOREC)) + == (SLP_VALID | SLP_DOREC)) { + slp->ns_flag &= ~SLP_DOREC; + slp->ns_sref++; + nd->nd_slp = slp; + break; + } + slp = slp->ns_next; + } + if (slp == &nfssvc_sockhead) + nfsd_head.nd_flag &= ~NFSD_CHECKSLP; + } + if ((slp = nd->nd_slp) == (struct nfssvc_sock *)0) + continue; + if (slp->ns_flag & SLP_VALID) { + if (slp->ns_flag & SLP_DISCONN) + nfsrv_zapsock(slp); + else if (slp->ns_flag & SLP_NEEDQ) { + slp->ns_flag &= ~SLP_NEEDQ; + (void) nfs_sndlock(&slp->ns_solock, + (struct nfsreq *)0); + nfsrv_rcv(slp->ns_so, (caddr_t)slp, + M_WAIT); + nfs_sndunlock(&slp->ns_solock); + } + error = nfsrv_dorec(slp, nd); + nd->nd_flag |= NFSD_REQINPROG; + } + } else { + error = 0; + slp = nd->nd_slp; + } + if (error || (slp->ns_flag & SLP_VALID) == 0) { + nd->nd_slp = (struct nfssvc_sock *)0; + nd->nd_flag &= ~NFSD_REQINPROG; + nfsrv_slpderef(slp); + continue; + } + splx(s); + so = slp->ns_so; + sotype = so->so_type; + starttime = time; + if (so->so_proto->pr_flags & PR_CONNREQUIRED) + solockp = &slp->ns_solock; + else + solockp = (int *)0; + /* + * nam == nam2 for connectionless protocols such as UDP + * nam2 == NULL for connection based protocols to disable + * recent request caching. + */ + if (nam2 = nd->nd_nam) { + nam = nam2; + cacherep = RC_CHECKIT; + } else { + nam = slp->ns_nam; + cacherep = RC_DOIT; + } + + /* + * Check to see if authorization is needed. + */ + if (nd->nd_flag & NFSD_NEEDAUTH) { + static int logauth = 0; + + nd->nd_flag &= ~NFSD_NEEDAUTH; + /* + * Check for a mapping already installed. + */ + uidp = slp->ns_uidh[NUIDHASH(nd->nd_cr.cr_uid)]; + while (uidp) { + if (uidp->nu_uid == nd->nd_cr.cr_uid) + break; + uidp = uidp->nu_hnext; + } + if (!uidp) { + nsd->nsd_uid = nd->nd_cr.cr_uid; + if (nam2 && logauth++ == 0) + log(LOG_WARNING, "Kerberized NFS using UDP\n"); + nsd->nsd_haddr = + mtod(nam, struct sockaddr_in *)->sin_addr.s_addr; + nsd->nsd_authlen = nd->nd_authlen; + if (copyout(nd->nd_authstr, nsd->nsd_authstr, + nd->nd_authlen) == 0 && + copyout((caddr_t)nsd, argp, sizeof (*nsd)) == 0) + return (ENEEDAUTH); + cacherep = RC_DROPIT; + } + } + if (cacherep == RC_CHECKIT) + cacherep = nfsrv_getcache(nam2, nd, &mreq); + + /* + * Check for just starting up for NQNFS and send + * fake "try again later" replies to the NQNFS clients. + */ + if (notstarted && nqnfsstarttime <= time.tv_sec) { + if (modify_flag) { + nqnfsstarttime = time.tv_sec + nqsrv_writeslack; + modify_flag = 0; + } else + notstarted = 0; + } + if (notstarted) { + if (nd->nd_nqlflag == NQL_NOVAL) + cacherep = RC_DROPIT; + else if (nd->nd_procnum != NFSPROC_WRITE) { + nd->nd_procnum = NFSPROC_NOOP; + nd->nd_repstat = NQNFS_TRYLATER; + cacherep = RC_DOIT; + } else + modify_flag = 1; + } else if (nd->nd_flag & NFSD_AUTHFAIL) { + nd->nd_flag &= ~NFSD_AUTHFAIL; + nd->nd_procnum = NFSPROC_NOOP; + nd->nd_repstat = NQNFS_AUTHERR; + cacherep = RC_DOIT; + } + + switch (cacherep) { + case RC_DOIT: + error = (*(nfsrv_procs[nd->nd_procnum]))(nd, + nd->nd_mrep, nd->nd_md, nd->nd_dpos, &nd->nd_cr, + nam, &mreq); + if (nd->nd_cr.cr_ref != 1) { + printf("nfssvc cref=%d\n", nd->nd_cr.cr_ref); + panic("nfssvc cref"); + } + if (error) { + if (nd->nd_procnum != NQNFSPROC_VACATED) + nfsstats.srv_errs++; + if (nam2) { + nfsrv_updatecache(nam2, nd, FALSE, mreq); + m_freem(nam2); + } + break; + } + nfsstats.srvrpccnt[nd->nd_procnum]++; + if (nam2) + nfsrv_updatecache(nam2, nd, TRUE, mreq); + nd->nd_mrep = (struct mbuf *)0; + case RC_REPLY: + m = mreq; + siz = 0; + while (m) { + siz += m->m_len; + m = m->m_next; + } + if (siz <= 0 || siz > NFS_MAXPACKET) { + printf("mbuf siz=%d\n",siz); + panic("Bad nfs svc reply"); + } + m = mreq; + m->m_pkthdr.len = siz; + m->m_pkthdr.rcvif = (struct ifnet *)0; + /* + * For stream protocols, prepend a Sun RPC + * Record Mark. + */ + if (sotype == SOCK_STREAM) { + M_PREPEND(m, NFSX_UNSIGNED, M_WAIT); + *mtod(m, u_long *) = htonl(0x80000000 | siz); + } + if (solockp) + (void) nfs_sndlock(solockp, (struct nfsreq *)0); + if (slp->ns_flag & SLP_VALID) + error = nfs_send(so, nam2, m, (struct nfsreq *)0); + else { + error = EPIPE; + m_freem(m); + } + if (nfsrtton) + nfsd_rt(&starttime, sotype, nd, nam, cacherep); + if (nam2) + MFREE(nam2, m); + if (nd->nd_mrep) + m_freem(nd->nd_mrep); + if (error == EPIPE) + nfsrv_zapsock(slp); + if (solockp) + nfs_sndunlock(solockp); + if (error == EINTR || error == ERESTART) { + nfsrv_slpderef(slp); + s = splnet(); + goto done; + } + break; + case RC_DROPIT: + if (nfsrtton) + nfsd_rt(&starttime, sotype, nd, nam, cacherep); + m_freem(nd->nd_mrep); + m_freem(nam2); + break; + }; + s = splnet(); + if (nfsrv_dorec(slp, nd)) { + nd->nd_flag &= ~NFSD_REQINPROG; + nd->nd_slp = (struct nfssvc_sock *)0; + nfsrv_slpderef(slp); + } + } +done: + remque(nd); + splx(s); + free((caddr_t)nd, M_NFSD); + nsd->nsd_nfsd = (struct nfsd *)0; + if (--nfs_numnfsd == 0) + nfsrv_init(TRUE); /* Reinitialize everything */ + return (error); +} + +/* + * Asynchronous I/O daemons for client nfs. + * They do read-ahead and write-behind operations on the block I/O cache. + * Never returns unless it fails or gets killed. + */ +nfssvc_iod(p) + struct proc *p; +{ + register struct buf *bp; + register int i, myiod; + int error = 0; + + /* + * Assign my position or return error if too many already running + */ + myiod = -1; + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + if (nfs_asyncdaemon[i] == 0) { + nfs_asyncdaemon[i]++; + myiod = i; + break; + } + if (myiod == -1) + return (EBUSY); + nfs_numasync++; + /* + * Just loop around doin our stuff until SIGKILL + */ + for (;;) { + while (nfs_bufq.tqh_first == NULL && error == 0) { + nfs_iodwant[myiod] = p; + error = tsleep((caddr_t)&nfs_iodwant[myiod], + PWAIT | PCATCH, "nfsidl", 0); + } + while ((bp = nfs_bufq.tqh_first) != NULL) { + /* Take one off the front of the list */ + TAILQ_REMOVE(&nfs_bufq, bp, b_freelist); + if (bp->b_flags & B_READ) + (void) nfs_doio(bp, bp->b_rcred, (struct proc *)0); + else + (void) nfs_doio(bp, bp->b_wcred, (struct proc *)0); + } + if (error) { + nfs_asyncdaemon[myiod] = 0; + nfs_numasync--; + return (error); + } + } +} + +/* + * Shut down a socket associated with an nfssvc_sock structure. + * Should be called with the send lock set, if required. + * The trick here is to increment the sref at the start, so that the nfsds + * will stop using it and clear ns_flag at the end so that it will not be + * reassigned during cleanup. + */ +nfsrv_zapsock(slp) + register struct nfssvc_sock *slp; +{ + register struct nfsuid *nuidp, *onuidp; + register int i; + struct socket *so; + struct file *fp; + struct mbuf *m; + + slp->ns_flag &= ~SLP_ALLFLAGS; + if (fp = slp->ns_fp) { + slp->ns_fp = (struct file *)0; + so = slp->ns_so; + so->so_upcall = NULL; + soshutdown(so, 2); + closef(fp, (struct proc *)0); + if (slp->ns_nam) + MFREE(slp->ns_nam, m); + m_freem(slp->ns_raw); + m_freem(slp->ns_rec); + nuidp = slp->ns_lrunext; + while (nuidp != (struct nfsuid *)slp) { + onuidp = nuidp; + nuidp = nuidp->nu_lrunext; + free((caddr_t)onuidp, M_NFSUID); + } + slp->ns_lrunext = slp->ns_lruprev = (struct nfsuid *)slp; + for (i = 0; i < NUIDHASHSIZ; i++) + slp->ns_uidh[i] = (struct nfsuid *)0; + } +} + +/* + * Get an authorization string for the uid by having the mount_nfs sitting + * on this mount point porpous out of the kernel and do it. + */ +nfs_getauth(nmp, rep, cred, auth_type, auth_str, auth_len) + register struct nfsmount *nmp; + struct nfsreq *rep; + struct ucred *cred; + int *auth_type; + char **auth_str; + int *auth_len; +{ + int error = 0; + + while ((nmp->nm_flag & NFSMNT_WAITAUTH) == 0) { + nmp->nm_flag |= NFSMNT_WANTAUTH; + (void) tsleep((caddr_t)&nmp->nm_authtype, PSOCK, + "nfsauth1", 2 * hz); + if (error = nfs_sigintr(nmp, rep, rep->r_procp)) { + nmp->nm_flag &= ~NFSMNT_WANTAUTH; + return (error); + } + } + nmp->nm_flag &= ~(NFSMNT_WAITAUTH | NFSMNT_WANTAUTH); + nmp->nm_authstr = *auth_str = (char *)malloc(RPCAUTH_MAXSIZ, M_TEMP, M_WAITOK); + nmp->nm_authuid = cred->cr_uid; + wakeup((caddr_t)&nmp->nm_authstr); + + /* + * And wait for mount_nfs to do its stuff. + */ + while ((nmp->nm_flag & NFSMNT_HASAUTH) == 0 && error == 0) { + (void) tsleep((caddr_t)&nmp->nm_authlen, PSOCK, + "nfsauth2", 2 * hz); + error = nfs_sigintr(nmp, rep, rep->r_procp); + } + if (nmp->nm_flag & NFSMNT_AUTHERR) { + nmp->nm_flag &= ~NFSMNT_AUTHERR; + error = EAUTH; + } + if (error) + free((caddr_t)*auth_str, M_TEMP); + else { + *auth_type = nmp->nm_authtype; + *auth_len = nmp->nm_authlen; + } + nmp->nm_flag &= ~NFSMNT_HASAUTH; + nmp->nm_flag |= NFSMNT_WAITAUTH; + if (nmp->nm_flag & NFSMNT_WANTAUTH) { + nmp->nm_flag &= ~NFSMNT_WANTAUTH; + wakeup((caddr_t)&nmp->nm_authtype); + } + return (error); +} + +/* + * Derefence a server socket structure. If it has no more references and + * is no longer valid, you can throw it away. + */ +void +nfsrv_slpderef(slp) + register struct nfssvc_sock *slp; +{ + if (--(slp->ns_sref) == 0 && (slp->ns_flag & SLP_VALID) == 0) { + slp->ns_prev->ns_next = slp->ns_next; + slp->ns_next->ns_prev = slp->ns_prev; + free((caddr_t)slp, M_NFSSVC); + } +} + +/* + * Initialize the data structures for the server. + * Handshake with any new nfsds starting up to avoid any chance of + * corruption. + */ +void +nfsrv_init(terminating) + int terminating; +{ + register struct nfssvc_sock *slp; + struct nfssvc_sock *oslp; + + if (nfssvc_sockhead.ns_flag & SLP_INIT) + panic("nfsd init"); + nfssvc_sockhead.ns_flag |= SLP_INIT; + if (terminating) { + slp = nfssvc_sockhead.ns_next; + while (slp != &nfssvc_sockhead) { + if (slp->ns_flag & SLP_VALID) + nfsrv_zapsock(slp); + slp->ns_next->ns_prev = slp->ns_prev; + slp->ns_prev->ns_next = slp->ns_next; + oslp = slp; + slp = slp->ns_next; + free((caddr_t)oslp, M_NFSSVC); + } + nfsrv_cleancache(); /* And clear out server cache */ + } + nfs_udpsock = (struct nfssvc_sock *) + malloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK); + bzero((caddr_t)nfs_udpsock, sizeof (struct nfssvc_sock)); + nfs_cltpsock = (struct nfssvc_sock *) + malloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK); + bzero((caddr_t)nfs_cltpsock, sizeof (struct nfssvc_sock)); + nfssvc_sockhead.ns_next = nfs_udpsock; + nfs_udpsock->ns_next = nfs_cltpsock; + nfs_cltpsock->ns_next = &nfssvc_sockhead; + nfssvc_sockhead.ns_prev = nfs_cltpsock; + nfs_cltpsock->ns_prev = nfs_udpsock; + nfs_udpsock->ns_prev = &nfssvc_sockhead; + nfs_udpsock->ns_lrunext = nfs_udpsock->ns_lruprev = + (struct nfsuid *)nfs_udpsock; + nfs_cltpsock->ns_lrunext = nfs_cltpsock->ns_lruprev = + (struct nfsuid *)nfs_cltpsock; + nfsd_head.nd_next = nfsd_head.nd_prev = &nfsd_head; + nfsd_head.nd_flag = 0; + nfssvc_sockhead.ns_flag &= ~SLP_INIT; + if (nfssvc_sockhead.ns_flag & SLP_WANTINIT) { + nfssvc_sockhead.ns_flag &= ~SLP_WANTINIT; + wakeup((caddr_t)&nfssvc_sockhead); + } +} + +/* + * Add entries to the server monitor log. + */ +static void +nfsd_rt(startp, sotype, nd, nam, cacherep) + struct timeval *startp; + int sotype; + register struct nfsd *nd; + struct mbuf *nam; + int cacherep; +{ + register struct drt *rt; + + rt = &nfsdrt.drt[nfsdrt.pos]; + if (cacherep == RC_DOIT) + rt->flag = 0; + else if (cacherep == RC_REPLY) + rt->flag = DRT_CACHEREPLY; + else + rt->flag = DRT_CACHEDROP; + if (sotype == SOCK_STREAM) + rt->flag |= DRT_TCP; + if (nd->nd_nqlflag != NQL_NOVAL) + rt->flag |= DRT_NQNFS; + rt->proc = nd->nd_procnum; + if (mtod(nam, struct sockaddr *)->sa_family == AF_INET) + rt->ipadr = mtod(nam, struct sockaddr_in *)->sin_addr.s_addr; + else + rt->ipadr = INADDR_ANY; + rt->resptime = ((time.tv_sec - startp->tv_sec) * 1000000) + + (time.tv_usec - startp->tv_usec); + rt->tstamp = time; + nfsdrt.pos = (nfsdrt.pos + 1) % NFSRTTLOGSIZ; +} diff --git a/sys/nfsserver/nfsm_subs.h b/sys/nfsserver/nfsm_subs.h new file mode 100644 index 00000000000..879db360057 --- /dev/null +++ b/sys/nfsserver/nfsm_subs.h @@ -0,0 +1,269 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsm_subs.h 8.1 (Berkeley) 6/16/93 + */ + +/* + * These macros do strange and peculiar things to mbuf chains for + * the assistance of the nfs code. To attempt to use them for any + * other purpose will be dangerous. (they make weird assumptions) + */ + +/* + * First define what the actual subs. return + */ +extern struct mbuf *nfsm_reqh(); + +#define M_HASCL(m) ((m)->m_flags & M_EXT) +#define NFSMINOFF(m) \ + if (M_HASCL(m)) \ + (m)->m_data = (m)->m_ext.ext_buf; \ + else if ((m)->m_flags & M_PKTHDR) \ + (m)->m_data = (m)->m_pktdat; \ + else \ + (m)->m_data = (m)->m_dat +#define NFSMADV(m, s) (m)->m_data += (s) +#define NFSMSIZ(m) ((M_HASCL(m))?MCLBYTES: \ + (((m)->m_flags & M_PKTHDR)?MHLEN:MLEN)) + +/* + * Now for the macros that do the simple stuff and call the functions + * for the hard stuff. + * These macros use several vars. declared in nfsm_reqhead and these + * vars. must not be used elsewhere unless you are careful not to corrupt + * them. The vars. starting with pN and tN (N=1,2,3,..) are temporaries + * that may be used so long as the value is not expected to retained + * after a macro. + * I know, this is kind of dorkey, but it makes the actual op functions + * fairly clean and deals with the mess caused by the xdr discriminating + * unions. + */ + +#define nfsm_build(a,c,s) \ + { if ((s) > M_TRAILINGSPACE(mb)) { \ + MGET(mb2, M_WAIT, MT_DATA); \ + if ((s) > MLEN) \ + panic("build > MLEN"); \ + mb->m_next = mb2; \ + mb = mb2; \ + mb->m_len = 0; \ + bpos = mtod(mb, caddr_t); \ + } \ + (a) = (c)(bpos); \ + mb->m_len += (s); \ + bpos += (s); } + +#define nfsm_dissect(a,c,s) \ + { t1 = mtod(md, caddr_t)+md->m_len-dpos; \ + if (t1 >= (s)) { \ + (a) = (c)(dpos); \ + dpos += (s); \ + } else if (error = nfsm_disct(&md, &dpos, (s), t1, &cp2)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } else { \ + (a) = (c)cp2; \ + } } + +#define nfsm_fhtom(v) \ + nfsm_build(cp,caddr_t,NFSX_FH); \ + bcopy((caddr_t)&(VTONFS(v)->n_fh), cp, NFSX_FH) + +#define nfsm_srvfhtom(f) \ + nfsm_build(cp,caddr_t,NFSX_FH); \ + bcopy((caddr_t)(f), cp, NFSX_FH) + +#define nfsm_mtofh(d,v) \ + { struct nfsnode *np; nfsv2fh_t *fhp; \ + nfsm_dissect(fhp,nfsv2fh_t *,NFSX_FH); \ + if (error = nfs_nget((d)->v_mount, fhp, &np)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } \ + (v) = NFSTOV(np); \ + nfsm_loadattr(v, (struct vattr *)0); \ + } + +#define nfsm_loadattr(v,a) \ + { struct vnode *tvp = (v); \ + if (error = nfs_loadattrcache(&tvp, &md, &dpos, (a))) { \ + m_freem(mrep); \ + goto nfsmout; \ + } \ + (v) = tvp; } + +#define nfsm_strsiz(s,m) \ + { nfsm_dissect(tl,u_long *,NFSX_UNSIGNED); \ + if (((s) = fxdr_unsigned(long,*tl)) > (m)) { \ + m_freem(mrep); \ + error = EBADRPC; \ + goto nfsmout; \ + } } + +#define nfsm_srvstrsiz(s,m) \ + { nfsm_dissect(tl,u_long *,NFSX_UNSIGNED); \ + if (((s) = fxdr_unsigned(long,*tl)) > (m) || (s) <= 0) { \ + error = EBADRPC; \ + nfsm_reply(0); \ + } } + +#define nfsm_mtouio(p,s) \ + if ((s) > 0 && \ + (error = nfsm_mbuftouio(&md,(p),(s),&dpos))) { \ + m_freem(mrep); \ + goto nfsmout; \ + } + +#define nfsm_uiotom(p,s) \ + if (error = nfsm_uiotombuf((p),&mb,(s),&bpos)) { \ + m_freem(mreq); \ + goto nfsmout; \ + } + +#define nfsm_reqhead(v,a,s) \ + mb = mreq = nfsm_reqh((v),(a),(s),&bpos) + +#define nfsm_reqdone m_freem(mrep); \ + nfsmout: + +#define nfsm_rndup(a) (((a)+3)&(~0x3)) + +#define nfsm_request(v, t, p, c) \ + if (error = nfs_request((v), mreq, (t), (p), \ + (c), &mrep, &md, &dpos)) \ + goto nfsmout + +#define nfsm_strtom(a,s,m) \ + if ((s) > (m)) { \ + m_freem(mreq); \ + error = ENAMETOOLONG; \ + goto nfsmout; \ + } \ + t2 = nfsm_rndup(s)+NFSX_UNSIGNED; \ + if (t2 <= M_TRAILINGSPACE(mb)) { \ + nfsm_build(tl,u_long *,t2); \ + *tl++ = txdr_unsigned(s); \ + *(tl+((t2>>2)-2)) = 0; \ + bcopy((caddr_t)(a), (caddr_t)tl, (s)); \ + } else if (error = nfsm_strtmbuf(&mb, &bpos, (a), (s))) { \ + m_freem(mreq); \ + goto nfsmout; \ + } + +#define nfsm_srvdone \ + nfsmout: \ + return(error) + +#define nfsm_reply(s) \ + { \ + nfsd->nd_repstat = error; \ + if (error) \ + (void) nfs_rephead(0, nfsd, error, cache, &frev, \ + mrq, &mb, &bpos); \ + else \ + (void) nfs_rephead((s), nfsd, error, cache, &frev, \ + mrq, &mb, &bpos); \ + m_freem(mrep); \ + mreq = *mrq; \ + if (error) \ + return(0); \ + } + +#define nfsm_adv(s) \ + t1 = mtod(md, caddr_t)+md->m_len-dpos; \ + if (t1 >= (s)) { \ + dpos += (s); \ + } else if (error = nfs_adv(&md, &dpos, (s), t1)) { \ + m_freem(mrep); \ + goto nfsmout; \ + } + +#define nfsm_srvmtofh(f) \ + nfsm_dissect(tl, u_long *, NFSX_FH); \ + bcopy((caddr_t)tl, (caddr_t)f, NFSX_FH) + +#define nfsm_clget \ + if (bp >= be) { \ + if (mp == mb) \ + mp->m_len += bp-bpos; \ + MGET(mp, M_WAIT, MT_DATA); \ + MCLGET(mp, M_WAIT); \ + mp->m_len = NFSMSIZ(mp); \ + mp2->m_next = mp; \ + mp2 = mp; \ + bp = mtod(mp, caddr_t); \ + be = bp+mp->m_len; \ + } \ + tl = (u_long *)bp + +#define nfsm_srvfillattr \ + fp->fa_type = vtonfs_type(vap->va_type); \ + fp->fa_mode = vtonfs_mode(vap->va_type, vap->va_mode); \ + fp->fa_nlink = txdr_unsigned(vap->va_nlink); \ + fp->fa_uid = txdr_unsigned(vap->va_uid); \ + fp->fa_gid = txdr_unsigned(vap->va_gid); \ + if (nfsd->nd_nqlflag == NQL_NOVAL) { \ + fp->fa_nfsblocksize = txdr_unsigned(vap->va_blocksize); \ + if (vap->va_type == VFIFO) \ + fp->fa_nfsrdev = 0xffffffff; \ + else \ + fp->fa_nfsrdev = txdr_unsigned(vap->va_rdev); \ + fp->fa_nfsfsid = txdr_unsigned(vap->va_fsid); \ + fp->fa_nfsfileid = txdr_unsigned(vap->va_fileid); \ + fp->fa_nfssize = txdr_unsigned(vap->va_size); \ + fp->fa_nfsblocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); \ + txdr_nfstime(&vap->va_atime, &fp->fa_nfsatime); \ + txdr_nfstime(&vap->va_mtime, &fp->fa_nfsmtime); \ + fp->fa_nfsctime.nfs_sec = txdr_unsigned(vap->va_ctime.ts_sec); \ + fp->fa_nfsctime.nfs_usec = txdr_unsigned(vap->va_gen); \ + } else { \ + fp->fa_nqblocksize = txdr_unsigned(vap->va_blocksize); \ + if (vap->va_type == VFIFO) \ + fp->fa_nqrdev = 0xffffffff; \ + else \ + fp->fa_nqrdev = txdr_unsigned(vap->va_rdev); \ + fp->fa_nqfsid = txdr_unsigned(vap->va_fsid); \ + fp->fa_nqfileid = txdr_unsigned(vap->va_fileid); \ + txdr_hyper(&vap->va_size, &fp->fa_nqsize); \ + txdr_hyper(&vap->va_bytes, &fp->fa_nqbytes); \ + txdr_nqtime(&vap->va_atime, &fp->fa_nqatime); \ + txdr_nqtime(&vap->va_mtime, &fp->fa_nqmtime); \ + txdr_nqtime(&vap->va_ctime, &fp->fa_nqctime); \ + fp->fa_nqflags = txdr_unsigned(vap->va_flags); \ + fp->fa_nqgen = txdr_unsigned(vap->va_gen); \ + txdr_hyper(&vap->va_filerev, &fp->fa_nqfilerev); \ + } + diff --git a/sys/nfsserver/nfsrvcache.h b/sys/nfsserver/nfsrvcache.h new file mode 100644 index 00000000000..26da2c275df --- /dev/null +++ b/sys/nfsserver/nfsrvcache.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfsrvcache.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Definitions for the server recent request cache + */ + +#define NFSRVCACHESIZ 256 + +struct nfsrvcache { + struct nfsrvcache *rc_forw; /* Hash chain links */ + struct nfsrvcache **rc_back; /* Hash chain links */ + struct nfsrvcache *rc_next; /* Lru list */ + struct nfsrvcache **rc_prev; /* Lru list */ + u_long rc_xid; /* rpc id number */ + union { + struct mbuf *ru_repmb; /* Reply mbuf list OR */ + int ru_repstat; /* Reply status */ + } rc_un; + union nethostaddr rc_haddr; /* Host address */ + short rc_proc; /* rpc proc number */ + u_char rc_state; /* Current state of request */ + u_char rc_flag; /* Flag bits */ +}; + +#define rc_reply rc_un.ru_repmb +#define rc_status rc_un.ru_repstat +#define rc_inetaddr rc_haddr.had_inetaddr +#define rc_nam rc_haddr.had_nam + +/* Cache entry states */ +#define RC_UNUSED 0 +#define RC_INPROG 1 +#define RC_DONE 2 + +/* Return values */ +#define RC_DROPIT 0 +#define RC_REPLY 1 +#define RC_DOIT 2 +#define RC_CHECKIT 3 + +/* Flag bits */ +#define RC_LOCKED 0x01 +#define RC_WANTED 0x02 +#define RC_REPSTATUS 0x04 +#define RC_REPMBUF 0x08 +#define RC_NQNFS 0x10 +#define RC_INETADDR 0x20 +#define RC_NAM 0x40 diff --git a/sys/nfsserver/nfsrvstats.h b/sys/nfsserver/nfsrvstats.h new file mode 100644 index 00000000000..261fd42657a --- /dev/null +++ b/sys/nfsserver/nfsrvstats.h @@ -0,0 +1,297 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)nfs.h 8.1 (Berkeley) 6/10/93 + */ + +/* + * Tunable constants for nfs + */ + +#define NFS_MAXIOVEC 34 +#define NFS_HZ 25 /* Ticks per second for NFS timeouts */ +#define NFS_TIMEO (1*NFS_HZ) /* Default timeout = 1 second */ +#define NFS_MINTIMEO (1*NFS_HZ) /* Min timeout to use */ +#define NFS_MAXTIMEO (60*NFS_HZ) /* Max timeout to backoff to */ +#define NFS_MINIDEMTIMEO (5*NFS_HZ) /* Min timeout for non-idempotent ops*/ +#define NFS_MAXREXMIT 100 /* Stop counting after this many */ +#define NFS_MAXWINDOW 1024 /* Max number of outstanding requests */ +#define NFS_RETRANS 10 /* Num of retrans for soft mounts */ +#define NFS_MAXGRPS 16 /* Max. size of groups list */ +#define NFS_MINATTRTIMO 5 /* Attribute cache timeout in sec */ +#define NFS_MAXATTRTIMO 60 +#define NFS_WSIZE 8192 /* Def. write data size <= 8192 */ +#define NFS_RSIZE 8192 /* Def. read data size <= 8192 */ +#define NFS_DEFRAHEAD 1 /* Def. read ahead # blocks */ +#define NFS_MAXRAHEAD 4 /* Max. read ahead # blocks */ +#define NFS_MAXREADDIR NFS_MAXDATA /* Max. size of directory read */ +#define NFS_MAXUIDHASH 64 /* Max. # of hashed uid entries/mp */ +#define NFS_MAXASYNCDAEMON 20 /* Max. number async_daemons runable */ +#define NFS_DIRBLKSIZ 1024 /* Size of an NFS directory block */ +#define NMOD(a) ((a) % nfs_asyncdaemons) + +/* + * Set the attribute timeout based on how recently the file has been modified. + */ +#define NFS_ATTRTIMEO(np) \ + ((((np)->n_flag & NMODIFIED) || \ + (time.tv_sec - (np)->n_mtime) / 10 < NFS_MINATTRTIMO) ? NFS_MINATTRTIMO : \ + ((time.tv_sec - (np)->n_mtime) / 10 > NFS_MAXATTRTIMO ? NFS_MAXATTRTIMO : \ + (time.tv_sec - (np)->n_mtime) / 10)) + +/* + * Structures for the nfssvc(2) syscall. Not that anyone but nfsd and mount_nfs + * should ever try and use it. + */ +struct nfsd_args { + int sock; /* Socket to serve */ + caddr_t name; /* Client address for connection based sockets */ + int namelen; /* Length of name */ +}; + +struct nfsd_srvargs { + struct nfsd *nsd_nfsd; /* Pointer to in kernel nfsd struct */ + uid_t nsd_uid; /* Effective uid mapped to cred */ + u_long nsd_haddr; /* Ip address of client */ + struct ucred nsd_cr; /* Cred. uid maps to */ + int nsd_authlen; /* Length of auth string (ret) */ + char *nsd_authstr; /* Auth string (ret) */ +}; + +struct nfsd_cargs { + char *ncd_dirp; /* Mount dir path */ + uid_t ncd_authuid; /* Effective uid */ + int ncd_authtype; /* Type of authenticator */ + int ncd_authlen; /* Length of authenticator string */ + char *ncd_authstr; /* Authenticator string */ +}; + +/* + * Stats structure + */ +struct nfsstats { + int attrcache_hits; + int attrcache_misses; + int lookupcache_hits; + int lookupcache_misses; + int direofcache_hits; + int direofcache_misses; + int biocache_reads; + int read_bios; + int read_physios; + int biocache_writes; + int write_bios; + int write_physios; + int biocache_readlinks; + int readlink_bios; + int biocache_readdirs; + int readdir_bios; + int rpccnt[NFS_NPROCS]; + int rpcretries; + int srvrpccnt[NFS_NPROCS]; + int srvrpc_errs; + int srv_errs; + int rpcrequests; + int rpctimeouts; + int rpcunexpected; + int rpcinvalid; + int srvcache_inproghits; + int srvcache_idemdonehits; + int srvcache_nonidemdonehits; + int srvcache_misses; + int srvnqnfs_leases; + int srvnqnfs_maxleases; + int srvnqnfs_getleases; +}; + +/* + * Flags for nfssvc() system call. + */ +#define NFSSVC_BIOD 0x002 +#define NFSSVC_NFSD 0x004 +#define NFSSVC_ADDSOCK 0x008 +#define NFSSVC_AUTHIN 0x010 +#define NFSSVC_GOTAUTH 0x040 +#define NFSSVC_AUTHINFAIL 0x080 +#define NFSSVC_MNTD 0x100 + +/* + * The set of signals the interrupt an I/O in progress for NFSMNT_INT mounts. + * What should be in this set is open to debate, but I believe that since + * I/O system calls on ufs are never interrupted by signals the set should + * be minimal. My reasoning is that many current programs that use signals + * such as SIGALRM will not expect file I/O system calls to be interrupted + * by them and break. + */ +#ifdef KERNEL +#define NFSINT_SIGMASK (sigmask(SIGINT)|sigmask(SIGTERM)|sigmask(SIGKILL)| \ + sigmask(SIGHUP)|sigmask(SIGQUIT)) + +/* + * Socket errors ignored for connectionless sockets?? + * For now, ignore them all + */ +#define NFSIGNORE_SOERROR(s, e) \ + ((e) != EINTR && (e) != ERESTART && (e) != EWOULDBLOCK && \ + ((s) & PR_CONNREQUIRED) == 0) + +/* + * Nfs outstanding request list element + */ +struct nfsreq { + struct nfsreq *r_next; + struct nfsreq *r_prev; + struct mbuf *r_mreq; + struct mbuf *r_mrep; + struct mbuf *r_md; + caddr_t r_dpos; + struct nfsmount *r_nmp; + struct vnode *r_vp; + u_long r_xid; + int r_flags; /* flags on request, see below */ + int r_retry; /* max retransmission count */ + int r_rexmit; /* current retrans count */ + int r_timer; /* tick counter on reply */ + int r_procnum; /* NFS procedure number */ + int r_rtt; /* RTT for rpc */ + struct proc *r_procp; /* Proc that did I/O system call */ +}; + +/* Flag values for r_flags */ +#define R_TIMING 0x01 /* timing request (in mntp) */ +#define R_SENT 0x02 /* request has been sent */ +#define R_SOFTTERM 0x04 /* soft mnt, too many retries */ +#define R_INTR 0x08 /* intr mnt, signal pending */ +#define R_SOCKERR 0x10 /* Fatal error on socket */ +#define R_TPRINTFMSG 0x20 /* Did a tprintf msg. */ +#define R_MUSTRESEND 0x40 /* Must resend request */ +#define R_GETONEREP 0x80 /* Probe for one reply only */ + +struct nfsstats nfsstats; + +/* + * A list of nfssvc_sock structures is maintained with all the sockets + * that require service by the nfsd. + * The nfsuid structs hang off of the nfssvc_sock structs in both lru + * and uid hash lists. + */ +#define NUIDHASHSIZ 32 +#define NUIDHASH(uid) ((uid) & (NUIDHASHSIZ - 1)) + +/* + * Network address hash list element + */ +union nethostaddr { + u_long had_inetaddr; + struct mbuf *had_nam; +}; + +struct nfsuid { + struct nfsuid *nu_lrunext; /* MUST be first */ + struct nfsuid *nu_lruprev; + struct nfsuid *nu_hnext; + struct nfsuid *nu_hprev; + int nu_flag; /* Flags */ + uid_t nu_uid; /* Uid mapped by this entry */ + union nethostaddr nu_haddr; /* Host addr. for dgram sockets */ + struct ucred nu_cr; /* Cred uid mapped to */ +}; + +#define nu_inetaddr nu_haddr.had_inetaddr +#define nu_nam nu_haddr.had_nam +/* Bits for nu_flag */ +#define NU_INETADDR 0x1 + +struct nfssvc_sock { + struct nfsuid *ns_lrunext; /* MUST be first */ + struct nfsuid *ns_lruprev; + struct nfssvc_sock *ns_next; + struct nfssvc_sock *ns_prev; + int ns_flag; + u_long ns_sref; + struct file *ns_fp; + struct socket *ns_so; + int ns_solock; + struct mbuf *ns_nam; + int ns_cc; + struct mbuf *ns_raw; + struct mbuf *ns_rawend; + int ns_reclen; + struct mbuf *ns_rec; + struct mbuf *ns_recend; + int ns_numuids; + struct nfsuid *ns_uidh[NUIDHASHSIZ]; +}; + +/* Bits for "ns_flag" */ +#define SLP_VALID 0x01 +#define SLP_DOREC 0x02 +#define SLP_NEEDQ 0x04 +#define SLP_DISCONN 0x08 +#define SLP_GETSTREAM 0x10 +#define SLP_INIT 0x20 +#define SLP_WANTINIT 0x40 + +#define SLP_ALLFLAGS 0xff + +/* + * One of these structures is allocated for each nfsd. + */ +struct nfsd { + struct nfsd *nd_next; /* Must be first */ + struct nfsd *nd_prev; + int nd_flag; /* NFSD_ flags */ + struct nfssvc_sock *nd_slp; /* Current socket */ + struct mbuf *nd_nam; /* Client addr for datagram req. */ + struct mbuf *nd_mrep; /* Req. mbuf list */ + struct mbuf *nd_md; + caddr_t nd_dpos; /* Position in list */ + int nd_procnum; /* RPC procedure number */ + u_long nd_retxid; /* RPC xid */ + int nd_repstat; /* Reply status value */ + struct ucred nd_cr; /* Credentials for req. */ + int nd_nqlflag; /* Leasing flag */ + int nd_duration; /* Lease duration */ + int nd_authlen; /* Authenticator len */ + u_char nd_authstr[RPCAUTH_MAXSIZ]; /* Authenticator data */ + struct proc *nd_procp; /* Proc ptr */ +}; + +#define NFSD_WAITING 0x01 +#define NFSD_CHECKSLP 0x02 +#define NFSD_REQINPROG 0x04 +#define NFSD_NEEDAUTH 0x08 +#define NFSD_AUTHFAIL 0x10 +#endif /* KERNEL */ diff --git a/sys/sys/_sigset.h b/sys/sys/_sigset.h new file mode 100644 index 00000000000..8ccded41c3b --- /dev/null +++ b/sys/sys/_sigset.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)signal.h 8.2 (Berkeley) 1/21/94 + */ + +#ifndef _SYS_SIGNAL_H_ +#define _SYS_SIGNAL_H_ + +#define NSIG 32 /* counting 0; could be 33 (mask is 1-32) */ + +#ifndef _ANSI_SOURCE +#include /* sigcontext; codes for SIGILL, SIGFPE */ +#endif + +#define SIGHUP 1 /* hangup */ +#define SIGINT 2 /* interrupt */ +#define SIGQUIT 3 /* quit */ +#define SIGILL 4 /* illegal instruction (not reset when caught) */ +#ifndef _POSIX_SOURCE +#define SIGTRAP 5 /* trace trap (not reset when caught) */ +#endif +#define SIGABRT 6 /* abort() */ +#ifndef _POSIX_SOURCE +#define SIGIOT SIGABRT /* compatibility */ +#define SIGEMT 7 /* EMT instruction */ +#endif +#define SIGFPE 8 /* floating point exception */ +#define SIGKILL 9 /* kill (cannot be caught or ignored) */ +#ifndef _POSIX_SOURCE +#define SIGBUS 10 /* bus error */ +#endif +#define SIGSEGV 11 /* segmentation violation */ +#ifndef _POSIX_SOURCE +#define SIGSYS 12 /* bad argument to system call */ +#endif +#define SIGPIPE 13 /* write on a pipe with no one to read it */ +#define SIGALRM 14 /* alarm clock */ +#define SIGTERM 15 /* software termination signal from kill */ +#ifndef _POSIX_SOURCE +#define SIGURG 16 /* urgent condition on IO channel */ +#endif +#define SIGSTOP 17 /* sendable stop signal not from tty */ +#define SIGTSTP 18 /* stop signal from tty */ +#define SIGCONT 19 /* continue a stopped process */ +#define SIGCHLD 20 /* to parent on child stop or exit */ +#define SIGTTIN 21 /* to readers pgrp upon background tty read */ +#define SIGTTOU 22 /* like TTIN for output if (tp->t_local<OSTOP) */ +#ifndef _POSIX_SOURCE +#define SIGIO 23 /* input/output possible signal */ +#define SIGXCPU 24 /* exceeded CPU time limit */ +#define SIGXFSZ 25 /* exceeded file size limit */ +#define SIGVTALRM 26 /* virtual time alarm */ +#define SIGPROF 27 /* profiling time alarm */ +#define SIGWINCH 28 /* window size changes */ +#define SIGINFO 29 /* information request */ +#endif +#define SIGUSR1 30 /* user defined signal 1 */ +#define SIGUSR2 31 /* user defined signal 2 */ + +#if defined(_ANSI_SOURCE) || defined(__cplusplus) +/* + * Language spec sez we must list exactly one parameter, even though we + * actually supply three. Ugh! + */ +#define SIG_DFL (void (*)(int))0 +#define SIG_IGN (void (*)(int))1 +#define SIG_ERR (void (*)(int))-1 +#else +#define SIG_DFL (void (*)())0 +#define SIG_IGN (void (*)())1 +#define SIG_ERR (void (*)())-1 +#endif + +#ifndef _ANSI_SOURCE +typedef unsigned int sigset_t; + +/* + * Signal vector "template" used in sigaction call. + */ +struct sigaction { + void (*sa_handler)(); /* signal handler */ + sigset_t sa_mask; /* signal mask to apply */ + int sa_flags; /* see signal options below */ +}; +#ifndef _POSIX_SOURCE +#define SA_ONSTACK 0x0001 /* take signal on signal stack */ +#define SA_RESTART 0x0002 /* restart system on signal return */ +#define SA_DISABLE 0x0004 /* disable taking signals on alternate stack */ +#ifdef COMPAT_SUNOS +#define SA_USERTRAMP 0x0100 /* do not bounce off kernel's sigtramp */ +#endif +#endif +#define SA_NOCLDSTOP 0x0008 /* do not generate SIGCHLD on child stop */ + +/* + * Flags for sigprocmask: + */ +#define SIG_BLOCK 1 /* block specified signal set */ +#define SIG_UNBLOCK 2 /* unblock specified signal set */ +#define SIG_SETMASK 3 /* set specified signal set */ + +#ifndef _POSIX_SOURCE +#ifndef KERNEL +#include +#endif +typedef void (*sig_t) __P((int)); /* type of signal function */ + +/* + * Structure used in sigaltstack call. + */ +struct sigaltstack { + char *ss_base; /* signal stack base */ + int ss_size; /* signal stack length */ + int ss_flags; /* SA_DISABLE and/or SA_ONSTACK */ +}; +#define MINSIGSTKSZ 8192 /* minimum allowable stack */ +#define SIGSTKSZ (MINSIGSTKSZ + 32768) /* recommended stack size */ + +/* + * 4.3 compatibility: + * Signal vector "template" used in sigvec call. + */ +struct sigvec { + void (*sv_handler)(); /* signal handler */ + int sv_mask; /* signal mask to apply */ + int sv_flags; /* see signal options below */ +}; + +#define SV_ONSTACK SA_ONSTACK +#define SV_INTERRUPT SA_RESTART /* same bit, opposite sense */ +#define sv_onstack sv_flags /* isn't compatibility wonderful! */ + +/* + * Structure used in sigstack call. + */ +struct sigstack { + char *ss_sp; /* signal stack pointer */ + int ss_onstack; /* current status */ +}; + +/* + * Macro for converting signal number to a mask suitable for + * sigblock(). + */ +#define sigmask(m) (1 << ((m)-1)) + +#define BADSIG SIG_ERR + +#endif /* !_POSIX_SOURCE */ +#endif /* !_ANSI_SOURCE */ + +/* + * For historical reasons; programs expect signal's return value to be + * defined by . + */ +__BEGIN_DECLS +void (*signal __P((int, void (*) __P((int))))) __P((int)); +__END_DECLS +#endif /* !_SYS_SIGNAL_H_ */ diff --git a/sys/sys/acct.h b/sys/sys/acct.h new file mode 100644 index 00000000000..edc5bdbd563 --- /dev/null +++ b/sys/sys/acct.h @@ -0,0 +1,75 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)acct.h 8.2 (Berkeley) 1/21/94 + */ + +/* + * Accounting structures; these use a comp_t type which is a 3 bits base 8 + * exponent, 13 bit fraction ``floating point'' number. Units are 1/AHZ + * seconds. + */ +typedef u_short comp_t; + +struct acct { + char ac_comm[10]; /* command name */ + comp_t ac_utime; /* user time */ + comp_t ac_stime; /* system time */ + comp_t ac_etime; /* elapsed time */ + time_t ac_btime; /* starting time */ + uid_t ac_uid; /* user id */ + gid_t ac_gid; /* group id */ + short ac_mem; /* average memory usage */ + comp_t ac_io; /* count of IO blocks */ + dev_t ac_tty; /* controlling tty */ +#define AFORK 0x01 /* forked but not execed */ +#define ASU 0x02 /* used super-user permissions */ +#define ACOMPAT 0x04 /* used compatibility mode */ +#define ACORE 0x08 /* dumped core */ +#define AXSIG 0x10 /* killed by a signal */ + char ac_flag; /* accounting flags */ +}; + +/* + * 1/AHZ is the granularity of the data encoded in the comp_t fields. + * This is not necessarily equal to hz. + */ +#define AHZ 64 + +#ifdef KERNEL +struct vnode *acctp; +#endif diff --git a/sys/sys/bio.h b/sys/sys/bio.h new file mode 100644 index 00000000000..e6c329f239d --- /dev/null +++ b/sys/sys/bio.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)buf.h 8.7 (Berkeley) 1/21/94 + */ + +#ifndef _SYS_BUF_H_ +#define _SYS_BUF_H_ +#include + +#define NOLIST ((struct buf *)0x87654321) + +/* + * The buffer header describes an I/O operation in the kernel. + */ +struct buf { + LIST_ENTRY(buf) b_hash; /* Hash chain. */ + LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ + TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ + struct buf *b_actf, **b_actb; /* Device driver queue when active. */ + struct proc *b_proc; /* Associated proc; NULL if kernel. */ + volatile long b_flags; /* B_* flags. */ + int b_error; /* Errno value. */ + long b_bufsize; /* Allocated buffer size. */ + long b_bcount; /* Valid bytes in buffer. */ + long b_resid; /* Remaining I/O. */ + dev_t b_dev; /* Device associated with buffer. */ + struct { + caddr_t b_addr; /* Memory, superblocks, indirect etc. */ + } b_un; + void *b_saveaddr; /* Original b_addr for physio. */ + daddr_t b_lblkno; /* Logical block number. */ + daddr_t b_blkno; /* Underlying physical block number. */ + /* Function to call upon completion. */ + void (*b_iodone) __P((struct buf *)); + struct vnode *b_vp; /* Device vnode. */ + int b_pfcent; /* Center page when swapping cluster. */ + int b_dirtyoff; /* Offset in buffer of dirty region. */ + int b_dirtyend; /* Offset of end of dirty region. */ + struct ucred *b_rcred; /* Read credentials reference. */ + struct ucred *b_wcred; /* Write credentials reference. */ + int b_validoff; /* Offset in buffer of valid region. */ + int b_validend; /* Offset of end of valid region. */ +}; + +/* Device driver compatibility definitions. */ +#define b_active b_bcount /* Driver queue head: drive active. */ +#define b_data b_un.b_addr /* b_un.b_addr is not changeable. */ +#define b_errcnt b_resid /* Retry count while I/O in progress. */ +#define iodone biodone /* Old name for biodone. */ +#define iowait biowait /* Old name for biowait. */ + +/* + * These flags are kept in b_flags. + */ +#define B_AGE 0x00000001 /* Move to age queue when I/O done. */ +#define B_APPENDWRITE 0x00000002 /* Append-write in progress. */ +#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ +#define B_BAD 0x00000008 /* Bad block revectoring in progress. */ +#define B_BUSY 0x00000010 /* I/O in progress. */ +#define B_CACHE 0x00000020 /* Bread found us in the cache. */ +#define B_CALL 0x00000040 /* Call b_iodone from biodone. */ +#define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ +#define B_DIRTY 0x00000100 /* Dirty page to be pushed out async. */ +#define B_DONE 0x00000200 /* I/O completed. */ +#define B_EINTR 0x00000400 /* I/O was interrupted */ +#define B_ERROR 0x00000800 /* I/O error occurred. */ +#define B_GATHERED 0x00001000 /* LFS: already in a segment. */ +#define B_INVAL 0x00002000 /* Does not contain valid info. */ +#define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ +#define B_NOCACHE 0x00008000 /* Do not cache block after use. */ +#define B_PAGET 0x00010000 /* Page in/out of page table space. */ +#define B_PGIN 0x00020000 /* Pagein op, so swap() can count it. */ +#define B_PHYS 0x00040000 /* I/O to user memory. */ +#define B_RAW 0x00080000 /* Set by physio for raw transfers. */ +#define B_READ 0x00100000 /* Read buffer. */ +#define B_TAPE 0x00200000 /* Magnetic tape I/O. */ +#define B_UAREA 0x00400000 /* Buffer describes Uarea I/O. */ +#define B_WANTED 0x00800000 /* Process wants this buffer. */ +#define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ +#define B_WRITEINPROG 0x01000000 /* Write in progress. */ +#define B_XXX 0x02000000 /* Debugging flag. */ + +/* + * This structure describes a clustered I/O. It is stored in the b_saveaddr + * field of the buffer on which I/O is done. At I/O completion, cluster + * callback uses the structure to parcel I/O's to individual buffers, and + * then free's this structure. + */ +struct cluster_save { + long bs_bcount; /* Saved b_bcount. */ + long bs_bufsize; /* Saved b_bufsize. */ + void *bs_saveaddr; /* Saved b_addr. */ + int bs_nchildren; /* Number of associated buffers. */ + struct buf **bs_children; /* List of associated buffers. */ +}; + +/* + * Zero out the buffer's data area. + */ +#define clrbuf(bp) { \ + blkclr((bp)->b_data, (u_int)(bp)->b_bcount); \ + (bp)->b_resid = 0; \ +} + +/* Flags to low-level allocation routines. */ +#define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ +#define B_SYNC 0x02 /* Do all allocations synchronously. */ + +#ifdef KERNEL +int nbuf; /* The number of buffer headers */ +struct buf *buf; /* The buffer headers. */ +char *buffers; /* The buffer contents. */ +int bufpages; /* Number of memory pages in the buffer pool. */ +struct buf *swbuf; /* Swap I/O buffer headers. */ +int nswbuf; /* Number of swap I/O buffer headers. */ +struct buf bswlist; /* Head of swap I/O buffer headers free list. */ +struct buf *bclnlist; /* Head of cleaned page list. */ + +__BEGIN_DECLS +int allocbuf __P((struct buf *, int)); +int bawrite __P((struct buf *)); +int bdwrite __P((struct buf *)); +void biodone __P((struct buf *)); +int biowait __P((struct buf *)); +int bread __P((struct vnode *, daddr_t, int, + struct ucred *, struct buf **)); +int breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, + struct ucred *, struct buf **)); +int brelse __P((struct buf *)); +void bufinit __P((void)); +int bwrite __P((struct buf *)); +void cluster_callback __P((struct buf *)); +int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long, + struct ucred *, struct buf **)); +void cluster_write __P((struct buf *, u_quad_t)); +struct buf *getblk __P((struct vnode *, daddr_t, int, int, int)); +struct buf *geteblk __P((int)); +struct buf *getnewbuf __P((int slpflag, int slptimeo)); +struct buf *incore __P((struct vnode *, daddr_t)); +u_int minphys __P((struct buf *bp)); +__END_DECLS +#endif +#endif /* !_SYS_BUF_H_ */ diff --git a/sys/sys/buf.h b/sys/sys/buf.h new file mode 100644 index 00000000000..e6c329f239d --- /dev/null +++ b/sys/sys/buf.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)buf.h 8.7 (Berkeley) 1/21/94 + */ + +#ifndef _SYS_BUF_H_ +#define _SYS_BUF_H_ +#include + +#define NOLIST ((struct buf *)0x87654321) + +/* + * The buffer header describes an I/O operation in the kernel. + */ +struct buf { + LIST_ENTRY(buf) b_hash; /* Hash chain. */ + LIST_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ + TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ + struct buf *b_actf, **b_actb; /* Device driver queue when active. */ + struct proc *b_proc; /* Associated proc; NULL if kernel. */ + volatile long b_flags; /* B_* flags. */ + int b_error; /* Errno value. */ + long b_bufsize; /* Allocated buffer size. */ + long b_bcount; /* Valid bytes in buffer. */ + long b_resid; /* Remaining I/O. */ + dev_t b_dev; /* Device associated with buffer. */ + struct { + caddr_t b_addr; /* Memory, superblocks, indirect etc. */ + } b_un; + void *b_saveaddr; /* Original b_addr for physio. */ + daddr_t b_lblkno; /* Logical block number. */ + daddr_t b_blkno; /* Underlying physical block number. */ + /* Function to call upon completion. */ + void (*b_iodone) __P((struct buf *)); + struct vnode *b_vp; /* Device vnode. */ + int b_pfcent; /* Center page when swapping cluster. */ + int b_dirtyoff; /* Offset in buffer of dirty region. */ + int b_dirtyend; /* Offset of end of dirty region. */ + struct ucred *b_rcred; /* Read credentials reference. */ + struct ucred *b_wcred; /* Write credentials reference. */ + int b_validoff; /* Offset in buffer of valid region. */ + int b_validend; /* Offset of end of valid region. */ +}; + +/* Device driver compatibility definitions. */ +#define b_active b_bcount /* Driver queue head: drive active. */ +#define b_data b_un.b_addr /* b_un.b_addr is not changeable. */ +#define b_errcnt b_resid /* Retry count while I/O in progress. */ +#define iodone biodone /* Old name for biodone. */ +#define iowait biowait /* Old name for biowait. */ + +/* + * These flags are kept in b_flags. + */ +#define B_AGE 0x00000001 /* Move to age queue when I/O done. */ +#define B_APPENDWRITE 0x00000002 /* Append-write in progress. */ +#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ +#define B_BAD 0x00000008 /* Bad block revectoring in progress. */ +#define B_BUSY 0x00000010 /* I/O in progress. */ +#define B_CACHE 0x00000020 /* Bread found us in the cache. */ +#define B_CALL 0x00000040 /* Call b_iodone from biodone. */ +#define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ +#define B_DIRTY 0x00000100 /* Dirty page to be pushed out async. */ +#define B_DONE 0x00000200 /* I/O completed. */ +#define B_EINTR 0x00000400 /* I/O was interrupted */ +#define B_ERROR 0x00000800 /* I/O error occurred. */ +#define B_GATHERED 0x00001000 /* LFS: already in a segment. */ +#define B_INVAL 0x00002000 /* Does not contain valid info. */ +#define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ +#define B_NOCACHE 0x00008000 /* Do not cache block after use. */ +#define B_PAGET 0x00010000 /* Page in/out of page table space. */ +#define B_PGIN 0x00020000 /* Pagein op, so swap() can count it. */ +#define B_PHYS 0x00040000 /* I/O to user memory. */ +#define B_RAW 0x00080000 /* Set by physio for raw transfers. */ +#define B_READ 0x00100000 /* Read buffer. */ +#define B_TAPE 0x00200000 /* Magnetic tape I/O. */ +#define B_UAREA 0x00400000 /* Buffer describes Uarea I/O. */ +#define B_WANTED 0x00800000 /* Process wants this buffer. */ +#define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ +#define B_WRITEINPROG 0x01000000 /* Write in progress. */ +#define B_XXX 0x02000000 /* Debugging flag. */ + +/* + * This structure describes a clustered I/O. It is stored in the b_saveaddr + * field of the buffer on which I/O is done. At I/O completion, cluster + * callback uses the structure to parcel I/O's to individual buffers, and + * then free's this structure. + */ +struct cluster_save { + long bs_bcount; /* Saved b_bcount. */ + long bs_bufsize; /* Saved b_bufsize. */ + void *bs_saveaddr; /* Saved b_addr. */ + int bs_nchildren; /* Number of associated buffers. */ + struct buf **bs_children; /* List of associated buffers. */ +}; + +/* + * Zero out the buffer's data area. + */ +#define clrbuf(bp) { \ + blkclr((bp)->b_data, (u_int)(bp)->b_bcount); \ + (bp)->b_resid = 0; \ +} + +/* Flags to low-level allocation routines. */ +#define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ +#define B_SYNC 0x02 /* Do all allocations synchronously. */ + +#ifdef KERNEL +int nbuf; /* The number of buffer headers */ +struct buf *buf; /* The buffer headers. */ +char *buffers; /* The buffer contents. */ +int bufpages; /* Number of memory pages in the buffer pool. */ +struct buf *swbuf; /* Swap I/O buffer headers. */ +int nswbuf; /* Number of swap I/O buffer headers. */ +struct buf bswlist; /* Head of swap I/O buffer headers free list. */ +struct buf *bclnlist; /* Head of cleaned page list. */ + +__BEGIN_DECLS +int allocbuf __P((struct buf *, int)); +int bawrite __P((struct buf *)); +int bdwrite __P((struct buf *)); +void biodone __P((struct buf *)); +int biowait __P((struct buf *)); +int bread __P((struct vnode *, daddr_t, int, + struct ucred *, struct buf **)); +int breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int, + struct ucred *, struct buf **)); +int brelse __P((struct buf *)); +void bufinit __P((void)); +int bwrite __P((struct buf *)); +void cluster_callback __P((struct buf *)); +int cluster_read __P((struct vnode *, u_quad_t, daddr_t, long, + struct ucred *, struct buf **)); +void cluster_write __P((struct buf *, u_quad_t)); +struct buf *getblk __P((struct vnode *, daddr_t, int, int, int)); +struct buf *geteblk __P((int)); +struct buf *getnewbuf __P((int slpflag, int slptimeo)); +struct buf *incore __P((struct vnode *, daddr_t)); +u_int minphys __P((struct buf *bp)); +__END_DECLS +#endif +#endif /* !_SYS_BUF_H_ */ diff --git a/sys/sys/callout.h b/sys/sys/callout.h new file mode 100644 index 00000000000..d685e56d3f0 --- /dev/null +++ b/sys/sys/callout.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)callout.h 8.2 (Berkeley) 1/21/94 + */ + +struct callout { + struct callout *c_next; /* next callout in queue */ + void *c_arg; /* function argument */ + void (*c_func) __P((void *)); /* function to call */ + int c_time; /* ticks to the event */ +}; + +#ifdef KERNEL +struct callout *callfree, *callout, calltodo; +int ncallout; +#endif diff --git a/sys/sys/cdefs.h b/sys/sys/cdefs.h new file mode 100644 index 00000000000..c104b9e964d --- /dev/null +++ b/sys/sys/cdefs.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Berkeley Software Design, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)cdefs.h 8.7 (Berkeley) 1/21/94 + */ + +#ifndef _CDEFS_H_ +#define _CDEFS_H_ + +#if defined(__cplusplus) +#define __BEGIN_DECLS extern "C" { +#define __END_DECLS }; +#else +#define __BEGIN_DECLS +#define __END_DECLS +#endif + +/* + * The __CONCAT macro is used to concatenate parts of symbol names, e.g. + * with "#define OLD(foo) __CONCAT(old,foo)", OLD(foo) produces oldfoo. + * The __CONCAT macro is a bit tricky -- make sure you don't put spaces + * in between its arguments. __CONCAT can also concatenate double-quoted + * strings produced by the __STRING macro, but this only works with ANSI C. + */ +#if defined(__STDC__) || defined(__cplusplus) +#define __P(protos) protos /* full-blown ANSI C */ +#define __CONCAT(x,y) x ## y +#define __STRING(x) #x + +#define __const const /* define reserved names to standard */ +#define __signed signed +#define __volatile volatile +#if defined(__cplusplus) +#define __inline inline /* convert to C++ keyword */ +#else +#ifndef __GNUC__ +#define __inline /* delete GCC keyword */ +#endif /* !__GNUC__ */ +#endif /* !__cplusplus */ + +#else /* !(__STDC__ || __cplusplus) */ +#define __P(protos) () /* traditional C preprocessor */ +#define __CONCAT(x,y) x/**/y +#define __STRING(x) "x" + +#ifndef __GNUC__ +#define __const /* delete pseudo-ANSI C keywords */ +#define __inline +#define __signed +#define __volatile +/* + * In non-ANSI C environments, new programs will want ANSI-only C keywords + * deleted from the program and old programs will want them left alone. + * When using a compiler other than gcc, programs using the ANSI C keywords + * const, inline etc. as normal identifiers should define -DNO_ANSI_KEYWORDS. + * When using "gcc -traditional", we assume that this is the intent; if + * __GNUC__ is defined but __STDC__ is not, we leave the new keywords alone. + */ +#ifndef NO_ANSI_KEYWORDS +#define const /* delete ANSI C keywords */ +#define inline +#define signed +#define volatile +#endif +#endif /* !__GNUC__ */ +#endif /* !(__STDC__ || __cplusplus) */ + +/* + * GCC1 and some versions of GCC2 declare dead (non-returning) and + * pure (no side effects) functions using "volatile" and "const"; + * unfortunately, these then cause warnings under "-ansi -pedantic". + * GCC2 uses a new, peculiar __attribute__((attrs)) style. All of + * these work for GNU C++ (modulo a slight glitch in the C++ grammar + * in the distribution version of 2.5.5). + */ +#if !defined(__GNUC__) || __GNUC__ < 2 || __GNUC_MINOR__ < 5 +#define __attribute__(x) /* delete __attribute__ if non-gcc or gcc1 */ +#if defined(__GNUC__) && !defined(__STRICT_ANSI__) +#define __dead __volatile +#define __pure __const +#endif +#endif + +/* Delete pseudo-keywords wherever they are not available or needed. */ +#ifndef __dead +#define __dead +#define __pure +#endif + +#endif /* !_CDEFS_H_ */ diff --git a/sys/sys/clist.h b/sys/sys/clist.h new file mode 100644 index 00000000000..bad26477015 --- /dev/null +++ b/sys/sys/clist.h @@ -0,0 +1,45 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)clist.h 8.1 (Berkeley) 6/4/93 + */ + +struct cblock { + struct cblock *c_next; /* next cblock in queue */ + char c_quote[CBQSIZE]; /* quoted characters */ + char c_info[CBSIZE]; /* characters */ +}; + +#ifdef KERNEL +extern struct cblock *cfree, *cfreelist; +extern int cfreecount, nclist; +#endif diff --git a/sys/sys/conf.h b/sys/sys/conf.h new file mode 100644 index 00000000000..58cb6fa8339 --- /dev/null +++ b/sys/sys/conf.h @@ -0,0 +1,123 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)conf.h 8.3 (Berkeley) 1/21/94 + */ + +/* + * Definitions of device driver entry switches + */ + +struct buf; +struct proc; +struct tty; +struct uio; +struct vnode; + +struct bdevsw { + int (*d_open) __P((dev_t dev, int oflags, int devtype, + struct proc *p)); + int (*d_close) __P((dev_t dev, int fflag, int devtype, + struct proc *p)); + int (*d_strategy) __P((struct buf *bp)); + int (*d_ioctl) __P((dev_t dev, int cmd, caddr_t data, + int fflag, struct proc *p)); + int (*d_dump) (); /* parameters vary by architecture */ + int (*d_psize) __P((dev_t dev)); + int d_flags; +}; + +#ifdef KERNEL +extern struct bdevsw bdevsw[]; +#endif + +struct cdevsw { + int (*d_open) __P((dev_t dev, int oflags, int devtype, + struct proc *p)); + int (*d_close) __P((dev_t dev, int fflag, int devtype, + struct proc *)); + int (*d_read) __P((dev_t dev, struct uio *uio, int ioflag)); + int (*d_write) __P((dev_t dev, struct uio *uio, int ioflag)); + int (*d_ioctl) __P((dev_t dev, int cmd, caddr_t data, + int fflag, struct proc *p)); + int (*d_stop) __P((struct tty *tp, int rw)); + int (*d_reset) __P((int uban)); /* XXX */ + struct tty *d_ttys; + int (*d_select) __P((dev_t dev, int which, struct proc *p)); + int (*d_mmap) __P(()); + int (*d_strategy) __P((struct buf *bp)); +}; + +#ifdef KERNEL +extern struct cdevsw cdevsw[]; + +/* symbolic sleep message strings */ +extern char devopn[], devio[], devwait[], devin[], devout[]; +extern char devioc[], devcls[]; +#endif + +struct linesw { + int (*l_open) __P((dev_t dev, struct tty *tp)); + int (*l_close) __P((struct tty *tp, int flag)); + int (*l_read) __P((struct tty *tp, struct uio *uio, + int flag)); + int (*l_write) __P((struct tty *tp, struct uio *uio, + int flag)); + int (*l_ioctl) __P((struct tty *tp, int cmd, caddr_t data, + int flag, struct proc *p)); + int (*l_rint) __P((int c, struct tty *tp)); + int (*l_start) __P((struct tty *tp)); + int (*l_modem) __P((struct tty *tp, int flag)); +}; + +#ifdef KERNEL +extern struct linesw linesw[]; +#endif + +struct swdevt { + dev_t sw_dev; + int sw_flags; + int sw_nblks; + struct vnode *sw_vp; +}; +#define SW_FREED 0x01 +#define SW_SEQUENTIAL 0x02 +#define sw_freed sw_flags /* XXX compat */ + +#ifdef KERNEL +extern struct swdevt swdevt[]; +#endif diff --git a/sys/sys/device.h b/sys/sys/device.h new file mode 100644 index 00000000000..0a233ed3e5c --- /dev/null +++ b/sys/sys/device.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Lawrence Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)device.h 8.2 (Berkeley) 2/17/94 + */ + +#ifndef _SYS_DEVICE_H_ +#define _SYS_DEVICE_H_ + +/* + * Minimal device structures. + * Note that all ``system'' device types are listed here. + */ +enum devclass { + DV_DULL, /* generic, no special info */ + DV_CPU, /* CPU (carries resource utilization) */ + DV_DISK, /* disk drive (label, etc) */ + DV_IFNET, /* network interface */ + DV_TAPE, /* tape device */ + DV_TTY /* serial line interface (???) */ +}; + +struct device { + enum devclass dv_class; /* this device's classification */ + struct device *dv_next; /* next in list of all */ + struct cfdata *dv_cfdata; /* config data that found us */ + int dv_unit; /* device unit number */ + char dv_xname[16]; /* external name (name + unit) */ + struct device *dv_parent; /* pointer to parent device */ +}; + +/* `event' counters (use zero or more per device instance, as needed) */ +struct evcnt { + struct evcnt *ev_next; /* linked list */ + struct device *ev_dev; /* associated device */ + int ev_count; /* how many have occurred */ + char ev_name[8]; /* what to call them (systat display) */ +}; + +/* + * Configuration data (i.e., data placed in ioconf.c). + */ +struct cfdata { + struct cfdriver *cf_driver; /* config driver */ + short cf_unit; /* unit number */ + short cf_fstate; /* finding state (below) */ + int *cf_loc; /* locators (machine dependent) */ + int cf_flags; /* flags from config */ + short *cf_parents; /* potential parents */ + void (**cf_ivstubs)(); /* config-generated vectors, if any */ +}; +#define FSTATE_NOTFOUND 0 /* has not been found */ +#define FSTATE_FOUND 1 /* has been found */ +#define FSTATE_STAR 2 /* duplicable */ + +typedef int (*cfmatch_t) __P((struct device *, struct cfdata *, void *)); + +/* + * `configuration' driver (what the machine-independent autoconf uses). + * As devices are found, they are applied against all the potential matches. + * The one with the best match is taken, and a device structure (plus any + * other data desired) is allocated. Pointers to these are placed into + * an array of pointers. The array itself must be dynamic since devices + * can be found long after the machine is up and running. + */ +struct cfdriver { + void **cd_devs; /* devices found */ + char *cd_name; /* device name */ + cfmatch_t cd_match; /* returns a match level */ + void (*cd_attach) __P((struct device *, struct device *, void *)); + enum devclass cd_class; /* device classification */ + size_t cd_devsize; /* size of dev data (for malloc) */ + void *cd_aux; /* additional driver, if any */ + int cd_ndevs; /* size of cd_devs array */ +}; + +/* + * Configuration printing functions, and their return codes. The second + * argument is NULL if the device was configured; otherwise it is the name + * of the parent device. The return value is ignored if the device was + * configured, so most functions can return UNCONF unconditionally. + */ +typedef int (*cfprint_t) __P((void *, char *)); +#define QUIET 0 /* print nothing */ +#define UNCONF 1 /* print " not configured\n" */ +#define UNSUPP 2 /* print " not supported\n" */ + +/* + * Pseudo-device attach information (function + number of pseudo-devs). + */ +struct pdevinit { + void (*pdev_attach) __P((int)); + int pdev_count; +}; + +struct device *alldevs; /* head of list of all devices */ +struct evcnt *allevents; /* head of list of all events */ + +struct cfdata *config_search __P((cfmatch_t, struct device *, void *)); +struct cfdata *config_rootsearch __P((cfmatch_t, char *, void *)); +int config_found __P((struct device *, void *, cfprint_t)); +int config_rootfound __P((char *, void *)); +void config_attach __P((struct device *, struct cfdata *, void *, cfprint_t)); +void evcnt_attach __P((struct device *, const char *, struct evcnt *)); +#endif /* !_SYS_DEVICE_H_ */ diff --git a/sys/sys/dir.h b/sys/sys/dir.h new file mode 100644 index 00000000000..0c4cd679cee --- /dev/null +++ b/sys/sys/dir.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dir.h 8.2 (Berkeley) 1/4/94 + */ + +/* + * The information in this file should be obtained from + * and is provided solely (and temporarily) for backward compatibility. + */ + +#ifndef _SYS_DIR_H_ +#define _SYS_DIR_H_ + +#include + +/* + * Backwards compatibility. + */ +#define direct dirent + +/* + * The DIRSIZ macro gives the minimum record length which will hold + * the directory entry. This requires the amount of space in struct direct + * without the d_name field, plus enough space for the name with a terminating + * null byte (dp->d_namlen+1), rounded up to a 4 byte boundary. + */ +#undef DIRSIZ +#define DIRSIZ(dp) \ + ((sizeof (struct direct) - (MAXNAMLEN+1)) + (((dp)->d_namlen+1 + 3) &~ 3)) + +#endif /* !_SYS_DIR_H_ */ diff --git a/sys/sys/dirent.h b/sys/sys/dirent.h new file mode 100644 index 00000000000..1c4b96aa29c --- /dev/null +++ b/sys/sys/dirent.h @@ -0,0 +1,76 @@ +/*- + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dirent.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * The dirent structure defines the format of directory entries returned by + * the getdirentries(2) system call. + * + * A directory entry has a struct dirent at the front of it, containing its + * inode number, the length of the entry, and the length of the name + * contained in the entry. These are followed by the name padded to a 4 + * byte boundary with null bytes. All names are guaranteed null terminated. + * The maximum length of a name in a directory is MAXNAMLEN. + */ + +struct dirent { + unsigned long d_fileno; /* file number of entry */ + unsigned short d_reclen; /* length of this record */ + unsigned char d_type; /* file type, see below */ + unsigned char d_namlen; /* length of string in d_name */ +#ifdef _POSIX_SOURCE + char d_name[255 + 1]; /* name must be no longer than this */ +#else +#define MAXNAMLEN 255 + char d_name[MAXNAMLEN + 1]; /* name must be no longer than this */ +#endif +}; + +/* + * File types + */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 + +/* + * Convert between stat structure types and directory types. + */ +#define IFTODT(mode) (((mode) & 0170000) >> 12) +#define DTTOIF(dirtype) ((dirtype) << 12) diff --git a/sys/sys/disk.h b/sys/sys/disk.h new file mode 100644 index 00000000000..352ecf00274 --- /dev/null +++ b/sys/sys/disk.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Lawrence Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)disk.h 8.1 (Berkeley) 6/2/93 + * + * from: $Header: disk.h,v 1.5 92/11/19 04:33:03 torek Exp $ (LBL) + */ + +/* + * Disk device structures. + * + * Note that this is only a preliminary outline. The final disk structures + * may be somewhat different. + */ +struct buf; + +struct dkdevice { + struct device dk_dev; /* base device */ + struct dkdevice *dk_next; /* list of disks; not yet used */ + int dk_bps; /* xfer rate: bytes per second */ + int dk_bopenmask; /* block devices open */ + int dk_copenmask; /* character devices open */ + int dk_openmask; /* composite (bopen|copen) */ + int dk_state; /* label state ### */ + int dk_blkshift; /* shift to convert DEV_BSIZE to blks */ + int dk_byteshift; /* shift to convert bytes to blks */ + struct dkdriver *dk_driver; /* pointer to driver */ + daddr_t dk_labelsector; /* sector containing label */ + struct disklabel dk_label; /* label */ +}; + +struct dkdriver { + void (*d_strategy) __P((struct buf *)); +#ifdef notyet + int (*d_open) __P((dev_t dev, int ifmt, int, struct proc *)); + int (*d_close) __P((dev_t dev, int, int ifmt, struct proc *)); + int (*d_ioctl) __P((dev_t dev, int cmd, caddr_t data, int fflag, + struct proc *)); + int (*d_dump) __P((dev_t)); + void (*d_start) __P((struct buf *, daddr_t)); + int (*d_mklabel) __P((struct dkdevice *)); +#endif +}; + +/* states */ +#define DK_CLOSED 0 /* drive is closed */ +#define DK_WANTOPEN 1 /* drive being opened */ +#define DK_WANTOPENRAW 2 /* drive being opened */ +#define DK_RDLABEL 3 /* label being read */ +#define DK_OPEN 4 /* label read, drive open */ +#define DK_OPENRAW 5 /* open without label */ + +#ifdef DISKSORT_STATS +/* + * Stats from disksort(). + */ +struct disksort_stats { + long ds_newhead; /* # new queue heads created */ + long ds_newtail; /* # new queue tails created */ + long ds_midfirst; /* # insertions into sort list */ + long ds_endfirst; /* # insertions at end of sort list */ + long ds_newsecond; /* # inversions (2nd lists) created */ + long ds_midsecond; /* # insertions into 2nd list */ + long ds_endsecond; /* # insertions at end of 2nd list */ +}; +#endif + +#ifdef KERNEL +void disksort __P((struct buf *, struct buf *)); +char *readdisklabel __P((struct dkdevice *, int)); +int setdisklabel __P((struct dkdevice *, struct disklabel *)); +int writedisklabel __P((struct dkdevice *, int)); +int diskerr __P((struct dkdevice *, struct buf *, char *, int, int)); +#endif diff --git a/sys/sys/disklabel.h b/sys/sys/disklabel.h new file mode 100644 index 00000000000..a25ee29e363 --- /dev/null +++ b/sys/sys/disklabel.h @@ -0,0 +1,332 @@ +/* + * Copyright (c) 1987, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)disklabel.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Disk description table, see disktab(5) + */ +#define _PATH_DISKTAB "/etc/disktab" +#define DISKTAB "/etc/disktab" /* deprecated */ + +/* + * Each disk has a label which includes information about the hardware + * disk geometry, filesystem partitions, and drive specific information. + * The label is in block 0 or 1, possibly offset from the beginning + * to leave room for a bootstrap, etc. + */ + +/* XXX these should be defined per controller (or drive) elsewhere, not here! */ +#ifdef i386 +#define LABELSECTOR 1 /* sector containing label */ +#define LABELOFFSET 0 /* offset of label in sector */ +#endif + +#ifndef LABELSECTOR +#define LABELSECTOR 0 /* sector containing label */ +#endif + +#ifndef LABELOFFSET +#define LABELOFFSET 64 /* offset of label in sector */ +#endif + +#define DISKMAGIC ((u_long) 0x82564557) /* The disk magic number */ +#ifndef MAXPARTITIONS +#define MAXPARTITIONS 8 +#endif + + +#ifndef LOCORE +struct disklabel { + u_long d_magic; /* the magic number */ + short d_type; /* drive type */ + short d_subtype; /* controller/d_type specific */ + char d_typename[16]; /* type name, e.g. "eagle" */ + /* + * d_packname contains the pack identifier and is returned when + * the disklabel is read off the disk or in-core copy. + * d_boot0 and d_boot1 are the (optional) names of the + * primary (block 0) and secondary (block 1-15) bootstraps + * as found in /usr/mdec. These are returned when using + * getdiskbyname(3) to retrieve the values from /etc/disktab. + */ +#if defined(KERNEL) || defined(STANDALONE) + char d_packname[16]; /* pack identifier */ +#else + union { + char un_d_packname[16]; /* pack identifier */ + struct { + char *un_d_boot0; /* primary bootstrap name */ + char *un_d_boot1; /* secondary bootstrap name */ + } un_b; + } d_un; +#define d_packname d_un.un_d_packname +#define d_boot0 d_un.un_b.un_d_boot0 +#define d_boot1 d_un.un_b.un_d_boot1 +#endif /* ! KERNEL or STANDALONE */ + /* disk geometry: */ + u_long d_secsize; /* # of bytes per sector */ + u_long d_nsectors; /* # of data sectors per track */ + u_long d_ntracks; /* # of tracks per cylinder */ + u_long d_ncylinders; /* # of data cylinders per unit */ + u_long d_secpercyl; /* # of data sectors per cylinder */ + u_long d_secperunit; /* # of data sectors per unit */ + /* + * Spares (bad sector replacements) below + * are not counted in d_nsectors or d_secpercyl. + * Spare sectors are assumed to be physical sectors + * which occupy space at the end of each track and/or cylinder. + */ + u_short d_sparespertrack; /* # of spare sectors per track */ + u_short d_sparespercyl; /* # of spare sectors per cylinder */ + /* + * Alternate cylinders include maintenance, replacement, + * configuration description areas, etc. + */ + u_long d_acylinders; /* # of alt. cylinders per unit */ + + /* hardware characteristics: */ + /* + * d_interleave, d_trackskew and d_cylskew describe perturbations + * in the media format used to compensate for a slow controller. + * Interleave is physical sector interleave, set up by the formatter + * or controller when formatting. When interleaving is in use, + * logically adjacent sectors are not physically contiguous, + * but instead are separated by some number of sectors. + * It is specified as the ratio of physical sectors traversed + * per logical sector. Thus an interleave of 1:1 implies contiguous + * layout, while 2:1 implies that logical sector 0 is separated + * by one sector from logical sector 1. + * d_trackskew is the offset of sector 0 on track N + * relative to sector 0 on track N-1 on the same cylinder. + * Finally, d_cylskew is the offset of sector 0 on cylinder N + * relative to sector 0 on cylinder N-1. + */ + u_short d_rpm; /* rotational speed */ + u_short d_interleave; /* hardware sector interleave */ + u_short d_trackskew; /* sector 0 skew, per track */ + u_short d_cylskew; /* sector 0 skew, per cylinder */ + u_long d_headswitch; /* head switch time, usec */ + u_long d_trkseek; /* track-to-track seek, usec */ + u_long d_flags; /* generic flags */ +#define NDDATA 5 + u_long d_drivedata[NDDATA]; /* drive-type specific information */ +#define NSPARE 5 + u_long d_spare[NSPARE]; /* reserved for future use */ + u_long d_magic2; /* the magic number (again) */ + u_short d_checksum; /* xor of data incl. partitions */ + + /* filesystem and partition information: */ + u_short d_npartitions; /* number of partitions in following */ + u_long d_bbsize; /* size of boot area at sn0, bytes */ + u_long d_sbsize; /* max size of fs superblock, bytes */ + struct partition { /* the partition table */ + u_long p_size; /* number of sectors in partition */ + u_long p_offset; /* starting sector */ + u_long p_fsize; /* filesystem basic fragment size */ + u_char p_fstype; /* filesystem type, see below */ + u_char p_frag; /* filesystem fragments per block */ + union { + u_short cpg; /* UFS: FS cylinders per group */ + u_short sgs; /* LFS: FS segment shift */ + } __partition_u1; +#define p_cpg __partition_u1.cpg +#define p_sgs __partition_u1.sgs + } d_partitions[MAXPARTITIONS]; /* actually may be more */ +}; +#else /* LOCORE */ + /* + * offsets for asm boot files. + */ + .set d_secsize,40 + .set d_nsectors,44 + .set d_ntracks,48 + .set d_ncylinders,52 + .set d_secpercyl,56 + .set d_secperunit,60 + .set d_end_,276 /* size of disk label */ +#endif /* LOCORE */ + +/* d_type values: */ +#define DTYPE_SMD 1 /* SMD, XSMD; VAX hp/up */ +#define DTYPE_MSCP 2 /* MSCP */ +#define DTYPE_DEC 3 /* other DEC (rk, rl) */ +#define DTYPE_SCSI 4 /* SCSI */ +#define DTYPE_ESDI 5 /* ESDI interface */ +#define DTYPE_ST506 6 /* ST506 etc. */ +#define DTYPE_HPIB 7 /* CS/80 on HP-IB */ +#define DTYPE_HPFL 8 /* HP Fiber-link */ +#define DTYPE_FLOPPY 10 /* floppy */ + +#ifdef DKTYPENAMES +static char *dktypenames[] = { + "unknown", + "SMD", + "MSCP", + "old DEC", + "SCSI", + "ESDI", + "ST506", + "HP-IB", + "HP-FL", + "type 9", + "floppy", + 0 +}; +#define DKMAXTYPES (sizeof(dktypenames) / sizeof(dktypenames[0]) - 1) +#endif + +/* + * Filesystem type and version. + * Used to interpret other filesystem-specific + * per-partition information. + */ +#define FS_UNUSED 0 /* unused */ +#define FS_SWAP 1 /* swap */ +#define FS_V6 2 /* Sixth Edition */ +#define FS_V7 3 /* Seventh Edition */ +#define FS_SYSV 4 /* System V */ +#define FS_V71K 5 /* V7 with 1K blocks (4.1, 2.9) */ +#define FS_V8 6 /* Eighth Edition, 4K blocks */ +#define FS_BSDFFS 7 /* 4.2BSD fast file system */ +#define FS_MSDOS 8 /* MSDOS file system */ +#define FS_BSDLFS 9 /* 4.4BSD log-structured file system */ +#define FS_OTHER 10 /* in use, but unknown/unsupported */ +#define FS_HPFS 11 /* OS/2 high-performance file system */ +#define FS_ISO9660 12 /* ISO 9660, normally CD-ROM */ +#define FS_BOOT 13 /* partition contains bootstrap */ + +#ifdef DKTYPENAMES +static char *fstypenames[] = { + "unused", + "swap", + "Version 6", + "Version 7", + "System V", + "4.1BSD", + "Eighth Edition", + "4.2BSD", + "MSDOS", + "4.4LFS", + "unknown", + "HPFS", + "ISO9660", + "boot", + 0 +}; +#define FSMAXTYPES (sizeof(fstypenames) / sizeof(fstypenames[0]) - 1) +#endif + +/* + * flags shared by various drives: + */ +#define D_REMOVABLE 0x01 /* removable media */ +#define D_ECC 0x02 /* supports ECC */ +#define D_BADSECT 0x04 /* supports bad sector forw. */ +#define D_RAMDISK 0x08 /* disk emulator */ +#define D_CHAIN 0x10 /* can do back-back transfers */ + +/* + * Drive data for SMD. + */ +#define d_smdflags d_drivedata[0] +#define D_SSE 0x1 /* supports skip sectoring */ +#define d_mindist d_drivedata[1] +#define d_maxdist d_drivedata[2] +#define d_sdist d_drivedata[3] + +/* + * Drive data for ST506. + */ +#define d_precompcyl d_drivedata[0] +#define d_gap3 d_drivedata[1] /* used only when formatting */ + +/* + * Drive data for SCSI. + */ +#define d_blind d_drivedata[0] + +#ifndef LOCORE +/* + * Structure used to perform a format + * or other raw operation, returning data + * and/or register values. + * Register identification and format + * are device- and driver-dependent. + */ +struct format_op { + char *df_buf; + int df_count; /* value-result */ + daddr_t df_startblk; + int df_reg[8]; /* result */ +}; + +/* + * Structure used internally to retrieve + * information about a partition on a disk. + */ +struct partinfo { + struct disklabel *disklab; + struct partition *part; +}; + +/* + * Disk-specific ioctls. + */ + /* get and set disklabel; DIOCGPART used internally */ +#define DIOCGDINFO _IOR('d', 101, struct disklabel)/* get */ +#define DIOCSDINFO _IOW('d', 102, struct disklabel)/* set */ +#define DIOCWDINFO _IOW('d', 103, struct disklabel)/* set, update disk */ +#define DIOCGPART _IOW('d', 104, struct partinfo) /* get partition */ + +/* do format operation, read or write */ +#define DIOCRFORMAT _IOWR('d', 105, struct format_op) +#define DIOCWFORMAT _IOWR('d', 106, struct format_op) + +#define DIOCSSTEP _IOW('d', 107, int) /* set step rate */ +#define DIOCSRETRIES _IOW('d', 108, int) /* set # of retries */ +#define DIOCWLABEL _IOW('d', 109, int) /* write en/disable label */ + +#define DIOCSBAD _IOW('d', 110, struct dkbad) /* set kernel dkbad */ + +#endif /* LOCORE */ + +#if !defined(KERNEL) && !defined(LOCORE) + +#include + +__BEGIN_DECLS +struct disklabel *getdiskbyname __P((const char *)); +__END_DECLS + +#endif diff --git a/sys/sys/diskmbr.h b/sys/sys/diskmbr.h new file mode 100644 index 00000000000..a25ee29e363 --- /dev/null +++ b/sys/sys/diskmbr.h @@ -0,0 +1,332 @@ +/* + * Copyright (c) 1987, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)disklabel.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Disk description table, see disktab(5) + */ +#define _PATH_DISKTAB "/etc/disktab" +#define DISKTAB "/etc/disktab" /* deprecated */ + +/* + * Each disk has a label which includes information about the hardware + * disk geometry, filesystem partitions, and drive specific information. + * The label is in block 0 or 1, possibly offset from the beginning + * to leave room for a bootstrap, etc. + */ + +/* XXX these should be defined per controller (or drive) elsewhere, not here! */ +#ifdef i386 +#define LABELSECTOR 1 /* sector containing label */ +#define LABELOFFSET 0 /* offset of label in sector */ +#endif + +#ifndef LABELSECTOR +#define LABELSECTOR 0 /* sector containing label */ +#endif + +#ifndef LABELOFFSET +#define LABELOFFSET 64 /* offset of label in sector */ +#endif + +#define DISKMAGIC ((u_long) 0x82564557) /* The disk magic number */ +#ifndef MAXPARTITIONS +#define MAXPARTITIONS 8 +#endif + + +#ifndef LOCORE +struct disklabel { + u_long d_magic; /* the magic number */ + short d_type; /* drive type */ + short d_subtype; /* controller/d_type specific */ + char d_typename[16]; /* type name, e.g. "eagle" */ + /* + * d_packname contains the pack identifier and is returned when + * the disklabel is read off the disk or in-core copy. + * d_boot0 and d_boot1 are the (optional) names of the + * primary (block 0) and secondary (block 1-15) bootstraps + * as found in /usr/mdec. These are returned when using + * getdiskbyname(3) to retrieve the values from /etc/disktab. + */ +#if defined(KERNEL) || defined(STANDALONE) + char d_packname[16]; /* pack identifier */ +#else + union { + char un_d_packname[16]; /* pack identifier */ + struct { + char *un_d_boot0; /* primary bootstrap name */ + char *un_d_boot1; /* secondary bootstrap name */ + } un_b; + } d_un; +#define d_packname d_un.un_d_packname +#define d_boot0 d_un.un_b.un_d_boot0 +#define d_boot1 d_un.un_b.un_d_boot1 +#endif /* ! KERNEL or STANDALONE */ + /* disk geometry: */ + u_long d_secsize; /* # of bytes per sector */ + u_long d_nsectors; /* # of data sectors per track */ + u_long d_ntracks; /* # of tracks per cylinder */ + u_long d_ncylinders; /* # of data cylinders per unit */ + u_long d_secpercyl; /* # of data sectors per cylinder */ + u_long d_secperunit; /* # of data sectors per unit */ + /* + * Spares (bad sector replacements) below + * are not counted in d_nsectors or d_secpercyl. + * Spare sectors are assumed to be physical sectors + * which occupy space at the end of each track and/or cylinder. + */ + u_short d_sparespertrack; /* # of spare sectors per track */ + u_short d_sparespercyl; /* # of spare sectors per cylinder */ + /* + * Alternate cylinders include maintenance, replacement, + * configuration description areas, etc. + */ + u_long d_acylinders; /* # of alt. cylinders per unit */ + + /* hardware characteristics: */ + /* + * d_interleave, d_trackskew and d_cylskew describe perturbations + * in the media format used to compensate for a slow controller. + * Interleave is physical sector interleave, set up by the formatter + * or controller when formatting. When interleaving is in use, + * logically adjacent sectors are not physically contiguous, + * but instead are separated by some number of sectors. + * It is specified as the ratio of physical sectors traversed + * per logical sector. Thus an interleave of 1:1 implies contiguous + * layout, while 2:1 implies that logical sector 0 is separated + * by one sector from logical sector 1. + * d_trackskew is the offset of sector 0 on track N + * relative to sector 0 on track N-1 on the same cylinder. + * Finally, d_cylskew is the offset of sector 0 on cylinder N + * relative to sector 0 on cylinder N-1. + */ + u_short d_rpm; /* rotational speed */ + u_short d_interleave; /* hardware sector interleave */ + u_short d_trackskew; /* sector 0 skew, per track */ + u_short d_cylskew; /* sector 0 skew, per cylinder */ + u_long d_headswitch; /* head switch time, usec */ + u_long d_trkseek; /* track-to-track seek, usec */ + u_long d_flags; /* generic flags */ +#define NDDATA 5 + u_long d_drivedata[NDDATA]; /* drive-type specific information */ +#define NSPARE 5 + u_long d_spare[NSPARE]; /* reserved for future use */ + u_long d_magic2; /* the magic number (again) */ + u_short d_checksum; /* xor of data incl. partitions */ + + /* filesystem and partition information: */ + u_short d_npartitions; /* number of partitions in following */ + u_long d_bbsize; /* size of boot area at sn0, bytes */ + u_long d_sbsize; /* max size of fs superblock, bytes */ + struct partition { /* the partition table */ + u_long p_size; /* number of sectors in partition */ + u_long p_offset; /* starting sector */ + u_long p_fsize; /* filesystem basic fragment size */ + u_char p_fstype; /* filesystem type, see below */ + u_char p_frag; /* filesystem fragments per block */ + union { + u_short cpg; /* UFS: FS cylinders per group */ + u_short sgs; /* LFS: FS segment shift */ + } __partition_u1; +#define p_cpg __partition_u1.cpg +#define p_sgs __partition_u1.sgs + } d_partitions[MAXPARTITIONS]; /* actually may be more */ +}; +#else /* LOCORE */ + /* + * offsets for asm boot files. + */ + .set d_secsize,40 + .set d_nsectors,44 + .set d_ntracks,48 + .set d_ncylinders,52 + .set d_secpercyl,56 + .set d_secperunit,60 + .set d_end_,276 /* size of disk label */ +#endif /* LOCORE */ + +/* d_type values: */ +#define DTYPE_SMD 1 /* SMD, XSMD; VAX hp/up */ +#define DTYPE_MSCP 2 /* MSCP */ +#define DTYPE_DEC 3 /* other DEC (rk, rl) */ +#define DTYPE_SCSI 4 /* SCSI */ +#define DTYPE_ESDI 5 /* ESDI interface */ +#define DTYPE_ST506 6 /* ST506 etc. */ +#define DTYPE_HPIB 7 /* CS/80 on HP-IB */ +#define DTYPE_HPFL 8 /* HP Fiber-link */ +#define DTYPE_FLOPPY 10 /* floppy */ + +#ifdef DKTYPENAMES +static char *dktypenames[] = { + "unknown", + "SMD", + "MSCP", + "old DEC", + "SCSI", + "ESDI", + "ST506", + "HP-IB", + "HP-FL", + "type 9", + "floppy", + 0 +}; +#define DKMAXTYPES (sizeof(dktypenames) / sizeof(dktypenames[0]) - 1) +#endif + +/* + * Filesystem type and version. + * Used to interpret other filesystem-specific + * per-partition information. + */ +#define FS_UNUSED 0 /* unused */ +#define FS_SWAP 1 /* swap */ +#define FS_V6 2 /* Sixth Edition */ +#define FS_V7 3 /* Seventh Edition */ +#define FS_SYSV 4 /* System V */ +#define FS_V71K 5 /* V7 with 1K blocks (4.1, 2.9) */ +#define FS_V8 6 /* Eighth Edition, 4K blocks */ +#define FS_BSDFFS 7 /* 4.2BSD fast file system */ +#define FS_MSDOS 8 /* MSDOS file system */ +#define FS_BSDLFS 9 /* 4.4BSD log-structured file system */ +#define FS_OTHER 10 /* in use, but unknown/unsupported */ +#define FS_HPFS 11 /* OS/2 high-performance file system */ +#define FS_ISO9660 12 /* ISO 9660, normally CD-ROM */ +#define FS_BOOT 13 /* partition contains bootstrap */ + +#ifdef DKTYPENAMES +static char *fstypenames[] = { + "unused", + "swap", + "Version 6", + "Version 7", + "System V", + "4.1BSD", + "Eighth Edition", + "4.2BSD", + "MSDOS", + "4.4LFS", + "unknown", + "HPFS", + "ISO9660", + "boot", + 0 +}; +#define FSMAXTYPES (sizeof(fstypenames) / sizeof(fstypenames[0]) - 1) +#endif + +/* + * flags shared by various drives: + */ +#define D_REMOVABLE 0x01 /* removable media */ +#define D_ECC 0x02 /* supports ECC */ +#define D_BADSECT 0x04 /* supports bad sector forw. */ +#define D_RAMDISK 0x08 /* disk emulator */ +#define D_CHAIN 0x10 /* can do back-back transfers */ + +/* + * Drive data for SMD. + */ +#define d_smdflags d_drivedata[0] +#define D_SSE 0x1 /* supports skip sectoring */ +#define d_mindist d_drivedata[1] +#define d_maxdist d_drivedata[2] +#define d_sdist d_drivedata[3] + +/* + * Drive data for ST506. + */ +#define d_precompcyl d_drivedata[0] +#define d_gap3 d_drivedata[1] /* used only when formatting */ + +/* + * Drive data for SCSI. + */ +#define d_blind d_drivedata[0] + +#ifndef LOCORE +/* + * Structure used to perform a format + * or other raw operation, returning data + * and/or register values. + * Register identification and format + * are device- and driver-dependent. + */ +struct format_op { + char *df_buf; + int df_count; /* value-result */ + daddr_t df_startblk; + int df_reg[8]; /* result */ +}; + +/* + * Structure used internally to retrieve + * information about a partition on a disk. + */ +struct partinfo { + struct disklabel *disklab; + struct partition *part; +}; + +/* + * Disk-specific ioctls. + */ + /* get and set disklabel; DIOCGPART used internally */ +#define DIOCGDINFO _IOR('d', 101, struct disklabel)/* get */ +#define DIOCSDINFO _IOW('d', 102, struct disklabel)/* set */ +#define DIOCWDINFO _IOW('d', 103, struct disklabel)/* set, update disk */ +#define DIOCGPART _IOW('d', 104, struct partinfo) /* get partition */ + +/* do format operation, read or write */ +#define DIOCRFORMAT _IOWR('d', 105, struct format_op) +#define DIOCWFORMAT _IOWR('d', 106, struct format_op) + +#define DIOCSSTEP _IOW('d', 107, int) /* set step rate */ +#define DIOCSRETRIES _IOW('d', 108, int) /* set # of retries */ +#define DIOCWLABEL _IOW('d', 109, int) /* write en/disable label */ + +#define DIOCSBAD _IOW('d', 110, struct dkbad) /* set kernel dkbad */ + +#endif /* LOCORE */ + +#if !defined(KERNEL) && !defined(LOCORE) + +#include + +__BEGIN_DECLS +struct disklabel *getdiskbyname __P((const char *)); +__END_DECLS + +#endif diff --git a/sys/sys/diskpc98.h b/sys/sys/diskpc98.h new file mode 100644 index 00000000000..a25ee29e363 --- /dev/null +++ b/sys/sys/diskpc98.h @@ -0,0 +1,332 @@ +/* + * Copyright (c) 1987, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)disklabel.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Disk description table, see disktab(5) + */ +#define _PATH_DISKTAB "/etc/disktab" +#define DISKTAB "/etc/disktab" /* deprecated */ + +/* + * Each disk has a label which includes information about the hardware + * disk geometry, filesystem partitions, and drive specific information. + * The label is in block 0 or 1, possibly offset from the beginning + * to leave room for a bootstrap, etc. + */ + +/* XXX these should be defined per controller (or drive) elsewhere, not here! */ +#ifdef i386 +#define LABELSECTOR 1 /* sector containing label */ +#define LABELOFFSET 0 /* offset of label in sector */ +#endif + +#ifndef LABELSECTOR +#define LABELSECTOR 0 /* sector containing label */ +#endif + +#ifndef LABELOFFSET +#define LABELOFFSET 64 /* offset of label in sector */ +#endif + +#define DISKMAGIC ((u_long) 0x82564557) /* The disk magic number */ +#ifndef MAXPARTITIONS +#define MAXPARTITIONS 8 +#endif + + +#ifndef LOCORE +struct disklabel { + u_long d_magic; /* the magic number */ + short d_type; /* drive type */ + short d_subtype; /* controller/d_type specific */ + char d_typename[16]; /* type name, e.g. "eagle" */ + /* + * d_packname contains the pack identifier and is returned when + * the disklabel is read off the disk or in-core copy. + * d_boot0 and d_boot1 are the (optional) names of the + * primary (block 0) and secondary (block 1-15) bootstraps + * as found in /usr/mdec. These are returned when using + * getdiskbyname(3) to retrieve the values from /etc/disktab. + */ +#if defined(KERNEL) || defined(STANDALONE) + char d_packname[16]; /* pack identifier */ +#else + union { + char un_d_packname[16]; /* pack identifier */ + struct { + char *un_d_boot0; /* primary bootstrap name */ + char *un_d_boot1; /* secondary bootstrap name */ + } un_b; + } d_un; +#define d_packname d_un.un_d_packname +#define d_boot0 d_un.un_b.un_d_boot0 +#define d_boot1 d_un.un_b.un_d_boot1 +#endif /* ! KERNEL or STANDALONE */ + /* disk geometry: */ + u_long d_secsize; /* # of bytes per sector */ + u_long d_nsectors; /* # of data sectors per track */ + u_long d_ntracks; /* # of tracks per cylinder */ + u_long d_ncylinders; /* # of data cylinders per unit */ + u_long d_secpercyl; /* # of data sectors per cylinder */ + u_long d_secperunit; /* # of data sectors per unit */ + /* + * Spares (bad sector replacements) below + * are not counted in d_nsectors or d_secpercyl. + * Spare sectors are assumed to be physical sectors + * which occupy space at the end of each track and/or cylinder. + */ + u_short d_sparespertrack; /* # of spare sectors per track */ + u_short d_sparespercyl; /* # of spare sectors per cylinder */ + /* + * Alternate cylinders include maintenance, replacement, + * configuration description areas, etc. + */ + u_long d_acylinders; /* # of alt. cylinders per unit */ + + /* hardware characteristics: */ + /* + * d_interleave, d_trackskew and d_cylskew describe perturbations + * in the media format used to compensate for a slow controller. + * Interleave is physical sector interleave, set up by the formatter + * or controller when formatting. When interleaving is in use, + * logically adjacent sectors are not physically contiguous, + * but instead are separated by some number of sectors. + * It is specified as the ratio of physical sectors traversed + * per logical sector. Thus an interleave of 1:1 implies contiguous + * layout, while 2:1 implies that logical sector 0 is separated + * by one sector from logical sector 1. + * d_trackskew is the offset of sector 0 on track N + * relative to sector 0 on track N-1 on the same cylinder. + * Finally, d_cylskew is the offset of sector 0 on cylinder N + * relative to sector 0 on cylinder N-1. + */ + u_short d_rpm; /* rotational speed */ + u_short d_interleave; /* hardware sector interleave */ + u_short d_trackskew; /* sector 0 skew, per track */ + u_short d_cylskew; /* sector 0 skew, per cylinder */ + u_long d_headswitch; /* head switch time, usec */ + u_long d_trkseek; /* track-to-track seek, usec */ + u_long d_flags; /* generic flags */ +#define NDDATA 5 + u_long d_drivedata[NDDATA]; /* drive-type specific information */ +#define NSPARE 5 + u_long d_spare[NSPARE]; /* reserved for future use */ + u_long d_magic2; /* the magic number (again) */ + u_short d_checksum; /* xor of data incl. partitions */ + + /* filesystem and partition information: */ + u_short d_npartitions; /* number of partitions in following */ + u_long d_bbsize; /* size of boot area at sn0, bytes */ + u_long d_sbsize; /* max size of fs superblock, bytes */ + struct partition { /* the partition table */ + u_long p_size; /* number of sectors in partition */ + u_long p_offset; /* starting sector */ + u_long p_fsize; /* filesystem basic fragment size */ + u_char p_fstype; /* filesystem type, see below */ + u_char p_frag; /* filesystem fragments per block */ + union { + u_short cpg; /* UFS: FS cylinders per group */ + u_short sgs; /* LFS: FS segment shift */ + } __partition_u1; +#define p_cpg __partition_u1.cpg +#define p_sgs __partition_u1.sgs + } d_partitions[MAXPARTITIONS]; /* actually may be more */ +}; +#else /* LOCORE */ + /* + * offsets for asm boot files. + */ + .set d_secsize,40 + .set d_nsectors,44 + .set d_ntracks,48 + .set d_ncylinders,52 + .set d_secpercyl,56 + .set d_secperunit,60 + .set d_end_,276 /* size of disk label */ +#endif /* LOCORE */ + +/* d_type values: */ +#define DTYPE_SMD 1 /* SMD, XSMD; VAX hp/up */ +#define DTYPE_MSCP 2 /* MSCP */ +#define DTYPE_DEC 3 /* other DEC (rk, rl) */ +#define DTYPE_SCSI 4 /* SCSI */ +#define DTYPE_ESDI 5 /* ESDI interface */ +#define DTYPE_ST506 6 /* ST506 etc. */ +#define DTYPE_HPIB 7 /* CS/80 on HP-IB */ +#define DTYPE_HPFL 8 /* HP Fiber-link */ +#define DTYPE_FLOPPY 10 /* floppy */ + +#ifdef DKTYPENAMES +static char *dktypenames[] = { + "unknown", + "SMD", + "MSCP", + "old DEC", + "SCSI", + "ESDI", + "ST506", + "HP-IB", + "HP-FL", + "type 9", + "floppy", + 0 +}; +#define DKMAXTYPES (sizeof(dktypenames) / sizeof(dktypenames[0]) - 1) +#endif + +/* + * Filesystem type and version. + * Used to interpret other filesystem-specific + * per-partition information. + */ +#define FS_UNUSED 0 /* unused */ +#define FS_SWAP 1 /* swap */ +#define FS_V6 2 /* Sixth Edition */ +#define FS_V7 3 /* Seventh Edition */ +#define FS_SYSV 4 /* System V */ +#define FS_V71K 5 /* V7 with 1K blocks (4.1, 2.9) */ +#define FS_V8 6 /* Eighth Edition, 4K blocks */ +#define FS_BSDFFS 7 /* 4.2BSD fast file system */ +#define FS_MSDOS 8 /* MSDOS file system */ +#define FS_BSDLFS 9 /* 4.4BSD log-structured file system */ +#define FS_OTHER 10 /* in use, but unknown/unsupported */ +#define FS_HPFS 11 /* OS/2 high-performance file system */ +#define FS_ISO9660 12 /* ISO 9660, normally CD-ROM */ +#define FS_BOOT 13 /* partition contains bootstrap */ + +#ifdef DKTYPENAMES +static char *fstypenames[] = { + "unused", + "swap", + "Version 6", + "Version 7", + "System V", + "4.1BSD", + "Eighth Edition", + "4.2BSD", + "MSDOS", + "4.4LFS", + "unknown", + "HPFS", + "ISO9660", + "boot", + 0 +}; +#define FSMAXTYPES (sizeof(fstypenames) / sizeof(fstypenames[0]) - 1) +#endif + +/* + * flags shared by various drives: + */ +#define D_REMOVABLE 0x01 /* removable media */ +#define D_ECC 0x02 /* supports ECC */ +#define D_BADSECT 0x04 /* supports bad sector forw. */ +#define D_RAMDISK 0x08 /* disk emulator */ +#define D_CHAIN 0x10 /* can do back-back transfers */ + +/* + * Drive data for SMD. + */ +#define d_smdflags d_drivedata[0] +#define D_SSE 0x1 /* supports skip sectoring */ +#define d_mindist d_drivedata[1] +#define d_maxdist d_drivedata[2] +#define d_sdist d_drivedata[3] + +/* + * Drive data for ST506. + */ +#define d_precompcyl d_drivedata[0] +#define d_gap3 d_drivedata[1] /* used only when formatting */ + +/* + * Drive data for SCSI. + */ +#define d_blind d_drivedata[0] + +#ifndef LOCORE +/* + * Structure used to perform a format + * or other raw operation, returning data + * and/or register values. + * Register identification and format + * are device- and driver-dependent. + */ +struct format_op { + char *df_buf; + int df_count; /* value-result */ + daddr_t df_startblk; + int df_reg[8]; /* result */ +}; + +/* + * Structure used internally to retrieve + * information about a partition on a disk. + */ +struct partinfo { + struct disklabel *disklab; + struct partition *part; +}; + +/* + * Disk-specific ioctls. + */ + /* get and set disklabel; DIOCGPART used internally */ +#define DIOCGDINFO _IOR('d', 101, struct disklabel)/* get */ +#define DIOCSDINFO _IOW('d', 102, struct disklabel)/* set */ +#define DIOCWDINFO _IOW('d', 103, struct disklabel)/* set, update disk */ +#define DIOCGPART _IOW('d', 104, struct partinfo) /* get partition */ + +/* do format operation, read or write */ +#define DIOCRFORMAT _IOWR('d', 105, struct format_op) +#define DIOCWFORMAT _IOWR('d', 106, struct format_op) + +#define DIOCSSTEP _IOW('d', 107, int) /* set step rate */ +#define DIOCSRETRIES _IOW('d', 108, int) /* set # of retries */ +#define DIOCWLABEL _IOW('d', 109, int) /* write en/disable label */ + +#define DIOCSBAD _IOW('d', 110, struct dkbad) /* set kernel dkbad */ + +#endif /* LOCORE */ + +#if !defined(KERNEL) && !defined(LOCORE) + +#include + +__BEGIN_DECLS +struct disklabel *getdiskbyname __P((const char *)); +__END_DECLS + +#endif diff --git a/sys/sys/dkbad.h b/sys/sys/dkbad.h new file mode 100644 index 00000000000..c574000aaf8 --- /dev/null +++ b/sys/sys/dkbad.h @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dkbad.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Definitions needed to perform bad sector revectoring ala DEC STD 144. + * + * The bad sector information is located in the first 5 even numbered + * sectors of the last track of the disk pack. There are five identical + * copies of the information, described by the dkbad structure. + * + * Replacement sectors are allocated starting with the first sector before + * the bad sector information and working backwards towards the beginning of + * the disk. A maximum of 126 bad sectors are supported. The position of + * the bad sector in the bad sector table determines which replacement sector + * it corresponds to. + * + * The bad sector information and replacement sectors are conventionally + * only accessible through the 'c' file system partition of the disk. If + * that partition is used for a file system, the user is responsible for + * making sure that it does not overlap the bad sector information or any + * replacement sectors. + */ +struct dkbad { + long bt_csn; /* cartridge serial number */ + u_short bt_mbz; /* unused; should be 0 */ + u_short bt_flag; /* -1 => alignment cartridge */ + struct bt_bad { + u_short bt_cyl; /* cylinder number of bad sector */ + u_short bt_trksec; /* track and sector number */ + } bt_bad[126]; +}; + +#define ECC 0 +#define SSE 1 +#define BSE 2 +#define CONT 3 diff --git a/sys/sys/dkstat.h b/sys/sys/dkstat.h new file mode 100644 index 00000000000..f7f5f1594a2 --- /dev/null +++ b/sys/sys/dkstat.h @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dkstat.h 8.2 (Berkeley) 1/21/94 + */ + +#define CP_USER 0 +#define CP_NICE 1 +#define CP_SYS 2 +#define CP_INTR 3 +#define CP_IDLE 4 +#define CPUSTATES 5 + +#define DK_NDRIVE 8 +#ifdef KERNEL +long cp_time[CPUSTATES]; +long dk_seek[DK_NDRIVE]; +long dk_time[DK_NDRIVE]; +long dk_wds[DK_NDRIVE]; +long dk_wpms[DK_NDRIVE]; +long dk_xfer[DK_NDRIVE]; + +int dk_busy; +int dk_ndrive; + +long tk_cancc; +long tk_nin; +long tk_nout; +long tk_rawcc; +#endif diff --git a/sys/sys/dmap.h b/sys/sys/dmap.h new file mode 100644 index 00000000000..2a6f538259e --- /dev/null +++ b/sys/sys/dmap.h @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dmap.h 8.2 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_DMAP_H_ +#define _SYS_DMAP_H_ + +/* + * Definitions for the mapping of vitual swap space to the physical swap + * area - the disk map. + */ +#define NDMAP 38 /* size of the swap area map */ + +struct dmap { + swblk_t dm_size; /* current size used by process */ + swblk_t dm_alloc; /* amount of physical swap space allocated */ + swblk_t dm_map[NDMAP]; /* first disk block number in each chunk */ +}; +#ifdef KERNEL +struct dmap zdmap; +int dmmin, dmmax, dmtext; +#endif + +/* The following structure is that ``returned'' from a call to vstodb(). */ +struct dblock { + swblk_t db_base; /* base of physical contig drum block */ + swblk_t db_size; /* size of block */ +}; +#endif /* !_SYS_DMAP_H_ */ diff --git a/sys/sys/domain.h b/sys/sys/domain.h new file mode 100644 index 00000000000..b056347d539 --- /dev/null +++ b/sys/sys/domain.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)domain.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Structure per communications domain. + */ + +/* + * Forward structure declarations for function prototypes [sic]. + */ +struct mbuf; + +struct domain { + int dom_family; /* AF_xxx */ + char *dom_name; + void (*dom_init) /* initialize domain data structures */ + __P((void)); + int (*dom_externalize) /* externalize access rights */ + __P((struct mbuf *)); + int (*dom_dispose) /* dispose of internalized rights */ + __P((struct mbuf *)); + struct protosw *dom_protosw, *dom_protoswNPROTOSW; + struct domain *dom_next; + int (*dom_rtattach) /* initialize routing table */ + __P((void **, int)); + int dom_rtoffset; /* an arg to rtattach, in bits */ + int dom_maxrtkey; /* for routing layer */ +}; + +#ifdef KERNEL +struct domain *domains; +#endif diff --git a/sys/sys/errno.h b/sys/sys/errno.h new file mode 100644 index 00000000000..a4e4ea6eb69 --- /dev/null +++ b/sys/sys/errno.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)errno.h 8.5 (Berkeley) 1/21/94 + */ + +#ifndef KERNEL +extern int errno; /* global error number */ +#endif + +#define EPERM 1 /* Operation not permitted */ +#define ENOENT 2 /* No such file or directory */ +#define ESRCH 3 /* No such process */ +#define EINTR 4 /* Interrupted system call */ +#define EIO 5 /* Input/output error */ +#define ENXIO 6 /* Device not configured */ +#define E2BIG 7 /* Argument list too long */ +#define ENOEXEC 8 /* Exec format error */ +#define EBADF 9 /* Bad file descriptor */ +#define ECHILD 10 /* No child processes */ +#define EDEADLK 11 /* Resource deadlock avoided */ + /* 11 was EAGAIN */ +#define ENOMEM 12 /* Cannot allocate memory */ +#define EACCES 13 /* Permission denied */ +#define EFAULT 14 /* Bad address */ +#ifndef _POSIX_SOURCE +#define ENOTBLK 15 /* Block device required */ +#endif +#define EBUSY 16 /* Device busy */ +#define EEXIST 17 /* File exists */ +#define EXDEV 18 /* Cross-device link */ +#define ENODEV 19 /* Operation not supported by device */ +#define ENOTDIR 20 /* Not a directory */ +#define EISDIR 21 /* Is a directory */ +#define EINVAL 22 /* Invalid argument */ +#define ENFILE 23 /* Too many open files in system */ +#define EMFILE 24 /* Too many open files */ +#define ENOTTY 25 /* Inappropriate ioctl for device */ +#ifndef _POSIX_SOURCE +#define ETXTBSY 26 /* Text file busy */ +#endif +#define EFBIG 27 /* File too large */ +#define ENOSPC 28 /* No space left on device */ +#define ESPIPE 29 /* Illegal seek */ +#define EROFS 30 /* Read-only file system */ +#define EMLINK 31 /* Too many links */ +#define EPIPE 32 /* Broken pipe */ + +/* math software */ +#define EDOM 33 /* Numerical argument out of domain */ +#define ERANGE 34 /* Result too large */ + +/* non-blocking and interrupt i/o */ +#define EAGAIN 35 /* Resource temporarily unavailable */ +#ifndef _POSIX_SOURCE +#define EWOULDBLOCK EAGAIN /* Operation would block */ +#define EINPROGRESS 36 /* Operation now in progress */ +#define EALREADY 37 /* Operation already in progress */ + +/* ipc/network software -- argument errors */ +#define ENOTSOCK 38 /* Socket operation on non-socket */ +#define EDESTADDRREQ 39 /* Destination address required */ +#define EMSGSIZE 40 /* Message too long */ +#define EPROTOTYPE 41 /* Protocol wrong type for socket */ +#define ENOPROTOOPT 42 /* Protocol not available */ +#define EPROTONOSUPPORT 43 /* Protocol not supported */ +#define ESOCKTNOSUPPORT 44 /* Socket type not supported */ +#define EOPNOTSUPP 45 /* Operation not supported */ +#define EPFNOSUPPORT 46 /* Protocol family not supported */ +#define EAFNOSUPPORT 47 /* Address family not supported by protocol family */ +#define EADDRINUSE 48 /* Address already in use */ +#define EADDRNOTAVAIL 49 /* Can't assign requested address */ + +/* ipc/network software -- operational errors */ +#define ENETDOWN 50 /* Network is down */ +#define ENETUNREACH 51 /* Network is unreachable */ +#define ENETRESET 52 /* Network dropped connection on reset */ +#define ECONNABORTED 53 /* Software caused connection abort */ +#define ECONNRESET 54 /* Connection reset by peer */ +#define ENOBUFS 55 /* No buffer space available */ +#define EISCONN 56 /* Socket is already connected */ +#define ENOTCONN 57 /* Socket is not connected */ +#define ESHUTDOWN 58 /* Can't send after socket shutdown */ +#define ETOOMANYREFS 59 /* Too many references: can't splice */ +#define ETIMEDOUT 60 /* Operation timed out */ +#define ECONNREFUSED 61 /* Connection refused */ + +#define ELOOP 62 /* Too many levels of symbolic links */ +#endif /* _POSIX_SOURCE */ +#define ENAMETOOLONG 63 /* File name too long */ + +/* should be rearranged */ +#ifndef _POSIX_SOURCE +#define EHOSTDOWN 64 /* Host is down */ +#define EHOSTUNREACH 65 /* No route to host */ +#endif /* _POSIX_SOURCE */ +#define ENOTEMPTY 66 /* Directory not empty */ + +/* quotas & mush */ +#ifndef _POSIX_SOURCE +#define EPROCLIM 67 /* Too many processes */ +#define EUSERS 68 /* Too many users */ +#define EDQUOT 69 /* Disc quota exceeded */ + +/* Network File System */ +#define ESTALE 70 /* Stale NFS file handle */ +#define EREMOTE 71 /* Too many levels of remote in path */ +#define EBADRPC 72 /* RPC struct is bad */ +#define ERPCMISMATCH 73 /* RPC version wrong */ +#define EPROGUNAVAIL 74 /* RPC prog. not avail */ +#define EPROGMISMATCH 75 /* Program version wrong */ +#define EPROCUNAVAIL 76 /* Bad procedure for program */ +#endif /* _POSIX_SOURCE */ + +#define ENOLCK 77 /* No locks available */ +#define ENOSYS 78 /* Function not implemented */ + +#ifndef _POSIX_SOURCE +#define EFTYPE 79 /* Inappropriate file type or format */ +#define EAUTH 80 /* Authentication error */ +#define ENEEDAUTH 81 /* Need authenticator */ +#define ELAST 81 /* Must be equal largest errno */ +#endif /* _POSIX_SOURCE */ + +#ifdef KERNEL +/* pseudo-errors returned inside kernel to modify return to process */ +#define ERESTART -1 /* restart syscall */ +#define EJUSTRETURN -2 /* don't modify regs, just return */ +#endif diff --git a/sys/sys/exec.h b/sys/sys/exec.h new file mode 100644 index 00000000000..443e1443414 --- /dev/null +++ b/sys/sys/exec.h @@ -0,0 +1,71 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)exec.h 8.3 (Berkeley) 1/21/94 + */ + +#include + +/* + * The following structure is found at the top of the user stack of each + * user process. The ps program uses it to locate argv and environment + * strings. Programs that wish ps to display other information may modify + * it; normally ps_argvstr points to the text for argv[0], and ps_nargvstr + * is the same as the program's argc. The fields ps_envstr and ps_nenvstr + * are the equivalent for the environment. + */ +struct ps_strings { + char *ps_argvstr; /* first of 0 or more argument strings */ + int ps_nargvstr; /* the number of argument strings */ + char *ps_envstr; /* first of 0 or more environment strings */ + int ps_nenvstr; /* the number of environment strings */ +}; + +/* + * Address of ps_strings structure (in user space). + */ +#define PS_STRINGS \ + ((struct ps_strings *)(USRSTACK - sizeof(struct ps_strings))) + +/* + * Arguments to the exec system call. + */ +struct execve_args { + char *fname; + char **argp; + char **envp; +}; diff --git a/sys/sys/fbio.h b/sys/sys/fbio.h new file mode 100644 index 00000000000..63371b77ed8 --- /dev/null +++ b/sys/sys/fbio.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software developed by the Computer Systems + * Engineering group at Lawrence Berkeley Laboratory under DARPA + * contract BG 91-66 and contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fbio.h 8.2 (Berkeley) 10/30/93 + * + * from: $Header: fbio.h,v 1.6 93/10/31 06:01:56 torek Exp $ (LBL) + */ + +/* + * Frame buffer ioctls (from Sprite, trimmed to essentials for X11). + */ + +/* + * Frame buffer type codes. + */ +#define FBTYPE_SUN1BW 0 /* multibus mono */ +#define FBTYPE_SUN1COLOR 1 /* multibus color */ +#define FBTYPE_SUN2BW 2 /* memory mono */ +#define FBTYPE_SUN2COLOR 3 /* color w/rasterop chips */ +#define FBTYPE_SUN2GP 4 /* GP1/GP2 */ +#define FBTYPE_SUN5COLOR 5 /* RoadRunner accelerator */ +#define FBTYPE_SUN3COLOR 6 /* memory color */ +#define FBTYPE_MEMCOLOR 7 /* memory 24-bit */ +#define FBTYPE_SUN4COLOR 8 /* memory color w/overlay */ + +#define FBTYPE_NOTSUN1 9 /* reserved for customer */ +#define FBTYPE_NOTSUN2 10 /* reserved for customer */ +#define FBTYPE_NOTSUN3 11 /* reserved for customer */ + +#define FBTYPE_SUNFAST_COLOR 12 /* accelerated 8bit */ +#define FBTYPE_SUNROP_COLOR 13 /* MEMCOLOR with rop h/w */ +#define FBTYPE_SUNFB_VIDEO 14 /* Simple video mixing */ +#define FBTYPE_RESERVED5 15 /* reserved, do not use */ +#define FBTYPE_RESERVED4 16 /* reserved, do not use */ +#define FBTYPE_RESERVED3 17 /* reserved, do not use */ +#define FBTYPE_RESERVED2 18 /* reserved, do not use */ +#define FBTYPE_RESERVED1 19 /* reserved, do not use */ + +#define FBTYPE_LASTPLUSONE 20 /* max number of fbs (change as add) */ + +/* + * Frame buffer descriptor as returned by FBIOGTYPE. + */ +struct fbtype { + int fb_type; /* as defined above */ + int fb_height; /* in pixels */ + int fb_width; /* in pixels */ + int fb_depth; /* bits per pixel */ + int fb_cmsize; /* size of color map (entries) */ + int fb_size; /* total size in bytes */ +}; +#define FBIOGTYPE _IOR('F', 0, struct fbtype) + +#ifdef notdef +/* + * General purpose structure for passing info in and out of frame buffers + * (used for gp1) -- unsupported. + */ +struct fbinfo { + int fb_physaddr; /* physical frame buffer address */ + int fb_hwwidth; /* fb board width */ + int fb_hwheight; /* fb board height */ + int fb_addrdelta; /* phys addr diff between boards */ + u_char *fb_ropaddr; /* fb virtual addr */ + int fb_unit; /* minor devnum of fb */ +}; +#define FBIOGINFO _IOR('F', 2, struct fbinfo) +#endif + +/* + * Color map I/O. + */ +struct fbcmap { + int index; /* first element (0 origin) */ + int count; /* number of elements */ + u_char *red; /* red color map elements */ + u_char *green; /* green color map elements */ + u_char *blue; /* blue color map elements */ +}; +#define FBIOPUTCMAP _IOW('F', 3, struct fbcmap) +#define FBIOGETCMAP _IOW('F', 4, struct fbcmap) + +/* + * Set/get attributes. + */ +#define FB_ATTR_NDEVSPECIFIC 8 /* no. of device specific values */ +#define FB_ATTR_NEMUTYPES 4 /* no. of emulation types */ + +struct fbsattr { + int flags; /* flags; see below */ + int emu_type; /* emulation type (-1 if unused) */ + int dev_specific[FB_ATTR_NDEVSPECIFIC]; /* catchall */ +}; +#define FB_ATTR_AUTOINIT 1 /* emulation auto init flag */ +#define FB_ATTR_DEVSPECIFIC 2 /* dev. specific stuff valid flag */ + +struct fbgattr { + int real_type; /* real device type */ + int owner; /* PID of owner, 0 if myself */ + struct fbtype fbtype; /* fbtype info for real device */ + struct fbsattr sattr; /* see above */ + int emu_types[FB_ATTR_NEMUTYPES]; /* possible emulations */ + /* (-1 if unused) */ +}; +/* FBIOSATTR _IOW('F', 5, struct fbsattr) -- unsupported */ +#define FBIOGATTR _IOR('F', 6, struct fbgattr) + +/* + * Video control. + */ +#define FBVIDEO_OFF 0 +#define FBVIDEO_ON 1 + +#define FBIOSVIDEO _IOW('F', 7, int) +#define FBIOGVIDEO _IOR('F', 8, int) + +/* + * Hardware cursor control (for, e.g., CG6). A rather complex and icky + * interface that smells like VMS, but there it is.... + */ +struct fbcurpos { + short x; + short y; +}; + +struct fbcursor { + short set; /* flags; see below */ + short enable; /* nonzero => cursor on, 0 => cursor off */ + struct fbcurpos pos; /* position on display */ + struct fbcurpos hot; /* hot-spot within cursor */ + struct fbcmap cmap; /* cursor color map */ + struct fbcurpos size; /* number of valid bits in image & mask */ + caddr_t image; /* cursor image bits */ + caddr_t mask; /* cursor mask bits */ +}; +#define FB_CUR_SETCUR 0x01 /* set on/off (i.e., obey fbcursor.enable) */ +#define FB_CUR_SETPOS 0x02 /* set position */ +#define FB_CUR_SETHOT 0x04 /* set hot-spot */ +#define FB_CUR_SETCMAP 0x08 /* set cursor color map */ +#define FB_CUR_SETSHAPE 0x10 /* set size & bits */ +#define FB_CUR_SETALL (FB_CUR_SETCUR | FB_CUR_SETPOS | FB_CUR_SETHOT | \ + FB_CUR_SETCMAP | FB_CUR_SETSHAPE) + +/* controls for cursor attributes & shape (including position) */ +#define FBIOSCURSOR _IOW('F', 24, struct fbcursor) +#define FBIOGCURSOR _IOWR('F', 25, struct fbcursor) + +/* controls for cursor position only */ +#define FBIOSCURPOS _IOW('F', 26, struct fbcurpos) +#define FBIOGCURPOS _IOW('F', 27, struct fbcurpos) + +/* get maximum cursor size */ +#define FBIOGCURMAX _IOR('F', 28, struct fbcurpos) diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h new file mode 100644 index 00000000000..62762f3498a --- /dev/null +++ b/sys/sys/fcntl.h @@ -0,0 +1,190 @@ +/*- + * Copyright (c) 1983, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fcntl.h 8.3 (Berkeley) 1/21/94 + */ + +#ifndef _SYS_FCNTL_H_ +#define _SYS_FCNTL_H_ + +/* + * This file includes the definitions for open and fcntl + * described by POSIX for ; it also includes + * related kernel definitions. + */ + +#ifndef KERNEL +#include +#endif + +/* + * File status flags: these are used by open(2), fcntl(2). + * They are also used (indirectly) in the kernel file structure f_flags, + * which is a superset of the open/fcntl flags. Open flags and f_flags + * are inter-convertible using OFLAGS(fflags) and FFLAGS(oflags). + * Open/fcntl flags begin with O_; kernel-internal flags begin with F. + */ +/* open-only flags */ +#define O_RDONLY 0x0000 /* open for reading only */ +#define O_WRONLY 0x0001 /* open for writing only */ +#define O_RDWR 0x0002 /* open for reading and writing */ +#define O_ACCMODE 0x0003 /* mask for above modes */ + +/* + * Kernel encoding of open mode; separate read and write bits that are + * independently testable: 1 greater than the above. + * + * XXX + * FREAD and FWRITE are excluded from the #ifdef KERNEL so that TIOCFLUSH, + * which was documented to use FREAD/FWRITE, continues to work. + */ +#ifndef _POSIX_SOURCE +#define FREAD 0x0001 +#define FWRITE 0x0002 +#endif +#define O_NONBLOCK 0x0004 /* no delay */ +#define O_APPEND 0x0008 /* set append mode */ +#ifndef _POSIX_SOURCE +#define O_SHLOCK 0x0010 /* open with shared file lock */ +#define O_EXLOCK 0x0020 /* open with exclusive file lock */ +#define O_ASYNC 0x0040 /* signal pgrp when data ready */ +#define O_FSYNC 0x0080 /* synchronous writes */ +#endif +#define O_CREAT 0x0200 /* create if nonexistant */ +#define O_TRUNC 0x0400 /* truncate to zero length */ +#define O_EXCL 0x0800 /* error if already exists */ +#ifdef KERNEL +#define FMARK 0x1000 /* mark during gc() */ +#define FDEFER 0x2000 /* defer for next gc pass */ +#define FHASLOCK 0x4000 /* descriptor holds advisory lock */ +#endif + +/* defined by POSIX 1003.1; BSD default, so no bit required */ +#define O_NOCTTY 0 /* don't assign controlling terminal */ + +#ifdef KERNEL +/* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ +#define FFLAGS(oflags) ((oflags) + 1) +#define OFLAGS(fflags) ((fflags) - 1) + +/* bits to save after open */ +#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK) +/* bits settable by fcntl(F_SETFL, ...) */ +#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK) +#endif + +/* + * The O_* flags used to have only F* names, which were used in the kernel + * and by fcntl. We retain the F* names for the kernel f_flags field + * and for backward compatibility for fcntl. + */ +#ifndef _POSIX_SOURCE +#define FAPPEND O_APPEND /* kernel/compat */ +#define FASYNC O_ASYNC /* kernel/compat */ +#define FFSYNC O_FSYNC /* kernel */ +#define FNONBLOCK O_NONBLOCK /* kernel */ +#define FNDELAY O_NONBLOCK /* compat */ +#define O_NDELAY O_NONBLOCK /* compat */ +#endif + +/* + * Constants used for fcntl(2) + */ + +/* command values */ +#define F_DUPFD 0 /* duplicate file descriptor */ +#define F_GETFD 1 /* get file descriptor flags */ +#define F_SETFD 2 /* set file descriptor flags */ +#define F_GETFL 3 /* get file status flags */ +#define F_SETFL 4 /* set file status flags */ +#ifndef _POSIX_SOURCE +#define F_GETOWN 5 /* get SIGIO/SIGURG proc/pgrp */ +#define F_SETOWN 6 /* set SIGIO/SIGURG proc/pgrp */ +#endif +#define F_GETLK 7 /* get record locking information */ +#define F_SETLK 8 /* set record locking information */ +#define F_SETLKW 9 /* F_SETLK; wait if blocked */ + +/* file descriptor flags (F_GETFD, F_SETFD) */ +#define FD_CLOEXEC 1 /* close-on-exec flag */ + +/* record locking flags (F_GETLK, F_SETLK, F_SETLKW) */ +#define F_RDLCK 1 /* shared or read lock */ +#define F_UNLCK 2 /* unlock */ +#define F_WRLCK 3 /* exclusive or write lock */ +#ifdef KERNEL +#define F_WAIT 0x010 /* Wait until lock is granted */ +#define F_FLOCK 0x020 /* Use flock(2) semantics for lock */ +#define F_POSIX 0x040 /* Use POSIX semantics for lock */ +#endif + +/* + * Advisory file segment locking data type - + * information passed to system by user + */ +struct flock { + off_t l_start; /* starting offset */ + off_t l_len; /* len = 0 means until end of file */ + pid_t l_pid; /* lock owner */ + short l_type; /* lock type: read/write, etc. */ + short l_whence; /* type of l_start */ +}; + + +#ifndef _POSIX_SOURCE +/* lock operations for flock(2) */ +#define LOCK_SH 0x01 /* shared file lock */ +#define LOCK_EX 0x02 /* exclusive file lock */ +#define LOCK_NB 0x04 /* don't block when locking */ +#define LOCK_UN 0x08 /* unlock file */ +#endif + + +#ifndef KERNEL +#include + +__BEGIN_DECLS +int open __P((const char *, int, ...)); +int creat __P((const char *, mode_t)); +int fcntl __P((int, int, ...)); +#ifndef _POSIX_SOURCE +int flock __P((int, int)); +#endif /* !_POSIX_SOURCE */ +__END_DECLS +#endif + +#endif /* !_SYS_FCNTL_H_ */ diff --git a/sys/sys/file.h b/sys/sys/file.h new file mode 100644 index 00000000000..3d82190669a --- /dev/null +++ b/sys/sys/file.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)file.h 8.1 (Berkeley) 6/2/93 + */ + +#include +#include + +#ifdef KERNEL +struct proc; +struct uio; + +/* + * Kernel descriptor table. + * One entry for each open kernel vnode and socket. + */ +struct file { + struct file *f_filef; /* list of active files */ + struct file **f_fileb; /* list of active files */ + short f_flag; /* see fcntl.h */ +#define DTYPE_VNODE 1 /* file */ +#define DTYPE_SOCKET 2 /* communications endpoint */ + short f_type; /* descriptor type */ + short f_count; /* reference count */ + short f_msgcount; /* references from message queue */ + struct ucred *f_cred; /* credentials associated with descriptor */ + struct fileops { + int (*fo_read) __P((struct file *fp, struct uio *uio, + struct ucred *cred)); + int (*fo_write) __P((struct file *fp, struct uio *uio, + struct ucred *cred)); + int (*fo_ioctl) __P((struct file *fp, int com, + caddr_t data, struct proc *p)); + int (*fo_select) __P((struct file *fp, int which, + struct proc *p)); + int (*fo_close) __P((struct file *fp, struct proc *p)); + } *f_ops; + off_t f_offset; + caddr_t f_data; /* vnode or socket */ +}; + +extern struct file *filehead; /* head of list of open files */ +extern int maxfiles; /* kernel limit on number of open files */ +extern int nfiles; /* actual number of open files */ + +#endif /* KERNEL */ diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h new file mode 100644 index 00000000000..1071bc10597 --- /dev/null +++ b/sys/sys/filedesc.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)filedesc.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * This structure is used for the management of descriptors. It may be + * shared by multiple processes. + * + * A process is initially started out with NDFILE descriptors stored within + * this structure, selected to be enough for typical applications based on + * the historical limit of 20 open files (and the usage of descriptors by + * shells). If these descriptors are exhausted, a larger descriptor table + * may be allocated, up to a process' resource limit; the internal arrays + * are then unused. The initial expansion is set to NDEXTENT; each time + * it runs out, it is doubled until the resource limit is reached. NDEXTENT + * should be selected to be the biggest multiple of OFILESIZE (see below) + * that will fit in a power-of-two sized piece of memory. + */ +#define NDFILE 20 +#define NDEXTENT 50 /* 250 bytes in 256-byte alloc. */ + +struct filedesc { + struct file **fd_ofiles; /* file structures for open files */ + char *fd_ofileflags; /* per-process open file flags */ + struct vnode *fd_cdir; /* current directory */ + struct vnode *fd_rdir; /* root directory */ + int fd_nfiles; /* number of open files allocated */ + u_short fd_lastfile; /* high-water mark of fd_ofiles */ + u_short fd_freefile; /* approx. next free file */ + u_short fd_cmask; /* mask for file creation */ + u_short fd_refcnt; /* reference count */ +}; + +/* + * Basic allocation of descriptors: + * one of the above, plus arrays for NDFILE descriptors. + */ +struct filedesc0 { + struct filedesc fd_fd; + /* + * These arrays are used when the number of open files is + * <= NDFILE, and are then pointed to by the pointers above. + */ + struct file *fd_dfiles[NDFILE]; + char fd_dfileflags[NDFILE]; +}; + +/* + * Per-process open flags. + */ +#define UF_EXCLOSE 0x01 /* auto-close on exec */ +#define UF_MAPPED 0x02 /* mapped from device */ + +/* + * Storage required per open file descriptor. + */ +#define OFILESIZE (sizeof(struct file *) + sizeof(char)) + +#ifdef KERNEL +/* + * Kernel global variables and routines. + */ +int fdalloc __P((struct proc *p, int want, int *result)); +int fdavail __P((struct proc *p, int n)); +int falloc __P((struct proc *p, struct file **resultfp, int *resultfd)); +struct filedesc *fdcopy __P((struct proc *p)); +void fdfree __P((struct proc *p)); +#endif diff --git a/sys/sys/filio.h b/sys/sys/filio.h new file mode 100644 index 00000000000..5c8789b882b --- /dev/null +++ b/sys/sys/filio.h @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)filio.h 8.1 (Berkeley) 3/28/94 + */ + +#ifndef _SYS_FILIO_H_ +#define _SYS_FILIO_H_ + +#include + +/* Generic file-descriptor ioctl's. */ +#define FIOCLEX _IO('f', 1) /* set close on exec on fd */ +#define FIONCLEX _IO('f', 2) /* remove close on exec */ +#define FIONREAD _IOR('f', 127, int) /* get # bytes to read */ +#define FIONBIO _IOW('f', 126, int) /* set/clear non-blocking i/o */ +#define FIOASYNC _IOW('f', 125, int) /* set/clear async i/o */ +#define FIOSETOWN _IOW('f', 124, int) /* set owner */ +#define FIOGETOWN _IOR('f', 123, int) /* get owner */ + +#endif /* !_SYS_FILIO_H_ */ diff --git a/sys/sys/gmon.h b/sys/sys/gmon.h new file mode 100644 index 00000000000..b103df80a8a --- /dev/null +++ b/sys/sys/gmon.h @@ -0,0 +1,159 @@ +/*- + * Copyright (c) 1982, 1986, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)gmon.h 8.2 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_GMON_H_ +#define _SYS_GMON_H_ + +#include + +/* + * Structure prepended to gmon.out profiling data file. + */ +struct gmonhdr { + u_long lpc; /* base pc address of sample buffer */ + u_long hpc; /* max pc address of sampled buffer */ + int ncnt; /* size of sample buffer (plus this header) */ + int version; /* version number */ + int profrate; /* profiling clock rate */ + int spare[3]; /* reserved */ +}; +#define GMONVERSION 0x00051879 + +/* + * histogram counters are unsigned shorts (according to the kernel). + */ +#define HISTCOUNTER unsigned short + +/* + * fraction of text space to allocate for histogram counters here, 1/2 + */ +#define HISTFRACTION 2 + +/* + * Fraction of text space to allocate for from hash buckets. + * The value of HASHFRACTION is based on the minimum number of bytes + * of separation between two subroutine call points in the object code. + * Given MIN_SUBR_SEPARATION bytes of separation the value of + * HASHFRACTION is calculated as: + * + * HASHFRACTION = MIN_SUBR_SEPARATION / (2 * sizeof(short) - 1); + * + * For example, on the VAX, the shortest two call sequence is: + * + * calls $0,(r0) + * calls $0,(r0) + * + * which is separated by only three bytes, thus HASHFRACTION is + * calculated as: + * + * HASHFRACTION = 3 / (2 * 2 - 1) = 1 + * + * Note that the division above rounds down, thus if MIN_SUBR_FRACTION + * is less than three, this algorithm will not work! + * + * In practice, however, call instructions are rarely at a minimal + * distance. Hence, we will define HASHFRACTION to be 2 across all + * architectures. This saves a reasonable amount of space for + * profiling data structures without (in practice) sacrificing + * any granularity. + */ +#define HASHFRACTION 2 + +/* + * percent of text space to allocate for tostructs with a minimum. + */ +#define ARCDENSITY 2 +#define MINARCS 50 +#define MAXARCS ((1 << (8 * sizeof(HISTCOUNTER))) - 2) + +struct tostruct { + u_long selfpc; + long count; + u_short link; + u_short pad; +}; + +/* + * a raw arc, with pointers to the calling site and + * the called site and a count. + */ +struct rawarc { + u_long raw_frompc; + u_long raw_selfpc; + long raw_count; +}; + +/* + * general rounding functions. + */ +#define ROUNDDOWN(x,y) (((x)/(y))*(y)) +#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y)) + +/* + * The profiling data structures are housed in this structure. + */ +struct gmonparam { + int state; + u_short *kcount; + u_long kcountsize; + u_short *froms; + u_long fromssize; + struct tostruct *tos; + u_long tossize; + long tolimit; + u_long lowpc; + u_long highpc; + u_long textsize; + u_long hashfraction; +}; +extern struct gmonparam _gmonparam; + +/* + * Possible states of profiling. + */ +#define GMON_PROF_ON 0 +#define GMON_PROF_BUSY 1 +#define GMON_PROF_ERROR 2 +#define GMON_PROF_OFF 3 + +/* + * Sysctl definitions for extracting profiling information from the kernel. + */ +#define GPROF_STATE 0 /* int: profiling enabling variable */ +#define GPROF_COUNT 1 /* struct: profile tick count buffer */ +#define GPROF_FROMS 2 /* struct: from location hash bucket */ +#define GPROF_TOS 3 /* struct: destination/count structure */ +#define GPROF_GMONPARAM 4 /* struct: profiling parameters (see above) */ +#endif /* !_SYS_GMON_H_ */ diff --git a/sys/sys/ioccom.h b/sys/sys/ioccom.h new file mode 100644 index 00000000000..5bc11b328bd --- /dev/null +++ b/sys/sys/ioccom.h @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ioccom.h 8.2 (Berkeley) 3/28/94 + */ + +#ifndef _SYS_IOCCOM_H_ +#define _SYS_IOCCOM_H_ + +/* + * Ioctl's have the command encoded in the lower word, and the size of + * any in or out parameters in the upper word. The high 3 bits of the + * upper word are used to encode the in/out status of the parameter. + */ +#define IOCPARM_MASK 0x1fff /* parameter length, at most 13 bits */ +#define IOCPARM_LEN(x) (((x) >> 16) & IOCPARM_MASK) +#define IOCBASECMD(x) ((x) & ~(IOCPARM_MASK << 16)) +#define IOCGROUP(x) (((x) >> 8) & 0xff) + +#define IOCPARM_MAX NBPG /* max size of ioctl, mult. of NBPG */ +#define IOC_VOID 0x20000000 /* no parameters */ +#define IOC_OUT 0x40000000 /* copy out parameters */ +#define IOC_IN 0x80000000 /* copy in parameters */ +#define IOC_INOUT (IOC_IN|IOC_OUT) +#define IOC_DIRMASK 0xe0000000 /* mask for IN/OUT/VOID */ + +#define _IOC(inout,group,num,len) \ + (inout | ((len & IOCPARM_MASK) << 16) | ((group) << 8) | (num)) +#define _IO(g,n) _IOC(IOC_VOID, (g), (n), 0) +#define _IOR(g,n,t) _IOC(IOC_OUT, (g), (n), sizeof(t)) +#define _IOW(g,n,t) _IOC(IOC_IN, (g), (n), sizeof(t)) +/* this should be _IORW, but stdio got there first */ +#define _IOWR(g,n,t) _IOC(IOC_INOUT, (g), (n), sizeof(t)) + +#endif /* !_SYS_IOCCOM_H_ */ diff --git a/sys/sys/ioctl.h b/sys/sys/ioctl.h new file mode 100644 index 00000000000..d04394fd181 --- /dev/null +++ b/sys/sys/ioctl.h @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ioctl.h 8.6 (Berkeley) 3/28/94 + */ + +#ifndef _SYS_IOCTL_H_ +#define _SYS_IOCTL_H_ + +#include + +/* + * Pun for SunOS prior to 3.2. SunOS 3.2 and later support TIOCGWINSZ + * and TIOCSWINSZ (yes, even 3.2-3.5, the fact that it wasn't documented + * nonwithstanding). + */ +struct ttysize { + unsigned short ts_lines; + unsigned short ts_cols; + unsigned short ts_xxx; + unsigned short ts_yyy; +}; +#define TIOCGSIZE TIOCGWINSZ +#define TIOCSSIZE TIOCSWINSZ + +#include + +#include +#include + +#ifndef KERNEL + +#include + +__BEGIN_DECLS +int ioctl __P((int, unsigned long, ...)); +__END_DECLS +#endif /* !KERNEL */ +#endif /* !_SYS_IOCTL_H_ */ + +/* + * Keep outside _SYS_IOCTL_H_ + * Compatability with old terminal driver + * + * Source level -> #define USE_OLD_TTY + * Kernel level -> options COMPAT_43 or COMPAT_SUNOS + */ +#if defined(USE_OLD_TTY) || defined(COMPAT_43) || defined(COMPAT_SUNOS) +#include +#endif diff --git a/sys/sys/ioctl_compat.h b/sys/sys/ioctl_compat.h new file mode 100644 index 00000000000..fd87b514cf8 --- /dev/null +++ b/sys/sys/ioctl_compat.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ioctl_compat.h 8.4 (Berkeley) 1/21/94 + */ + +#ifndef _SYS_IOCTL_COMPAT_H_ +#define _SYS_IOCTL_COMPAT_H_ + +#include +#include + +struct tchars { + char t_intrc; /* interrupt */ + char t_quitc; /* quit */ + char t_startc; /* start output */ + char t_stopc; /* stop output */ + char t_eofc; /* end-of-file */ + char t_brkc; /* input delimiter (like nl) */ +}; + +struct ltchars { + char t_suspc; /* stop process signal */ + char t_dsuspc; /* delayed stop process signal */ + char t_rprntc; /* reprint line */ + char t_flushc; /* flush output (toggles) */ + char t_werasc; /* word erase */ + char t_lnextc; /* literal next character */ +}; + +/* + * Structure for TIOCGETP and TIOCSETP ioctls. + */ +#ifndef _SGTTYB_ +#define _SGTTYB_ +struct sgttyb { + char sg_ispeed; /* input speed */ + char sg_ospeed; /* output speed */ + char sg_erase; /* erase character */ + char sg_kill; /* kill character */ + short sg_flags; /* mode flags */ +}; +#endif + +#ifdef USE_OLD_TTY +# undef TIOCGETD +# define TIOCGETD _IOR('t', 0, int) /* get line discipline */ +# undef TIOCSETD +# define TIOCSETD _IOW('t', 1, int) /* set line discipline */ +#else +# define OTIOCGETD _IOR('t', 0, int) /* get line discipline */ +# define OTIOCSETD _IOW('t', 1, int) /* set line discipline */ +#endif +#define TIOCHPCL _IO('t', 2) /* hang up on last close */ +#define TIOCGETP _IOR('t', 8,struct sgttyb)/* get parameters -- gtty */ +#define TIOCSETP _IOW('t', 9,struct sgttyb)/* set parameters -- stty */ +#define TIOCSETN _IOW('t',10,struct sgttyb)/* as above, but no flushtty*/ +#define TIOCSETC _IOW('t',17,struct tchars)/* set special characters */ +#define TIOCGETC _IOR('t',18,struct tchars)/* get special characters */ +#define TANDEM 0x00000001 /* send stopc on out q full */ +#define CBREAK 0x00000002 /* half-cooked mode */ +#define LCASE 0x00000004 /* simulate lower case */ +#define ECHO 0x00000008 /* echo input */ +#define CRMOD 0x00000010 /* map \r to \r\n on output */ +#define RAW 0x00000020 /* no i/o processing */ +#define ODDP 0x00000040 /* get/send odd parity */ +#define EVENP 0x00000080 /* get/send even parity */ +#define ANYP 0x000000c0 /* get any parity/send none */ +#define NLDELAY 0x00000300 /* \n delay */ +#define NL0 0x00000000 +#define NL1 0x00000100 /* tty 37 */ +#define NL2 0x00000200 /* vt05 */ +#define NL3 0x00000300 +#define TBDELAY 0x00000c00 /* horizontal tab delay */ +#define TAB0 0x00000000 +#define TAB1 0x00000400 /* tty 37 */ +#define TAB2 0x00000800 +#define XTABS 0x00000c00 /* expand tabs on output */ +#define CRDELAY 0x00003000 /* \r delay */ +#define CR0 0x00000000 +#define CR1 0x00001000 /* tn 300 */ +#define CR2 0x00002000 /* tty 37 */ +#define CR3 0x00003000 /* concept 100 */ +#define VTDELAY 0x00004000 /* vertical tab delay */ +#define FF0 0x00000000 +#define FF1 0x00004000 /* tty 37 */ +#define BSDELAY 0x00008000 /* \b delay */ +#define BS0 0x00000000 +#define BS1 0x00008000 +#define ALLDELAY (NLDELAY|TBDELAY|CRDELAY|VTDELAY|BSDELAY) +#define CRTBS 0x00010000 /* do backspacing for crt */ +#define PRTERA 0x00020000 /* \ ... / erase */ +#define CRTERA 0x00040000 /* " \b " to wipe out char */ +#define TILDE 0x00080000 /* hazeltine tilde kludge */ +#define MDMBUF 0x00100000 /*start/stop output on carrier*/ +#define LITOUT 0x00200000 /* literal output */ +#define TOSTOP 0x00400000 /*SIGSTOP on background output*/ +#define FLUSHO 0x00800000 /* flush output to terminal */ +#define NOHANG 0x01000000 /* (no-op) was no SIGHUP on carrier drop */ +#define L001000 0x02000000 +#define CRTKIL 0x04000000 /* kill line with " \b " */ +#define PASS8 0x08000000 +#define CTLECH 0x10000000 /* echo control chars as ^X */ +#define PENDIN 0x20000000 /* tp->t_rawq needs reread */ +#define DECCTQ 0x40000000 /* only ^Q starts after ^S */ +#define NOFLSH 0x80000000 /* no output flush on signal */ +#define TIOCLBIS _IOW('t', 127, int) /* bis local mode bits */ +#define TIOCLBIC _IOW('t', 126, int) /* bic local mode bits */ +#define TIOCLSET _IOW('t', 125, int) /* set entire local mode word */ +#define TIOCLGET _IOR('t', 124, int) /* get local modes */ +#define LCRTBS (CRTBS>>16) +#define LPRTERA (PRTERA>>16) +#define LCRTERA (CRTERA>>16) +#define LTILDE (TILDE>>16) +#define LMDMBUF (MDMBUF>>16) +#define LLITOUT (LITOUT>>16) +#define LTOSTOP (TOSTOP>>16) +#define LFLUSHO (FLUSHO>>16) +#define LNOHANG (NOHANG>>16) +#define LCRTKIL (CRTKIL>>16) +#define LPASS8 (PASS8>>16) +#define LCTLECH (CTLECH>>16) +#define LPENDIN (PENDIN>>16) +#define LDECCTQ (DECCTQ>>16) +#define LNOFLSH (NOFLSH>>16) +#define TIOCSLTC _IOW('t',117,struct ltchars)/* set local special chars*/ +#define TIOCGLTC _IOR('t',116,struct ltchars)/* get local special chars*/ +#define OTIOCCONS _IO('t', 98) /* for hp300 -- sans int arg */ +#define OTTYDISC 0 +#define NETLDISC 1 +#define NTTYDISC 2 + +#endif /* !_SYS_IOCTL_COMPAT_H_ */ diff --git a/sys/sys/ipc.h b/sys/sys/ipc.h new file mode 100644 index 00000000000..cc036a8e83b --- /dev/null +++ b/sys/sys/ipc.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ipc.h 8.3 (Berkeley) 1/21/94 + */ + +/* + * SVID compatible ipc.h file + */ +#ifndef _SYS_IPC_H_ +#define _SYS_IPC_H_ + +typedef long key_t; /* XXX should be in types.h */ + +struct ipc_perm { + ushort cuid; /* creator user id */ + ushort cgid; /* creator group id */ + ushort uid; /* user id */ + ushort gid; /* group id */ + ushort mode; /* r/w permission */ + ushort seq; /* sequence # (to generate unique msg/sem/shm id) */ + key_t key; /* user specified msg/sem/shm key */ +}; + +/* common mode bits */ +#define IPC_R 00400 /* read permission */ +#define IPC_W 00200 /* write/alter permission */ + +/* SVID required constants (same values as system 5) */ +#define IPC_CREAT 01000 /* create entry if key does not exist */ +#define IPC_EXCL 02000 /* fail if key exists */ +#define IPC_NOWAIT 04000 /* error if request must wait */ + +#define IPC_PRIVATE (key_t)0 /* private key */ + +#define IPC_RMID 0 /* remove identifier */ +#define IPC_SET 1 /* set options */ +#define IPC_STAT 2 /* get options */ + +#endif /* !_SYS_IPC_H_ */ diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h new file mode 100644 index 00000000000..682e6c8c194 --- /dev/null +++ b/sys/sys/kernel.h @@ -0,0 +1,59 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kernel.h 8.3 (Berkeley) 1/21/94 + */ + +/* Global variables for the kernel. */ + +/* 1.1 */ +extern long hostid; +extern char hostname[MAXHOSTNAMELEN]; +extern int hostnamelen; + +/* 1.2 */ +extern volatile struct timeval mono_time; +extern struct timeval boottime; +extern struct timeval runtime; +extern volatile struct timeval time; +extern struct timezone tz; /* XXX */ + +extern int tick; /* usec per tick (1000000 / hz) */ +extern int hz; /* system clock's frequency */ +extern int stathz; /* statistics clock's frequency */ +extern int profhz; /* profiling clock's frequency */ +extern int lbolt; /* once a second sleep address */ diff --git a/sys/sys/ktrace.h b/sys/sys/ktrace.h new file mode 100644 index 00000000000..1623c3562fe --- /dev/null +++ b/sys/sys/ktrace.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ktrace.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * operations to ktrace system call (KTROP(op)) + */ +#define KTROP_SET 0 /* set trace points */ +#define KTROP_CLEAR 1 /* clear trace points */ +#define KTROP_CLEARFILE 2 /* stop all tracing to file */ +#define KTROP(o) ((o)&3) /* macro to extract operation */ +/* + * flags (ORed in with operation) + */ +#define KTRFLAG_DESCEND 4 /* perform op on all children too */ + +/* + * ktrace record header + */ +struct ktr_header { + int ktr_len; /* length of buf */ + short ktr_type; /* trace record type */ + pid_t ktr_pid; /* process id */ + char ktr_comm[MAXCOMLEN+1]; /* command name */ + struct timeval ktr_time; /* timestamp */ + caddr_t ktr_buf; +}; + +/* + * Test for kernel trace point + */ +#define KTRPOINT(p, type) \ + (((p)->p_traceflag & ((1<<(type))|KTRFAC_ACTIVE)) == (1<<(type))) + +/* + * ktrace record types + */ + +/* + * KTR_SYSCALL - system call record + */ +#define KTR_SYSCALL 1 +struct ktr_syscall { + short ktr_code; /* syscall number */ + short ktr_narg; /* number of arguments */ + /* + * followed by ktr_narg ints + */ +}; + +/* + * KTR_SYSRET - return from system call record + */ +#define KTR_SYSRET 2 +struct ktr_sysret { + short ktr_code; + short ktr_eosys; + int ktr_error; + int ktr_retval; +}; + +/* + * KTR_NAMEI - namei record + */ +#define KTR_NAMEI 3 + /* record contains pathname */ + +/* + * KTR_GENIO - trace generic process i/o + */ +#define KTR_GENIO 4 +struct ktr_genio { + int ktr_fd; + enum uio_rw ktr_rw; + /* + * followed by data successfully read/written + */ +}; + +/* + * KTR_PSIG - trace processed signal + */ +#define KTR_PSIG 5 +struct ktr_psig { + int signo; + sig_t action; + int mask; + int code; +}; + +/* + * KTR_CSW - trace context switches + */ +#define KTR_CSW 6 +struct ktr_csw { + int out; /* 1 if switch out, 0 if switch in */ + int user; /* 1 if usermode (ivcsw), 0 if kernel (vcsw) */ +}; + +/* + * kernel trace points (in p_traceflag) + */ +#define KTRFAC_MASK 0x00ffffff +#define KTRFAC_SYSCALL (1< + +__BEGIN_DECLS +int ktrace __P((const char *, int, int, pid_t)); +__END_DECLS + +#endif /* !KERNEL */ diff --git a/sys/sys/libkern.h b/sys/sys/libkern.h new file mode 100644 index 00000000000..0e465e03dfd --- /dev/null +++ b/sys/sys/libkern.h @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)libkern.h 8.1 (Berkeley) 6/10/93 + */ + +#include + +static inline int +imax(a, b) + int a, b; +{ + return (a > b ? a : b); +} +static inline int +imin(a, b) + int a, b; +{ + return (a < b ? a : b); +} +static inline long +lmax(a, b) + long a, b; +{ + return (a > b ? a : b); +} +static inline long +lmin(a, b) + long a, b; +{ + return (a < b ? a : b); +} +static inline u_int +max(a, b) + u_int a, b; +{ + return (a > b ? a : b); +} +static inline u_int +min(a, b) + u_int a, b; +{ + return (a < b ? a : b); +} +static inline u_long +ulmax(a, b) + u_long a, b; +{ + return (a > b ? a : b); +} +static inline u_long +ulmin(a, b) + u_long a, b; +{ + return (a < b ? a : b); +} + +/* Prototypes for non-quad routines. */ +int bcmp __P((const void *, const void *, size_t)); +int ffs __P((int)); +int locc __P((int, char *, u_int)); +u_long random __P((void)); +char *rindex __P((const char *, int)); +int scanc __P((u_int, u_char *, u_char *, int)); +int skpc __P((int, int, char *)); +char *strcat __P((char *, const char *)); +char *strcpy __P((char *, const char *)); +size_t strlen __P((const char *)); +char *strncpy __P((char *, const char *, size_t)); diff --git a/sys/sys/linedisc.h b/sys/sys/linedisc.h new file mode 100644 index 00000000000..58cb6fa8339 --- /dev/null +++ b/sys/sys/linedisc.h @@ -0,0 +1,123 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)conf.h 8.3 (Berkeley) 1/21/94 + */ + +/* + * Definitions of device driver entry switches + */ + +struct buf; +struct proc; +struct tty; +struct uio; +struct vnode; + +struct bdevsw { + int (*d_open) __P((dev_t dev, int oflags, int devtype, + struct proc *p)); + int (*d_close) __P((dev_t dev, int fflag, int devtype, + struct proc *p)); + int (*d_strategy) __P((struct buf *bp)); + int (*d_ioctl) __P((dev_t dev, int cmd, caddr_t data, + int fflag, struct proc *p)); + int (*d_dump) (); /* parameters vary by architecture */ + int (*d_psize) __P((dev_t dev)); + int d_flags; +}; + +#ifdef KERNEL +extern struct bdevsw bdevsw[]; +#endif + +struct cdevsw { + int (*d_open) __P((dev_t dev, int oflags, int devtype, + struct proc *p)); + int (*d_close) __P((dev_t dev, int fflag, int devtype, + struct proc *)); + int (*d_read) __P((dev_t dev, struct uio *uio, int ioflag)); + int (*d_write) __P((dev_t dev, struct uio *uio, int ioflag)); + int (*d_ioctl) __P((dev_t dev, int cmd, caddr_t data, + int fflag, struct proc *p)); + int (*d_stop) __P((struct tty *tp, int rw)); + int (*d_reset) __P((int uban)); /* XXX */ + struct tty *d_ttys; + int (*d_select) __P((dev_t dev, int which, struct proc *p)); + int (*d_mmap) __P(()); + int (*d_strategy) __P((struct buf *bp)); +}; + +#ifdef KERNEL +extern struct cdevsw cdevsw[]; + +/* symbolic sleep message strings */ +extern char devopn[], devio[], devwait[], devin[], devout[]; +extern char devioc[], devcls[]; +#endif + +struct linesw { + int (*l_open) __P((dev_t dev, struct tty *tp)); + int (*l_close) __P((struct tty *tp, int flag)); + int (*l_read) __P((struct tty *tp, struct uio *uio, + int flag)); + int (*l_write) __P((struct tty *tp, struct uio *uio, + int flag)); + int (*l_ioctl) __P((struct tty *tp, int cmd, caddr_t data, + int flag, struct proc *p)); + int (*l_rint) __P((int c, struct tty *tp)); + int (*l_start) __P((struct tty *tp)); + int (*l_modem) __P((struct tty *tp, int flag)); +}; + +#ifdef KERNEL +extern struct linesw linesw[]; +#endif + +struct swdevt { + dev_t sw_dev; + int sw_flags; + int sw_nblks; + struct vnode *sw_vp; +}; +#define SW_FREED 0x01 +#define SW_SEQUENTIAL 0x02 +#define sw_freed sw_flags /* XXX compat */ + +#ifdef KERNEL +extern struct swdevt swdevt[]; +#endif diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h new file mode 100644 index 00000000000..ba67bda1f5a --- /dev/null +++ b/sys/sys/malloc.h @@ -0,0 +1,306 @@ +/* + * Copyright (c) 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)malloc.h 8.3 (Berkeley) 1/12/94 + */ + +#ifndef _SYS_MALLOC_H_ +#define _SYS_MALLOC_H_ + +#define KMEMSTATS + +/* + * flags to malloc + */ +#define M_WAITOK 0x0000 +#define M_NOWAIT 0x0001 + +/* + * Types of memory to be allocated + */ +#define M_FREE 0 /* should be on free list */ +#define M_MBUF 1 /* mbuf */ +#define M_DEVBUF 2 /* device driver memory */ +#define M_SOCKET 3 /* socket structure */ +#define M_PCB 4 /* protocol control block */ +#define M_RTABLE 5 /* routing tables */ +#define M_HTABLE 6 /* IMP host tables */ +#define M_FTABLE 7 /* fragment reassembly header */ +#define M_ZOMBIE 8 /* zombie proc status */ +#define M_IFADDR 9 /* interface address */ +#define M_SOOPTS 10 /* socket options */ +#define M_SONAME 11 /* socket name */ +#define M_NAMEI 12 /* namei path name buffer */ +#define M_GPROF 13 /* kernel profiling buffer */ +#define M_IOCTLOPS 14 /* ioctl data buffer */ +#define M_MAPMEM 15 /* mapped memory descriptors */ +#define M_CRED 16 /* credentials */ +#define M_PGRP 17 /* process group header */ +#define M_SESSION 18 /* session header */ +#define M_IOV 19 /* large iov's */ +#define M_MOUNT 20 /* vfs mount struct */ +#define M_FHANDLE 21 /* network file handle */ +#define M_NFSREQ 22 /* NFS request header */ +#define M_NFSMNT 23 /* NFS mount structure */ +#define M_NFSNODE 24 /* NFS vnode private part */ +#define M_VNODE 25 /* Dynamically allocated vnodes */ +#define M_CACHE 26 /* Dynamically allocated cache entries */ +#define M_DQUOT 27 /* UFS quota entries */ +#define M_UFSMNT 28 /* UFS mount structure */ +#define M_SHM 29 /* SVID compatible shared memory segments */ +#define M_VMMAP 30 /* VM map structures */ +#define M_VMMAPENT 31 /* VM map entry structures */ +#define M_VMOBJ 32 /* VM object structure */ +#define M_VMOBJHASH 33 /* VM object hash structure */ +#define M_VMPMAP 34 /* VM pmap */ +#define M_VMPVENT 35 /* VM phys-virt mapping entry */ +#define M_VMPAGER 36 /* XXX: VM pager struct */ +#define M_VMPGDATA 37 /* XXX: VM pager private data */ +#define M_FILE 38 /* Open file structure */ +#define M_FILEDESC 39 /* Open file descriptor table */ +#define M_LOCKF 40 /* Byte-range locking structures */ +#define M_PROC 41 /* Proc structures */ +#define M_SUBPROC 42 /* Proc sub-structures */ +#define M_SEGMENT 43 /* Segment for LFS */ +#define M_LFSNODE 44 /* LFS vnode private part */ +#define M_FFSNODE 45 /* FFS vnode private part */ +#define M_MFSNODE 46 /* MFS vnode private part */ +#define M_NQLEASE 47 /* Nqnfs lease */ +#define M_NQMHOST 48 /* Nqnfs host address table */ +#define M_NETADDR 49 /* Export host address structure */ +#define M_NFSSVC 50 /* Nfs server structure */ +#define M_NFSUID 51 /* Nfs uid mapping structure */ +#define M_NFSD 52 /* Nfs server daemon structure */ +#define M_IPMOPTS 53 /* internet multicast options */ +#define M_IPMADDR 54 /* internet multicast address */ +#define M_IFMADDR 55 /* link-level multicast address */ +#define M_MRTABLE 56 /* multicast routing tables */ +#define M_ISOFSMNT 57 /* ISOFS mount structure */ +#define M_ISOFSNODE 58 /* ISOFS vnode private part */ +#define M_TEMP 74 /* misc temporary data buffers */ +#define M_LAST 75 /* Must be last type + 1 */ + +#define INITKMEMNAMES { \ + "free", /* 0 M_FREE */ \ + "mbuf", /* 1 M_MBUF */ \ + "devbuf", /* 2 M_DEVBUF */ \ + "socket", /* 3 M_SOCKET */ \ + "pcb", /* 4 M_PCB */ \ + "routetbl", /* 5 M_RTABLE */ \ + "hosttbl", /* 6 M_HTABLE */ \ + "fragtbl", /* 7 M_FTABLE */ \ + "zombie", /* 8 M_ZOMBIE */ \ + "ifaddr", /* 9 M_IFADDR */ \ + "soopts", /* 10 M_SOOPTS */ \ + "soname", /* 11 M_SONAME */ \ + "namei", /* 12 M_NAMEI */ \ + "gprof", /* 13 M_GPROF */ \ + "ioctlops", /* 14 M_IOCTLOPS */ \ + "mapmem", /* 15 M_MAPMEM */ \ + "cred", /* 16 M_CRED */ \ + "pgrp", /* 17 M_PGRP */ \ + "session", /* 18 M_SESSION */ \ + "iov", /* 19 M_IOV */ \ + "mount", /* 20 M_MOUNT */ \ + "fhandle", /* 21 M_FHANDLE */ \ + "NFS req", /* 22 M_NFSREQ */ \ + "NFS mount", /* 23 M_NFSMNT */ \ + "NFS node", /* 24 M_NFSNODE */ \ + "vnodes", /* 25 M_VNODE */ \ + "namecache", /* 26 M_CACHE */ \ + "UFS quota", /* 27 M_DQUOT */ \ + "UFS mount", /* 28 M_UFSMNT */ \ + "shm", /* 29 M_SHM */ \ + "VM map", /* 30 M_VMMAP */ \ + "VM mapent", /* 31 M_VMMAPENT */ \ + "VM object", /* 32 M_VMOBJ */ \ + "VM objhash", /* 33 M_VMOBJHASH */ \ + "VM pmap", /* 34 M_VMPMAP */ \ + "VM pvmap", /* 35 M_VMPVENT */ \ + "VM pager", /* 36 M_VMPAGER */ \ + "VM pgdata", /* 37 M_VMPGDATA */ \ + "file", /* 38 M_FILE */ \ + "file desc", /* 39 M_FILEDESC */ \ + "lockf", /* 40 M_LOCKF */ \ + "proc", /* 41 M_PROC */ \ + "subproc", /* 42 M_SUBPROC */ \ + "LFS segment", /* 43 M_SEGMENT */ \ + "LFS node", /* 44 M_LFSNODE */ \ + "FFS node", /* 45 M_FFSNODE */ \ + "MFS node", /* 46 M_MFSNODE */ \ + "NQNFS Lease", /* 47 M_NQLEASE */ \ + "NQNFS Host", /* 48 M_NQMHOST */ \ + "Export Host", /* 49 M_NETADDR */ \ + "NFS srvsock", /* 50 M_NFSSVC */ \ + "NFS uid", /* 51 M_NFSUID */ \ + "NFS daemon", /* 52 M_NFSD */ \ + "ip_moptions", /* 53 M_IPMOPTS */ \ + "in_multi", /* 54 M_IPMADDR */ \ + "ether_multi", /* 55 M_IFMADDR */ \ + "mrt", /* 56 M_MRTABLE */ \ + "ISOFS mount", /* 57 M_ISOFSMNT */ \ + "ISOFS node", /* 58 M_ISOFSNODE */ \ + NULL, NULL, NULL, NULL, NULL, \ + NULL, NULL, NULL, NULL, NULL, \ + NULL, NULL, NULL, NULL, NULL, \ + "temp", /* 74 M_TEMP */ \ +} + +struct kmemstats { + long ks_inuse; /* # of packets of this type currently in use */ + long ks_calls; /* total packets of this type ever allocated */ + long ks_memuse; /* total memory held in bytes */ + u_short ks_limblocks; /* number of times blocked for hitting limit */ + u_short ks_mapblocks; /* number of times blocked for kernel map */ + long ks_maxused; /* maximum number ever used */ + long ks_limit; /* most that are allowed to exist */ + long ks_size; /* sizes of this thing that are allocated */ + long ks_spare; +}; + +/* + * Array of descriptors that describe the contents of each page + */ +struct kmemusage { + short ku_indx; /* bucket index */ + union { + u_short freecnt;/* for small allocations, free pieces in page */ + u_short pagecnt;/* for large allocations, pages alloced */ + } ku_un; +}; +#define ku_freecnt ku_un.freecnt +#define ku_pagecnt ku_un.pagecnt + +/* + * Set of buckets for each size of memory block that is retained + */ +struct kmembuckets { + caddr_t kb_next; /* list of free blocks */ + caddr_t kb_last; /* last free block */ + long kb_calls; /* total calls to allocate this size */ + long kb_total; /* total number of blocks allocated */ + long kb_totalfree; /* # of free elements in this bucket */ + long kb_elmpercl; /* # of elements in this sized allocation */ + long kb_highwat; /* high water mark */ + long kb_couldfree; /* over high water mark and could free */ +}; + +#ifdef KERNEL +#define MINALLOCSIZE (1 << MINBUCKET) +#define BUCKETINDX(size) \ + (size) <= (MINALLOCSIZE * 128) \ + ? (size) <= (MINALLOCSIZE * 8) \ + ? (size) <= (MINALLOCSIZE * 2) \ + ? (size) <= (MINALLOCSIZE * 1) \ + ? (MINBUCKET + 0) \ + : (MINBUCKET + 1) \ + : (size) <= (MINALLOCSIZE * 4) \ + ? (MINBUCKET + 2) \ + : (MINBUCKET + 3) \ + : (size) <= (MINALLOCSIZE* 32) \ + ? (size) <= (MINALLOCSIZE * 16) \ + ? (MINBUCKET + 4) \ + : (MINBUCKET + 5) \ + : (size) <= (MINALLOCSIZE * 64) \ + ? (MINBUCKET + 6) \ + : (MINBUCKET + 7) \ + : (size) <= (MINALLOCSIZE * 2048) \ + ? (size) <= (MINALLOCSIZE * 512) \ + ? (size) <= (MINALLOCSIZE * 256) \ + ? (MINBUCKET + 8) \ + : (MINBUCKET + 9) \ + : (size) <= (MINALLOCSIZE * 1024) \ + ? (MINBUCKET + 10) \ + : (MINBUCKET + 11) \ + : (size) <= (MINALLOCSIZE * 8192) \ + ? (size) <= (MINALLOCSIZE * 4096) \ + ? (MINBUCKET + 12) \ + : (MINBUCKET + 13) \ + : (size) <= (MINALLOCSIZE * 16384) \ + ? (MINBUCKET + 14) \ + : (MINBUCKET + 15) + +/* + * Turn virtual addresses into kmem map indicies + */ +#define kmemxtob(alloc) (kmembase + (alloc) * NBPG) +#define btokmemx(addr) (((caddr_t)(addr) - kmembase) / NBPG) +#define btokup(addr) (&kmemusage[((caddr_t)(addr) - kmembase) >> CLSHIFT]) + +/* + * Macro versions for the usual cases of malloc/free + */ +#if defined(KMEMSTATS) || defined(DIAGNOSTIC) +#define MALLOC(space, cast, size, type, flags) \ + (space) = (cast)malloc((u_long)(size), type, flags) +#define FREE(addr, type) free((caddr_t)(addr), type) + +#else /* do not collect statistics */ +#define MALLOC(space, cast, size, type, flags) { \ + register struct kmembuckets *kbp = &bucket[BUCKETINDX(size)]; \ + long s = splimp(); \ + if (kbp->kb_next == NULL) { \ + (space) = (cast)malloc((u_long)(size), type, flags); \ + } else { \ + (space) = (cast)kbp->kb_next; \ + kbp->kb_next = *(caddr_t *)(space); \ + } \ + splx(s); \ +} + +#define FREE(addr, type) { \ + register struct kmembuckets *kbp; \ + register struct kmemusage *kup = btokup(addr); \ + long s = splimp(); \ + if (1 << kup->ku_indx > MAXALLOCSAVE) { \ + free((caddr_t)(addr), type); \ + } else { \ + kbp = &bucket[kup->ku_indx]; \ + if (kbp->kb_next == NULL) \ + kbp->kb_next = (caddr_t)(addr); \ + else \ + *(caddr_t *)(kbp->kb_last) = (caddr_t)(addr); \ + *(caddr_t *)(addr) = NULL; \ + kbp->kb_last = (caddr_t)(addr); \ + } \ + splx(s); \ +} +#endif /* do not collect statistics */ + +extern struct kmemstats kmemstats[]; +extern struct kmemusage *kmemusage; +extern char *kmembase; +extern struct kmembuckets bucket[]; +extern void *malloc __P((unsigned long size, int type, int flags)); +extern void free __P((void *addr, int type)); +#endif /* KERNEL */ +#endif /* !_SYS_MALLOC_H_ */ diff --git a/sys/sys/map.h b/sys/sys/map.h new file mode 100644 index 00000000000..6cec4b55653 --- /dev/null +++ b/sys/sys/map.h @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)map.h 8.3 (Berkeley) 1/26/94 + */ + +/* + * Resource allocation maps. + * + * Associated routines manage sub-allocation of an address space using + * an array of segment descriptors. The first element of this array + * is a map structure, describing the arrays extent and the name + * of the controlled object. Each additional structure represents + * a free segment of the address space. + * + * A call to rminit initializes a resource map and may also be used + * to free some address space for the map. Subsequent calls to rmalloc + * and rmfree allocate and free space in the resource map. If the resource + * map becomes too fragmented to be described in the available space, + * then some of the resource is discarded. This may lead to critical + * shortages, but is better than not checking (as the previous versions + * of these routines did) or giving up and calling panic(). The routines + * could use linked lists and call a memory allocator when they run + * out of space, but that would not solve the out of space problem when + * called at interrupt time. + * + * N.B.: The address 0 in the resource address space is not available + * as it is used internally by the resource map routines. + */ +struct map { + struct mapent *m_limit; /* address of last slot in map */ + char *m_name; /* name of resource, for messages */ +}; + +struct mapent { + long m_size; /* size of this segment of the map */ + long m_addr; /* start of segment */ +}; + +#ifdef KERNEL +#define ARGMAPSIZE 16 +struct map *kmemmap, *mbmap, *swapmap; +int nswapmap; + +long rmalloc __P((struct map *, long)); +void rmfree __P((struct map *, long, long)); +void rminit __P((struct map *, long, long, char *, int)); +#endif diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h new file mode 100644 index 00000000000..f3ea7edefe6 --- /dev/null +++ b/sys/sys/mbuf.h @@ -0,0 +1,402 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mbuf.h 8.3 (Berkeley) 1/21/94 + */ + +#ifndef M_WAITOK +#include +#endif + +/* + * Mbufs are of a single size, MSIZE (machine/machparam.h), which + * includes overhead. An mbuf may add a single "mbuf cluster" of size + * MCLBYTES (also in machine/machparam.h), which has no additional overhead + * and is used instead of the internal data area; this is done when + * at least MINCLSIZE of data must be stored. + */ + +#define MLEN (MSIZE - sizeof(struct m_hdr)) /* normal data len */ +#define MHLEN (MLEN - sizeof(struct pkthdr)) /* data len w/pkthdr */ + +#define MINCLSIZE (MHLEN + MLEN) /* smallest amount to put in cluster */ +#define M_MAXCOMPRESS (MHLEN / 2) /* max amount to copy for compression */ + +/* + * Macros for type conversion + * mtod(m,t) - convert mbuf pointer to data pointer of correct type + * dtom(x) - convert data pointer within mbuf to mbuf pointer (XXX) + * mtocl(x) - convert pointer within cluster to cluster index # + * cltom(x) - convert cluster # to ptr to beginning of cluster + */ +#define mtod(m,t) ((t)((m)->m_data)) +#define dtom(x) ((struct mbuf *)((int)(x) & ~(MSIZE-1))) +#define mtocl(x) (((u_int)(x) - (u_int)mbutl) >> MCLSHIFT) +#define cltom(x) ((caddr_t)((u_int)mbutl + ((u_int)(x) << MCLSHIFT))) + +/* header at beginning of each mbuf: */ +struct m_hdr { + struct mbuf *mh_next; /* next buffer in chain */ + struct mbuf *mh_nextpkt; /* next chain in queue/record */ + int mh_len; /* amount of data in this mbuf */ + caddr_t mh_data; /* location of data */ + short mh_type; /* type of data in this mbuf */ + short mh_flags; /* flags; see below */ +}; + +/* record/packet header in first mbuf of chain; valid if M_PKTHDR set */ +struct pkthdr { + int len; /* total packet length */ + struct ifnet *rcvif; /* rcv interface */ +}; + +/* description of external storage mapped into mbuf, valid if M_EXT set */ +struct m_ext { + caddr_t ext_buf; /* start of buffer */ + void (*ext_free)(); /* free routine if not the usual */ + u_int ext_size; /* size of buffer, for ext_free */ +}; + +struct mbuf { + struct m_hdr m_hdr; + union { + struct { + struct pkthdr MH_pkthdr; /* M_PKTHDR set */ + union { + struct m_ext MH_ext; /* M_EXT set */ + char MH_databuf[MHLEN]; + } MH_dat; + } MH; + char M_databuf[MLEN]; /* !M_PKTHDR, !M_EXT */ + } M_dat; +}; +#define m_next m_hdr.mh_next +#define m_len m_hdr.mh_len +#define m_data m_hdr.mh_data +#define m_type m_hdr.mh_type +#define m_flags m_hdr.mh_flags +#define m_nextpkt m_hdr.mh_nextpkt +#define m_act m_nextpkt +#define m_pkthdr M_dat.MH.MH_pkthdr +#define m_ext M_dat.MH.MH_dat.MH_ext +#define m_pktdat M_dat.MH.MH_dat.MH_databuf +#define m_dat M_dat.M_databuf + +/* mbuf flags */ +#define M_EXT 0x0001 /* has associated external storage */ +#define M_PKTHDR 0x0002 /* start of record */ +#define M_EOR 0x0004 /* end of record */ + +/* mbuf pkthdr flags, also in m_flags */ +#define M_BCAST 0x0100 /* send/received as link-level broadcast */ +#define M_MCAST 0x0200 /* send/received as link-level multicast */ + +/* flags copied when copying m_pkthdr */ +#define M_COPYFLAGS (M_PKTHDR|M_EOR|M_BCAST|M_MCAST) + +/* mbuf types */ +#define MT_FREE 0 /* should be on free list */ +#define MT_DATA 1 /* dynamic (data) allocation */ +#define MT_HEADER 2 /* packet header */ +#define MT_SOCKET 3 /* socket structure */ +#define MT_PCB 4 /* protocol control block */ +#define MT_RTABLE 5 /* routing tables */ +#define MT_HTABLE 6 /* IMP host tables */ +#define MT_ATABLE 7 /* address resolution tables */ +#define MT_SONAME 8 /* socket name */ +#define MT_SOOPTS 10 /* socket options */ +#define MT_FTABLE 11 /* fragment reassembly header */ +#define MT_RIGHTS 12 /* access rights */ +#define MT_IFADDR 13 /* interface address */ +#define MT_CONTROL 14 /* extra-data protocol message */ +#define MT_OOBDATA 15 /* expedited data */ + +/* flags to m_get/MGET */ +#define M_DONTWAIT M_NOWAIT +#define M_WAIT M_WAITOK + +/* + * mbuf utility macros: + * + * MBUFLOCK(code) + * prevents a section of code from from being interrupted by network + * drivers. + */ +#define MBUFLOCK(code) \ + { int ms = splimp(); \ + { code } \ + splx(ms); \ + } + +/* + * mbuf allocation/deallocation macros: + * + * MGET(struct mbuf *m, int how, int type) + * allocates an mbuf and initializes it to contain internal data. + * + * MGETHDR(struct mbuf *m, int how, int type) + * allocates an mbuf and initializes it to contain a packet header + * and internal data. + */ +#define MGET(m, how, type) { \ + MALLOC((m), struct mbuf *, MSIZE, mbtypes[type], (how)); \ + if (m) { \ + (m)->m_type = (type); \ + MBUFLOCK(mbstat.m_mtypes[type]++;) \ + (m)->m_next = (struct mbuf *)NULL; \ + (m)->m_nextpkt = (struct mbuf *)NULL; \ + (m)->m_data = (m)->m_dat; \ + (m)->m_flags = 0; \ + } else \ + (m) = m_retry((how), (type)); \ +} + +#define MGETHDR(m, how, type) { \ + MALLOC((m), struct mbuf *, MSIZE, mbtypes[type], (how)); \ + if (m) { \ + (m)->m_type = (type); \ + MBUFLOCK(mbstat.m_mtypes[type]++;) \ + (m)->m_next = (struct mbuf *)NULL; \ + (m)->m_nextpkt = (struct mbuf *)NULL; \ + (m)->m_data = (m)->m_pktdat; \ + (m)->m_flags = M_PKTHDR; \ + } else \ + (m) = m_retryhdr((how), (type)); \ +} + +/* + * Mbuf cluster macros. + * MCLALLOC(caddr_t p, int how) allocates an mbuf cluster. + * MCLGET adds such clusters to a normal mbuf; + * the flag M_EXT is set upon success. + * MCLFREE releases a reference to a cluster allocated by MCLALLOC, + * freeing the cluster if the reference count has reached 0. + * + * Normal mbuf clusters are normally treated as character arrays + * after allocation, but use the first word of the buffer as a free list + * pointer while on the free list. + */ +union mcluster { + union mcluster *mcl_next; + char mcl_buf[MCLBYTES]; +}; + +#define MCLALLOC(p, how) \ + MBUFLOCK( \ + if (mclfree == 0) \ + (void)m_clalloc(1, (how)); \ + if ((p) = (caddr_t)mclfree) { \ + ++mclrefcnt[mtocl(p)]; \ + mbstat.m_clfree--; \ + mclfree = ((union mcluster *)(p))->mcl_next; \ + } \ + ) + +#define MCLGET(m, how) \ + { MCLALLOC((m)->m_ext.ext_buf, (how)); \ + if ((m)->m_ext.ext_buf != NULL) { \ + (m)->m_data = (m)->m_ext.ext_buf; \ + (m)->m_flags |= M_EXT; \ + (m)->m_ext.ext_size = MCLBYTES; \ + } \ + } + +#define MCLFREE(p) \ + MBUFLOCK ( \ + if (--mclrefcnt[mtocl(p)] == 0) { \ + ((union mcluster *)(p))->mcl_next = mclfree; \ + mclfree = (union mcluster *)(p); \ + mbstat.m_clfree++; \ + } \ + ) + +/* + * MFREE(struct mbuf *m, struct mbuf *n) + * Free a single mbuf and associated external storage. + * Place the successor, if any, in n. + */ +#ifdef notyet +#define MFREE(m, n) \ + { MBUFLOCK(mbstat.m_mtypes[(m)->m_type]--;) \ + if ((m)->m_flags & M_EXT) { \ + if ((m)->m_ext.ext_free) \ + (*((m)->m_ext.ext_free))((m)->m_ext.ext_buf, \ + (m)->m_ext.ext_size); \ + else \ + MCLFREE((m)->m_ext.ext_buf); \ + } \ + (n) = (m)->m_next; \ + FREE((m), mbtypes[(m)->m_type]); \ + } +#else /* notyet */ +#define MFREE(m, nn) \ + { MBUFLOCK(mbstat.m_mtypes[(m)->m_type]--;) \ + if ((m)->m_flags & M_EXT) { \ + MCLFREE((m)->m_ext.ext_buf); \ + } \ + (nn) = (m)->m_next; \ + FREE((m), mbtypes[(m)->m_type]); \ + } +#endif + +/* + * Copy mbuf pkthdr from from to to. + * from must have M_PKTHDR set, and to must be empty. + */ +#define M_COPY_PKTHDR(to, from) { \ + (to)->m_pkthdr = (from)->m_pkthdr; \ + (to)->m_flags = (from)->m_flags & M_COPYFLAGS; \ + (to)->m_data = (to)->m_pktdat; \ +} + +/* + * Set the m_data pointer of a newly-allocated mbuf (m_get/MGET) to place + * an object of the specified size at the end of the mbuf, longword aligned. + */ +#define M_ALIGN(m, len) \ + { (m)->m_data += (MLEN - (len)) &~ (sizeof(long) - 1); } +/* + * As above, for mbufs allocated with m_gethdr/MGETHDR + * or initialized by M_COPY_PKTHDR. + */ +#define MH_ALIGN(m, len) \ + { (m)->m_data += (MHLEN - (len)) &~ (sizeof(long) - 1); } + +/* + * Compute the amount of space available + * before the current start of data in an mbuf. + */ +#define M_LEADINGSPACE(m) \ + ((m)->m_flags & M_EXT ? /* (m)->m_data - (m)->m_ext.ext_buf */ 0 : \ + (m)->m_flags & M_PKTHDR ? (m)->m_data - (m)->m_pktdat : \ + (m)->m_data - (m)->m_dat) + +/* + * Compute the amount of space available + * after the end of data in an mbuf. + */ +#define M_TRAILINGSPACE(m) \ + ((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf + (m)->m_ext.ext_size - \ + ((m)->m_data + (m)->m_len) : \ + &(m)->m_dat[MLEN] - ((m)->m_data + (m)->m_len)) + +/* + * Arrange to prepend space of size plen to mbuf m. + * If a new mbuf must be allocated, how specifies whether to wait. + * If how is M_DONTWAIT and allocation fails, the original mbuf chain + * is freed and m is set to NULL. + */ +#define M_PREPEND(m, plen, how) { \ + if (M_LEADINGSPACE(m) >= (plen)) { \ + (m)->m_data -= (plen); \ + (m)->m_len += (plen); \ + } else \ + (m) = m_prepend((m), (plen), (how)); \ + if ((m) && (m)->m_flags & M_PKTHDR) \ + (m)->m_pkthdr.len += (plen); \ +} + +/* change mbuf to new type */ +#define MCHTYPE(m, t) { \ + MBUFLOCK(mbstat.m_mtypes[(m)->m_type]--; mbstat.m_mtypes[t]++;) \ + (m)->m_type = t;\ +} + +/* length to m_copy to copy all */ +#define M_COPYALL 1000000000 + +/* compatiblity with 4.3 */ +#define m_copy(m, o, l) m_copym((m), (o), (l), M_DONTWAIT) + +/* + * Mbuf statistics. + */ +struct mbstat { + u_long m_mbufs; /* mbufs obtained from page pool */ + u_long m_clusters; /* clusters obtained from page pool */ + u_long m_spare; /* spare field */ + u_long m_clfree; /* free clusters */ + u_long m_drops; /* times failed to find space */ + u_long m_wait; /* times waited for space */ + u_long m_drain; /* times drained protocols for space */ + u_short m_mtypes[256]; /* type specific mbuf allocations */ +}; + +#ifdef KERNEL +extern struct mbuf *mbutl; /* virtual address of mclusters */ +extern char *mclrefcnt; /* cluster reference counts */ +struct mbstat mbstat; +extern int nmbclusters; +union mcluster *mclfree; +int max_linkhdr; /* largest link-level header */ +int max_protohdr; /* largest protocol header */ +int max_hdr; /* largest link+protocol header */ +int max_datalen; /* MHLEN - max_hdr */ +extern int mbtypes[]; /* XXX */ + +struct mbuf *m_copym __P((struct mbuf *, int, int, int)); +struct mbuf *m_free __P((struct mbuf *)); +struct mbuf *m_get __P((int, int)); +struct mbuf *m_getclr __P((int, int)); +struct mbuf *m_gethdr __P((int, int)); +struct mbuf *m_prepend __P((struct mbuf *, int, int)); +struct mbuf *m_pullup __P((struct mbuf *, int)); +struct mbuf *m_retry __P((int, int)); +struct mbuf *m_retryhdr __P((int, int)); +int m_clalloc __P((int, int)); +void m_copyback __P((struct mbuf *, int, int, caddr_t)); +void m_freem __P((struct mbuf *)); + +#ifdef MBTYPES +int mbtypes[] = { /* XXX */ + M_FREE, /* MT_FREE 0 should be on free list */ + M_MBUF, /* MT_DATA 1 dynamic (data) allocation */ + M_MBUF, /* MT_HEADER 2 packet header */ + M_SOCKET, /* MT_SOCKET 3 socket structure */ + M_PCB, /* MT_PCB 4 protocol control block */ + M_RTABLE, /* MT_RTABLE 5 routing tables */ + M_HTABLE, /* MT_HTABLE 6 IMP host tables */ + 0, /* MT_ATABLE 7 address resolution tables */ + M_MBUF, /* MT_SONAME 8 socket name */ + 0, /* 9 */ + M_SOOPTS, /* MT_SOOPTS 10 socket options */ + M_FTABLE, /* MT_FTABLE 11 fragment reassembly header */ + M_MBUF, /* MT_RIGHTS 12 access rights */ + M_IFADDR, /* MT_IFADDR 13 interface address */ + M_MBUF, /* MT_CONTROL 14 extra-data protocol message */ + M_MBUF, /* MT_OOBDATA 15 expedited data */ +#ifdef DATAKIT + 25, 26, 27, 28, 29, 30, 31, 32 /* datakit ugliness */ +#endif +}; +#endif +#endif diff --git a/sys/sys/mman.h b/sys/sys/mman.h new file mode 100644 index 00000000000..b3951c202cb --- /dev/null +++ b/sys/sys/mman.h @@ -0,0 +1,89 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mman.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Protections are chosen from these bits, or-ed together + */ +#define PROT_READ 0x01 /* pages can be read */ +#define PROT_WRITE 0x02 /* pages can be written */ +#define PROT_EXEC 0x04 /* pages can be executed */ + +/* + * Flags contain sharing type and options. + * Sharing types; choose one. + */ +#define MAP_SHARED 0x0001 /* share changes */ +#define MAP_PRIVATE 0x0002 /* changes are private */ +#define MAP_COPY 0x0004 /* "copy" region at mmap time */ + +/* + * Other flags + */ +#define MAP_FIXED 0x0010 /* map addr must be exactly as requested */ +#define MAP_RENAME 0x0020 /* Sun: rename private pages to file */ +#define MAP_NORESERVE 0x0040 /* Sun: don't reserve needed swap area */ +#define MAP_INHERIT 0x0080 /* region is retained after exec */ +#define MAP_NOEXTEND 0x0100 /* for MAP_FILE, don't change file size */ +#define MAP_HASSEMAPHORE 0x0200 /* region may contain semaphores */ + +/* + * Mapping type; default is map from file. + */ +#define MAP_ANON 0x1000 /* allocated from memory, swap space */ + +/* + * Advice to madvise + */ +#define MADV_NORMAL 0 /* no further special treatment */ +#define MADV_RANDOM 1 /* expect random page references */ +#define MADV_SEQUENTIAL 2 /* expect sequential page references */ +#define MADV_WILLNEED 3 /* will need these pages */ +#define MADV_DONTNEED 4 /* dont need these pages */ + +#ifndef KERNEL + +#include + +__BEGIN_DECLS +/* Some of these int's should probably be size_t's */ +caddr_t mmap __P((caddr_t, size_t, int, int, int, off_t)); +int mprotect __P((caddr_t, size_t, int)); +int munmap __P((caddr_t, size_t)); +int msync __P((caddr_t, size_t)); +int mlock __P((caddr_t, size_t)); +int munlock __P((caddr_t, size_t)); +__END_DECLS + +#endif /* !KERNEL */ diff --git a/sys/sys/mount.h b/sys/sys/mount.h new file mode 100644 index 00000000000..4561675ef0d --- /dev/null +++ b/sys/sys/mount.h @@ -0,0 +1,418 @@ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mount.h 8.13 (Berkeley) 3/27/94 + */ + +#ifndef KERNEL +#include +#endif +#include + +typedef struct { long val[2]; } fsid_t; /* file system id type */ + +/* + * File identifier. + * These are unique per filesystem on a single machine. + */ +#define MAXFIDSZ 16 + +struct fid { + u_short fid_len; /* length of data in bytes */ + u_short fid_reserved; /* force longword alignment */ + char fid_data[MAXFIDSZ]; /* data (variable length) */ +}; + +/* + * file system statistics + */ + +#define MNAMELEN 90 /* length of buffer for returned name */ + +struct statfs { + short f_type; /* type of filesystem (see below) */ + short f_flags; /* copy of mount flags */ + long f_bsize; /* fundamental file system block size */ + long f_iosize; /* optimal transfer block size */ + long f_blocks; /* total data blocks in file system */ + long f_bfree; /* free blocks in fs */ + long f_bavail; /* free blocks avail to non-superuser */ + long f_files; /* total file nodes in file system */ + long f_ffree; /* free file nodes in fs */ + fsid_t f_fsid; /* file system id */ + long f_spare[9]; /* spare for later */ + char f_mntonname[MNAMELEN]; /* directory on which mounted */ + char f_mntfromname[MNAMELEN];/* mounted filesystem */ +}; + +/* + * File system types. + */ +#define MOUNT_NONE 0 +#define MOUNT_UFS 1 /* Fast Filesystem */ +#define MOUNT_NFS 2 /* Sun-compatible Network Filesystem */ +#define MOUNT_MFS 3 /* Memory-based Filesystem */ +#define MOUNT_MSDOS 4 /* MS/DOS Filesystem */ +#define MOUNT_LFS 5 /* Log-based Filesystem */ +#define MOUNT_LOFS 6 /* Loopback Filesystem */ +#define MOUNT_FDESC 7 /* File Descriptor Filesystem */ +#define MOUNT_PORTAL 8 /* Portal Filesystem */ +#define MOUNT_NULL 9 /* Minimal Filesystem Layer */ +#define MOUNT_UMAP 10 /* User/Group Identifer Remapping Filesystem */ +#define MOUNT_KERNFS 11 /* Kernel Information Filesystem */ +#define MOUNT_PROCFS 12 /* /proc Filesystem */ +#define MOUNT_AFS 13 /* Andrew Filesystem */ +#define MOUNT_CD9660 14 /* ISO9660 (aka CDROM) Filesystem */ +#define MOUNT_UNION 15 /* Union (translucent) Filesystem */ +#define MOUNT_MAXTYPE 15 + +#define INITMOUNTNAMES { \ + "none", /* 0 MOUNT_NONE */ \ + "ufs", /* 1 MOUNT_UFS */ \ + "nfs", /* 2 MOUNT_NFS */ \ + "mfs", /* 3 MOUNT_MFS */ \ + "msdos", /* 4 MOUNT_MSDOS */ \ + "lfs", /* 5 MOUNT_LFS */ \ + "lofs", /* 6 MOUNT_LOFS */ \ + "fdesc", /* 7 MOUNT_FDESC */ \ + "portal", /* 8 MOUNT_PORTAL */ \ + "null", /* 9 MOUNT_NULL */ \ + "umap", /* 10 MOUNT_UMAP */ \ + "kernfs", /* 11 MOUNT_KERNFS */ \ + "procfs", /* 12 MOUNT_PROCFS */ \ + "afs", /* 13 MOUNT_AFS */ \ + "iso9660fs", /* 14 MOUNT_CD9660 */ \ + "union", /* 15 MOUNT_UNION */ \ + 0, /* 16 MOUNT_SPARE */ \ +} + +/* + * Structure per mounted file system. Each mounted file system has an + * array of operations and an instance record. The file systems are + * put on a doubly linked list. + */ +LIST_HEAD(vnodelst, vnode); + +struct mount { + TAILQ_ENTRY(mount) mnt_list; /* mount list */ + struct vfsops *mnt_op; /* operations on fs */ + struct vnode *mnt_vnodecovered; /* vnode we mounted on */ + struct vnodelst mnt_vnodelist; /* list of vnodes this mount */ + int mnt_flag; /* flags */ + int mnt_maxsymlinklen; /* max size of short symlink */ + struct statfs mnt_stat; /* cache of filesystem stats */ + qaddr_t mnt_data; /* private data */ +}; + +/* + * Mount flags. + * + * Unmount uses MNT_FORCE flag. + */ +#define MNT_RDONLY 0x00000001 /* read only filesystem */ +#define MNT_SYNCHRONOUS 0x00000002 /* file system written synchronously */ +#define MNT_NOEXEC 0x00000004 /* can't exec from filesystem */ +#define MNT_NOSUID 0x00000008 /* don't honor setuid bits on fs */ +#define MNT_NODEV 0x00000010 /* don't interpret special files */ +#define MNT_UNION 0x00000020 /* union with underlying filesystem */ +#define MNT_ASYNC 0x00000040 /* file system written asynchronously */ + +/* + * exported mount flags. + */ +#define MNT_EXRDONLY 0x00000080 /* exported read only */ +#define MNT_EXPORTED 0x00000100 /* file system is exported */ +#define MNT_DEFEXPORTED 0x00000200 /* exported to the world */ +#define MNT_EXPORTANON 0x00000400 /* use anon uid mapping for everyone */ +#define MNT_EXKERB 0x00000800 /* exported with Kerberos uid mapping */ + +/* + * Flags set by internal operations. + */ +#define MNT_LOCAL 0x00001000 /* filesystem is stored locally */ +#define MNT_QUOTA 0x00002000 /* quotas are enabled on filesystem */ +#define MNT_ROOTFS 0x00004000 /* identifies the root filesystem */ +#define MNT_USER 0x00008000 /* mounted by a user */ + +/* + * Mask of flags that are visible to statfs() + */ +#define MNT_VISFLAGMASK 0x0000ffff + +/* + * filesystem control flags. + * + * MNT_MLOCK lock the mount entry so that name lookup cannot proceed + * past the mount point. This keeps the subtree stable during mounts + * and unmounts. + */ +#define MNT_UPDATE 0x00010000 /* not a real mount, just an update */ +#define MNT_DELEXPORT 0x00020000 /* delete export host lists */ +#define MNT_RELOAD 0x00040000 /* reload filesystem data */ +#define MNT_FORCE 0x00080000 /* force unmount or readonly change */ +#define MNT_MLOCK 0x00100000 /* lock so that subtree is stable */ +#define MNT_MWAIT 0x00200000 /* someone is waiting for lock */ +#define MNT_MPBUSY 0x00400000 /* scan of mount point in progress */ +#define MNT_MPWANT 0x00800000 /* waiting for mount point */ +#define MNT_UNMOUNT 0x01000000 /* unmount in progress */ +#define MNT_WANTRDWR 0x02000000 /* want upgrade to read/write */ + +/* + * Operations supported on mounted file system. + */ +#ifdef KERNEL +#ifdef __STDC__ +struct nameidata; +struct mbuf; +#endif + +struct vfsops { + int (*vfs_mount) __P((struct mount *mp, char *path, caddr_t data, + struct nameidata *ndp, struct proc *p)); + int (*vfs_start) __P((struct mount *mp, int flags, + struct proc *p)); + int (*vfs_unmount) __P((struct mount *mp, int mntflags, + struct proc *p)); + int (*vfs_root) __P((struct mount *mp, struct vnode **vpp)); + int (*vfs_quotactl) __P((struct mount *mp, int cmds, uid_t uid, + caddr_t arg, struct proc *p)); + int (*vfs_statfs) __P((struct mount *mp, struct statfs *sbp, + struct proc *p)); + int (*vfs_sync) __P((struct mount *mp, int waitfor, + struct ucred *cred, struct proc *p)); + int (*vfs_vget) __P((struct mount *mp, ino_t ino, + struct vnode **vpp)); + int (*vfs_fhtovp) __P((struct mount *mp, struct fid *fhp, + struct mbuf *nam, struct vnode **vpp, + int *exflagsp, struct ucred **credanonp)); + int (*vfs_vptofh) __P((struct vnode *vp, struct fid *fhp)); + int (*vfs_init) __P((void)); +}; + +#define VFS_MOUNT(MP, PATH, DATA, NDP, P) \ + (*(MP)->mnt_op->vfs_mount)(MP, PATH, DATA, NDP, P) +#define VFS_START(MP, FLAGS, P) (*(MP)->mnt_op->vfs_start)(MP, FLAGS, P) +#define VFS_UNMOUNT(MP, FORCE, P) (*(MP)->mnt_op->vfs_unmount)(MP, FORCE, P) +#define VFS_ROOT(MP, VPP) (*(MP)->mnt_op->vfs_root)(MP, VPP) +#define VFS_QUOTACTL(MP,C,U,A,P) (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A, P) +#define VFS_STATFS(MP, SBP, P) (*(MP)->mnt_op->vfs_statfs)(MP, SBP, P) +#define VFS_SYNC(MP, WAIT, C, P) (*(MP)->mnt_op->vfs_sync)(MP, WAIT, C, P) +#define VFS_VGET(MP, INO, VPP) (*(MP)->mnt_op->vfs_vget)(MP, INO, VPP) +#define VFS_FHTOVP(MP, FIDP, NAM, VPP, EXFLG, CRED) \ + (*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, NAM, VPP, EXFLG, CRED) +#define VFS_VPTOFH(VP, FIDP) (*(VP)->v_mount->mnt_op->vfs_vptofh)(VP, FIDP) +#endif /* KERNEL */ + +/* + * Flags for various system call interfaces. + * + * waitfor flags to vfs_sync() and getfsstat() + */ +#define MNT_WAIT 1 +#define MNT_NOWAIT 2 + +/* + * Generic file handle + */ +struct fhandle { + fsid_t fh_fsid; /* File system id of mount point */ + struct fid fh_fid; /* File sys specific id */ +}; +typedef struct fhandle fhandle_t; + +#ifdef KERNEL +#include +#include /* XXX for AF_MAX */ + +/* + * Network address lookup element + */ +struct netcred { + struct radix_node netc_rnodes[2]; + int netc_exflags; + struct ucred netc_anon; +}; + +/* + * Network export information + */ +struct netexport { + struct netcred ne_defexported; /* Default export */ + struct radix_node_head *ne_rtable[AF_MAX+1]; /* Individual exports */ +}; +#endif /* KERNEL */ + +/* + * Export arguments for local filesystem mount calls. + */ +struct export_args { + int ex_flags; /* export related flags */ + uid_t ex_root; /* mapping for root uid */ + struct ucred ex_anon; /* mapping for anonymous user */ + struct sockaddr *ex_addr; /* net address to which exported */ + int ex_addrlen; /* and the net address length */ + struct sockaddr *ex_mask; /* mask of valid bits in saddr */ + int ex_masklen; /* and the smask length */ +}; + +/* + * Arguments to mount UFS-based filesystems + */ +struct ufs_args { + char *fspec; /* block special device to mount */ + struct export_args export; /* network export information */ +}; + +#ifdef MFS +/* + * Arguments to mount MFS + */ +struct mfs_args { + char *fspec; /* name to export for statfs */ + struct export_args export; /* if exported MFSes are supported */ + caddr_t base; /* base of file system in memory */ + u_long size; /* size of file system */ +}; +#endif /* MFS */ + +#ifdef CD9660 +/* + * Arguments to mount ISO 9660 filesystems. + */ +struct iso_args { + char *fspec; /* block special device to mount */ + struct export_args export; /* network export info */ + int flags; /* mounting flags, see below */ + +}; +#define ISOFSMNT_NORRIP 0x00000001 /* disable Rock Ridge Ext.*/ +#define ISOFSMNT_GENS 0x00000002 /* enable generation numbers */ +#define ISOFSMNT_EXTATT 0x00000004 /* enable extended attributes */ +#endif /* CD9660 */ + +#ifdef NFS +/* + * File Handle (32 bytes for version 2), variable up to 1024 for version 3 + */ +union nfsv2fh { + fhandle_t fh_generic; + u_char fh_bytes[32]; +}; +typedef union nfsv2fh nfsv2fh_t; + +/* + * Arguments to mount NFS + */ +struct nfs_args { + struct sockaddr *addr; /* file server address */ + int addrlen; /* length of address */ + int sotype; /* Socket type */ + int proto; /* and Protocol */ + nfsv2fh_t *fh; /* File handle to be mounted */ + int flags; /* flags */ + int wsize; /* write size in bytes */ + int rsize; /* read size in bytes */ + int timeo; /* initial timeout in .1 secs */ + int retrans; /* times to retry send */ + int maxgrouplist; /* Max. size of group list */ + int readahead; /* # of blocks to readahead */ + int leaseterm; /* Term (sec) of lease */ + int deadthresh; /* Retrans threshold */ + char *hostname; /* server's name */ +}; + + +/* + * NFS mount option flags + */ +#define NFSMNT_SOFT 0x00000001 /* soft mount (hard is default) */ +#define NFSMNT_WSIZE 0x00000002 /* set write size */ +#define NFSMNT_RSIZE 0x00000004 /* set read size */ +#define NFSMNT_TIMEO 0x00000008 /* set initial timeout */ +#define NFSMNT_RETRANS 0x00000010 /* set number of request retrys */ +#define NFSMNT_MAXGRPS 0x00000020 /* set maximum grouplist size */ +#define NFSMNT_INT 0x00000040 /* allow interrupts on hard mount */ +#define NFSMNT_NOCONN 0x00000080 /* Don't Connect the socket */ +#define NFSMNT_NQNFS 0x00000100 /* Use Nqnfs protocol */ +#define NFSMNT_MYWRITE 0x00000200 /* Assume writes were mine */ +#define NFSMNT_KERB 0x00000400 /* Use Kerberos authentication */ +#define NFSMNT_DUMBTIMR 0x00000800 /* Don't estimate rtt dynamically */ +#define NFSMNT_RDIRALOOK 0x00001000 /* Do lookup with readdir (nqnfs) */ +#define NFSMNT_LEASETERM 0x00002000 /* set lease term (nqnfs) */ +#define NFSMNT_READAHEAD 0x00004000 /* set read ahead */ +#define NFSMNT_DEADTHRESH 0x00008000 /* set dead server retry thresh */ +#define NFSMNT_NQLOOKLEASE 0x00010000 /* Get lease for lookup */ +#define NFSMNT_RESVPORT 0x00020000 /* Allocate a reserved port */ +#define NFSMNT_INTERNAL 0xffe00000 /* Bits set internally */ +#define NFSMNT_MNTD 0x00200000 /* Mnt server for mnt point */ +#define NFSMNT_DISMINPROG 0x00400000 /* Dismount in progress */ +#define NFSMNT_DISMNT 0x00800000 /* Dismounted */ +#define NFSMNT_SNDLOCK 0x01000000 /* Send socket lock */ +#define NFSMNT_WANTSND 0x02000000 /* Want above */ +#define NFSMNT_RCVLOCK 0x04000000 /* Rcv socket lock */ +#define NFSMNT_WANTRCV 0x08000000 /* Want above */ +#define NFSMNT_WAITAUTH 0x10000000 /* Wait for authentication */ +#define NFSMNT_HASAUTH 0x20000000 /* Has authenticator */ +#define NFSMNT_WANTAUTH 0x40000000 /* Wants an authenticator */ +#define NFSMNT_AUTHERR 0x80000000 /* Authentication error */ +#endif /* NFS */ + +#ifdef KERNEL +/* + * exported vnode operations + */ +struct mount *getvfs __P((fsid_t *)); /* return vfs given fsid */ +int vfs_export /* process mount export info */ + __P((struct mount *, struct netexport *, struct export_args *)); +struct netcred *vfs_export_lookup /* lookup host in fs export list */ + __P((struct mount *, struct netexport *, struct mbuf *)); +int vfs_lock __P((struct mount *)); /* lock a vfs */ +int vfs_mountedon __P((struct vnode *));/* is a vfs mounted on vp */ +void vfs_unlock __P((struct mount *)); /* unlock a vfs */ +extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ +extern struct vfsops *vfssw[]; /* filesystem type table */ + +#else /* KERNEL */ + +#include + +__BEGIN_DECLS +int fstatfs __P((int, struct statfs *)); +int getfh __P((const char *, fhandle_t *)); +int getfsstat __P((struct statfs *, long, int)); +int getmntinfo __P((struct statfs **, int)); +int mount __P((int, const char *, int, void *)); +int statfs __P((const char *, struct statfs *)); +int unmount __P((const char *, int)); +__END_DECLS + +#endif /* KERNEL */ diff --git a/sys/sys/msgbuf.h b/sys/sys/msgbuf.h new file mode 100644 index 00000000000..57ee0b6f30a --- /dev/null +++ b/sys/sys/msgbuf.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 1981, 1984, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)msgbuf.h 8.1 (Berkeley) 6/2/93 + */ + +#define MSG_BSIZE (4096 - 3 * sizeof(long)) +struct msgbuf { +#define MSG_MAGIC 0x063061 + long msg_magic; + long msg_bufx; /* write pointer */ + long msg_bufr; /* read pointer */ + char msg_bufc[MSG_BSIZE]; /* buffer */ +}; +#ifdef KERNEL +struct msgbuf *msgbufp; +#endif diff --git a/sys/sys/mtio.h b/sys/sys/mtio.h new file mode 100644 index 00000000000..7b4ef0c017c --- /dev/null +++ b/sys/sys/mtio.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mtio.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Structures and definitions for mag tape io control commands + */ + +/* structure for MTIOCTOP - mag tape op command */ +struct mtop { + short mt_op; /* operations defined below */ + daddr_t mt_count; /* how many of them */ +}; + +/* operations */ +#define MTWEOF 0 /* write an end-of-file record */ +#define MTFSF 1 /* forward space file */ +#define MTBSF 2 /* backward space file */ +#define MTFSR 3 /* forward space record */ +#define MTBSR 4 /* backward space record */ +#define MTREW 5 /* rewind */ +#define MTOFFL 6 /* rewind and put the drive offline */ +#define MTNOP 7 /* no operation, sets status only */ +#define MTCACHE 8 /* enable controller cache */ +#define MTNOCACHE 9 /* disable controller cache */ + +/* structure for MTIOCGET - mag tape get status command */ + +struct mtget { + short mt_type; /* type of magtape device */ +/* the following two registers are grossly device dependent */ + short mt_dsreg; /* ``drive status'' register */ + short mt_erreg; /* ``error'' register */ +/* end device-dependent registers */ + short mt_resid; /* residual count */ +/* the following two are not yet implemented */ + daddr_t mt_fileno; /* file number of current position */ + daddr_t mt_blkno; /* block number of current position */ +/* end not yet implemented */ +}; + +/* + * Constants for mt_type byte. These are the same + * for controllers compatible with the types listed. + */ +#define MT_ISTS 0x01 /* TS-11 */ +#define MT_ISHT 0x02 /* TM03 Massbus: TE16, TU45, TU77 */ +#define MT_ISTM 0x03 /* TM11/TE10 Unibus */ +#define MT_ISMT 0x04 /* TM78/TU78 Massbus */ +#define MT_ISUT 0x05 /* SI TU-45 emulation on Unibus */ +#define MT_ISCPC 0x06 /* SUN */ +#define MT_ISAR 0x07 /* SUN */ +#define MT_ISTMSCP 0x08 /* DEC TMSCP protocol (TU81, TK50) */ +#define MT_ISCY 0x09 /* CCI Cipher */ +#define MT_ISCT 0x0a /* HP 1/4 tape */ +#define MT_ISFHP 0x0b /* HP 7980 1/2 tape */ +#define MT_ISEXABYTE 0x0c /* Exabyte */ +#define MT_ISEXA8200 0x0c /* Exabyte EXB-8200 */ +#define MT_ISEXA8500 0x0d /* Exabyte EXB-8500 */ +#define MT_ISVIPER1 0x0e /* Archive Viper-150 */ +#define MT_ISPYTHON 0x0f /* Archive Python (DAT) */ +#define MT_ISHPDAT 0x10 /* HP 35450A DAT drive */ +#define MT_ISMFOUR 0x11 /* M4 Data 1/2 9track drive */ +#define MT_ISTK50 0x12 /* DEC SCSI TK50 */ +#define MT_ISMT02 0x13 /* Emulex MT02 SCSI tape controller */ + +/* mag tape io control commands */ +#define MTIOCTOP _IOW('m', 1, struct mtop) /* do a mag tape op */ +#define MTIOCGET _IOR('m', 2, struct mtget) /* get tape status */ +#define MTIOCIEOT _IO('m', 3) /* ignore EOT error */ +#define MTIOCEEOT _IO('m', 4) /* enable EOT error */ + +#ifndef KERNEL +#define DEFTAPE "/dev/rmt12" +#endif + +#ifdef KERNEL +/* + * minor device number + */ + +#define T_UNIT 003 /* unit selection */ +#define T_NOREWIND 004 /* no rewind on close */ +#define T_DENSEL 030 /* density select */ +#define T_800BPI 000 /* select 800 bpi */ +#define T_1600BPI 010 /* select 1600 bpi */ +#define T_6250BPI 020 /* select 6250 bpi */ +#define T_BADBPI 030 /* undefined selection */ +#endif diff --git a/sys/sys/namei.h b/sys/sys/namei.h new file mode 100644 index 00000000000..74ff3602c26 --- /dev/null +++ b/sys/sys/namei.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 1985, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)namei.h 8.2 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_NAMEI_H_ +#define _SYS_NAMEI_H_ + +/* + * Encapsulation of namei parameters. + */ +struct nameidata { + /* + * Arguments to namei/lookup. + */ + caddr_t ni_dirp; /* pathname pointer */ + enum uio_seg ni_segflg; /* location of pathname */ + /* u_long ni_nameiop; namei operation */ + /* u_long ni_flags; flags to namei */ + /* struct proc *ni_proc; process requesting lookup */ + /* + * Arguments to lookup. + */ + /* struct ucred *ni_cred; credentials */ + struct vnode *ni_startdir; /* starting directory */ + struct vnode *ni_rootdir; /* logical root directory */ + /* + * Results: returned from/manipulated by lookup + */ + struct vnode *ni_vp; /* vnode of result */ + struct vnode *ni_dvp; /* vnode of intermediate directory */ + /* + * Shared between namei and lookup/commit routines. + */ + long ni_pathlen; /* remaining chars in path */ + char *ni_next; /* next location in pathname */ + u_long ni_loopcnt; /* count of symlinks encountered */ + /* + * Lookup parameters: this structure describes the subset of + * information from the nameidata structure that is passed + * through the VOP interface. + */ + struct componentname { + /* + * Arguments to lookup. + */ + u_long cn_nameiop; /* namei operation */ + u_long cn_flags; /* flags to namei */ + struct proc *cn_proc; /* process requesting lookup */ + struct ucred *cn_cred; /* credentials */ + /* + * Shared between lookup and commit routines. + */ + char *cn_pnbuf; /* pathname buffer */ + char *cn_nameptr; /* pointer to looked up name */ + long cn_namelen; /* length of looked up component */ + u_long cn_hash; /* hash value of looked up name */ + long cn_consume; /* chars to consume in lookup() */ + } ni_cnd; +}; + +#ifdef KERNEL +/* + * namei operations + */ +#define LOOKUP 0 /* perform name lookup only */ +#define CREATE 1 /* setup for file creation */ +#define DELETE 2 /* setup for file deletion */ +#define RENAME 3 /* setup for file renaming */ +#define OPMASK 3 /* mask for operation */ +/* + * namei operational modifier flags, stored in ni_cnd.flags + */ +#define LOCKLEAF 0x0004 /* lock inode on return */ +#define LOCKPARENT 0x0008 /* want parent vnode returned locked */ +#define WANTPARENT 0x0010 /* want parent vnode returned unlocked */ +#define NOCACHE 0x0020 /* name must not be left in cache */ +#define FOLLOW 0x0040 /* follow symbolic links */ +#define NOFOLLOW 0x0000 /* do not follow symbolic links (pseudo) */ +#define MODMASK 0x00fc /* mask of operational modifiers */ +/* + * Namei parameter descriptors. + * + * SAVENAME may be set by either the callers of namei or by VOP_LOOKUP. + * If the caller of namei sets the flag (for example execve wants to + * know the name of the program that is being executed), then it must + * free the buffer. If VOP_LOOKUP sets the flag, then the buffer must + * be freed by either the commit routine or the VOP_ABORT routine. + * SAVESTART is set only by the callers of namei. It implies SAVENAME + * plus the addition of saving the parent directory that contains the + * name in ni_startdir. It allows repeated calls to lookup for the + * name being sought. The caller is responsible for releasing the + * buffer and for vrele'ing ni_startdir. + */ +#define NOCROSSMOUNT 0x00100 /* do not cross mount points */ +#define RDONLY 0x00200 /* lookup with read-only semantics */ +#define HASBUF 0x00400 /* has allocated pathname buffer */ +#define SAVENAME 0x00800 /* save pathanme buffer */ +#define SAVESTART 0x01000 /* save starting directory */ +#define ISDOTDOT 0x02000 /* current component name is .. */ +#define MAKEENTRY 0x04000 /* entry is to be added to name cache */ +#define ISLASTCN 0x08000 /* this is last component of pathname */ +#define ISSYMLINK 0x10000 /* symlink needs interpretation */ +#define PARAMASK 0xfff00 /* mask of parameter descriptors */ +/* + * Initialization of an nameidata structure. + */ +#define NDINIT(ndp, op, flags, segflg, namep, p) { \ + (ndp)->ni_cnd.cn_nameiop = op; \ + (ndp)->ni_cnd.cn_flags = flags; \ + (ndp)->ni_segflg = segflg; \ + (ndp)->ni_dirp = namep; \ + (ndp)->ni_cnd.cn_proc = p; \ +} +#endif + +/* + * This structure describes the elements in the cache of recent + * names looked up by namei. NCHNAMLEN is sized to make structure + * size a power of two to optimize malloc's. Minimum reasonable + * size is 15. + */ + +#define NCHNAMLEN 31 /* maximum name segment length we bother with */ + +struct namecache { + struct namecache *nc_forw; /* hash chain */ + struct namecache **nc_back; /* hash chain */ + struct namecache *nc_nxt; /* LRU chain */ + struct namecache **nc_prev; /* LRU chain */ + struct vnode *nc_dvp; /* vnode of parent of name */ + u_long nc_dvpid; /* capability number of nc_dvp */ + struct vnode *nc_vp; /* vnode the name refers to */ + u_long nc_vpid; /* capability number of nc_vp */ + char nc_nlen; /* length of name */ + char nc_name[NCHNAMLEN]; /* segment name */ +}; + +#ifdef KERNEL +u_long nextvnodeid; +int namei __P((struct nameidata *ndp)); +int lookup __P((struct nameidata *ndp)); +#endif + +/* + * Stats on usefulness of namei caches. + */ +struct nchstats { + long ncs_goodhits; /* hits that we can really use */ + long ncs_neghits; /* negative hits that we can use */ + long ncs_badhits; /* hits we must drop */ + long ncs_falsehits; /* hits with id mismatch */ + long ncs_miss; /* misses */ + long ncs_long; /* long names that ignore cache */ + long ncs_pass2; /* names found with passes == 2 */ + long ncs_2passes; /* number of times we attempt it */ +}; +#endif /* !_SYS_NAMEI_H_ */ diff --git a/sys/sys/param.h b/sys/sys/param.h new file mode 100644 index 00000000000..91bdfd8facc --- /dev/null +++ b/sys/sys/param.h @@ -0,0 +1,216 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)param.h 8.2 (Berkeley) 1/21/94 + */ + +#define BSD 199306 /* System version (year & month). */ +#define BSD4_3 1 +#define BSD4_4 1 + +#ifndef NULL +#define NULL 0 +#endif + +#ifndef LOCORE +#include +#endif + +/* + * Machine-independent constants (some used in following include files). + * Redefined constants are from POSIX 1003.1 limits file. + * + * MAXCOMLEN should be >= sizeof(ac_comm) (see ) + * MAXLOGNAME should be >= UT_NAMESIZE (see ) + */ +#include + +#define MAXCOMLEN 16 /* max command name remembered */ +#define MAXINTERP 32 /* max interpreter file name length */ +#define MAXLOGNAME 12 /* max login name length */ +#define MAXUPRC CHILD_MAX /* max simultaneous processes */ +#define NCARGS ARG_MAX /* max bytes for an exec function */ +#define NGROUPS NGROUPS_MAX /* max number groups */ +#define NOFILE OPEN_MAX /* max open files per process */ +#define NOGROUP 65535 /* marker for empty group set member */ +#define MAXHOSTNAMELEN 256 /* max hostname size */ + +/* More types and definitions used throughout the kernel. */ +#ifdef KERNEL +#include +#include +#include +#include +#include +#include +#endif + +/* Signals. */ +#include + +/* Machine type dependent parameters. */ +#include +#include + +/* + * Priorities. Note that with 32 run queues, differences less than 4 are + * insignificant. + */ +#define PSWP 0 +#define PVM 4 +#define PINOD 8 +#define PRIBIO 16 +#define PVFS 20 +#define PZERO 22 /* No longer magic, shouldn't be here. XXX */ +#define PSOCK 24 +#define PWAIT 32 +#define PLOCK 36 +#define PPAUSE 40 +#define PUSER 50 +#define MAXPRI 127 /* Priorities range from 0 through MAXPRI. */ + +#define PRIMASK 0x0ff +#define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ + +#define NZERO 0 /* default "nice" */ + +#define NBPW sizeof(int) /* number of bytes per word (integer) */ + +#define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ +#define NODEV (dev_t)(-1) /* non-existent device */ + +/* + * Clustering of hardware pages on machines with ridiculously small + * page sizes is done here. The paging subsystem deals with units of + * CLSIZE pte's describing NBPG (from machine/machparam.h) pages each. + */ +#define CLBYTES (CLSIZE*NBPG) +#define CLOFSET (CLSIZE*NBPG-1) /* for clusters, like PGOFSET */ +#define claligned(x) ((((int)(x))&CLOFSET)==0) +#define CLOFF CLOFSET +#define CLSHIFT (PGSHIFT+CLSIZELOG2) + +#if CLSIZE==1 +#define clbase(i) (i) +#define clrnd(i) (i) +#else +/* Give the base virtual address (first of CLSIZE). */ +#define clbase(i) ((i) &~ (CLSIZE-1)) +/* Round a number of clicks up to a whole cluster. */ +#define clrnd(i) (((i) + (CLSIZE-1)) &~ (CLSIZE-1)) +#endif + +#define CBLOCK 64 /* Clist block size, must be a power of 2. */ +#define CBQSIZE (CBLOCK/NBBY) /* Quote bytes/cblock - can do better. */ + /* Data chars/clist. */ +#define CBSIZE (CBLOCK - sizeof(struct cblock *) - CBQSIZE) +#define CROUND (CBLOCK - 1) /* Clist rounding. */ + +/* + * File system parameters and macros. + * + * The file system is made out of blocks of at most MAXBSIZE units, with + * smaller units (fragments) only in the last direct block. MAXBSIZE + * primarily determines the size of buffers in the buffer pool. It may be + * made larger without any effect on existing file systems; however making + * it smaller make make some file systems unmountable. + */ +#define MAXBSIZE MAXPHYS +#define MAXFRAG 8 + +/* + * MAXPATHLEN defines the longest permissable path length after expanding + * symbolic links. It is used to allocate a temporary buffer from the buffer + * pool in which to do the name expansion, hence should be a power of two, + * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the + * maximum number of symbolic links that may be expanded in a path name. + * It should be set high enough to allow all legitimate uses, but halt + * infinite loops reasonably quickly. + */ +#define MAXPATHLEN PATH_MAX +#define MAXSYMLINKS 8 + +/* Bit map related macros. */ +#define setbit(a,i) ((a)[(i)/NBBY] |= 1<<((i)%NBBY)) +#define clrbit(a,i) ((a)[(i)/NBBY] &= ~(1<<((i)%NBBY))) +#define isset(a,i) ((a)[(i)/NBBY] & (1<<((i)%NBBY))) +#define isclr(a,i) (((a)[(i)/NBBY] & (1<<((i)%NBBY))) == 0) + +/* Macros for counting and rounding. */ +#ifndef howmany +#define howmany(x, y) (((x)+((y)-1))/(y)) +#endif +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) +#define powerof2(x) ((((x)-1)&(x))==0) + +/* Macros for min/max. */ +#ifndef KERNEL +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) +#endif + +/* + * Constants for setting the parameters of the kernel memory allocator. + * + * 2 ** MINBUCKET is the smallest unit of memory that will be + * allocated. It must be at least large enough to hold a pointer. + * + * Units of memory less or equal to MAXALLOCSAVE will permanently + * allocate physical memory; requests for these size pieces of + * memory are quite fast. Allocations greater than MAXALLOCSAVE must + * always allocate and free physical memory; requests for these + * size allocations should be done infrequently as they will be slow. + * + * Constraints: CLBYTES <= MAXALLOCSAVE <= 2 ** (MINBUCKET + 14), and + * MAXALLOCSIZE must be a power of two. + */ +#define MINBUCKET 4 /* 4 => min allocation of 16 bytes */ +#define MAXALLOCSAVE (2 * CLBYTES) + +/* + * Scale factor for scaled integers used to count %cpu time and load avgs. + * + * The number of CPU `tick's that map to a unique `%age' can be expressed + * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that + * can be calculated (assuming 32 bits) can be closely approximated using + * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). + * + * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', + * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. + */ +#define FSHIFT 11 /* bits to right of fixed binary point */ +#define FSCALE (1< /* Machine-dependent proc substruct. */ +#include /* For struct selinfo. */ + +/* + * One structure allocated per session. + */ +struct session { + int s_count; /* Ref cnt; pgrps in session. */ + struct proc *s_leader; /* Session leader. */ + struct vnode *s_ttyvp; /* Vnode of controlling terminal. */ + struct tty *s_ttyp; /* Controlling terminal. */ + char s_login[MAXLOGNAME]; /* Setlogin() name. */ +}; + +/* + * One structure allocated per process group. + */ +struct pgrp { + struct pgrp *pg_hforw; /* Forward link in hash bucket. */ + struct proc *pg_mem; /* Pointer to pgrp members. */ + struct session *pg_session; /* Pointer to session. */ + pid_t pg_id; /* Pgrp id. */ + int pg_jobc; /* # procs qualifying pgrp for job control */ +}; + +/* + * Description of a process. + * + * This structure contains the information needed to manage a thread of + * control, known in UN*X as a process; it has references to substructures + * containing descriptions of things that the process uses, but may share + * with related processes. The process structure and the substructures + * are always addressible except for those marked "(PROC ONLY)" below, + * which might be addressible only on a processor on which the process + * is running. + */ +struct proc { + struct proc *p_forw; /* Doubly-linked run/sleep queue. */ + struct proc *p_back; + struct proc *p_next; /* Linked list of active procs */ + struct proc **p_prev; /* and zombies. */ + + /* substructures: */ + struct pcred *p_cred; /* Process owner's identity. */ + struct filedesc *p_fd; /* Ptr to open files structure. */ + struct pstats *p_stats; /* Accounting/statistics (PROC ONLY). */ + struct plimit *p_limit; /* Process limits. */ + struct vmspace *p_vmspace; /* Address space. */ + struct sigacts *p_sigacts; /* Signal actions, state (PROC ONLY). */ + +#define p_ucred p_cred->pc_ucred +#define p_rlimit p_limit->pl_rlimit + + int p_flag; /* P_* flags. */ + char p_stat; /* S* process status. */ + char p_pad1[3]; + + pid_t p_pid; /* Process identifier. */ + struct proc *p_hash; /* Hashed based on p_pid for kill+exit+... */ + struct proc *p_pgrpnxt; /* Pointer to next process in process group. */ + struct proc *p_pptr; /* Pointer to process structure of parent. */ + struct proc *p_osptr; /* Pointer to older sibling processes. */ + +/* The following fields are all zeroed upon creation in fork. */ +#define p_startzero p_ysptr + struct proc *p_ysptr; /* Pointer to younger siblings. */ + struct proc *p_cptr; /* Pointer to youngest living child. */ + pid_t p_oppid; /* Save parent pid during ptrace. XXX */ + int p_dupfd; /* Sideways return value from fdopen. XXX */ + + /* scheduling */ + u_int p_estcpu; /* Time averaged value of p_cpticks. */ + int p_cpticks; /* Ticks of cpu time. */ + fixpt_t p_pctcpu; /* %cpu for this process during p_swtime */ + void *p_wchan; /* Sleep address. */ + char *p_wmesg; /* Reason for sleep. */ + u_int p_swtime; /* Time swapped in or out. */ + u_int p_slptime; /* Time since last blocked. */ + + struct itimerval p_realtimer; /* Alarm timer. */ + struct timeval p_rtime; /* Real time. */ + u_quad_t p_uticks; /* Statclock hits in user mode. */ + u_quad_t p_sticks; /* Statclock hits in system mode. */ + u_quad_t p_iticks; /* Statclock hits processing intr. */ + + int p_traceflag; /* Kernel trace points. */ + struct vnode *p_tracep; /* Trace to vnode. */ + + int p_siglist; /* Signals arrived but not delivered. */ + + struct vnode *p_textvp; /* Vnode of executable. */ + + long p_spare[5]; /* pad to 256, avoid shifting eproc. */ + +/* End area that is zeroed on creation. */ +#define p_endzero p_startcopy + +/* The following fields are all copied upon creation in fork. */ +#define p_startcopy p_sigmask + + sigset_t p_sigmask; /* Current signal mask. */ + sigset_t p_sigignore; /* Signals being ignored. */ + sigset_t p_sigcatch; /* Signals being caught by user. */ + + u_char p_priority; /* Process priority. */ + u_char p_usrpri; /* User-priority based on p_cpu and p_nice. */ + char p_nice; /* Process "nice" value. */ + char p_comm[MAXCOMLEN+1]; + + struct pgrp *p_pgrp; /* Pointer to process group. */ + +/* End area that is copied on creation. */ +#define p_endcopy p_thread + int p_thread; /* Id for this "thread"; Mach glue. XXX */ + struct user *p_addr; /* Kernel virtual addr of u-area (PROC ONLY). */ + struct mdproc p_md; /* Any machine-dependent fields. */ + + u_short p_xstat; /* Exit status for wait; also stop signal. */ + u_short p_acflag; /* Accounting flags. */ + struct rusage *p_ru; /* Exit information. XXX */ + +}; + +#define p_session p_pgrp->pg_session +#define p_pgid p_pgrp->pg_id + +/* Status values. */ +#define SIDL 1 /* Process being created by fork. */ +#define SRUN 2 /* Currently runnable. */ +#define SSLEEP 3 /* Sleeping on an address. */ +#define SSTOP 4 /* Process debugging or suspension. */ +#define SZOMB 5 /* Awaiting collection by parent. */ + +/* These flags are kept in p_flags. */ +#define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ +#define P_CONTROLT 0x00002 /* Has a controlling terminal. */ +#define P_INMEM 0x00004 /* Loaded into memory. */ +#define P_NOCLDSTOP 0x00008 /* No SIGCHLD when children stop. */ +#define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ +#define P_PROFIL 0x00020 /* Has started profiling. */ +#define P_SELECT 0x00040 /* Selecting; wakeup/waiting danger. */ +#define P_SINTR 0x00080 /* Sleep is interruptible. */ +#define P_SUGID 0x00100 /* Had set id privileges since last exec. */ +#define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ +#define P_TIMEOUT 0x00400 /* Timing out during sleep. */ +#define P_TRACED 0x00800 /* Debugged process being traced. */ +#define P_WAITED 0x01000 /* Debugging process has waited for child. */ +#define P_WEXIT 0x02000 /* Working on exiting. */ +#define P_EXEC 0x04000 /* Process called exec. */ + +/* Should probably be changed into a hold count. */ +#define P_NOSWAP 0x08000 /* Another flag to prevent swap out. */ +#define P_PHYSIO 0x10000 /* Doing physical I/O. */ + +/* Should be moved to machine-dependent areas. */ +#define P_OWEUPC 0x20000 /* Owe process an addupc() call at next ast. */ + +/* + * MOVE TO ucred.h? + * + * Shareable process credentials (always resident). This includes a reference + * to the current user credentials as well as real and saved ids that may be + * used to change ids. + */ +struct pcred { + struct ucred *pc_ucred; /* Current credentials. */ + uid_t p_ruid; /* Real user id. */ + uid_t p_svuid; /* Saved effective user id. */ + gid_t p_rgid; /* Real group id. */ + gid_t p_svgid; /* Saved effective group id. */ + int p_refcnt; /* Number of references. */ +}; + +#ifdef KERNEL +/* + * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t, + * as it is used to represent "no process group". + */ +#define PID_MAX 30000 +#define NO_PID 30001 +#define PIDHASH(pid) ((pid) & pidhashmask) + +#define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) +#define SESSHOLD(s) ((s)->s_count++) +#define SESSRELE(s) { \ + if (--(s)->s_count == 0) \ + FREE(s, M_SESSION); \ +} + +extern struct proc *pidhash[]; /* In param.c. */ +extern struct pgrp *pgrphash[]; /* In param.c. */ +extern struct proc *curproc; /* Current running proc. */ +extern struct proc proc0; /* Process slot for swapper. */ +extern int nprocs, maxproc; /* Current and max number of procs. */ +extern int pidhashmask; /* In param.c. */ + +volatile struct proc *allproc; /* List of active procs. */ +struct proc *zombproc; /* List of zombie procs. */ +struct proc *initproc, *pageproc; /* Process slots for init, pager. */ + +#define NQS 32 /* 32 run queues. */ +int whichqs; /* Bit mask summary of non-empty Q's. */ +struct prochd { + struct proc *ph_link; /* Linked list of running processes. */ + struct proc *ph_rlink; +} qs[NQS]; + +struct proc *pfind __P((pid_t)); /* Find process by id. */ +struct pgrp *pgfind __P((pid_t)); /* Find process group by id. */ + +void mi_switch __P((void)); +void resetpriority __P((struct proc *)); +void setrunnable __P((struct proc *)); +void setrunqueue __P((struct proc *)); +void sleep __P((void *chan, int pri)); +int tsleep __P((void *chan, int pri, char *wmesg, int timo)); +void unsleep __P((struct proc *)); +void wakeup __P((void *chan)); +#endif /* KERNEL */ +#endif /* !_SYS_PROC_H_ */ diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h new file mode 100644 index 00000000000..76ac720d85f --- /dev/null +++ b/sys/sys/protosw.h @@ -0,0 +1,210 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)protosw.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Protocol switch table. + * + * Each protocol has a handle initializing one of these structures, + * which is used for protocol-protocol and system-protocol communication. + * + * A protocol is called through the pr_init entry before any other. + * Thereafter it is called every 200ms through the pr_fasttimo entry and + * every 500ms through the pr_slowtimo for timer based actions. + * The system will call the pr_drain entry if it is low on space and + * this should throw away any non-critical data. + * + * Protocols pass data between themselves as chains of mbufs using + * the pr_input and pr_output hooks. Pr_input passes data up (towards + * UNIX) and pr_output passes it down (towards the imps); control + * information passes up and down on pr_ctlinput and pr_ctloutput. + * The protocol is responsible for the space occupied by any the + * arguments to these entries and must dispose it. + * + * The userreq routine interfaces protocols to the system and is + * described below. + */ +struct protosw { + short pr_type; /* socket type used for */ + struct domain *pr_domain; /* domain protocol a member of */ + short pr_protocol; /* protocol number */ + short pr_flags; /* see below */ +/* protocol-protocol hooks */ + void (*pr_input)(); /* input to protocol (from below) */ + int (*pr_output)(); /* output to protocol (from above) */ + void (*pr_ctlinput)(); /* control input (from below) */ + int (*pr_ctloutput)(); /* control output (from above) */ +/* user-protocol hook */ + int (*pr_usrreq)(); /* user request: see list below */ +/* utility hooks */ + void (*pr_init)(); /* initialization hook */ + void (*pr_fasttimo)(); /* fast timeout (200ms) */ + void (*pr_slowtimo)(); /* slow timeout (500ms) */ + void (*pr_drain)(); /* flush any excess space possible */ + int (*pr_sysctl)(); /* sysctl for protocol */ +}; + +#define PR_SLOWHZ 2 /* 2 slow timeouts per second */ +#define PR_FASTHZ 5 /* 5 fast timeouts per second */ + +/* + * Values for pr_flags. + * PR_ADDR requires PR_ATOMIC; + * PR_ADDR and PR_CONNREQUIRED are mutually exclusive. + */ +#define PR_ATOMIC 0x01 /* exchange atomic messages only */ +#define PR_ADDR 0x02 /* addresses given with messages */ +#define PR_CONNREQUIRED 0x04 /* connection required by protocol */ +#define PR_WANTRCVD 0x08 /* want PRU_RCVD calls */ +#define PR_RIGHTS 0x10 /* passes capabilities */ + +/* + * The arguments to usrreq are: + * (*protosw[].pr_usrreq)(up, req, m, nam, opt); + * where up is a (struct socket *), req is one of these requests, + * m is a optional mbuf chain containing a message, + * nam is an optional mbuf chain containing an address, + * and opt is a pointer to a socketopt structure or nil. + * The protocol is responsible for disposal of the mbuf chain m, + * the caller is responsible for any space held by nam and opt. + * A non-zero return from usrreq gives an + * UNIX error number which should be passed to higher level software. + */ +#define PRU_ATTACH 0 /* attach protocol to up */ +#define PRU_DETACH 1 /* detach protocol from up */ +#define PRU_BIND 2 /* bind socket to address */ +#define PRU_LISTEN 3 /* listen for connection */ +#define PRU_CONNECT 4 /* establish connection to peer */ +#define PRU_ACCEPT 5 /* accept connection from peer */ +#define PRU_DISCONNECT 6 /* disconnect from peer */ +#define PRU_SHUTDOWN 7 /* won't send any more data */ +#define PRU_RCVD 8 /* have taken data; more room now */ +#define PRU_SEND 9 /* send this data */ +#define PRU_ABORT 10 /* abort (fast DISCONNECT, DETATCH) */ +#define PRU_CONTROL 11 /* control operations on protocol */ +#define PRU_SENSE 12 /* return status into m */ +#define PRU_RCVOOB 13 /* retrieve out of band data */ +#define PRU_SENDOOB 14 /* send out of band data */ +#define PRU_SOCKADDR 15 /* fetch socket's address */ +#define PRU_PEERADDR 16 /* fetch peer's address */ +#define PRU_CONNECT2 17 /* connect two sockets */ +/* begin for protocols internal use */ +#define PRU_FASTTIMO 18 /* 200ms timeout */ +#define PRU_SLOWTIMO 19 /* 500ms timeout */ +#define PRU_PROTORCV 20 /* receive from below */ +#define PRU_PROTOSEND 21 /* send to below */ + +#define PRU_NREQ 21 + +#ifdef PRUREQUESTS +char *prurequests[] = { + "ATTACH", "DETACH", "BIND", "LISTEN", + "CONNECT", "ACCEPT", "DISCONNECT", "SHUTDOWN", + "RCVD", "SEND", "ABORT", "CONTROL", + "SENSE", "RCVOOB", "SENDOOB", "SOCKADDR", + "PEERADDR", "CONNECT2", "FASTTIMO", "SLOWTIMO", + "PROTORCV", "PROTOSEND", +}; +#endif + +/* + * The arguments to the ctlinput routine are + * (*protosw[].pr_ctlinput)(cmd, sa, arg); + * where cmd is one of the commands below, sa is a pointer to a sockaddr, + * and arg is an optional caddr_t argument used within a protocol family. + */ +#define PRC_IFDOWN 0 /* interface transition */ +#define PRC_ROUTEDEAD 1 /* select new route if possible ??? */ +#define PRC_QUENCH2 3 /* DEC congestion bit says slow down */ +#define PRC_QUENCH 4 /* some one said to slow down */ +#define PRC_MSGSIZE 5 /* message size forced drop */ +#define PRC_HOSTDEAD 6 /* host appears to be down */ +#define PRC_HOSTUNREACH 7 /* deprecated (use PRC_UNREACH_HOST) */ +#define PRC_UNREACH_NET 8 /* no route to network */ +#define PRC_UNREACH_HOST 9 /* no route to host */ +#define PRC_UNREACH_PROTOCOL 10 /* dst says bad protocol */ +#define PRC_UNREACH_PORT 11 /* bad port # */ +/* was PRC_UNREACH_NEEDFRAG 12 (use PRC_MSGSIZE) */ +#define PRC_UNREACH_SRCFAIL 13 /* source route failed */ +#define PRC_REDIRECT_NET 14 /* net routing redirect */ +#define PRC_REDIRECT_HOST 15 /* host routing redirect */ +#define PRC_REDIRECT_TOSNET 16 /* redirect for type of service & net */ +#define PRC_REDIRECT_TOSHOST 17 /* redirect for tos & host */ +#define PRC_TIMXCEED_INTRANS 18 /* packet lifetime expired in transit */ +#define PRC_TIMXCEED_REASS 19 /* lifetime expired on reass q */ +#define PRC_PARAMPROB 20 /* header incorrect */ + +#define PRC_NCMDS 21 + +#define PRC_IS_REDIRECT(cmd) \ + ((cmd) >= PRC_REDIRECT_NET && (cmd) <= PRC_REDIRECT_TOSHOST) + +#ifdef PRCREQUESTS +char *prcrequests[] = { + "IFDOWN", "ROUTEDEAD", "#2", "DEC-BIT-QUENCH2", + "QUENCH", "MSGSIZE", "HOSTDEAD", "#7", + "NET-UNREACH", "HOST-UNREACH", "PROTO-UNREACH", "PORT-UNREACH", + "#12", "SRCFAIL-UNREACH", "NET-REDIRECT", "HOST-REDIRECT", + "TOSNET-REDIRECT", "TOSHOST-REDIRECT", "TX-INTRANS", "TX-REASS", + "PARAMPROB" +}; +#endif + +/* + * The arguments to ctloutput are: + * (*protosw[].pr_ctloutput)(req, so, level, optname, optval); + * req is one of the actions listed below, so is a (struct socket *), + * level is an indication of which protocol layer the option is intended. + * optname is a protocol dependent socket option request, + * optval is a pointer to a mbuf-chain pointer, for value-return results. + * The protocol is responsible for disposal of the mbuf chain *optval + * if supplied, + * the caller is responsible for any space held by *optval, when returned. + * A non-zero return from usrreq gives an + * UNIX error number which should be passed to higher level software. + */ +#define PRCO_GETOPT 0 +#define PRCO_SETOPT 1 + +#define PRCO_NCMDS 2 + +#ifdef PRCOREQUESTS +char *prcorequests[] = { + "GETOPT", "SETOPT", +}; +#endif + +#ifdef KERNEL +extern struct protosw *pffindproto(), *pffindtype(); +#endif diff --git a/sys/sys/ptrace.h b/sys/sys/ptrace.h new file mode 100644 index 00000000000..f7f99d474a4 --- /dev/null +++ b/sys/sys/ptrace.h @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 1984, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ptrace.h 8.2 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_PTRACE_H_ +#define _SYS_PTRACE_H_ + +#define PT_TRACE_ME 0 /* child declares it's being traced */ +#define PT_READ_I 1 /* read word in child's I space */ +#define PT_READ_D 2 /* read word in child's D space */ +#define PT_READ_U 3 /* read word in child's user structure */ +#define PT_WRITE_I 4 /* write word in child's I space */ +#define PT_WRITE_D 5 /* write word in child's D space */ +#define PT_WRITE_U 6 /* write word in child's user structure */ +#define PT_CONTINUE 7 /* continue the child */ +#define PT_KILL 8 /* kill the child process */ +#define PT_STEP 9 /* single step the child */ +#define PT_ATTACH 10 /* trace some running process */ +#define PT_DETACH 11 /* stop tracing a process */ + +#define PT_FIRSTMACH 32 /* for machine-specific requests */ +#include /* machine-specific requests, if any */ + +#ifdef KERNEL +void proc_reparent __P((struct proc *child, struct proc *newparent)); +#else /* !KERNEL */ + +#include + +__BEGIN_DECLS +int ptrace __P((int _request, pid_t _pid, caddr_t _addr, int _data)); +__END_DECLS + +#endif /* !KERNEL */ + +#endif /* !_SYS_PTRACE_H_ */ diff --git a/sys/sys/queue.h b/sys/sys/queue.h new file mode 100644 index 00000000000..c200c9f4ccf --- /dev/null +++ b/sys/sys/queue.h @@ -0,0 +1,245 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.4 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_QUEUE_H_ +#define _SYS_QUEUE_H_ + +/* + * This file defines three types of data structures: lists, tail queues, + * and circular queues. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list after + * an existing element or at the head of the list. A list may only be + * traversed in the forward direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list after + * an existing element, at the head of the list, or at the end of the + * list. A tail queue may only be traversed in the forward direction. + * + * A circle queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or after + * an existing element, at the head of the list, or at the end of the list. + * A circle queue may be traversed in either direction, but has a more + * complex end of list detection. + * + * For details on the use of these macros, see the queue(3) manual page. + */ + +/* + * List definitions. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +/* + * List functions. + */ +#define LIST_INIT(head) { \ + (head)->lh_first = NULL; \ +} + +#define LIST_INSERT_AFTER(listelm, elm, field) { \ + if (((elm)->field.le_next = (listelm)->field.le_next) != NULL) \ + (listelm)->field.le_next->field.le_prev = \ + &(elm)->field.le_next; \ + (listelm)->field.le_next = (elm); \ + (elm)->field.le_prev = &(listelm)->field.le_next; \ +} + +#define LIST_INSERT_HEAD(head, elm, field) { \ + if (((elm)->field.le_next = (head)->lh_first) != NULL) \ + (head)->lh_first->field.le_prev = &(elm)->field.le_next;\ + (head)->lh_first = (elm); \ + (elm)->field.le_prev = &(head)->lh_first; \ +} + +#define LIST_REMOVE(elm, field) { \ + if ((elm)->field.le_next != NULL) \ + (elm)->field.le_next->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = (elm)->field.le_next; \ +} + +/* + * Tail queue definitions. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ +} + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ +} + +/* + * Tail queue functions. + */ +#define TAILQ_INIT(head) { \ + (head)->tqh_first = NULL; \ + (head)->tqh_last = &(head)->tqh_first; \ +} + +#define TAILQ_INSERT_HEAD(head, elm, field) { \ + if (((elm)->field.tqe_next = (head)->tqh_first) != NULL) \ + (elm)->field.tqe_next->field.tqe_prev = \ + &(elm)->field.tqe_next; \ + else \ + (head)->tqh_last = &(elm)->field.tqe_next; \ + (head)->tqh_first = (elm); \ + (elm)->field.tqe_prev = &(head)->tqh_first; \ +} + +#define TAILQ_INSERT_TAIL(head, elm, field) { \ + (elm)->field.tqe_next = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &(elm)->field.tqe_next; \ +} + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) { \ + if (((elm)->field.tqe_next = (listelm)->field.tqe_next) != NULL)\ + (elm)->field.tqe_next->field.tqe_prev = \ + &(elm)->field.tqe_next; \ + else \ + (head)->tqh_last = &(elm)->field.tqe_next; \ + (listelm)->field.tqe_next = (elm); \ + (elm)->field.tqe_prev = &(listelm)->field.tqe_next; \ +} + +#define TAILQ_REMOVE(head, elm, field) { \ + if (((elm)->field.tqe_next) != NULL) \ + (elm)->field.tqe_next->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + *(elm)->field.tqe_prev = (elm)->field.tqe_next; \ +} + +/* + * Circular queue definitions. + */ +#define CIRCLEQ_HEAD(name, type) \ +struct name { \ + struct type *cqh_first; /* first element */ \ + struct type *cqh_last; /* last element */ \ +} + +#define CIRCLEQ_ENTRY(type) \ +struct { \ + struct type *cqe_next; /* next element */ \ + struct type *cqe_prev; /* previous element */ \ +} + +/* + * Circular queue functions. + */ +#define CIRCLEQ_INIT(head) { \ + (head)->cqh_first = (void *)(head); \ + (head)->cqh_last = (void *)(head); \ +} + +#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) { \ + (elm)->field.cqe_next = (listelm)->field.cqe_next; \ + (elm)->field.cqe_prev = (listelm); \ + if ((listelm)->field.cqe_next == (void *)(head)) \ + (head)->cqh_last = (elm); \ + else \ + (listelm)->field.cqe_next->field.cqe_prev = (elm); \ + (listelm)->field.cqe_next = (elm); \ +} + +#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) { \ + (elm)->field.cqe_next = (listelm); \ + (elm)->field.cqe_prev = (listelm)->field.cqe_prev; \ + if ((listelm)->field.cqe_prev == (void *)(head)) \ + (head)->cqh_first = (elm); \ + else \ + (listelm)->field.cqe_prev->field.cqe_next = (elm); \ + (listelm)->field.cqe_prev = (elm); \ +} + +#define CIRCLEQ_INSERT_HEAD(head, elm, field) { \ + (elm)->field.cqe_next = (head)->cqh_first; \ + (elm)->field.cqe_prev = (void *)(head); \ + if ((head)->cqh_last == (void *)(head)) \ + (head)->cqh_last = (elm); \ + else \ + (head)->cqh_first->field.cqe_prev = (elm); \ + (head)->cqh_first = (elm); \ +} + +#define CIRCLEQ_INSERT_TAIL(head, elm, field) { \ + (elm)->field.cqe_next = (void *)(head); \ + (elm)->field.cqe_prev = (head)->cqh_last; \ + if ((head)->cqh_first == (void *)(head)) \ + (head)->cqh_first = (elm); \ + else \ + (head)->cqh_last->field.cqe_next = (elm); \ + (head)->cqh_last = (elm); \ +} + +#define CIRCLEQ_REMOVE(head, elm, field) { \ + if ((elm)->field.cqe_next == (void *)(head)) \ + (head)->cqh_last = (elm)->field.cqe_prev; \ + else \ + (elm)->field.cqe_next->field.cqe_prev = \ + (elm)->field.cqe_prev; \ + if ((elm)->field.cqe_prev == (void *)(head)) \ + (head)->cqh_first = (elm)->field.cqe_next; \ + else \ + (elm)->field.cqe_prev->field.cqe_next = \ + (elm)->field.cqe_next; \ +} +#endif /* !_SYS_QUEUE_H_ */ diff --git a/sys/sys/reboot.h b/sys/sys/reboot.h new file mode 100644 index 00000000000..c3c957e17ee --- /dev/null +++ b/sys/sys/reboot.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)reboot.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Arguments to reboot system call. + * These are passed to boot program in r11, + * and on to init. + */ +#define RB_AUTOBOOT 0 /* flags for system auto-booting itself */ + +#define RB_ASKNAME 0x01 /* ask for file name to reboot from */ +#define RB_SINGLE 0x02 /* reboot to single user only */ +#define RB_NOSYNC 0x04 /* dont sync before reboot */ +#define RB_HALT 0x08 /* don't reboot, just halt */ +#define RB_INITNAME 0x10 /* name given for /etc/init (unused) */ +#define RB_DFLTROOT 0x20 /* use compiled-in rootdev */ +#define RB_KDB 0x40 /* give control to kernel debugger */ +#define RB_RDONLY 0x80 /* mount root fs read-only */ +#define RB_DUMP 0x100 /* dump kernel memory before reboot */ +#define RB_MINIROOT 0x200 /* mini-root present in memory at boot time */ + +/* + * Constants for converting boot-style device number to type, + * adaptor (uba, mba, etc), unit number and partition number. + * Type (== major device number) is in the low byte + * for backward compatibility. Except for that of the "magic + * number", each mask applies to the shifted value. + * Format: + * (4) (4) (4) (4) (8) (8) + * -------------------------------- + * |MA | AD| CT| UN| PART | TYPE | + * -------------------------------- + */ +#define B_ADAPTORSHIFT 24 +#define B_ADAPTORMASK 0x0f +#define B_ADAPTOR(val) (((val) >> B_ADAPTORSHIFT) & B_ADAPTORMASK) +#define B_CONTROLLERSHIFT 20 +#define B_CONTROLLERMASK 0xf +#define B_CONTROLLER(val) (((val)>>B_CONTROLLERSHIFT) & B_CONTROLLERMASK) +#define B_UNITSHIFT 16 +#define B_UNITMASK 0xf +#define B_UNIT(val) (((val) >> B_UNITSHIFT) & B_UNITMASK) +#define B_PARTITIONSHIFT 8 +#define B_PARTITIONMASK 0xff +#define B_PARTITION(val) (((val) >> B_PARTITIONSHIFT) & B_PARTITIONMASK) +#define B_TYPESHIFT 0 +#define B_TYPEMASK 0xff +#define B_TYPE(val) (((val) >> B_TYPESHIFT) & B_TYPEMASK) + +#define B_MAGICMASK ((u_long)0xf0000000) +#define B_DEVMAGIC ((u_long)0xa0000000) + +#define MAKEBOOTDEV(type, adaptor, controller, unit, partition) \ + (((type) << B_TYPESHIFT) | ((adaptor) << B_ADAPTORSHIFT) | \ + ((controller) << B_CONTROLLERSHIFT) | ((unit) << B_UNITSHIFT) | \ + ((partition) << B_PARTITIONSHIFT) | B_DEVMAGIC) diff --git a/sys/sys/resource.h b/sys/sys/resource.h new file mode 100644 index 00000000000..559f1ac6c37 --- /dev/null +++ b/sys/sys/resource.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)resource.h 8.2 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_RESOURCE_H_ +#define _SYS_RESOURCE_H_ + +/* + * Process priority specifications to get/setpriority. + */ +#define PRIO_MIN -20 +#define PRIO_MAX 20 + +#define PRIO_PROCESS 0 +#define PRIO_PGRP 1 +#define PRIO_USER 2 + +/* + * Resource utilization information. + */ + +#define RUSAGE_SELF 0 +#define RUSAGE_CHILDREN -1 + +struct rusage { + struct timeval ru_utime; /* user time used */ + struct timeval ru_stime; /* system time used */ + long ru_maxrss; /* max resident set size */ +#define ru_first ru_ixrss + long ru_ixrss; /* integral shared memory size */ + long ru_idrss; /* integral unshared data " */ + long ru_isrss; /* integral unshared stack " */ + long ru_minflt; /* page reclaims */ + long ru_majflt; /* page faults */ + long ru_nswap; /* swaps */ + long ru_inblock; /* block input operations */ + long ru_oublock; /* block output operations */ + long ru_msgsnd; /* messages sent */ + long ru_msgrcv; /* messages received */ + long ru_nsignals; /* signals received */ + long ru_nvcsw; /* voluntary context switches */ + long ru_nivcsw; /* involuntary " */ +#define ru_last ru_nivcsw +}; + +/* + * Resource limits + */ +#define RLIMIT_CPU 0 /* cpu time in milliseconds */ +#define RLIMIT_FSIZE 1 /* maximum file size */ +#define RLIMIT_DATA 2 /* data size */ +#define RLIMIT_STACK 3 /* stack size */ +#define RLIMIT_CORE 4 /* core file size */ +#define RLIMIT_RSS 5 /* resident set size */ +#define RLIMIT_MEMLOCK 6 /* locked-in-memory address space */ +#define RLIMIT_NPROC 7 /* number of processes */ +#define RLIMIT_NOFILE 8 /* number of open files */ + +#define RLIM_NLIMITS 9 /* number of resource limits */ + +#define RLIM_INFINITY (((u_quad_t)1 << 63) - 1) + +struct orlimit { + long rlim_cur; /* current (soft) limit */ + long rlim_max; /* maximum value for rlim_cur */ +}; + +struct rlimit { + quad_t rlim_cur; /* current (soft) limit */ + quad_t rlim_max; /* maximum value for rlim_cur */ +}; + +/* Load average structure. */ +struct loadavg { + fixpt_t ldavg[3]; + long fscale; +}; + +#ifdef KERNEL +extern struct loadavg averunnable; + +#else +#include + +__BEGIN_DECLS +int getpriority __P((int, int)); +int getrlimit __P((int, struct rlimit *)); +int getrusage __P((int, struct rusage *)); +int setpriority __P((int, int, int)); +int setrlimit __P((int, const struct rlimit *)); +__END_DECLS + +#endif /* KERNEL */ +#endif /* !_SYS_RESOURCE_H_ */ diff --git a/sys/sys/resourcevar.h b/sys/sys/resourcevar.h new file mode 100644 index 00000000000..0f8d5e30eed --- /dev/null +++ b/sys/sys/resourcevar.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)resourcevar.h 8.3 (Berkeley) 2/22/94 + */ + +#ifndef _SYS_RESOURCEVAR_H_ +#define _SYS_RESOURCEVAR_H_ + +/* + * Kernel per-process accounting / statistics + * (not necessarily resident except when running). + */ +struct pstats { +#define pstat_startzero p_ru + struct rusage p_ru; /* stats for this proc */ + struct rusage p_cru; /* sum of stats for reaped children */ +#define pstat_endzero pstat_startcopy + +#define pstat_startcopy p_timer + struct itimerval p_timer[3]; /* virtual-time timers */ + + struct uprof { /* profile arguments */ + caddr_t pr_base; /* buffer base */ + u_long pr_size; /* buffer size */ + u_long pr_off; /* pc offset */ + u_long pr_scale; /* pc scaling */ + u_long pr_addr; /* temp storage for addr until AST */ + u_long pr_ticks; /* temp storage for ticks until AST */ + } p_prof; +#define pstat_endcopy p_start + struct timeval p_start; /* starting time */ +}; + +/* + * Kernel shareable process resource limits. Because this structure + * is moderately large but changes infrequently, it is normally + * shared copy-on-write after forks. If a group of processes + * ("threads") share modifications, the PL_SHAREMOD flag is set, + * and a copy must be made for the child of a new fork that isn't + * sharing modifications to the limits. + */ +struct plimit { + struct rlimit pl_rlimit[RLIM_NLIMITS]; +#define PL_SHAREMOD 0x01 /* modifications are shared */ + int p_lflags; + int p_refcnt; /* number of references */ +}; + +/* add user profiling from AST */ +#define ADDUPROF(p) \ + addupc_task(p, \ + (p)->p_stats->p_prof.pr_addr, (p)->p_stats->p_prof.pr_ticks) + +#ifdef KERNEL +void addupc_intr __P((struct proc *p, u_long pc, u_int ticks)); +void addupc_task __P((struct proc *p, u_long pc, u_int ticks)); +struct plimit + *limcopy __P((struct plimit *lim)); +#endif +#endif /* !_SYS_RESOURCEVAR_H_ */ diff --git a/sys/sys/select.h b/sys/sys/select.h new file mode 100644 index 00000000000..a279c592fbe --- /dev/null +++ b/sys/sys/select.h @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)select.h 8.2 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_SELECT_H_ +#define _SYS_SELECT_H_ + +/* + * Used to maintain information about processes that wish to be + * notified when I/O becomes possible. + */ +struct selinfo { + pid_t si_pid; /* process to be notified */ + short si_flags; /* see below */ +}; +#define SI_COLL 0x0001 /* collision occurred */ + +#ifdef KERNEL +struct proc; + +void selrecord __P((struct proc *selector, struct selinfo *)); +void selwakeup __P((struct selinfo *)); +#endif + +#endif /* !_SYS_SELECT_H_ */ diff --git a/sys/sys/selinfo.h b/sys/sys/selinfo.h new file mode 100644 index 00000000000..a279c592fbe --- /dev/null +++ b/sys/sys/selinfo.h @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)select.h 8.2 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_SELECT_H_ +#define _SYS_SELECT_H_ + +/* + * Used to maintain information about processes that wish to be + * notified when I/O becomes possible. + */ +struct selinfo { + pid_t si_pid; /* process to be notified */ + short si_flags; /* see below */ +}; +#define SI_COLL 0x0001 /* collision occurred */ + +#ifdef KERNEL +struct proc; + +void selrecord __P((struct proc *selector, struct selinfo *)); +void selwakeup __P((struct selinfo *)); +#endif + +#endif /* !_SYS_SELECT_H_ */ diff --git a/sys/sys/signal.h b/sys/sys/signal.h new file mode 100644 index 00000000000..8ccded41c3b --- /dev/null +++ b/sys/sys/signal.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)signal.h 8.2 (Berkeley) 1/21/94 + */ + +#ifndef _SYS_SIGNAL_H_ +#define _SYS_SIGNAL_H_ + +#define NSIG 32 /* counting 0; could be 33 (mask is 1-32) */ + +#ifndef _ANSI_SOURCE +#include /* sigcontext; codes for SIGILL, SIGFPE */ +#endif + +#define SIGHUP 1 /* hangup */ +#define SIGINT 2 /* interrupt */ +#define SIGQUIT 3 /* quit */ +#define SIGILL 4 /* illegal instruction (not reset when caught) */ +#ifndef _POSIX_SOURCE +#define SIGTRAP 5 /* trace trap (not reset when caught) */ +#endif +#define SIGABRT 6 /* abort() */ +#ifndef _POSIX_SOURCE +#define SIGIOT SIGABRT /* compatibility */ +#define SIGEMT 7 /* EMT instruction */ +#endif +#define SIGFPE 8 /* floating point exception */ +#define SIGKILL 9 /* kill (cannot be caught or ignored) */ +#ifndef _POSIX_SOURCE +#define SIGBUS 10 /* bus error */ +#endif +#define SIGSEGV 11 /* segmentation violation */ +#ifndef _POSIX_SOURCE +#define SIGSYS 12 /* bad argument to system call */ +#endif +#define SIGPIPE 13 /* write on a pipe with no one to read it */ +#define SIGALRM 14 /* alarm clock */ +#define SIGTERM 15 /* software termination signal from kill */ +#ifndef _POSIX_SOURCE +#define SIGURG 16 /* urgent condition on IO channel */ +#endif +#define SIGSTOP 17 /* sendable stop signal not from tty */ +#define SIGTSTP 18 /* stop signal from tty */ +#define SIGCONT 19 /* continue a stopped process */ +#define SIGCHLD 20 /* to parent on child stop or exit */ +#define SIGTTIN 21 /* to readers pgrp upon background tty read */ +#define SIGTTOU 22 /* like TTIN for output if (tp->t_local<OSTOP) */ +#ifndef _POSIX_SOURCE +#define SIGIO 23 /* input/output possible signal */ +#define SIGXCPU 24 /* exceeded CPU time limit */ +#define SIGXFSZ 25 /* exceeded file size limit */ +#define SIGVTALRM 26 /* virtual time alarm */ +#define SIGPROF 27 /* profiling time alarm */ +#define SIGWINCH 28 /* window size changes */ +#define SIGINFO 29 /* information request */ +#endif +#define SIGUSR1 30 /* user defined signal 1 */ +#define SIGUSR2 31 /* user defined signal 2 */ + +#if defined(_ANSI_SOURCE) || defined(__cplusplus) +/* + * Language spec sez we must list exactly one parameter, even though we + * actually supply three. Ugh! + */ +#define SIG_DFL (void (*)(int))0 +#define SIG_IGN (void (*)(int))1 +#define SIG_ERR (void (*)(int))-1 +#else +#define SIG_DFL (void (*)())0 +#define SIG_IGN (void (*)())1 +#define SIG_ERR (void (*)())-1 +#endif + +#ifndef _ANSI_SOURCE +typedef unsigned int sigset_t; + +/* + * Signal vector "template" used in sigaction call. + */ +struct sigaction { + void (*sa_handler)(); /* signal handler */ + sigset_t sa_mask; /* signal mask to apply */ + int sa_flags; /* see signal options below */ +}; +#ifndef _POSIX_SOURCE +#define SA_ONSTACK 0x0001 /* take signal on signal stack */ +#define SA_RESTART 0x0002 /* restart system on signal return */ +#define SA_DISABLE 0x0004 /* disable taking signals on alternate stack */ +#ifdef COMPAT_SUNOS +#define SA_USERTRAMP 0x0100 /* do not bounce off kernel's sigtramp */ +#endif +#endif +#define SA_NOCLDSTOP 0x0008 /* do not generate SIGCHLD on child stop */ + +/* + * Flags for sigprocmask: + */ +#define SIG_BLOCK 1 /* block specified signal set */ +#define SIG_UNBLOCK 2 /* unblock specified signal set */ +#define SIG_SETMASK 3 /* set specified signal set */ + +#ifndef _POSIX_SOURCE +#ifndef KERNEL +#include +#endif +typedef void (*sig_t) __P((int)); /* type of signal function */ + +/* + * Structure used in sigaltstack call. + */ +struct sigaltstack { + char *ss_base; /* signal stack base */ + int ss_size; /* signal stack length */ + int ss_flags; /* SA_DISABLE and/or SA_ONSTACK */ +}; +#define MINSIGSTKSZ 8192 /* minimum allowable stack */ +#define SIGSTKSZ (MINSIGSTKSZ + 32768) /* recommended stack size */ + +/* + * 4.3 compatibility: + * Signal vector "template" used in sigvec call. + */ +struct sigvec { + void (*sv_handler)(); /* signal handler */ + int sv_mask; /* signal mask to apply */ + int sv_flags; /* see signal options below */ +}; + +#define SV_ONSTACK SA_ONSTACK +#define SV_INTERRUPT SA_RESTART /* same bit, opposite sense */ +#define sv_onstack sv_flags /* isn't compatibility wonderful! */ + +/* + * Structure used in sigstack call. + */ +struct sigstack { + char *ss_sp; /* signal stack pointer */ + int ss_onstack; /* current status */ +}; + +/* + * Macro for converting signal number to a mask suitable for + * sigblock(). + */ +#define sigmask(m) (1 << ((m)-1)) + +#define BADSIG SIG_ERR + +#endif /* !_POSIX_SOURCE */ +#endif /* !_ANSI_SOURCE */ + +/* + * For historical reasons; programs expect signal's return value to be + * defined by . + */ +__BEGIN_DECLS +void (*signal __P((int, void (*) __P((int))))) __P((int)); +__END_DECLS +#endif /* !_SYS_SIGNAL_H_ */ diff --git a/sys/sys/signalvar.h b/sys/sys/signalvar.h new file mode 100644 index 00000000000..3d7e68bc530 --- /dev/null +++ b/sys/sys/signalvar.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)signalvar.h 8.3 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_SIGNALVAR_H_ /* tmp for user.h */ +#define _SYS_SIGNALVAR_H_ + +/* + * Kernel signal definitions and data structures, + * not exported to user programs. + */ + +/* + * Process signal actions and state, needed only within the process + * (not necessarily resident). + */ +struct sigacts { + sig_t ps_sigact[NSIG]; /* disposition of signals */ + sigset_t ps_catchmask[NSIG]; /* signals to be blocked */ + sigset_t ps_sigonstack; /* signals to take on sigstack */ + sigset_t ps_sigintr; /* signals that interrupt syscalls */ + sigset_t ps_oldmask; /* saved mask from before sigpause */ + int ps_flags; /* signal flags, below */ + struct sigaltstack ps_sigstk; /* sp & on stack state variable */ + int ps_sig; /* for core dump/debugger XXX */ + int ps_code; /* for core dump/debugger XXX */ + int ps_addr; /* for core dump/debugger XXX */ + sigset_t ps_usertramp; /* SunOS compat; libc sigtramp XXX */ +}; + +/* signal flags */ +#define SAS_OLDMASK 0x01 /* need to restore mask before pause */ +#define SAS_ALTSTACK 0x02 /* have alternate signal stack */ + +/* additional signal action values, used only temporarily/internally */ +#define SIG_CATCH (void (*)())2 +#define SIG_HOLD (void (*)())3 + +/* + * get signal action for process and signal; currently only for current process + */ +#define SIGACTION(p, sig) (p->p_sigacts->ps_sigact[(sig)]) + +/* + * Determine signal that should be delivered to process p, the current + * process, 0 if none. If there is a pending stop signal with default + * action, the process stops in issig(). + */ +#define CURSIG(p) \ + (((p)->p_siglist == 0 || \ + ((p)->p_flag & P_TRACED) == 0 && \ + ((p)->p_siglist & ~(p)->p_sigmask) == 0) ? \ + 0 : issignal(p)) + +/* + * Clear a pending signal from a process. + */ +#define CLRSIG(p, sig) { (p)->p_siglist &= ~sigmask(sig); } + +/* + * Signal properties and actions. + * The array below categorizes the signals and their default actions + * according to the following properties: + */ +#define SA_KILL 0x01 /* terminates process by default */ +#define SA_CORE 0x02 /* ditto and coredumps */ +#define SA_STOP 0x04 /* suspend process */ +#define SA_TTYSTOP 0x08 /* ditto, from tty */ +#define SA_IGNORE 0x10 /* ignore by default */ +#define SA_CONT 0x20 /* continue if suspended */ +#define SA_CANTMASK 0x40 /* non-maskable, catchable */ + +#ifdef SIGPROP +int sigprop[NSIG + 1] = { + 0, /* unused */ + SA_KILL, /* SIGHUP */ + SA_KILL, /* SIGINT */ + SA_KILL|SA_CORE, /* SIGQUIT */ + SA_KILL|SA_CORE, /* SIGILL */ + SA_KILL|SA_CORE, /* SIGTRAP */ + SA_KILL|SA_CORE, /* SIGABRT */ + SA_KILL|SA_CORE, /* SIGEMT */ + SA_KILL|SA_CORE, /* SIGFPE */ + SA_KILL, /* SIGKILL */ + SA_KILL|SA_CORE, /* SIGBUS */ + SA_KILL|SA_CORE, /* SIGSEGV */ + SA_KILL|SA_CORE, /* SIGSYS */ + SA_KILL, /* SIGPIPE */ + SA_KILL, /* SIGALRM */ + SA_KILL, /* SIGTERM */ + SA_IGNORE, /* SIGURG */ + SA_STOP, /* SIGSTOP */ + SA_STOP|SA_TTYSTOP, /* SIGTSTP */ + SA_IGNORE|SA_CONT, /* SIGCONT */ + SA_IGNORE, /* SIGCHLD */ + SA_STOP|SA_TTYSTOP, /* SIGTTIN */ + SA_STOP|SA_TTYSTOP, /* SIGTTOU */ + SA_IGNORE, /* SIGIO */ + SA_KILL, /* SIGXCPU */ + SA_KILL, /* SIGXFSZ */ + SA_KILL, /* SIGVTALRM */ + SA_KILL, /* SIGPROF */ + SA_IGNORE, /* SIGWINCH */ + SA_IGNORE, /* SIGINFO */ + SA_KILL, /* SIGUSR1 */ + SA_KILL, /* SIGUSR2 */ +}; + +#define contsigmask (sigmask(SIGCONT)) +#define stopsigmask (sigmask(SIGSTOP) | sigmask(SIGTSTP) | \ + sigmask(SIGTTIN) | sigmask(SIGTTOU)) + +#endif /* SIGPROP */ + +#define sigcantmask (sigmask(SIGKILL) | sigmask(SIGSTOP)) + +#ifdef KERNEL +/* + * Machine-independent functions: + */ +int coredump __P((struct proc *p)); +void execsigs __P((struct proc *p)); +void gsignal __P((int pgid, int sig)); +int issig __P((struct proc *p)); +void pgsignal __P((struct pgrp *pgrp, int sig, int checkctty)); +void postsig __P((int sig)); +void psignal __P((struct proc *p, int sig)); +void siginit __P((struct proc *p)); +void trapsignal __P((struct proc *p, int sig, unsigned code)); + +/* + * Machine-dependent functions: + */ +void sendsig __P((sig_t action, int sig, int returnmask, unsigned code)); +#endif /* KERNEL */ +#endif /* !_SYS_SIGNALVAR_H_ */ diff --git a/sys/sys/socket.h b/sys/sys/socket.h new file mode 100644 index 00000000000..f6728e98854 --- /dev/null +++ b/sys/sys/socket.h @@ -0,0 +1,339 @@ +/* + * Copyright (c) 1982, 1985, 1986, 1988, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)socket.h 8.4 (Berkeley) 2/21/94 + */ + +#ifndef _SYS_SOCKET_H_ +#define _SYS_SOCKET_H_ + +/* + * Definitions related to sockets: types, address families, options. + */ + +/* + * Types + */ +#define SOCK_STREAM 1 /* stream socket */ +#define SOCK_DGRAM 2 /* datagram socket */ +#define SOCK_RAW 3 /* raw-protocol interface */ +#define SOCK_RDM 4 /* reliably-delivered message */ +#define SOCK_SEQPACKET 5 /* sequenced packet stream */ + +/* + * Option flags per-socket. + */ +#define SO_DEBUG 0x0001 /* turn on debugging info recording */ +#define SO_ACCEPTCONN 0x0002 /* socket has had listen() */ +#define SO_REUSEADDR 0x0004 /* allow local address reuse */ +#define SO_KEEPALIVE 0x0008 /* keep connections alive */ +#define SO_DONTROUTE 0x0010 /* just use interface addresses */ +#define SO_BROADCAST 0x0020 /* permit sending of broadcast msgs */ +#define SO_USELOOPBACK 0x0040 /* bypass hardware when possible */ +#define SO_LINGER 0x0080 /* linger on close if data present */ +#define SO_OOBINLINE 0x0100 /* leave received OOB data in line */ +#define SO_REUSEPORT 0x0200 /* allow local address & port reuse */ + +/* + * Additional options, not kept in so_options. + */ +#define SO_SNDBUF 0x1001 /* send buffer size */ +#define SO_RCVBUF 0x1002 /* receive buffer size */ +#define SO_SNDLOWAT 0x1003 /* send low-water mark */ +#define SO_RCVLOWAT 0x1004 /* receive low-water mark */ +#define SO_SNDTIMEO 0x1005 /* send timeout */ +#define SO_RCVTIMEO 0x1006 /* receive timeout */ +#define SO_ERROR 0x1007 /* get error status and clear */ +#define SO_TYPE 0x1008 /* get socket type */ + +/* + * Structure used for manipulating linger option. + */ +struct linger { + int l_onoff; /* option on/off */ + int l_linger; /* linger time */ +}; + +/* + * Level number for (get/set)sockopt() to apply to socket itself. + */ +#define SOL_SOCKET 0xffff /* options for socket level */ + +/* + * Address families. + */ +#define AF_UNSPEC 0 /* unspecified */ +#define AF_LOCAL 1 /* local to host (pipes, portals) */ +#define AF_UNIX AF_LOCAL /* backward compatibility */ +#define AF_INET 2 /* internetwork: UDP, TCP, etc. */ +#define AF_IMPLINK 3 /* arpanet imp addresses */ +#define AF_PUP 4 /* pup protocols: e.g. BSP */ +#define AF_CHAOS 5 /* mit CHAOS protocols */ +#define AF_NS 6 /* XEROX NS protocols */ +#define AF_ISO 7 /* ISO protocols */ +#define AF_OSI AF_ISO +#define AF_ECMA 8 /* european computer manufacturers */ +#define AF_DATAKIT 9 /* datakit protocols */ +#define AF_CCITT 10 /* CCITT protocols, X.25 etc */ +#define AF_SNA 11 /* IBM SNA */ +#define AF_DECnet 12 /* DECnet */ +#define AF_DLI 13 /* DEC Direct data link interface */ +#define AF_LAT 14 /* LAT */ +#define AF_HYLINK 15 /* NSC Hyperchannel */ +#define AF_APPLETALK 16 /* Apple Talk */ +#define AF_ROUTE 17 /* Internal Routing Protocol */ +#define AF_LINK 18 /* Link layer interface */ +#define pseudo_AF_XTP 19 /* eXpress Transfer Protocol (no AF) */ +#define AF_COIP 20 /* connection-oriented IP, aka ST II */ +#define AF_CNT 21 /* Computer Network Technology */ +#define pseudo_AF_RTIP 22 /* Help Identify RTIP packets */ +#define AF_IPX 23 /* Novell Internet Protocol */ +#define AF_SIP 24 /* Simple Internet Protocol */ +#define pseudo_AF_PIP 25 /* Help Identify PIP packets */ + +#define AF_MAX 26 + +/* + * Structure used by kernel to store most + * addresses. + */ +struct sockaddr { + u_char sa_len; /* total length */ + u_char sa_family; /* address family */ + char sa_data[14]; /* actually longer; address value */ +}; + +/* + * Structure used by kernel to pass protocol + * information in raw sockets. + */ +struct sockproto { + u_short sp_family; /* address family */ + u_short sp_protocol; /* protocol */ +}; + +/* + * Protocol families, same as address families for now. + */ +#define PF_UNSPEC AF_UNSPEC +#define PF_LOCAL AF_LOCAL +#define PF_UNIX PF_LOCAL /* backward compatibility */ +#define PF_INET AF_INET +#define PF_IMPLINK AF_IMPLINK +#define PF_PUP AF_PUP +#define PF_CHAOS AF_CHAOS +#define PF_NS AF_NS +#define PF_ISO AF_ISO +#define PF_OSI AF_ISO +#define PF_ECMA AF_ECMA +#define PF_DATAKIT AF_DATAKIT +#define PF_CCITT AF_CCITT +#define PF_SNA AF_SNA +#define PF_DECnet AF_DECnet +#define PF_DLI AF_DLI +#define PF_LAT AF_LAT +#define PF_HYLINK AF_HYLINK +#define PF_APPLETALK AF_APPLETALK +#define PF_ROUTE AF_ROUTE +#define PF_LINK AF_LINK +#define PF_XTP pseudo_AF_XTP /* really just proto family, no AF */ +#define PF_COIP AF_COIP +#define PF_CNT AF_CNT +#define PF_SIP AF_SIP +#define PF_IPX AF_IPX /* same format as AF_NS */ +#define PF_RTIP pseudo_AF_FTIP /* same format as AF_INET */ +#define PF_PIP pseudo_AF_PIP + +#define PF_MAX AF_MAX + +/* + * Definitions for network related sysctl, CTL_NET. + * + * Second level is protocol family. + * Third level is protocol number. + * + * Further levels are defined by the individual families below. + */ +#define NET_MAXID AF_MAX + +#define CTL_NET_NAMES { \ + { 0, 0 }, \ + { "unix", CTLTYPE_NODE }, \ + { "inet", CTLTYPE_NODE }, \ + { "implink", CTLTYPE_NODE }, \ + { "pup", CTLTYPE_NODE }, \ + { "chaos", CTLTYPE_NODE }, \ + { "xerox_ns", CTLTYPE_NODE }, \ + { "iso", CTLTYPE_NODE }, \ + { "emca", CTLTYPE_NODE }, \ + { "datakit", CTLTYPE_NODE }, \ + { "ccitt", CTLTYPE_NODE }, \ + { "ibm_sna", CTLTYPE_NODE }, \ + { "decnet", CTLTYPE_NODE }, \ + { "dec_dli", CTLTYPE_NODE }, \ + { "lat", CTLTYPE_NODE }, \ + { "hylink", CTLTYPE_NODE }, \ + { "appletalk", CTLTYPE_NODE }, \ + { "route", CTLTYPE_NODE }, \ + { "link_layer", CTLTYPE_NODE }, \ + { "xtp", CTLTYPE_NODE }, \ + { "coip", CTLTYPE_NODE }, \ + { "cnt", CTLTYPE_NODE }, \ + { "rtip", CTLTYPE_NODE }, \ + { "ipx", CTLTYPE_NODE }, \ + { "sip", CTLTYPE_NODE }, \ + { "pip", CTLTYPE_NODE }, \ +} + +/* + * PF_ROUTE - Routing table + * + * Three additional levels are defined: + * Fourth: address family, 0 is wildcard + * Fifth: type of info, defined below + * Sixth: flag(s) to mask with for NET_RT_FLAGS + */ +#define NET_RT_DUMP 1 /* dump; may limit to a.f. */ +#define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ +#define NET_RT_IFLIST 3 /* survey interface list */ +#define NET_RT_MAXID 4 + +#define CTL_NET_RT_NAMES { \ + { 0, 0 }, \ + { "dump", CTLTYPE_STRUCT }, \ + { "flags", CTLTYPE_STRUCT }, \ + { "iflist", CTLTYPE_STRUCT }, \ +} + +/* + * Maximum queue length specifiable by listen. + */ +#define SOMAXCONN 5 + +/* + * Message header for recvmsg and sendmsg calls. + * Used value-result for recvmsg, value only for sendmsg. + */ +struct msghdr { + caddr_t msg_name; /* optional address */ + u_int msg_namelen; /* size of address */ + struct iovec *msg_iov; /* scatter/gather array */ + u_int msg_iovlen; /* # elements in msg_iov */ + caddr_t msg_control; /* ancillary data, see below */ + u_int msg_controllen; /* ancillary data buffer len */ + int msg_flags; /* flags on received message */ +}; + +#define MSG_OOB 0x1 /* process out-of-band data */ +#define MSG_PEEK 0x2 /* peek at incoming message */ +#define MSG_DONTROUTE 0x4 /* send without using routing tables */ +#define MSG_EOR 0x8 /* data completes record */ +#define MSG_TRUNC 0x10 /* data discarded before delivery */ +#define MSG_CTRUNC 0x20 /* control data lost before delivery */ +#define MSG_WAITALL 0x40 /* wait for full request or error */ +#define MSG_DONTWAIT 0x80 /* this message should be nonblocking */ + +/* + * Header for ancillary data objects in msg_control buffer. + * Used for additional information with/about a datagram + * not expressible by flags. The format is a sequence + * of message elements headed by cmsghdr structures. + */ +struct cmsghdr { + u_int cmsg_len; /* data byte count, including hdr */ + int cmsg_level; /* originating protocol */ + int cmsg_type; /* protocol-specific type */ +/* followed by u_char cmsg_data[]; */ +}; + +/* given pointer to struct cmsghdr, return pointer to data */ +#define CMSG_DATA(cmsg) ((u_char *)((cmsg) + 1)) + +/* given pointer to struct cmsghdr, return pointer to next cmsghdr */ +#define CMSG_NXTHDR(mhdr, cmsg) \ + (((caddr_t)(cmsg) + (cmsg)->cmsg_len + sizeof(struct cmsghdr) > \ + (mhdr)->msg_control + (mhdr)->msg_controllen) ? \ + (struct cmsghdr *)NULL : \ + (struct cmsghdr *)((caddr_t)(cmsg) + ALIGN((cmsg)->cmsg_len))) + +#define CMSG_FIRSTHDR(mhdr) ((struct cmsghdr *)(mhdr)->msg_control) + +/* "Socket"-level control message types: */ +#define SCM_RIGHTS 0x01 /* access rights (array of int) */ + +/* + * 4.3 compat sockaddr, move to compat file later + */ +struct osockaddr { + u_short sa_family; /* address family */ + char sa_data[14]; /* up to 14 bytes of direct address */ +}; + +/* + * 4.3-compat message header (move to compat file later). + */ +struct omsghdr { + caddr_t msg_name; /* optional address */ + int msg_namelen; /* size of address */ + struct iovec *msg_iov; /* scatter/gather array */ + int msg_iovlen; /* # elements in msg_iov */ + caddr_t msg_accrights; /* access rights sent/received */ + int msg_accrightslen; +}; + +#ifndef KERNEL + +#include + +__BEGIN_DECLS +int accept __P((int, struct sockaddr *, int *)); +int bind __P((int, const struct sockaddr *, int)); +int connect __P((int, const struct sockaddr *, int)); +int getpeername __P((int, struct sockaddr *, int *)); +int getsockname __P((int, struct sockaddr *, int *)); +int getsockopt __P((int, int, int, void *, int *)); +int listen __P((int, int)); +ssize_t recv __P((int, void *, size_t, int)); +ssize_t recvfrom __P((int, void *, size_t, int, struct sockaddr *, int *)); +ssize_t recvmsg __P((int, struct msghdr *, int)); +ssize_t send __P((int, const void *, size_t, int)); +ssize_t sendto __P((int, const void *, + size_t, int, const struct sockaddr *, int)); +ssize_t sendmsg __P((int, const struct msghdr *, int)); +int setsockopt __P((int, int, int, const void *, int)); +int shutdown __P((int, int)); +int socket __P((int, int, int)); +int socketpair __P((int, int, int, int *)); +__END_DECLS + +#endif /* !KERNEL */ +#endif /* !_SYS_SOCKET_H_ */ diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h new file mode 100644 index 00000000000..ff104046c7c --- /dev/null +++ b/sys/sys/socketvar.h @@ -0,0 +1,207 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)socketvar.h 8.1 (Berkeley) 6/2/93 + */ + +#include /* for struct selinfo */ + +/* + * Kernel structure per socket. + * Contains send and receive buffer queues, + * handle on protocol and pointer to protocol + * private data and error information. + */ +struct socket { + short so_type; /* generic type, see socket.h */ + short so_options; /* from socket call, see socket.h */ + short so_linger; /* time to linger while closing */ + short so_state; /* internal state flags SS_*, below */ + caddr_t so_pcb; /* protocol control block */ + struct protosw *so_proto; /* protocol handle */ +/* + * Variables for connection queueing. + * Socket where accepts occur is so_head in all subsidiary sockets. + * If so_head is 0, socket is not related to an accept. + * For head socket so_q0 queues partially completed connections, + * while so_q is a queue of connections ready to be accepted. + * If a connection is aborted and it has so_head set, then + * it has to be pulled out of either so_q0 or so_q. + * We allow connections to queue up based on current queue lengths + * and limit on number of queued connections for this socket. + */ + struct socket *so_head; /* back pointer to accept socket */ + struct socket *so_q0; /* queue of partial connections */ + struct socket *so_q; /* queue of incoming connections */ + short so_q0len; /* partials on so_q0 */ + short so_qlen; /* number of connections on so_q */ + short so_qlimit; /* max number queued connections */ + short so_timeo; /* connection timeout */ + u_short so_error; /* error affecting connection */ + pid_t so_pgid; /* pgid for signals */ + u_long so_oobmark; /* chars to oob mark */ +/* + * Variables for socket buffering. + */ + struct sockbuf { + u_long sb_cc; /* actual chars in buffer */ + u_long sb_hiwat; /* max actual char count */ + u_long sb_mbcnt; /* chars of mbufs used */ + u_long sb_mbmax; /* max chars of mbufs to use */ + long sb_lowat; /* low water mark */ + struct mbuf *sb_mb; /* the mbuf chain */ + struct selinfo sb_sel; /* process selecting read/write */ + short sb_flags; /* flags, see below */ + short sb_timeo; /* timeout for read/write */ + } so_rcv, so_snd; +#define SB_MAX (256*1024) /* default for max chars in sockbuf */ +#define SB_LOCK 0x01 /* lock on data queue */ +#define SB_WANT 0x02 /* someone is waiting to lock */ +#define SB_WAIT 0x04 /* someone is waiting for data/space */ +#define SB_SEL 0x08 /* someone is selecting */ +#define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ +#define SB_NOTIFY (SB_WAIT|SB_SEL|SB_ASYNC) +#define SB_NOINTR 0x40 /* operations not interruptible */ + + caddr_t so_tpcb; /* Wisc. protocol control block XXX */ + void (*so_upcall) __P((struct socket *so, caddr_t arg, int waitf)); + caddr_t so_upcallarg; /* Arg for above */ +}; + +/* + * Socket state bits. + */ +#define SS_NOFDREF 0x001 /* no file table ref any more */ +#define SS_ISCONNECTED 0x002 /* socket connected to a peer */ +#define SS_ISCONNECTING 0x004 /* in process of connecting to peer */ +#define SS_ISDISCONNECTING 0x008 /* in process of disconnecting */ +#define SS_CANTSENDMORE 0x010 /* can't send more data to peer */ +#define SS_CANTRCVMORE 0x020 /* can't receive more data from peer */ +#define SS_RCVATMARK 0x040 /* at mark on input */ + +#define SS_PRIV 0x080 /* privileged for broadcast, raw... */ +#define SS_NBIO 0x100 /* non-blocking ops */ +#define SS_ASYNC 0x200 /* async i/o notify */ +#define SS_ISCONFIRMING 0x400 /* deciding to accept connection req */ + + +/* + * Macros for sockets and socket buffering. + */ + +/* + * How much space is there in a socket buffer (so->so_snd or so->so_rcv)? + * This is problematical if the fields are unsigned, as the space might + * still be negative (cc > hiwat or mbcnt > mbmax). Should detect + * overflow and return 0. Should use "lmin" but it doesn't exist now. + */ +#define sbspace(sb) \ + ((long) imin((int)((sb)->sb_hiwat - (sb)->sb_cc), \ + (int)((sb)->sb_mbmax - (sb)->sb_mbcnt))) + +/* do we have to send all at once on a socket? */ +#define sosendallatonce(so) \ + ((so)->so_proto->pr_flags & PR_ATOMIC) + +/* can we read something from so? */ +#define soreadable(so) \ + ((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \ + ((so)->so_state & SS_CANTRCVMORE) || \ + (so)->so_qlen || (so)->so_error) + +/* can we write something to so? */ +#define sowriteable(so) \ + (sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && \ + (((so)->so_state&SS_ISCONNECTED) || \ + ((so)->so_proto->pr_flags&PR_CONNREQUIRED)==0) || \ + ((so)->so_state & SS_CANTSENDMORE) || \ + (so)->so_error) + +/* adjust counters in sb reflecting allocation of m */ +#define sballoc(sb, m) { \ + (sb)->sb_cc += (m)->m_len; \ + (sb)->sb_mbcnt += MSIZE; \ + if ((m)->m_flags & M_EXT) \ + (sb)->sb_mbcnt += (m)->m_ext.ext_size; \ +} + +/* adjust counters in sb reflecting freeing of m */ +#define sbfree(sb, m) { \ + (sb)->sb_cc -= (m)->m_len; \ + (sb)->sb_mbcnt -= MSIZE; \ + if ((m)->m_flags & M_EXT) \ + (sb)->sb_mbcnt -= (m)->m_ext.ext_size; \ +} + +/* + * Set lock on sockbuf sb; sleep if lock is already held. + * Unless SB_NOINTR is set on sockbuf, sleep is interruptible. + * Returns error without lock if sleep is interrupted. + */ +#define sblock(sb, wf) ((sb)->sb_flags & SB_LOCK ? \ + (((wf) == M_WAITOK) ? sb_lock(sb) : EWOULDBLOCK) : \ + ((sb)->sb_flags |= SB_LOCK), 0) + +/* release lock on sockbuf sb */ +#define sbunlock(sb) { \ + (sb)->sb_flags &= ~SB_LOCK; \ + if ((sb)->sb_flags & SB_WANT) { \ + (sb)->sb_flags &= ~SB_WANT; \ + wakeup((caddr_t)&(sb)->sb_flags); \ + } \ +} + +#define sorwakeup(so) { sowakeup((so), &(so)->so_rcv); \ + if ((so)->so_upcall) \ + (*((so)->so_upcall))((so), (so)->so_upcallarg, M_DONTWAIT); \ + } + +#define sowwakeup(so) sowakeup((so), &(so)->so_snd) + +#ifdef KERNEL +u_long sb_max; +/* to catch callers missing new second argument to sonewconn: */ +#define sonewconn(head, connstatus) sonewconn1((head), (connstatus)) +struct socket *sonewconn1 __P((struct socket *head, int connstatus)); + +/* strings for sleep message: */ +extern char netio[], netcon[], netcls[]; + +/* + * File operations on sockets. + */ +int soo_read __P((struct file *fp, struct uio *uio, struct ucred *cred)); +int soo_write __P((struct file *fp, struct uio *uio, struct ucred *cred)); +int soo_ioctl __P((struct file *fp, int com, caddr_t data, struct proc *p)); +int soo_select __P((struct file *fp, int which, struct proc *p)); +int soo_close __P((struct file *fp, struct proc *p)); +#endif diff --git a/sys/sys/sockio.h b/sys/sys/sockio.h new file mode 100644 index 00000000000..eb5a44a598d --- /dev/null +++ b/sys/sys/sockio.h @@ -0,0 +1,77 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sockio.h 8.1 (Berkeley) 3/28/94 + */ + +#ifndef _SYS_SOCKIO_H_ +#define _SYS_SOCKIO_H_ + +#include + +/* Socket ioctl's. */ +#define SIOCSHIWAT _IOW('s', 0, int) /* set high watermark */ +#define SIOCGHIWAT _IOR('s', 1, int) /* get high watermark */ +#define SIOCSLOWAT _IOW('s', 2, int) /* set low watermark */ +#define SIOCGLOWAT _IOR('s', 3, int) /* get low watermark */ +#define SIOCATMARK _IOR('s', 7, int) /* at oob mark? */ +#define SIOCSPGRP _IOW('s', 8, int) /* set process group */ +#define SIOCGPGRP _IOR('s', 9, int) /* get process group */ + +#define SIOCADDRT _IOW('r', 10, struct ortentry) /* add route */ +#define SIOCDELRT _IOW('r', 11, struct ortentry) /* delete route */ + +#define SIOCSIFADDR _IOW('i', 12, struct ifreq) /* set ifnet address */ +#define OSIOCGIFADDR _IOWR('i', 13, struct ifreq) /* get ifnet address */ +#define SIOCGIFADDR _IOWR('i', 33, struct ifreq) /* get ifnet address */ +#define SIOCSIFDSTADDR _IOW('i', 14, struct ifreq) /* set p-p address */ +#define OSIOCGIFDSTADDR _IOWR('i', 15, struct ifreq) /* get p-p address */ +#define SIOCGIFDSTADDR _IOWR('i', 34, struct ifreq) /* get p-p address */ +#define SIOCSIFFLAGS _IOW('i', 16, struct ifreq) /* set ifnet flags */ +#define SIOCGIFFLAGS _IOWR('i', 17, struct ifreq) /* get ifnet flags */ +#define OSIOCGIFBRDADDR _IOWR('i', 18, struct ifreq) /* get broadcast addr */ +#define SIOCGIFBRDADDR _IOWR('i', 35, struct ifreq) /* get broadcast addr */ +#define SIOCSIFBRDADDR _IOW('i', 19, struct ifreq) /* set broadcast addr */ +#define OSIOCGIFCONF _IOWR('i', 20, struct ifconf) /* get ifnet list */ +#define SIOCGIFCONF _IOWR('i', 36, struct ifconf) /* get ifnet list */ +#define OSIOCGIFNETMASK _IOWR('i', 21, struct ifreq) /* get net addr mask */ +#define SIOCGIFNETMASK _IOWR('i', 37, struct ifreq) /* get net addr mask */ +#define SIOCSIFNETMASK _IOW('i', 22, struct ifreq) /* set net addr mask */ +#define SIOCGIFMETRIC _IOWR('i', 23, struct ifreq) /* get IF metric */ +#define SIOCSIFMETRIC _IOW('i', 24, struct ifreq) /* set IF metric */ +#define SIOCDIFADDR _IOW('i', 25, struct ifreq) /* delete IF addr */ +#define SIOCAIFADDR _IOW('i', 26, struct ifaliasreq)/* add/chg IF alias */ + +#define SIOCADDMULTI _IOW('i', 49, struct ifreq) /* add m'cast addr */ +#define SIOCDELMULTI _IOW('i', 50, struct ifreq) /* del m'cast addr */ + +#endif /* !_SYS_SOCKIO_H_ */ diff --git a/sys/sys/stat.h b/sys/sys/stat.h new file mode 100644 index 00000000000..07020c36770 --- /dev/null +++ b/sys/sys/stat.h @@ -0,0 +1,193 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)stat.h 8.6 (Berkeley) 3/8/94 + */ + +#ifndef _SYS_STAT_H_ +#define _SYS_STAT_H_ + +#include + +#ifndef _POSIX_SOURCE +struct ostat { + unsigned short st_dev; /* inode's device */ + ino_t st_ino; /* inode's number */ + mode_t st_mode; /* inode protection mode */ + nlink_t st_nlink; /* number of hard links */ + unsigned short st_uid; /* user ID of the file's owner */ + unsigned short st_gid; /* group ID of the file's group */ + unsigned short st_rdev; /* device type */ + long st_size; /* file size, in bytes */ + struct timespec st_atimespec; /* time of last access */ + struct timespec st_mtimespec; /* time of last data modification */ + struct timespec st_ctimespec; /* time of last file status change */ + long st_blksize; /* optimal blocksize for I/O */ + long st_blocks; /* blocks allocated for file */ + unsigned long st_flags; /* user defined flags for file */ + unsigned long st_gen; /* file generation number */ +}; +#endif /* !_POSIX_SOURCE */ + +struct stat { + dev_t st_dev; /* inode's device */ + ino_t st_ino; /* inode's number */ + mode_t st_mode; /* inode protection mode */ + nlink_t st_nlink; /* number of hard links */ + uid_t st_uid; /* user ID of the file's owner */ + gid_t st_gid; /* group ID of the file's group */ + dev_t st_rdev; /* device type */ + struct timespec st_atimespec; /* time of last access */ + struct timespec st_mtimespec; /* time of last data modification */ + struct timespec st_ctimespec; /* time of last file status change */ + off_t st_size; /* file size, in bytes */ + quad_t st_blocks; /* blocks allocated for file */ + unsigned long st_blksize; /* optimal blocksize for I/O */ + unsigned long st_flags; /* user defined flags for file */ + unsigned long st_gen; /* file generation number */ + long st_lspare; + quad_t st_qspare[2]; +}; +#define st_atime st_atimespec.ts_sec +#define st_mtime st_mtimespec.ts_sec +#define st_ctime st_ctimespec.ts_sec + +#define S_ISUID 0004000 /* set user id on execution */ +#define S_ISGID 0002000 /* set group id on execution */ +#ifndef _POSIX_SOURCE +#define S_ISTXT 0001000 /* sticky bit */ +#endif + +#define S_IRWXU 0000700 /* RWX mask for owner */ +#define S_IRUSR 0000400 /* R for owner */ +#define S_IWUSR 0000200 /* W for owner */ +#define S_IXUSR 0000100 /* X for owner */ + +#ifndef _POSIX_SOURCE +#define S_IREAD S_IRUSR +#define S_IWRITE S_IWUSR +#define S_IEXEC S_IXUSR +#endif + +#define S_IRWXG 0000070 /* RWX mask for group */ +#define S_IRGRP 0000040 /* R for group */ +#define S_IWGRP 0000020 /* W for group */ +#define S_IXGRP 0000010 /* X for group */ + +#define S_IRWXO 0000007 /* RWX mask for other */ +#define S_IROTH 0000004 /* R for other */ +#define S_IWOTH 0000002 /* W for other */ +#define S_IXOTH 0000001 /* X for other */ + +#ifndef _POSIX_SOURCE +#define S_IFMT 0170000 /* type of file mask */ +#define S_IFIFO 0010000 /* named pipe (fifo) */ +#define S_IFCHR 0020000 /* character special */ +#define S_IFDIR 0040000 /* directory */ +#define S_IFBLK 0060000 /* block special */ +#define S_IFREG 0100000 /* regular */ +#define S_IFLNK 0120000 /* symbolic link */ +#define S_IFSOCK 0140000 /* socket */ +#define S_ISVTX 0001000 /* save swapped text even after use */ +#endif + +#define S_ISDIR(m) ((m & 0170000) == 0040000) /* directory */ +#define S_ISCHR(m) ((m & 0170000) == 0020000) /* char special */ +#define S_ISBLK(m) ((m & 0170000) == 0060000) /* block special */ +#define S_ISREG(m) ((m & 0170000) == 0100000) /* regular file */ +#define S_ISFIFO(m) ((m & 0170000) == 0100000 || \ + (m & 0170000) == 0140000) /* fifo or socket */ +#ifndef _POSIX_SOURCE +#define S_ISLNK(m) ((m & 0170000) == 0120000) /* symbolic link */ +#define S_ISSOCK(m) ((m & 0170000) == 0100000 || \ + (m & 0170000) == 0140000) /* fifo or socket */ +#endif + +#ifndef _POSIX_SOURCE +#define ACCESSPERMS (S_IRWXU|S_IRWXG|S_IRWXO) /* 0777 */ + /* 7777 */ +#define ALLPERMS (S_ISUID|S_ISGID|S_ISTXT|S_IRWXU|S_IRWXG|S_IRWXO) + /* 0666 */ +#define DEFFILEMODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH) + +#define S_BLKSIZE 512 /* block size used in the stat struct */ + +/* + * Definitions of flags stored in file flags word. + * + * Super-user and owner changeable flags. + */ +#define UF_SETTABLE 0x0000ffff /* mask of owner changeable flags */ +#define UF_NODUMP 0x00000001 /* do not dump file */ +#define UF_IMMUTABLE 0x00000002 /* file may not be changed */ +#define UF_APPEND 0x00000004 /* writes to file may only append */ +/* + * Super-user changeable flags. + */ +#define SF_SETTABLE 0xffff0000 /* mask of superuser changeable flags */ +#define SF_ARCHIVED 0x00010000 /* file is archived */ +#define SF_IMMUTABLE 0x00020000 /* file may not be changed */ +#define SF_APPEND 0x00040000 /* writes to file may only append */ + +#ifdef KERNEL +/* + * Shorthand abbreviations of above. + */ +#define APPEND (UF_APPEND | SF_APPEND) +#define IMMUTABLE (UF_IMMUTABLE | SF_IMMUTABLE) +#endif +#endif + +#ifndef KERNEL +#include + +__BEGIN_DECLS +int chmod __P((const char *, mode_t)); +int fstat __P((int, struct stat *)); +int mkdir __P((const char *, mode_t)); +int mkfifo __P((const char *, mode_t)); +int stat __P((const char *, struct stat *)); +mode_t umask __P((mode_t)); +#ifndef _POSIX_SOURCE +int chflags __P((const char *, u_long)); +int fchflags __P((int, u_long)); +int fchmod __P((int, mode_t)); +int lstat __P((const char *, struct stat *)); +#endif +__END_DECLS +#endif +#endif /* !_SYS_STAT_H_ */ diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h new file mode 100644 index 00000000000..8df8eb4fc51 --- /dev/null +++ b/sys/sys/syscall.h @@ -0,0 +1,186 @@ +/* + * System call numbers. + * + * DO NOT EDIT-- this file is automatically generated. + * created from @(#)syscalls.master 8.2 (Berkeley) 1/13/94 + */ + +#define SYS_syscall 0 +#define SYS_exit 1 +#define SYS_fork 2 +#define SYS_read 3 +#define SYS_write 4 +#define SYS_open 5 +#define SYS_close 6 +#define SYS_wait4 7 + /* 8 is old creat */ +#define SYS_link 9 +#define SYS_unlink 10 + /* 11 is obsolete execv */ +#define SYS_chdir 12 +#define SYS_fchdir 13 +#define SYS_mknod 14 +#define SYS_chmod 15 +#define SYS_chown 16 +#define SYS_break 17 +#define SYS_getfsstat 18 + /* 19 is old lseek */ +#define SYS_getpid 20 +#define SYS_mount 21 +#define SYS_unmount 22 +#define SYS_setuid 23 +#define SYS_getuid 24 +#define SYS_geteuid 25 +#define SYS_ptrace 26 +#define SYS_recvmsg 27 +#define SYS_sendmsg 28 +#define SYS_recvfrom 29 +#define SYS_accept 30 +#define SYS_getpeername 31 +#define SYS_getsockname 32 +#define SYS_access 33 +#define SYS_chflags 34 +#define SYS_fchflags 35 +#define SYS_sync 36 +#define SYS_kill 37 + /* 38 is old stat */ +#define SYS_getppid 39 + /* 40 is old lstat */ +#define SYS_dup 41 +#define SYS_pipe 42 +#define SYS_getegid 43 +#define SYS_profil 44 +#define SYS_ktrace 45 +#define SYS_sigaction 46 +#define SYS_getgid 47 +#define SYS_sigprocmask 48 +#define SYS_getlogin 49 +#define SYS_setlogin 50 +#define SYS_acct 51 +#define SYS_sigpending 52 +#define SYS_sigaltstack 53 +#define SYS_ioctl 54 +#define SYS_reboot 55 +#define SYS_revoke 56 +#define SYS_symlink 57 +#define SYS_readlink 58 +#define SYS_execve 59 +#define SYS_umask 60 +#define SYS_chroot 61 + /* 62 is old fstat */ + /* 63 is old getkerninfo */ + /* 64 is old getpagesize */ +#define SYS_msync 65 +#define SYS_vfork 66 + /* 67 is obsolete vread */ + /* 68 is obsolete vwrite */ +#define SYS_sbrk 69 +#define SYS_sstk 70 + /* 71 is old mmap */ +#define SYS_vadvise 72 +#define SYS_munmap 73 +#define SYS_mprotect 74 +#define SYS_madvise 75 + /* 76 is obsolete vhangup */ + /* 77 is obsolete vlimit */ +#define SYS_mincore 78 +#define SYS_getgroups 79 +#define SYS_setgroups 80 +#define SYS_getpgrp 81 +#define SYS_setpgid 82 +#define SYS_setitimer 83 + /* 84 is old wait */ +#define SYS_swapon 85 +#define SYS_getitimer 86 + /* 87 is old gethostname */ + /* 88 is old sethostname */ +#define SYS_getdtablesize 89 +#define SYS_dup2 90 +#define SYS_fcntl 92 +#define SYS_select 93 +#define SYS_fsync 95 +#define SYS_setpriority 96 +#define SYS_socket 97 +#define SYS_connect 98 + /* 99 is old accept */ +#define SYS_getpriority 100 + /* 101 is old send */ + /* 102 is old recv */ +#define SYS_sigreturn 103 +#define SYS_bind 104 +#define SYS_setsockopt 105 +#define SYS_listen 106 + /* 107 is obsolete vtimes */ + /* 108 is old sigvec */ + /* 109 is old sigblock */ + /* 110 is old sigsetmask */ +#define SYS_sigsuspend 111 + /* 112 is old sigstack */ + /* 113 is old recvmsg */ + /* 114 is old sendmsg */ +#define SYS_vtrace 115 + /* 115 is obsolete vtrace */ +#define SYS_gettimeofday 116 +#define SYS_getrusage 117 +#define SYS_getsockopt 118 +#define SYS_resuba 119 +#define SYS_readv 120 +#define SYS_writev 121 +#define SYS_settimeofday 122 +#define SYS_fchown 123 +#define SYS_fchmod 124 + /* 125 is old recvfrom */ + /* 126 is old setreuid */ + /* 127 is old setregid */ +#define SYS_rename 128 + /* 129 is old truncate */ + /* 130 is old ftruncate */ +#define SYS_flock 131 +#define SYS_mkfifo 132 +#define SYS_sendto 133 +#define SYS_shutdown 134 +#define SYS_socketpair 135 +#define SYS_mkdir 136 +#define SYS_rmdir 137 +#define SYS_utimes 138 + /* 139 is obsolete 4.2 sigreturn */ +#define SYS_adjtime 140 + /* 141 is old getpeername */ + /* 142 is old gethostid */ + /* 143 is old sethostid */ + /* 144 is old getrlimit */ + /* 145 is old setrlimit */ + /* 146 is old killpg */ +#define SYS_setsid 147 +#define SYS_quotactl 148 + /* 149 is old quota */ + /* 150 is old getsockname */ +#define SYS_nfssvc 155 + /* 156 is old getdirentries */ +#define SYS_statfs 157 +#define SYS_fstatfs 158 +#define SYS_getfh 161 +#define SYS_shmsys 171 +#define SYS_setgid 181 +#define SYS_setegid 182 +#define SYS_seteuid 183 +#define SYS_lfs_bmapv 184 +#define SYS_lfs_markv 185 +#define SYS_lfs_segclean 186 +#define SYS_lfs_segwait 187 +#define SYS_stat 188 +#define SYS_fstat 189 +#define SYS_lstat 190 +#define SYS_pathconf 191 +#define SYS_fpathconf 192 +#define SYS_getrlimit 194 +#define SYS_setrlimit 195 +#define SYS_getdirentries 196 +#define SYS_mmap 197 +#define SYS___syscall 198 +#define SYS_lseek 199 +#define SYS_truncate 200 +#define SYS_ftruncate 201 +#define SYS___sysctl 202 +#define SYS_mlock 203 +#define SYS_munlock 204 diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h new file mode 100644 index 00000000000..4ad83a74542 --- /dev/null +++ b/sys/sys/sysctl.h @@ -0,0 +1,344 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Karels at Berkeley Software Design, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sysctl.h 8.1 (Berkeley) 6/2/93 + */ + +#ifndef _SYS_SYSCTL_H_ +#define _SYS_SYSCTL_H_ + +/* + * These are for the eproc structure defined below. + */ +#ifndef KERNEL +#include +#include +#include +#include +#endif + +/* + * Definitions for sysctl call. The sysctl call uses a hierarchical name + * for objects that can be examined or modified. The name is expressed as + * a sequence of integers. Like a file path name, the meaning of each + * component depends on its place in the hierarchy. The top-level and kern + * identifiers are defined here, and other identifiers are defined in the + * respective subsystem header files. + */ + +#define CTL_MAXNAME 12 /* largest number of components supported */ + +/* + * Each subsystem defined by sysctl defines a list of variables + * for that subsystem. Each name is either a node with further + * levels defined below it, or it is a leaf of some particular + * type given below. Each sysctl level defines a set of name/type + * pairs to be used by sysctl(1) in manipulating the subsystem. + */ +struct ctlname { + char *ctl_name; /* subsystem name */ + int ctl_type; /* type of name */ +}; +#define CTLTYPE_NODE 1 /* name is a node */ +#define CTLTYPE_INT 2 /* name describes an integer */ +#define CTLTYPE_STRING 3 /* name describes a string */ +#define CTLTYPE_QUAD 4 /* name describes a 64-bit number */ +#define CTLTYPE_STRUCT 5 /* name describes a structure */ + +/* + * Top-level identifiers + */ +#define CTL_UNSPEC 0 /* unused */ +#define CTL_KERN 1 /* "high kernel": proc, limits */ +#define CTL_VM 2 /* virtual memory */ +#define CTL_FS 3 /* file system, mount type is next */ +#define CTL_NET 4 /* network, see socket.h */ +#define CTL_DEBUG 5 /* debugging parameters */ +#define CTL_HW 6 /* generic cpu/io */ +#define CTL_MACHDEP 7 /* machine dependent */ +#define CTL_USER 8 /* user-level */ +#define CTL_MAXID 9 /* number of valid top-level ids */ + +#define CTL_NAMES { \ + { 0, 0 }, \ + { "kern", CTLTYPE_NODE }, \ + { "vm", CTLTYPE_NODE }, \ + { "fs", CTLTYPE_NODE }, \ + { "net", CTLTYPE_NODE }, \ + { "debug", CTLTYPE_NODE }, \ + { "hw", CTLTYPE_NODE }, \ + { "machdep", CTLTYPE_NODE }, \ + { "user", CTLTYPE_NODE }, \ +} + +/* + * CTL_KERN identifiers + */ +#define KERN_OSTYPE 1 /* string: system version */ +#define KERN_OSRELEASE 2 /* string: system release */ +#define KERN_OSREV 3 /* int: system revision */ +#define KERN_VERSION 4 /* string: compile time info */ +#define KERN_MAXVNODES 5 /* int: max vnodes */ +#define KERN_MAXPROC 6 /* int: max processes */ +#define KERN_MAXFILES 7 /* int: max open files */ +#define KERN_ARGMAX 8 /* int: max arguments to exec */ +#define KERN_SECURELVL 9 /* int: system security level */ +#define KERN_HOSTNAME 10 /* string: hostname */ +#define KERN_HOSTID 11 /* int: host identifier */ +#define KERN_CLOCKRATE 12 /* struct: struct clockrate */ +#define KERN_VNODE 13 /* struct: vnode structures */ +#define KERN_PROC 14 /* struct: process entries */ +#define KERN_FILE 15 /* struct: file entries */ +#define KERN_PROF 16 /* node: kernel profiling info */ +#define KERN_POSIX1 17 /* int: POSIX.1 version */ +#define KERN_NGROUPS 18 /* int: # of supplemental group ids */ +#define KERN_JOB_CONTROL 19 /* int: is job control available */ +#define KERN_SAVED_IDS 20 /* int: saved set-user/group-ID */ +#define KERN_BOOTTIME 21 /* struct: time kernel was booted */ +#define KERN_MAXID 22 /* number of valid kern ids */ + +#define CTL_KERN_NAMES { \ + { 0, 0 }, \ + { "ostype", CTLTYPE_STRING }, \ + { "osrelease", CTLTYPE_STRING }, \ + { "osrevision", CTLTYPE_INT }, \ + { "version", CTLTYPE_STRING }, \ + { "maxvnodes", CTLTYPE_INT }, \ + { "maxproc", CTLTYPE_INT }, \ + { "maxfiles", CTLTYPE_INT }, \ + { "argmax", CTLTYPE_INT }, \ + { "securelevel", CTLTYPE_INT }, \ + { "hostname", CTLTYPE_STRING }, \ + { "hostid", CTLTYPE_INT }, \ + { "clockrate", CTLTYPE_STRUCT }, \ + { "vnode", CTLTYPE_STRUCT }, \ + { "proc", CTLTYPE_STRUCT }, \ + { "file", CTLTYPE_STRUCT }, \ + { "profiling", CTLTYPE_NODE }, \ + { "posix1version", CTLTYPE_INT }, \ + { "ngroups", CTLTYPE_INT }, \ + { "job_control", CTLTYPE_INT }, \ + { "saved_ids", CTLTYPE_INT }, \ + { "boottime", CTLTYPE_STRUCT }, \ +} + +/* + * KERN_PROC subtypes + */ +#define KERN_PROC_ALL 0 /* everything */ +#define KERN_PROC_PID 1 /* by process id */ +#define KERN_PROC_PGRP 2 /* by process group id */ +#define KERN_PROC_SESSION 3 /* by session of pid */ +#define KERN_PROC_TTY 4 /* by controlling tty */ +#define KERN_PROC_UID 5 /* by effective uid */ +#define KERN_PROC_RUID 6 /* by real uid */ + +/* + * KERN_PROC subtype ops return arrays of augmented proc structures: + */ +struct kinfo_proc { + struct proc kp_proc; /* proc structure */ + struct eproc { + struct proc *e_paddr; /* address of proc */ + struct session *e_sess; /* session pointer */ + struct pcred e_pcred; /* process credentials */ + struct ucred e_ucred; /* current credentials */ +#ifdef sparc + struct { + segsz_t vm_rssize; /* resident set size */ + segsz_t vm_tsize; /* text size */ + segsz_t vm_dsize; /* data size */ + segsz_t vm_ssize; /* stack size */ + } e_vm; +#else + struct vmspace e_vm; /* address space */ +#endif + pid_t e_ppid; /* parent process id */ + pid_t e_pgid; /* process group id */ + short e_jobc; /* job control counter */ + dev_t e_tdev; /* controlling tty dev */ + pid_t e_tpgid; /* tty process group id */ + struct session *e_tsess; /* tty session pointer */ +#define WMESGLEN 7 + char e_wmesg[WMESGLEN+1]; /* wchan message */ + segsz_t e_xsize; /* text size */ + short e_xrssize; /* text rss */ + short e_xccount; /* text references */ + short e_xswrss; + long e_flag; +#define EPROC_CTTY 0x01 /* controlling tty vnode active */ +#define EPROC_SLEADER 0x02 /* session leader */ + char e_login[MAXLOGNAME]; /* setlogin() name */ + long e_spare[4]; + } kp_eproc; +}; + +/* + * CTL_HW identifiers + */ +#define HW_MACHINE 1 /* string: machine class */ +#define HW_MODEL 2 /* string: specific machine model */ +#define HW_NCPU 3 /* int: number of cpus */ +#define HW_BYTEORDER 4 /* int: machine byte order */ +#define HW_PHYSMEM 5 /* int: total memory */ +#define HW_USERMEM 6 /* int: non-kernel memory */ +#define HW_PAGESIZE 7 /* int: software page size */ +#define HW_DISKNAMES 8 /* strings: disk drive names */ +#define HW_DISKSTATS 9 /* struct: diskstats[] */ +#define HW_MAXID 10 /* number of valid hw ids */ + +#define CTL_HW_NAMES { \ + { 0, 0 }, \ + { "machine", CTLTYPE_STRING }, \ + { "model", CTLTYPE_STRING }, \ + { "ncpu", CTLTYPE_INT }, \ + { "byteorder", CTLTYPE_INT }, \ + { "physmem", CTLTYPE_INT }, \ + { "usermem", CTLTYPE_INT }, \ + { "pagesize", CTLTYPE_INT }, \ + { "disknames", CTLTYPE_STRUCT }, \ + { "diskstats", CTLTYPE_STRUCT }, \ +} + +/* + * CTL_USER definitions + */ +#define USER_CS_PATH 1 /* string: _CS_PATH */ +#define USER_BC_BASE_MAX 2 /* int: BC_BASE_MAX */ +#define USER_BC_DIM_MAX 3 /* int: BC_DIM_MAX */ +#define USER_BC_SCALE_MAX 4 /* int: BC_SCALE_MAX */ +#define USER_BC_STRING_MAX 5 /* int: BC_STRING_MAX */ +#define USER_COLL_WEIGHTS_MAX 6 /* int: COLL_WEIGHTS_MAX */ +#define USER_EXPR_NEST_MAX 7 /* int: EXPR_NEST_MAX */ +#define USER_LINE_MAX 8 /* int: LINE_MAX */ +#define USER_RE_DUP_MAX 9 /* int: RE_DUP_MAX */ +#define USER_POSIX2_VERSION 10 /* int: POSIX2_VERSION */ +#define USER_POSIX2_C_BIND 11 /* int: POSIX2_C_BIND */ +#define USER_POSIX2_C_DEV 12 /* int: POSIX2_C_DEV */ +#define USER_POSIX2_CHAR_TERM 13 /* int: POSIX2_CHAR_TERM */ +#define USER_POSIX2_FORT_DEV 14 /* int: POSIX2_FORT_DEV */ +#define USER_POSIX2_FORT_RUN 15 /* int: POSIX2_FORT_RUN */ +#define USER_POSIX2_LOCALEDEF 16 /* int: POSIX2_LOCALEDEF */ +#define USER_POSIX2_SW_DEV 17 /* int: POSIX2_SW_DEV */ +#define USER_POSIX2_UPE 18 /* int: POSIX2_UPE */ +#define USER_STREAM_MAX 19 /* int: POSIX2_STREAM_MAX */ +#define USER_TZNAME_MAX 20 /* int: POSIX2_TZNAME_MAX */ +#define USER_MAXID 21 /* number of valid user ids */ + +#define CTL_USER_NAMES { \ + { 0, 0 }, \ + { "cs_path", CTLTYPE_STRING }, \ + { "bc_base_max", CTLTYPE_INT }, \ + { "bc_dim_max", CTLTYPE_INT }, \ + { "bc_scale_max", CTLTYPE_INT }, \ + { "bc_string_max", CTLTYPE_INT }, \ + { "coll_weights_max", CTLTYPE_INT }, \ + { "expr_nest_max", CTLTYPE_INT }, \ + { "line_max", CTLTYPE_INT }, \ + { "re_dup_max", CTLTYPE_INT }, \ + { "posix2_version", CTLTYPE_INT }, \ + { "posix2_c_bind", CTLTYPE_INT }, \ + { "posix2_c_dev", CTLTYPE_INT }, \ + { "posix2_char_term", CTLTYPE_INT }, \ + { "posix2_fort_dev", CTLTYPE_INT }, \ + { "posix2_fort_run", CTLTYPE_INT }, \ + { "posix2_localedef", CTLTYPE_INT }, \ + { "posix2_sw_dev", CTLTYPE_INT }, \ + { "posix2_upe", CTLTYPE_INT }, \ + { "stream_max", CTLTYPE_INT }, \ + { "tzname_max", CTLTYPE_INT }, \ +} + +/* + * CTL_DEBUG definitions + * + * Second level identifier specifies which debug variable. + * Third level identifier specifies which stucture component. + */ +#define CTL_DEBUG_NAME 0 /* string: variable name */ +#define CTL_DEBUG_VALUE 1 /* int: variable value */ +#define CTL_DEBUG_MAXID 20 + +#ifdef KERNEL +#ifdef DEBUG +/* + * CTL_DEBUG variables. + * + * These are declared as separate variables so that they can be + * individually initialized at the location of their associated + * variable. The loader prevents multiple use by issuing errors + * if a variable is initialized in more than one place. They are + * aggregated into an array in debug_sysctl(), so that it can + * conveniently locate them when querried. If more debugging + * variables are added, they must also be declared here and also + * entered into the array. + */ +struct ctldebug { + char *debugname; /* name of debugging variable */ + int *debugvar; /* pointer to debugging variable */ +}; +extern struct ctldebug debug0, debug1, debug2, debug3, debug4; +extern struct ctldebug debug5, debug6, debug7, debug8, debug9; +extern struct ctldebug debug10, debug11, debug12, debug13, debug14; +extern struct ctldebug debug15, debug16, debug17, debug18, debug19; +#endif /* DEBUG */ + +/* + * Internal sysctl function calling convention: + * + * (*sysctlfn)(name, namelen, oldval, oldlenp, newval, newlen); + * + * The name parameter points at the next component of the name to be + * interpreted. The namelen parameter is the number of integers in + * the name. + */ +typedef int (sysctlfn) + __P((int *, u_int, void *, size_t *, void *, size_t, struct proc *)); + +int sysctl_int __P((void *, size_t *, void *, size_t, int *)); +int sysctl_rdint __P((void *, size_t *, void *, int)); +int sysctl_string __P((void *, size_t *, void *, size_t, char *, int)); +int sysctl_rdstring __P((void *, size_t *, void *, char *)); +int sysctl_rdstruct __P((void *, size_t *, void *, void *, int)); +void fill_eproc __P((struct proc *, struct eproc *)); + +#else /* !KERNEL */ +#include + +__BEGIN_DECLS +int sysctl __P((int *, u_int, void *, size_t *, void *, size_t)); +__END_DECLS +#endif /* KERNEL */ +#endif /* !_SYS_SYSCTL_H_ */ diff --git a/sys/sys/syslimits.h b/sys/sys/syslimits.h new file mode 100644 index 00000000000..550000c6503 --- /dev/null +++ b/sys/sys/syslimits.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)syslimits.h 8.1 (Berkeley) 6/2/93 + */ + +#define ARG_MAX 20480 /* max bytes for an exec function */ +#define CHILD_MAX 40 /* max simultaneous processes */ +#define LINK_MAX 32767 /* max file link count */ +#define MAX_CANON 255 /* max bytes in term canon input line */ +#define MAX_INPUT 255 /* max bytes in terminal input */ +#define NAME_MAX 255 /* max bytes in a file name */ +#define NGROUPS_MAX 16 /* max supplemental group id's */ +#define OPEN_MAX 64 /* max open files per process */ +#define PATH_MAX 1024 /* max bytes in pathname */ +#define PIPE_BUF 512 /* max bytes for atomic pipe writes */ + +#define BC_BASE_MAX 99 /* max ibase/obase values in bc(1) */ +#define BC_DIM_MAX 2048 /* max array elements in bc(1) */ +#define BC_SCALE_MAX 99 /* max scale value in bc(1) */ +#define BC_STRING_MAX 1000 /* max const string length in bc(1) */ +#define COLL_WEIGHTS_MAX 0 /* max weights for order keyword */ +#define EXPR_NEST_MAX 32 /* max expressions nested in expr(1) */ +#define LINE_MAX 2048 /* max bytes in an input line */ +#define RE_DUP_MAX 255 /* max RE's in interval notation */ diff --git a/sys/sys/syslog.h b/sys/sys/syslog.h new file mode 100644 index 00000000000..935db2d4484 --- /dev/null +++ b/sys/sys/syslog.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)syslog.h 8.1 (Berkeley) 6/2/93 + */ + +#define _PATH_LOG "/dev/log" + +/* + * priorities/facilities are encoded into a single 32-bit quantity, where the + * bottom 3 bits are the priority (0-7) and the top 28 bits are the facility + * (0-big number). Both the priorities and the facilities map roughly + * one-to-one to strings in the syslogd(8) source code. This mapping is + * included in this file. + * + * priorities (these are ordered) + */ +#define LOG_EMERG 0 /* system is unusable */ +#define LOG_ALERT 1 /* action must be taken immediately */ +#define LOG_CRIT 2 /* critical conditions */ +#define LOG_ERR 3 /* error conditions */ +#define LOG_WARNING 4 /* warning conditions */ +#define LOG_NOTICE 5 /* normal but significant condition */ +#define LOG_INFO 6 /* informational */ +#define LOG_DEBUG 7 /* debug-level messages */ + +#define LOG_PRIMASK 0x07 /* mask to extract priority part (internal) */ + /* extract priority */ +#define LOG_PRI(p) ((p) & LOG_PRIMASK) +#define LOG_MAKEPRI(fac, pri) (((fac) << 3) | (pri)) + +#ifdef SYSLOG_NAMES +#define INTERNAL_NOPRI 0x10 /* the "no priority" priority */ + /* mark "facility" */ +#define INTERNAL_MARK LOG_MAKEPRI(LOG_NFACILITIES, 0) +typedef struct _code { + char *c_name; + int c_val; +} CODE; + +CODE prioritynames[] = { + "alert", LOG_ALERT, + "crit", LOG_CRIT, + "debug", LOG_DEBUG, + "emerg", LOG_EMERG, + "err", LOG_ERR, + "error", LOG_ERR, /* DEPRECATED */ + "info", LOG_INFO, + "none", INTERNAL_NOPRI, /* INTERNAL */ + "notice", LOG_NOTICE, + "panic", LOG_EMERG, /* DEPRECATED */ + "warn", LOG_WARNING, /* DEPRECATED */ + "warning", LOG_WARNING, + NULL, -1, +}; +#endif + +/* facility codes */ +#define LOG_KERN (0<<3) /* kernel messages */ +#define LOG_USER (1<<3) /* random user-level messages */ +#define LOG_MAIL (2<<3) /* mail system */ +#define LOG_DAEMON (3<<3) /* system daemons */ +#define LOG_AUTH (4<<3) /* security/authorization messages */ +#define LOG_SYSLOG (5<<3) /* messages generated internally by syslogd */ +#define LOG_LPR (6<<3) /* line printer subsystem */ +#define LOG_NEWS (7<<3) /* network news subsystem */ +#define LOG_UUCP (8<<3) /* UUCP subsystem */ +#define LOG_CRON (9<<3) /* clock daemon */ +#define LOG_AUTHPRIV (10<<3) /* security/authorization messages (private) */ +#define LOG_FTP (11<<3) /* ftp daemon */ + + /* other codes through 15 reserved for system use */ +#define LOG_LOCAL0 (16<<3) /* reserved for local use */ +#define LOG_LOCAL1 (17<<3) /* reserved for local use */ +#define LOG_LOCAL2 (18<<3) /* reserved for local use */ +#define LOG_LOCAL3 (19<<3) /* reserved for local use */ +#define LOG_LOCAL4 (20<<3) /* reserved for local use */ +#define LOG_LOCAL5 (21<<3) /* reserved for local use */ +#define LOG_LOCAL6 (22<<3) /* reserved for local use */ +#define LOG_LOCAL7 (23<<3) /* reserved for local use */ + +#define LOG_NFACILITIES 24 /* current number of facilities */ +#define LOG_FACMASK 0x03f8 /* mask to extract facility part */ + /* facility of pri */ +#define LOG_FAC(p) (((p) & LOG_FACMASK) >> 3) + +#ifdef SYSLOG_NAMES +CODE facilitynames[] = { + "auth", LOG_AUTH, + "authpriv", LOG_AUTHPRIV, + "cron", LOG_CRON, + "daemon", LOG_DAEMON, + "ftp", LOG_FTP, + "kern", LOG_KERN, + "lpr", LOG_LPR, + "mail", LOG_MAIL, + "mark", INTERNAL_MARK, /* INTERNAL */ + "news", LOG_NEWS, + "security", LOG_AUTH, /* DEPRECATED */ + "syslog", LOG_SYSLOG, + "user", LOG_USER, + "uucp", LOG_UUCP, + "local0", LOG_LOCAL0, + "local1", LOG_LOCAL1, + "local2", LOG_LOCAL2, + "local3", LOG_LOCAL3, + "local4", LOG_LOCAL4, + "local5", LOG_LOCAL5, + "local6", LOG_LOCAL6, + "local7", LOG_LOCAL7, + NULL, -1, +}; +#endif + +#ifdef KERNEL +#define LOG_PRINTF -1 /* pseudo-priority to indicate use of printf */ +#endif + +/* + * arguments to setlogmask. + */ +#define LOG_MASK(pri) (1 << (pri)) /* mask for one priority */ +#define LOG_UPTO(pri) ((1 << ((pri)+1)) - 1) /* all priorities through pri */ + +/* + * Option flags for openlog. + * + * LOG_ODELAY no longer does anything. + * LOG_NDELAY is the inverse of what it used to be. + */ +#define LOG_PID 0x01 /* log the pid with each message */ +#define LOG_CONS 0x02 /* log on the console if errors in sending */ +#define LOG_ODELAY 0x04 /* delay open until first syslog() (default) */ +#define LOG_NDELAY 0x08 /* don't delay open */ +#define LOG_NOWAIT 0x10 /* don't wait for console forks: DEPRECATED */ +#define LOG_PERROR 0x20 /* log to stderr as well */ + +#ifndef KERNEL + +/* + * Don't use va_list in the vsyslog() prototype. Va_list is typedef'd in two + * places ( and ), so if we include one + * of them here we may collide with the utility's includes. It's unreasonable + * for utilities to have to include one of them to include syslog.h, so we get + * _BSD_VA_LIST_ from and use it. + */ +#include +#include + +__BEGIN_DECLS +void closelog __P((void)); +void openlog __P((const char *, int, int)); +int setlogmask __P((int)); +void syslog __P((int, const char *, ...)); +void vsyslog __P((int, const char *, _BSD_VA_LIST_)); +__END_DECLS + +#endif /* !KERNEL */ diff --git a/sys/sys/systm.h b/sys/sys/systm.h new file mode 100644 index 00000000000..91cb64bd5fa --- /dev/null +++ b/sys/sys/systm.h @@ -0,0 +1,165 @@ +/*- + * Copyright (c) 1982, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)systm.h 8.4 (Berkeley) 2/23/94 + */ + +/* + * The `securelevel' variable controls the security level of the system. + * It can only be decreased by process 1 (/sbin/init). + * + * Security levels are as follows: + * -1 permannently insecure mode - always run system in level 0 mode. + * 0 insecure mode - immutable and append-only flags make be turned off. + * All devices may be read or written subject to permission modes. + * 1 secure mode - immutable and append-only flags may not be changed; + * raw disks of mounted filesystems, /dev/mem, and /dev/kmem are + * read-only. + * 2 highly secure mode - same as (1) plus raw disks are always + * read-only whether mounted or not. This level precludes tampering + * with filesystems by unmounting them, but also inhibits running + * newfs while the system is secured. + * + * In normal operation, the system runs in level 0 mode while single user + * and in level 1 mode while multiuser. If level 2 mode is desired while + * running multiuser, it can be set in the multiuser startup script + * (/etc/rc.local) using sysctl(1). If it is desired to run the system + * in level 0 mode while multiuser, initialize the variable securelevel + * in /sys/kern/kern_sysctl.c to -1. Note that it is NOT initialized to + * zero as that would allow the vmunix binary to be patched to -1. + * Without initialization, securelevel loads in the BSS area which only + * comes into existence when the kernel is loaded and hence cannot be + * patched by a stalking hacker. + */ +extern int securelevel; /* system security level */ +extern const char *panicstr; /* panic message */ +extern char version[]; /* system version */ +extern char copyright[]; /* system copyright */ + +extern int nblkdev; /* number of entries in bdevsw */ +extern int nchrdev; /* number of entries in cdevsw */ +extern int nswdev; /* number of swap devices */ +extern int nswap; /* size of swap space */ + +extern int selwait; /* select timeout address */ + +extern u_char curpriority; /* priority of current process */ + +extern int maxmem; /* max memory per process */ +extern int physmem; /* physical memory */ + +extern dev_t dumpdev; /* dump device */ +extern long dumplo; /* offset into dumpdev */ + +extern dev_t rootdev; /* root device */ +extern struct vnode *rootvp; /* vnode equivalent to above */ + +extern dev_t swapdev; /* swapping device */ +extern struct vnode *swapdev_vp;/* vnode equivalent to above */ + +extern struct sysent { /* system call table */ + int sy_narg; /* number of arguments */ + int (*sy_call)(); /* implementing function */ +} sysent[]; + +extern int boothowto; /* reboot flags, from console subsystem */ + +/* casts to keep lint happy */ +#define insque(q,p) _insque((caddr_t)q,(caddr_t)p) +#define remque(q) _remque((caddr_t)q) + +/* + * General function declarations. + */ +int nullop __P((void)); +int enodev __P((void)); +int enoioctl __P((void)); +int enxio __P((void)); +int eopnotsupp __P((void)); +int seltrue __P((dev_t dev, int which, struct proc *p)); +void *hashinit __P((int count, int type, u_long *hashmask)); + +#ifdef __GNUC__ +volatile void panic __P((const char *, ...)); +#else +void panic __P((const char *, ...)); +#endif +void tablefull __P((const char *)); +void addlog __P((const char *, ...)); +void log __P((int, const char *, ...)); +void printf __P((const char *, ...)); +int sprintf __P((char *buf, const char *, ...)); +void ttyprintf __P((struct tty *, const char *, ...)); + +void bcopy __P((const void *from, void *to, u_int len)); +void ovbcopy __P((const void *from, void *to, u_int len)); +void bzero __P((void *buf, u_int len)); + +int copystr __P((void *kfaddr, void *kdaddr, u_int len, u_int *done)); +int copyinstr __P((void *udaddr, void *kaddr, u_int len, u_int *done)); +int copyoutstr __P((void *kaddr, void *udaddr, u_int len, u_int *done)); +int copyin __P((void *udaddr, void *kaddr, u_int len)); +int copyout __P((void *kaddr, void *udaddr, u_int len)); + +int fubyte __P((void *base)); +#ifdef notdef +int fuibyte __P((void *base)); +#endif +int subyte __P((void *base, int byte)); +int suibyte __P((void *base, int byte)); +int fuword __P((void *base)); +int fuiword __P((void *base)); +int suword __P((void *base, int word)); +int suiword __P((void *base, int word)); + +int hzto __P((struct timeval *tv)); +void timeout __P((void (*func)(void *), void *arg, int ticks)); +void untimeout __P((void (*func)(void *), void *arg)); +void realitexpire __P((void *)); + +struct clockframe; +void hardclock __P((struct clockframe *frame)); +void softclock __P((void)); +void statclock __P((struct clockframe *frame)); + +void initclocks __P((void)); + +void startprofclock __P((struct proc *)); +void stopprofclock __P((struct proc *)); +void setstatclockrate __P((int hzrate)); + +#include diff --git a/sys/sys/tablet.h b/sys/sys/tablet.h new file mode 100644 index 00000000000..cbb3f23d006 --- /dev/null +++ b/sys/sys/tablet.h @@ -0,0 +1,94 @@ +/*- + * Copyright (c) 1985, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tablet.h 8.3 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_TABLET_H_ +#define _SYS_TABLET_H_ + +/* + * Tablet line discipline. + */ +#include + +/* + * Reads on the tablet return one of the following structures, depending on + * the underlying tablet type. The first two are defined such that a read of + * sizeof (gtcopos) on a non-gtco tablet will return meaningful info. The + * in-proximity bit is simulated where the tablet does not directly provide + * the information. + */ +struct tbpos { + int xpos, ypos; /* raw x-y coordinates */ + short status; /* buttons/pen down */ +#define TBINPROX 0100000 /* pen in proximity of tablet */ + short scount; /* sample count */ +}; + +struct gtcopos { + int xpos, ypos; /* raw x-y coordinates */ + short status; /* as above */ + short scount; /* sample count */ + short xtilt, ytilt; /* raw tilt */ + short pressure; + short pad; /* pad to longword boundary */ +}; + +struct polpos { + short p_x, p_y, p_z; /* raw 3-space coordinates */ + short p_azi, p_pit, p_rol; /* azimuth, pitch, and roll */ + short p_stat; /* status, as above */ + char p_key; /* calculator input keyboard */ +}; + +#define BIOSMODE _IOW('b', 1, int) /* set mode bit(s) */ +#define BIOGMODE _IOR('b', 2, int) /* get mode bit(s) */ +#define TBMODE 0xfff0 /* mode bits: */ +#define TBPOINT 0x0010 /* single point */ +#define TBRUN 0x0000 /* runs contin. */ +#define TBSTOP 0x0020 /* shut-up */ +#define TBGO 0x0000 /* ~TBSTOP */ +#define TBTYPE 0x000f /* tablet type: */ +#define TBUNUSED 0x0 +#define TBHITACHI 0x1 /* hitachi tablet */ +#define TBTIGER 0x2 /* hitachi tiger */ +#define TBGTCO 0x3 /* gtco */ +#define TBPOL 0x4 /* polhemus 3space */ +#define TBHDG 0x5 /* hdg-1111b, low res */ +#define TBHDGHIRES 0x6 /* hdg-1111b, high res */ +#define TBDIGI 0x7 /* gtco digi-pad, low res */ +#define TBDIGIHIRES 0x8 /* gtco digi-pad, high res */ +#define BIOSTYPE _IOW('b', 3, int) /* set tablet type */ +#define BIOGTYPE _IOR('b', 4, int) /* get tablet type*/ + +#endif /* !_SYS_TABLET_H_ */ diff --git a/sys/sys/termios.h b/sys/sys/termios.h new file mode 100644 index 00000000000..4ad04a10fb1 --- /dev/null +++ b/sys/sys/termios.h @@ -0,0 +1,278 @@ +/* + * Copyright (c) 1988, 1989, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)termios.h 8.3 (Berkeley) 3/28/94 + */ + +#ifndef _SYS_TERMIOS_H_ +#define _SYS_TERMIOS_H_ + +/* + * Special Control Characters + * + * Index into c_cc[] character array. + * + * Name Subscript Enabled by + */ +#define VEOF 0 /* ICANON */ +#define VEOL 1 /* ICANON */ +#ifndef _POSIX_SOURCE +#define VEOL2 2 /* ICANON */ +#endif +#define VERASE 3 /* ICANON */ +#ifndef _POSIX_SOURCE +#define VWERASE 4 /* ICANON */ +#endif +#define VKILL 5 /* ICANON */ +#ifndef _POSIX_SOURCE +#define VREPRINT 6 /* ICANON */ +#endif +/* 7 spare 1 */ +#define VINTR 8 /* ISIG */ +#define VQUIT 9 /* ISIG */ +#define VSUSP 10 /* ISIG */ +#ifndef _POSIX_SOURCE +#define VDSUSP 11 /* ISIG */ +#endif +#define VSTART 12 /* IXON, IXOFF */ +#define VSTOP 13 /* IXON, IXOFF */ +#ifndef _POSIX_SOURCE +#define VLNEXT 14 /* IEXTEN */ +#define VDISCARD 15 /* IEXTEN */ +#endif +#define VMIN 16 /* !ICANON */ +#define VTIME 17 /* !ICANON */ +#ifndef _POSIX_SOURCE +#define VSTATUS 18 /* ICANON */ +/* 19 spare 2 */ +#endif +#define NCCS 20 + +#define _POSIX_VDISABLE ((unsigned char)'\377') + +#ifndef _POSIX_SOURCE +#define CCEQ(val, c) (c == val ? val != _POSIX_VDISABLE : 0) +#endif + +/* + * Input flags - software input processing + */ +#define IGNBRK 0x00000001 /* ignore BREAK condition */ +#define BRKINT 0x00000002 /* map BREAK to SIGINTR */ +#define IGNPAR 0x00000004 /* ignore (discard) parity errors */ +#define PARMRK 0x00000008 /* mark parity and framing errors */ +#define INPCK 0x00000010 /* enable checking of parity errors */ +#define ISTRIP 0x00000020 /* strip 8th bit off chars */ +#define INLCR 0x00000040 /* map NL into CR */ +#define IGNCR 0x00000080 /* ignore CR */ +#define ICRNL 0x00000100 /* map CR to NL (ala CRMOD) */ +#define IXON 0x00000200 /* enable output flow control */ +#define IXOFF 0x00000400 /* enable input flow control */ +#ifndef _POSIX_SOURCE +#define IXANY 0x00000800 /* any char will restart after stop */ +#define IMAXBEL 0x00002000 /* ring bell on input queue full */ +#endif /*_POSIX_SOURCE */ + +/* + * Output flags - software output processing + */ +#define OPOST 0x00000001 /* enable following output processing */ +#ifndef _POSIX_SOURCE +#define ONLCR 0x00000002 /* map NL to CR-NL (ala CRMOD) */ +#define OXTABS 0x00000004 /* expand tabs to spaces */ +#define ONOEOT 0x00000008 /* discard EOT's (^D) on output) */ +#endif /*_POSIX_SOURCE */ + +/* + * Control flags - hardware control of terminal + */ +#ifndef _POSIX_SOURCE +#define CIGNORE 0x00000001 /* ignore control flags */ +#endif +#define CSIZE 0x00000300 /* character size mask */ +#define CS5 0x00000000 /* 5 bits (pseudo) */ +#define CS6 0x00000100 /* 6 bits */ +#define CS7 0x00000200 /* 7 bits */ +#define CS8 0x00000300 /* 8 bits */ +#define CSTOPB 0x00000400 /* send 2 stop bits */ +#define CREAD 0x00000800 /* enable receiver */ +#define PARENB 0x00001000 /* parity enable */ +#define PARODD 0x00002000 /* odd parity, else even */ +#define HUPCL 0x00004000 /* hang up on last close */ +#define CLOCAL 0x00008000 /* ignore modem status lines */ +#ifndef _POSIX_SOURCE +#define CCTS_OFLOW 0x00010000 /* CTS flow control of output */ +#define CRTSCTS CCTS_OFLOW /* ??? */ +#define CRTS_IFLOW 0x00020000 /* RTS flow control of input */ +#define MDMBUF 0x00100000 /* flow control output via Carrier */ +#endif + + +/* + * "Local" flags - dumping ground for other state + * + * Warning: some flags in this structure begin with + * the letter "I" and look like they belong in the + * input flag. + */ + +#ifndef _POSIX_SOURCE +#define ECHOKE 0x00000001 /* visual erase for line kill */ +#endif /*_POSIX_SOURCE */ +#define ECHOE 0x00000002 /* visually erase chars */ +#define ECHOK 0x00000004 /* echo NL after line kill */ +#define ECHO 0x00000008 /* enable echoing */ +#define ECHONL 0x00000010 /* echo NL even if ECHO is off */ +#ifndef _POSIX_SOURCE +#define ECHOPRT 0x00000020 /* visual erase mode for hardcopy */ +#define ECHOCTL 0x00000040 /* echo control chars as ^(Char) */ +#endif /*_POSIX_SOURCE */ +#define ISIG 0x00000080 /* enable signals INTR, QUIT, [D]SUSP */ +#define ICANON 0x00000100 /* canonicalize input lines */ +#ifndef _POSIX_SOURCE +#define ALTWERASE 0x00000200 /* use alternate WERASE algorithm */ +#endif /*_POSIX_SOURCE */ +#define IEXTEN 0x00000400 /* enable DISCARD and LNEXT */ +#define EXTPROC 0x00000800 /* external processing */ +#define TOSTOP 0x00400000 /* stop background jobs from output */ +#ifndef _POSIX_SOURCE +#define FLUSHO 0x00800000 /* output being flushed (state) */ +#define NOKERNINFO 0x02000000 /* no kernel output from VSTATUS */ +#define PENDIN 0x20000000 /* XXX retype pending input (state) */ +#endif /*_POSIX_SOURCE */ +#define NOFLSH 0x80000000 /* don't flush after interrupt */ + +typedef unsigned long tcflag_t; +typedef unsigned char cc_t; +typedef long speed_t; + +struct termios { + tcflag_t c_iflag; /* input flags */ + tcflag_t c_oflag; /* output flags */ + tcflag_t c_cflag; /* control flags */ + tcflag_t c_lflag; /* local flags */ + cc_t c_cc[NCCS]; /* control chars */ + long c_ispeed; /* input speed */ + long c_ospeed; /* output speed */ +}; + +/* + * Commands passed to tcsetattr() for setting the termios structure. + */ +#define TCSANOW 0 /* make change immediate */ +#define TCSADRAIN 1 /* drain output, then change */ +#define TCSAFLUSH 2 /* drain output, flush input */ +#ifndef _POSIX_SOURCE +#define TCSASOFT 0x10 /* flag - don't alter h.w. state */ +#endif + +/* + * Standard speeds + */ +#define B0 0 +#define B50 50 +#define B75 75 +#define B110 110 +#define B134 134 +#define B150 150 +#define B200 200 +#define B300 300 +#define B600 600 +#define B1200 1200 +#define B1800 1800 +#define B2400 2400 +#define B4800 4800 +#define B9600 9600 +#define B19200 19200 +#define B38400 38400 +#ifndef _POSIX_SOURCE +#define B7200 7200 +#define B14400 14400 +#define B28800 28800 +#define B57600 57600 +#define B76800 76800 +#define B115200 115200 +#define B230400 230400 +#define EXTA 19200 +#define EXTB 38400 +#endif /* !_POSIX_SOURCE */ + +#ifndef KERNEL + +#define TCIFLUSH 1 +#define TCOFLUSH 2 +#define TCIOFLUSH 3 +#define TCOOFF 1 +#define TCOON 2 +#define TCIOFF 3 +#define TCION 4 + +#include + +__BEGIN_DECLS +speed_t cfgetispeed __P((const struct termios *)); +speed_t cfgetospeed __P((const struct termios *)); +int cfsetispeed __P((struct termios *, speed_t)); +int cfsetospeed __P((struct termios *, speed_t)); +int tcgetattr __P((int, struct termios *)); +int tcsetattr __P((int, int, const struct termios *)); +int tcdrain __P((int)); +int tcflow __P((int, int)); +int tcflush __P((int, int)); +int tcsendbreak __P((int, int)); + +#ifndef _POSIX_SOURCE +void cfmakeraw __P((struct termios *)); +int cfsetspeed __P((struct termios *, speed_t)); +#endif /* !_POSIX_SOURCE */ +__END_DECLS + +#endif /* !KERNEL */ + +#ifndef _POSIX_SOURCE + +/* + * Include tty ioctl's that aren't just for backwards compatibility + * with the old tty driver. These ioctl definitions were previously + * in . + */ +#include +#endif + +/* + * END OF PROTECTED INCLUDE. + */ +#endif /* !_SYS_TERMIOS_H_ */ + +#ifndef _POSIX_SOURCE +#include +#endif diff --git a/sys/sys/time.h b/sys/sys/time.h new file mode 100644 index 00000000000..53227712a3b --- /dev/null +++ b/sys/sys/time.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)time.h 8.1 (Berkeley) 6/2/93 + */ + +#ifndef _SYS_TIME_H_ +#define _SYS_TIME_H_ + +/* + * Structure returned by gettimeofday(2) system call, + * and used in other calls. + */ +struct timeval { + long tv_sec; /* seconds */ + long tv_usec; /* and microseconds */ +}; + +/* + * Structure defined by POSIX.4 to be like a timeval. + */ +struct timespec { + long ts_sec; /* seconds */ + long ts_nsec; /* and nanoseconds */ +}; + +#define TIMEVAL_TO_TIMESPEC(tv, ts) { \ + (ts)->ts_sec = (tv)->tv_sec; \ + (ts)->ts_nsec = (tv)->tv_usec * 1000; \ +} +#define TIMESPEC_TO_TIMEVAL(tv, ts) { \ + (tv)->tv_sec = (ts)->ts_sec; \ + (tv)->tv_usec = (ts)->ts_nsec / 1000; \ +} + +struct timezone { + int tz_minuteswest; /* minutes west of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; +#define DST_NONE 0 /* not on dst */ +#define DST_USA 1 /* USA style dst */ +#define DST_AUST 2 /* Australian style dst */ +#define DST_WET 3 /* Western European dst */ +#define DST_MET 4 /* Middle European dst */ +#define DST_EET 5 /* Eastern European dst */ +#define DST_CAN 6 /* Canada */ + +/* Operations on timevals. */ +#define timerclear(tvp) (tvp)->tv_sec = (tvp)->tv_usec = 0 +#define timerisset(tvp) ((tvp)->tv_sec || (tvp)->tv_usec) +#define timercmp(tvp, uvp, cmp) \ + (((tvp)->tv_sec == (uvp)->tv_sec) ? \ + ((tvp)->tv_usec cmp (uvp)->tv_usec) : \ + ((tvp)->tv_sec cmp (uvp)->tv_sec)) + +/* + * Names of the interval timers, and structure + * defining a timer setting. + */ +#define ITIMER_REAL 0 +#define ITIMER_VIRTUAL 1 +#define ITIMER_PROF 2 + +struct itimerval { + struct timeval it_interval; /* timer interval */ + struct timeval it_value; /* current value */ +}; + +/* + * Getkerninfo clock information structure + */ +struct clockinfo { + int hz; /* clock frequency */ + int tick; /* micro-seconds per hz tick */ + int stathz; /* statistics clock frequency */ + int profhz; /* profiling clock frequency */ +}; + +#ifndef KERNEL +#include + +#ifndef _POSIX_SOURCE +#include + +__BEGIN_DECLS +int adjtime __P((const struct timeval *, struct timeval *)); +int getitimer __P((int, struct itimerval *)); +int gettimeofday __P((struct timeval *, struct timezone *)); +int setitimer __P((int, const struct itimerval *, struct itimerval *)); +int settimeofday __P((const struct timeval *, const struct timezone *)); +int utimes __P((const char *, const struct timeval *)); +__END_DECLS +#endif /* !POSIX */ + +#endif /* !KERNEL */ + +#endif /* !_SYS_TIME_H_ */ diff --git a/sys/sys/timeb.h b/sys/sys/timeb.h new file mode 100644 index 00000000000..2ab010514b6 --- /dev/null +++ b/sys/sys/timeb.h @@ -0,0 +1,47 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)timeb.h 8.2 (Berkeley) 1/21/94 + */ + +/* The ftime(2) system call structure -- deprecated. */ +struct timeb { + time_t time; /* seconds since the Epoch */ + unsigned short millitm; /* + milliseconds since the Epoch */ + short timezone; /* minutes west of CUT */ + short dstflag; /* DST == non-zero */ +}; diff --git a/sys/sys/times.h b/sys/sys/times.h new file mode 100644 index 00000000000..23a15008291 --- /dev/null +++ b/sys/sys/times.h @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)times.h 8.4 (Berkeley) 1/21/94 + */ + +#ifndef _SYS_TIMES_H_ +#define _SYS_TIMES_H_ + +#include + +#ifdef _BSD_CLOCK_T_ +typedef _BSD_CLOCK_T_ clock_t; +#undef _BSD_CLOCK_T_ +#endif + +struct tms { + clock_t tms_utime; /* User CPU time */ + clock_t tms_stime; /* System CPU time */ + clock_t tms_cutime; /* User CPU time of terminated child procs */ + clock_t tms_cstime; /* System CPU time of terminated child procs */ +}; + +#ifndef KERNEL +#include + +__BEGIN_DECLS +clock_t times __P((struct tms *)); +__END_DECLS +#endif +#endif /* !_SYS_TIMES_H_ */ diff --git a/sys/sys/timetc.h b/sys/sys/timetc.h new file mode 100644 index 00000000000..53227712a3b --- /dev/null +++ b/sys/sys/timetc.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)time.h 8.1 (Berkeley) 6/2/93 + */ + +#ifndef _SYS_TIME_H_ +#define _SYS_TIME_H_ + +/* + * Structure returned by gettimeofday(2) system call, + * and used in other calls. + */ +struct timeval { + long tv_sec; /* seconds */ + long tv_usec; /* and microseconds */ +}; + +/* + * Structure defined by POSIX.4 to be like a timeval. + */ +struct timespec { + long ts_sec; /* seconds */ + long ts_nsec; /* and nanoseconds */ +}; + +#define TIMEVAL_TO_TIMESPEC(tv, ts) { \ + (ts)->ts_sec = (tv)->tv_sec; \ + (ts)->ts_nsec = (tv)->tv_usec * 1000; \ +} +#define TIMESPEC_TO_TIMEVAL(tv, ts) { \ + (tv)->tv_sec = (ts)->ts_sec; \ + (tv)->tv_usec = (ts)->ts_nsec / 1000; \ +} + +struct timezone { + int tz_minuteswest; /* minutes west of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; +#define DST_NONE 0 /* not on dst */ +#define DST_USA 1 /* USA style dst */ +#define DST_AUST 2 /* Australian style dst */ +#define DST_WET 3 /* Western European dst */ +#define DST_MET 4 /* Middle European dst */ +#define DST_EET 5 /* Eastern European dst */ +#define DST_CAN 6 /* Canada */ + +/* Operations on timevals. */ +#define timerclear(tvp) (tvp)->tv_sec = (tvp)->tv_usec = 0 +#define timerisset(tvp) ((tvp)->tv_sec || (tvp)->tv_usec) +#define timercmp(tvp, uvp, cmp) \ + (((tvp)->tv_sec == (uvp)->tv_sec) ? \ + ((tvp)->tv_usec cmp (uvp)->tv_usec) : \ + ((tvp)->tv_sec cmp (uvp)->tv_sec)) + +/* + * Names of the interval timers, and structure + * defining a timer setting. + */ +#define ITIMER_REAL 0 +#define ITIMER_VIRTUAL 1 +#define ITIMER_PROF 2 + +struct itimerval { + struct timeval it_interval; /* timer interval */ + struct timeval it_value; /* current value */ +}; + +/* + * Getkerninfo clock information structure + */ +struct clockinfo { + int hz; /* clock frequency */ + int tick; /* micro-seconds per hz tick */ + int stathz; /* statistics clock frequency */ + int profhz; /* profiling clock frequency */ +}; + +#ifndef KERNEL +#include + +#ifndef _POSIX_SOURCE +#include + +__BEGIN_DECLS +int adjtime __P((const struct timeval *, struct timeval *)); +int getitimer __P((int, struct itimerval *)); +int gettimeofday __P((struct timeval *, struct timezone *)); +int setitimer __P((int, const struct itimerval *, struct itimerval *)); +int settimeofday __P((const struct timeval *, const struct timezone *)); +int utimes __P((const char *, const struct timeval *)); +__END_DECLS +#endif /* !POSIX */ + +#endif /* !KERNEL */ + +#endif /* !_SYS_TIME_H_ */ diff --git a/sys/sys/tprintf.h b/sys/sys/tprintf.h new file mode 100644 index 00000000000..5b83aaec029 --- /dev/null +++ b/sys/sys/tprintf.h @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tprintf.h 8.1 (Berkeley) 6/2/93 + */ + +typedef struct session *tpr_t; + +tpr_t tprintf_open __P((struct proc *)); +void tprintf_close __P((tpr_t)); + +void tprintf __P((tpr_t, const char *fmt, ...)); diff --git a/sys/sys/trace.h b/sys/sys/trace.h new file mode 100644 index 00000000000..d401f1459d7 --- /dev/null +++ b/sys/sys/trace.h @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)trace.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * File system buffer tracing points; all trace + */ +#define TR_BREADHIT 0 /* buffer read found in cache */ +#define TR_BREADMISS 1 /* buffer read not in cache */ +#define TR_BWRITE 2 /* buffer written */ +#define TR_BREADHITRA 3 /* buffer read-ahead found in cache */ +#define TR_BREADMISSRA 4 /* buffer read-ahead not in cache */ +#define TR_XFODMISS 5 /* exe fod read */ +#define TR_XFODHIT 6 /* exe fod read */ +#define TR_BRELSE 7 /* brelse */ +#define TR_BREALLOC 8 /* expand/contract a buffer */ + +/* + * Memory allocator trace points; all trace the amount of memory involved + */ +#define TR_MALL 10 /* memory allocated */ + +/* + * Paging trace points: all are + */ +#define TR_INTRANS 20 /* page intransit block */ +#define TR_EINTRANS 21 /* page intransit wait done */ +#define TR_FRECLAIM 22 /* reclaim from free list */ +#define TR_RECLAIM 23 /* reclaim from loop */ +#define TR_XSFREC 24 /* reclaim from free list instead of drum */ +#define TR_XIFREC 25 /* reclaim from free list instead of fsys */ +#define TR_WAITMEM 26 /* wait for memory in pagein */ +#define TR_EWAITMEM 27 /* end memory wait in pagein */ +#define TR_ZFOD 28 /* zfod page fault */ +#define TR_EXFOD 29 /* exec fod page fault */ +#define TR_VRFOD 30 /* vread fod page fault */ +#define TR_CACHEFOD 31 /* fod in file system cache */ +#define TR_SWAPIN 32 /* drum page fault */ +#define TR_PGINDONE 33 /* page in done */ +#define TR_SWAPIO 34 /* swap i/o request arrives */ + +/* + * System call trace points. + */ +#define TR_VADVISE 40 /* vadvise occurred with */ + +/* + * Miscellaneous + */ +#define TR_STAMP 45 /* user said vtrace(VTR_STAMP, value); */ + +/* + * This defines the size of the trace flags array. + */ +#define TR_NFLAGS 100 /* generous */ + +#define TRCSIZ 4096 + +/* + * Specifications of the vtrace() system call, which takes one argument. + */ +#define VTRACE 64+51 + +#define VTR_DISABLE 0 /* set a trace flag to 0 */ +#define VTR_ENABLE 1 /* set a trace flag to 1 */ +#define VTR_VALUE 2 /* return value of a trace flag */ +#define VTR_UALARM 3 /* set alarm to go off (sig 16) */ + /* in specified number of hz */ +#define VTR_STAMP 4 /* user specified stamp */ + +#ifdef KERNEL +#ifdef TRACE +struct proc *traceproc; +int tracewhich, tracebuf[TRCSIZ]; +u_int tracex; +char traceflags[TR_NFLAGS]; +#define pack(v,b) (((v)->v_mount->mnt_stat.f_fsid.val[0])<<16)|(b) +#define trace(a,b,c) { \ + if (traceflags[a]) \ + trace1(a,b,c); \ +} +#else +#define trace(a,b,c) +#endif +#endif diff --git a/sys/sys/tty.h b/sys/sys/tty.h new file mode 100644 index 00000000000..4a89b0382ad --- /dev/null +++ b/sys/sys/tty.h @@ -0,0 +1,217 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty.h 8.6 (Berkeley) 1/21/94 + */ + +#include +#include /* For struct selinfo. */ + +/* + * Clists are character lists, which is a variable length linked list + * of cblocks, with a count of the number of characters in the list. + */ +struct clist { + int c_cc; /* Number of characters in the clist. */ + char *c_cf; /* Pointer to the first cblock. */ + char *c_cl; /* Pointer to the last cblock. */ +}; + +/* + * Per-tty structure. + * + * Should be split in two, into device and tty drivers. + * Glue could be masks of what to echo and circular buffer + * (low, high, timeout). + */ +struct tty { + struct clist t_rawq; /* Device raw input queue. */ + long t_rawcc; /* Raw input queue statistics. */ + struct clist t_canq; /* Device canonical queue. */ + long t_cancc; /* Canonical queue statistics. */ + struct clist t_outq; /* Device output queue. */ + long t_outcc; /* Output queue statistics. */ + char t_line; /* Interface to device drivers. */ + dev_t t_dev; /* Device. */ + int t_state; /* Device and driver (TS*) state. */ + int t_flags; /* Tty flags. */ + struct pgrp *t_pgrp; /* Foreground process group. */ + struct session *t_session; /* Enclosing session. */ + struct selinfo t_rsel; /* Tty read/oob select. */ + struct selinfo t_wsel; /* Tty write select. */ + struct termios t_termios; /* Termios state. */ + struct winsize t_winsize; /* Window size. */ + /* Start output. */ + void (*t_oproc) __P((struct tty *)); + /* Stop output. */ + void (*t_stop) __P((struct tty *, int)); + /* Set hardware state. */ + int (*t_param) __P((struct tty *, struct termios *)); + void *t_sc; /* XXX: net/if_sl.c:sl_softc. */ + short t_column; /* Tty output column. */ + short t_rocount, t_rocol; /* Tty. */ + short t_hiwat; /* High water mark. */ + short t_lowat; /* Low water mark. */ + short t_gen; /* Generation number. */ +}; + +#define t_cc t_termios.c_cc +#define t_cflag t_termios.c_cflag +#define t_iflag t_termios.c_iflag +#define t_ispeed t_termios.c_ispeed +#define t_lflag t_termios.c_lflag +#define t_min t_termios.c_min +#define t_oflag t_termios.c_oflag +#define t_ospeed t_termios.c_ospeed +#define t_time t_termios.c_time + +#define TTIPRI 25 /* Sleep priority for tty reads. */ +#define TTOPRI 26 /* Sleep priority for tty writes. */ + +#define TTMASK 15 +#define OBUFSIZ 100 +#define TTYHOG 1024 + +#ifdef KERNEL +#define TTMAXHIWAT roundup(2048, CBSIZE) +#define TTMINHIWAT roundup(100, CBSIZE) +#define TTMAXLOWAT 256 +#define TTMINLOWAT 32 +#endif + +/* These flags are kept in t_state. */ +#define TS_ASLEEP 0x00001 /* Process waiting for tty. */ +#define TS_ASYNC 0x00002 /* Tty in async I/O mode. */ +#define TS_BUSY 0x00004 /* Draining output. */ +#define TS_CARR_ON 0x00008 /* Carrier is present. */ +#define TS_FLUSH 0x00010 /* Outq has been flushed during DMA. */ +#define TS_ISOPEN 0x00020 /* Open has completed. */ +#define TS_TBLOCK 0x00040 /* Further input blocked. */ +#define TS_TIMEOUT 0x00080 /* Wait for output char processing. */ +#define TS_TTSTOP 0x00100 /* Output paused. */ +#define TS_WOPEN 0x00200 /* Open in progress. */ +#define TS_XCLUDE 0x00400 /* Tty requires exclusivity. */ + +/* State for intra-line fancy editing work. */ +#define TS_BKSL 0x00800 /* State for lowercase \ work. */ +#define TS_CNTTB 0x01000 /* Counting tab width, ignore FLUSHO. */ +#define TS_ERASE 0x02000 /* Within a \.../ for PRTRUB. */ +#define TS_LNCH 0x04000 /* Next character is literal. */ +#define TS_TYPEN 0x08000 /* Retyping suspended input (PENDIN). */ +#define TS_LOCAL (TS_BKSL | TS_CNTTB | TS_ERASE | TS_LNCH | TS_TYPEN) + +/* Character type information. */ +#define ORDINARY 0 +#define CONTROL 1 +#define BACKSPACE 2 +#define NEWLINE 3 +#define TAB 4 +#define VTAB 5 +#define RETURN 6 + +struct speedtab { + int sp_speed; /* Speed. */ + int sp_code; /* Code. */ +}; + +/* Modem control commands (driver). */ +#define DMSET 0 +#define DMBIS 1 +#define DMBIC 2 +#define DMGET 3 + +/* Flags on a character passed to ttyinput. */ +#define TTY_CHARMASK 0x000000ff /* Character mask */ +#define TTY_QUOTE 0x00000100 /* Character quoted */ +#define TTY_ERRORMASK 0xff000000 /* Error mask */ +#define TTY_FE 0x01000000 /* Framing error or BREAK condition */ +#define TTY_PE 0x02000000 /* Parity error */ + +/* Is tp controlling terminal for p? */ +#define isctty(p, tp) \ + ((p)->p_session == (tp)->t_session && (p)->p_flag & P_CONTROLT) + +/* Is p in background of tp? */ +#define isbackground(p, tp) \ + (isctty((p), (tp)) && (p)->p_pgrp != (tp)->t_pgrp) + +#ifdef KERNEL +extern struct ttychars ttydefaults; + +/* Symbolic sleep message strings. */ +extern char ttyin[], ttyout[], ttopen[], ttclos[], ttybg[], ttybuf[]; + +int b_to_q __P((char *cp, int cc, struct clist *q)); +void catq __P((struct clist *from, struct clist *to)); +void clist_init __P((void)); +int getc __P((struct clist *q)); +void ndflush __P((struct clist *q, int cc)); +int ndqb __P((struct clist *q, int flag)); +char *nextc __P((struct clist *q, char *cp, int *c)); +int putc __P((int c, struct clist *q)); +int q_to_b __P((struct clist *q, char *cp, int cc)); +int unputc __P((struct clist *q)); + +int nullmodem __P((struct tty *tp, int flag)); +int tputchar __P((int c, struct tty *tp)); +int ttioctl __P((struct tty *tp, int com, void *data, int flag)); +int ttread __P((struct tty *tp, struct uio *uio, int flag)); +void ttrstrt __P((void *tp)); +int ttselect __P((dev_t device, int rw, struct proc *p)); +void ttsetwater __P((struct tty *tp)); +int ttspeedtab __P((int speed, struct speedtab *table)); +int ttstart __P((struct tty *tp)); +void ttwakeup __P((struct tty *tp)); +int ttwrite __P((struct tty *tp, struct uio *uio, int flag)); +void ttychars __P((struct tty *tp)); +int ttycheckoutq __P((struct tty *tp, int wait)); +int ttyclose __P((struct tty *tp)); +void ttyflush __P((struct tty *tp, int rw)); +void ttyinfo __P((struct tty *tp)); +int ttyinput __P((int c, struct tty *tp)); +int ttylclose __P((struct tty *tp, int flag)); +int ttymodem __P((struct tty *tp, int flag)); +int ttyopen __P((dev_t device, struct tty *tp)); +int ttyoutput __P((int c, struct tty *tp)); +void ttypend __P((struct tty *tp)); +void ttyretype __P((struct tty *tp)); +void ttyrub __P((int c, struct tty *tp)); +int ttysleep __P((struct tty *tp, + void *chan, int pri, char *wmesg, int timeout)); +int ttywait __P((struct tty *tp)); +int ttywflush __P((struct tty *tp)); +#endif diff --git a/sys/sys/ttychars.h b/sys/sys/ttychars.h new file mode 100644 index 00000000000..1a23aa77091 --- /dev/null +++ b/sys/sys/ttychars.h @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ttychars.h 8.2 (Berkeley) 1/4/94 + */ + +/* + * 4.3 COMPATIBILITY FILE + * + * User visible structures and constants related to terminal handling. + */ +#ifndef _SYS_TTYCHARS_H_ +#define _SYS_TTYCHARS_H_ + +struct ttychars { + char tc_erase; /* erase last character */ + char tc_kill; /* erase entire line */ + char tc_intrc; /* interrupt */ + char tc_quitc; /* quit */ + char tc_startc; /* start output */ + char tc_stopc; /* stop output */ + char tc_eofc; /* end-of-file */ + char tc_brkc; /* input delimiter (like nl) */ + char tc_suspc; /* stop process signal */ + char tc_dsuspc; /* delayed stop process signal */ + char tc_rprntc; /* reprint line */ + char tc_flushc; /* flush output (toggles) */ + char tc_werasc; /* word erase */ + char tc_lnextc; /* literal next character */ +}; +#ifdef USE_OLD_TTY +#include /* to pick up character defaults */ +#endif +#endif /* !_SYS_TTYCHARS_H_ */ diff --git a/sys/sys/ttycom.h b/sys/sys/ttycom.h new file mode 100644 index 00000000000..a12d8d00354 --- /dev/null +++ b/sys/sys/ttycom.h @@ -0,0 +1,128 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ttycom.h 8.1 (Berkeley) 3/28/94 + */ + +#ifndef _SYS_TTYCOM_H_ +#define _SYS_TTYCOM_H_ + +#include + +/* + * Tty ioctl's except for those supported only for backwards compatibility + * with the old tty driver. + */ + +/* + * Window/terminal size structure. This information is stored by the kernel + * in order to provide a consistent interface, but is not used by the kernel. + */ +struct winsize { + unsigned short ws_row; /* rows, in characters */ + unsigned short ws_col; /* columns, in characters */ + unsigned short ws_xpixel; /* horizontal size, pixels */ + unsigned short ws_ypixel; /* vertical size, pixels */ +}; + +#define TIOCMODG _IOR('t', 3, int) /* get modem control state */ +#define TIOCMODS _IOW('t', 4, int) /* set modem control state */ +#define TIOCM_LE 0001 /* line enable */ +#define TIOCM_DTR 0002 /* data terminal ready */ +#define TIOCM_RTS 0004 /* request to send */ +#define TIOCM_ST 0010 /* secondary transmit */ +#define TIOCM_SR 0020 /* secondary receive */ +#define TIOCM_CTS 0040 /* clear to send */ +#define TIOCM_CAR 0100 /* carrier detect */ +#define TIOCM_CD TIOCM_CAR +#define TIOCM_RNG 0200 /* ring */ +#define TIOCM_RI TIOCM_RNG +#define TIOCM_DSR 0400 /* data set ready */ + /* 8-10 compat */ +#define TIOCEXCL _IO('t', 13) /* set exclusive use of tty */ +#define TIOCNXCL _IO('t', 14) /* reset exclusive use of tty */ + /* 15 unused */ +#define TIOCFLUSH _IOW('t', 16, int) /* flush buffers */ + /* 17-18 compat */ +#define TIOCGETA _IOR('t', 19, struct termios) /* get termios struct */ +#define TIOCSETA _IOW('t', 20, struct termios) /* set termios struct */ +#define TIOCSETAW _IOW('t', 21, struct termios) /* drain output, set */ +#define TIOCSETAF _IOW('t', 22, struct termios) /* drn out, fls in, set */ +#define TIOCGETD _IOR('t', 26, int) /* get line discipline */ +#define TIOCSETD _IOW('t', 27, int) /* set line discipline */ + /* 127-124 compat */ +#define TIOCSBRK _IO('t', 123) /* set break bit */ +#define TIOCCBRK _IO('t', 122) /* clear break bit */ +#define TIOCSDTR _IO('t', 121) /* set data terminal ready */ +#define TIOCCDTR _IO('t', 120) /* clear data terminal ready */ +#define TIOCGPGRP _IOR('t', 119, int) /* get pgrp of tty */ +#define TIOCSPGRP _IOW('t', 118, int) /* set pgrp of tty */ + /* 117-116 compat */ +#define TIOCOUTQ _IOR('t', 115, int) /* output queue size */ +#define TIOCSTI _IOW('t', 114, char) /* simulate terminal input */ +#define TIOCNOTTY _IO('t', 113) /* void tty association */ +#define TIOCPKT _IOW('t', 112, int) /* pty: set/clear packet mode */ +#define TIOCPKT_DATA 0x00 /* data packet */ +#define TIOCPKT_FLUSHREAD 0x01 /* flush packet */ +#define TIOCPKT_FLUSHWRITE 0x02 /* flush packet */ +#define TIOCPKT_STOP 0x04 /* stop output */ +#define TIOCPKT_START 0x08 /* start output */ +#define TIOCPKT_NOSTOP 0x10 /* no more ^S, ^Q */ +#define TIOCPKT_DOSTOP 0x20 /* now do ^S ^Q */ +#define TIOCPKT_IOCTL 0x40 /* state change of pty driver */ +#define TIOCSTOP _IO('t', 111) /* stop output, like ^S */ +#define TIOCSTART _IO('t', 110) /* start output, like ^Q */ +#define TIOCMSET _IOW('t', 109, int) /* set all modem bits */ +#define TIOCMBIS _IOW('t', 108, int) /* bis modem bits */ +#define TIOCMBIC _IOW('t', 107, int) /* bic modem bits */ +#define TIOCMGET _IOR('t', 106, int) /* get all modem bits */ +#define TIOCREMOTE _IOW('t', 105, int) /* remote input editing */ +#define TIOCGWINSZ _IOR('t', 104, struct winsize) /* get window size */ +#define TIOCSWINSZ _IOW('t', 103, struct winsize) /* set window size */ +#define TIOCUCNTL _IOW('t', 102, int) /* pty: set/clr usr cntl mode */ +#define UIOCCMD(n) _IO('u', n) /* usr cntl op "n" */ +#define TIOCCONS _IOW('t', 98, int) /* become virtual console */ +#define TIOCSCTTY _IO('t', 97) /* become controlling tty */ +#define TIOCEXT _IOW('t', 96, int) /* pty: external processing */ +#define TIOCSIG _IO('t', 95) /* pty: generate signal */ +#define TIOCDRAIN _IO('t', 94) /* wait till output drained */ + +#define TTYDISC 0 /* termios tty line discipline */ +#define TABLDISC 3 /* tablet discipline */ +#define SLIPDISC 4 /* serial IP discipline */ + +#endif /* !_SYS_TTYCOM_H_ */ diff --git a/sys/sys/ttydefaults.h b/sys/sys/ttydefaults.h new file mode 100644 index 00000000000..1a8aaa5bd00 --- /dev/null +++ b/sys/sys/ttydefaults.h @@ -0,0 +1,96 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ttydefaults.h 8.4 (Berkeley) 1/21/94 + */ + +/* + * System wide defaults for terminal state. + */ +#ifndef _SYS_TTYDEFAULTS_H_ +#define _SYS_TTYDEFAULTS_H_ + +/* + * Defaults on "first" open. + */ +#define TTYDEF_IFLAG (BRKINT | ISTRIP | ICRNL | IMAXBEL | IXON | IXANY) +#define TTYDEF_OFLAG (OPOST | ONLCR | OXTABS) +#define TTYDEF_LFLAG (ECHO | ICANON | ISIG | IEXTEN | ECHOE|ECHOKE|ECHOCTL) +#define TTYDEF_CFLAG (CREAD | CS7 | PARENB | HUPCL) +#define TTYDEF_SPEED (B9600) + +/* + * Control Character Defaults + */ +#define CTRL(x) (x&037) +#define CEOF CTRL('d') +#define CEOL ((unsigned char)'\377') /* XXX avoid _POSIX_VDISABLE */ +#define CERASE 0177 +#define CINTR CTRL('c') +#define CSTATUS ((unsigned char)'\377') /* XXX avoid _POSIX_VDISABLE */ +#define CKILL CTRL('u') +#define CMIN 1 +#define CQUIT 034 /* FS, ^\ */ +#define CSUSP CTRL('z') +#define CTIME 0 +#define CDSUSP CTRL('y') +#define CSTART CTRL('q') +#define CSTOP CTRL('s') +#define CLNEXT CTRL('v') +#define CDISCARD CTRL('o') +#define CWERASE CTRL('w') +#define CREPRINT CTRL('r') +#define CEOT CEOF +/* compat */ +#define CBRK CEOL +#define CRPRNT CREPRINT +#define CFLUSH CDISCARD + +/* PROTECTED INCLUSION ENDS HERE */ +#endif /* !_SYS_TTYDEFAULTS_H_ */ + +/* + * #define TTYDEFCHARS to include an array of default control characters. + */ +#ifdef TTYDEFCHARS +cc_t ttydefchars[NCCS] = { + CEOF, CEOL, CEOL, CERASE, CWERASE, CKILL, CREPRINT, + _POSIX_VDISABLE, CINTR, CQUIT, CSUSP, CDSUSP, CSTART, CSTOP, CLNEXT, + CDISCARD, CMIN, CTIME, CSTATUS, _POSIX_VDISABLE +}; +#undef TTYDEFCHARS +#endif diff --git a/sys/sys/ttydev.h b/sys/sys/ttydev.h new file mode 100644 index 00000000000..c52a2136926 --- /dev/null +++ b/sys/sys/ttydev.h @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ttydev.h 8.2 (Berkeley) 1/4/94 + */ + +/* COMPATABILITY HEADER FILE */ + +#ifndef _SYS_TTYDEV_H_ +#define _SYS_TTYDEV_H_ + +#ifdef USE_OLD_TTY +#define B0 0 +#define B50 1 +#define B75 2 +#define B110 3 +#define B134 4 +#define B150 5 +#define B200 6 +#define B300 7 +#define B600 8 +#define B1200 9 +#define B1800 10 +#define B2400 11 +#define B4800 12 +#define B9600 13 +#define EXTA 14 +#define EXTB 15 +#endif /* USE_OLD_TTY */ + +#endif /* !_SYS_TTYDEV_H_ */ diff --git a/sys/sys/types.h b/sys/sys/types.h new file mode 100644 index 00000000000..76d2975d31b --- /dev/null +++ b/sys/sys/types.h @@ -0,0 +1,162 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)types.h 8.4 (Berkeley) 1/21/94 + */ + +#ifndef _SYS_TYPES_H_ +#define _SYS_TYPES_H_ + +/* Machine type dependent parameters. */ +#include + +#ifndef _POSIX_SOURCE +typedef unsigned char u_char; +typedef unsigned short u_short; +typedef unsigned int u_int; +typedef unsigned long u_long; +typedef unsigned short ushort; /* Sys V compatibility */ +typedef unsigned int uint; /* Sys V compatibility */ +#endif + +typedef unsigned long long u_quad_t; /* quads */ +typedef long long quad_t; +typedef quad_t * qaddr_t; + +typedef char * caddr_t; /* core address */ +typedef long daddr_t; /* disk address */ +typedef unsigned long dev_t; /* device number */ +typedef unsigned long fixpt_t; /* fixed point number */ +typedef unsigned long gid_t; /* group id */ +typedef unsigned long ino_t; /* inode number */ +typedef unsigned short mode_t; /* permissions */ +typedef unsigned short nlink_t; /* link count */ +typedef quad_t off_t; /* file offset */ +typedef long pid_t; /* process id */ +typedef long segsz_t; /* segment size */ +typedef long swblk_t; /* swap offset */ +typedef unsigned long uid_t; /* user id */ + +/* + * This belongs in unistd.h, but is placed here to ensure that programs + * casting the second parameter of lseek to off_t will get the correct + * version of lseek. + */ +#ifndef KERNEL +#include +__BEGIN_DECLS +off_t lseek __P((int, off_t, int)); +__END_DECLS +#endif + +#ifndef _POSIX_SOURCE +#define major(x) ((int)(((u_int)(x) >> 8)&0xff)) /* major number */ +#define minor(x) ((int)((x)&0xff)) /* minor number */ +#define makedev(x,y) ((dev_t)(((x)<<8) | (y))) /* create dev_t */ +#endif + +#include +#include + +#ifdef _BSD_CLOCK_T_ +typedef _BSD_CLOCK_T_ clock_t; +#undef _BSD_CLOCK_T_ +#endif + +#ifdef _BSD_SIZE_T_ +typedef _BSD_SIZE_T_ size_t; +#undef _BSD_SIZE_T_ +#endif + +#ifdef _BSD_SSIZE_T_ +typedef _BSD_SSIZE_T_ ssize_t; +#undef _BSD_SSIZE_T_ +#endif + +#ifdef _BSD_TIME_T_ +typedef _BSD_TIME_T_ time_t; +#undef _BSD_TIME_T_ +#endif + +#ifndef _POSIX_SOURCE +#define NBBY 8 /* number of bits in a byte */ + +/* + * Select uses bit masks of file descriptors in longs. These macros + * manipulate such bit fields (the filesystem macros use chars). + * FD_SETSIZE may be defined by the user, but the default here should + * be enough for most uses. + */ +#ifndef FD_SETSIZE +#define FD_SETSIZE 256 +#endif + +typedef long fd_mask; +#define NFDBITS (sizeof(fd_mask) * NBBY) /* bits per mask */ + +#ifndef howmany +#define howmany(x, y) (((x)+((y)-1))/(y)) +#endif + +typedef struct fd_set { + fd_mask fds_bits[howmany(FD_SETSIZE, NFDBITS)]; +} fd_set; + +#define FD_SET(n, p) ((p)->fds_bits[(n)/NFDBITS] |= (1 << ((n) % NFDBITS))) +#define FD_CLR(n, p) ((p)->fds_bits[(n)/NFDBITS] &= ~(1 << ((n) % NFDBITS))) +#define FD_ISSET(n, p) ((p)->fds_bits[(n)/NFDBITS] & (1 << ((n) % NFDBITS))) +#define FD_COPY(f, t) bcopy(f, t, sizeof(*(f))) +#define FD_ZERO(p) bzero(p, sizeof(*(p))) + +#if defined(__STDC__) && defined(KERNEL) +/* + * Forward structure declarations for function prototypes. We include the + * common structures that cross subsystem boundaries here; others are mostly + * used in the same place that the structure is defined. + */ +struct proc; +struct pgrp; +struct ucred; +struct rusage; +struct file; +struct buf; +struct tty; +struct uio; +#endif + +#endif /* !_POSIX_SOURCE */ +#endif /* !_SYS_TYPES_H_ */ diff --git a/sys/sys/ucred.h b/sys/sys/ucred.h new file mode 100644 index 00000000000..d3ee02dbde3 --- /dev/null +++ b/sys/sys/ucred.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ucred.h 8.2 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_UCRED_H_ +#define _SYS_UCRED_H_ + +/* + * Credentials. + */ +struct ucred { + u_short cr_ref; /* reference count */ + uid_t cr_uid; /* effective user id */ + short cr_ngroups; /* number of groups */ + gid_t cr_groups[NGROUPS]; /* groups */ +}; +#define cr_gid cr_groups[0] +#define NOCRED ((struct ucred *)-1) /* no credential available */ +#define FSCRED ((struct ucred *)-2) /* filesystem credential */ + +#ifdef KERNEL +#define crhold(cr) (cr)->cr_ref++ +struct ucred *crget(); +struct ucred *crcopy(); +struct ucred *crdup(); +#endif /* KERNEL */ + +#endif /* !_SYS_UCRED_H_ */ diff --git a/sys/sys/uio.h b/sys/sys/uio.h new file mode 100644 index 00000000000..3356ebfee89 --- /dev/null +++ b/sys/sys/uio.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 1982, 1986, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uio.h 8.5 (Berkeley) 2/22/94 + */ + +#ifndef _SYS_UIO_H_ +#define _SYS_UIO_H_ + +/* + * XXX + * iov_base should be a void *. + */ +struct iovec { + char *iov_base; /* Base address. */ + size_t iov_len; /* Length. */ +}; + +enum uio_rw { UIO_READ, UIO_WRITE }; + +/* Segment flag values. */ +enum uio_seg { + UIO_USERSPACE, /* from user data space */ + UIO_SYSSPACE, /* from system space */ + UIO_USERISPACE /* from user I space */ +}; + +#ifdef KERNEL +struct uio { + struct iovec *uio_iov; + int uio_iovcnt; + off_t uio_offset; + int uio_resid; + enum uio_seg uio_segflg; + enum uio_rw uio_rw; + struct proc *uio_procp; +}; + +/* + * Limits + */ +#define UIO_MAXIOV 1024 /* max 1K of iov's */ +#define UIO_SMALLIOV 8 /* 8 on stack, else malloc */ +#endif /* KERNEL */ + +#ifndef KERNEL +#include + +__BEGIN_DECLS +ssize_t readv __P((int, const struct iovec *, int)); +ssize_t writev __P((int, const struct iovec *, int)); +__END_DECLS +#endif /* !KERNEL */ +#endif /* !_SYS_UIO_H_ */ diff --git a/sys/sys/un.h b/sys/sys/un.h new file mode 100644 index 00000000000..3e214a26bb5 --- /dev/null +++ b/sys/sys/un.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)un.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Definitions for UNIX IPC domain. + */ +struct sockaddr_un { + u_char sun_len; /* sockaddr len including null */ + u_char sun_family; /* AF_UNIX */ + char sun_path[104]; /* path name (gag) */ +}; + +#ifdef KERNEL +int unp_discard(); +#else + +/* actual length of an initialized sockaddr_un */ +#define SUN_LEN(su) \ + (sizeof(*(su)) - sizeof((su)->sun_path) + strlen((su)->sun_path)) +#endif diff --git a/sys/sys/unistd.h b/sys/sys/unistd.h new file mode 100644 index 00000000000..e086f6f6e39 --- /dev/null +++ b/sys/sys/unistd.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)unistd.h 8.2 (Berkeley) 1/7/94 + */ + +#ifndef _SYS_UNISTD_H_ +#define _SYS_UNISTD_H_ + +/* compile-time symbolic constants */ +#define _POSIX_JOB_CONTROL /* implementation supports job control */ + +/* + * Although we have saved user/group IDs, we do not use them in setuid + * as described in POSIX 1003.1, because the feature does not work for + * root. We use the saved IDs in seteuid/setegid, which are not currently + * part of the POSIX 1003.1 specification. + */ +#ifdef _NOT_AVAILABLE +#define _POSIX_SAVED_IDS /* saved set-user-ID and set-group-ID */ +#endif + +#define _POSIX_VERSION 198808L +#define _POSIX2_VERSION 199212L + +/* execution-time symbolic constants */ + /* chown requires appropriate privileges */ +#define _POSIX_CHOWN_RESTRICTED 1 + /* too-long path components generate errors */ +#define _POSIX_NO_TRUNC 1 + /* may disable terminal special characters */ +#define _POSIX_VDISABLE ((unsigned char)'\377') + +/* access function */ +#define F_OK 0 /* test for existence of file */ +#define X_OK 0x01 /* test for execute or search permission */ +#define W_OK 0x02 /* test for write permission */ +#define R_OK 0x04 /* test for read permission */ + +/* whence values for lseek(2) */ +#define SEEK_SET 0 /* set file offset to offset */ +#define SEEK_CUR 1 /* set file offset to current plus offset */ +#define SEEK_END 2 /* set file offset to EOF plus offset */ + +#ifndef _POSIX_SOURCE +/* whence values for lseek(2); renamed by POSIX 1003.1 */ +#define L_SET SEEK_SET +#define L_INCR SEEK_CUR +#define L_XTND SEEK_END +#endif + +/* configurable pathname variables */ +#define _PC_LINK_MAX 1 +#define _PC_MAX_CANON 2 +#define _PC_MAX_INPUT 3 +#define _PC_NAME_MAX 4 +#define _PC_PATH_MAX 5 +#define _PC_PIPE_BUF 6 +#define _PC_CHOWN_RESTRICTED 7 +#define _PC_NO_TRUNC 8 +#define _PC_VDISABLE 9 + +/* configurable system variables */ +#define _SC_ARG_MAX 1 +#define _SC_CHILD_MAX 2 +#define _SC_CLK_TCK 3 +#define _SC_NGROUPS_MAX 4 +#define _SC_OPEN_MAX 5 +#define _SC_JOB_CONTROL 6 +#define _SC_SAVED_IDS 7 +#define _SC_VERSION 8 +#define _SC_BC_BASE_MAX 9 +#define _SC_BC_DIM_MAX 10 +#define _SC_BC_SCALE_MAX 11 +#define _SC_BC_STRING_MAX 12 +#define _SC_COLL_WEIGHTS_MAX 13 +#define _SC_EXPR_NEST_MAX 14 +#define _SC_LINE_MAX 15 +#define _SC_RE_DUP_MAX 16 +#define _SC_2_VERSION 17 +#define _SC_2_C_BIND 18 +#define _SC_2_C_DEV 19 +#define _SC_2_CHAR_TERM 20 +#define _SC_2_FORT_DEV 21 +#define _SC_2_FORT_RUN 22 +#define _SC_2_LOCALEDEF 23 +#define _SC_2_SW_DEV 24 +#define _SC_2_UPE 25 +#define _SC_STREAM_MAX 26 +#define _SC_TZNAME_MAX 27 + +/* configurable system strings */ +#define _CS_PATH 1 + +#endif /* !_SYS_UNISTD_H_ */ diff --git a/sys/sys/unpcb.h b/sys/sys/unpcb.h new file mode 100644 index 00000000000..efcfd0e23c1 --- /dev/null +++ b/sys/sys/unpcb.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)unpcb.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Protocol control block for an active + * instance of a UNIX internal protocol. + * + * A socket may be associated with an vnode in the + * file system. If so, the unp_vnode pointer holds + * a reference count to this vnode, which should be irele'd + * when the socket goes away. + * + * A socket may be connected to another socket, in which + * case the control block of the socket to which it is connected + * is given by unp_conn. + * + * A socket may be referenced by a number of sockets (e.g. several + * sockets may be connected to a datagram socket.) These sockets + * are in a linked list starting with unp_refs, linked through + * unp_nextref and null-terminated. Note that a socket may be referenced + * by a number of other sockets and may also reference a socket (not + * necessarily one which is referencing it). This generates + * the need for unp_refs and unp_nextref to be separate fields. + * + * Stream sockets keep copies of receive sockbuf sb_cc and sb_mbcnt + * so that changes in the sockbuf may be computed to modify + * back pressure on the sender accordingly. + */ +struct unpcb { + struct socket *unp_socket; /* pointer back to socket */ + struct vnode *unp_vnode; /* if associated with file */ + ino_t unp_ino; /* fake inode number */ + struct unpcb *unp_conn; /* control block of connected socket */ + struct unpcb *unp_refs; /* referencing socket linked list */ + struct unpcb *unp_nextref; /* link in unp_refs list */ + struct mbuf *unp_addr; /* bound address of socket */ + int unp_cc; /* copy of rcv.sb_cc */ + int unp_mbcnt; /* copy of rcv.sb_mbcnt */ +}; + +#define sotounpcb(so) ((struct unpcb *)((so)->so_pcb)) diff --git a/sys/sys/user.h b/sys/sys/user.h new file mode 100644 index 00000000000..85fdd130c2d --- /dev/null +++ b/sys/sys/user.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)user.h 8.2 (Berkeley) 9/23/93 + */ + +#include +#ifndef KERNEL +/* stuff that *used* to be included by user.h, or is now needed */ +#include +#include +#include +#include +#include +#endif +#include +#include +#include /* XXX */ +#include + + +/* + * Per process structure containing data that isn't needed in core + * when the process isn't running (esp. when swapped out). + * This structure may or may not be at the same kernel address + * in all processes. + */ + +struct user { + struct pcb u_pcb; + + struct sigacts u_sigacts; /* p_sigacts points here (use it!) */ + struct pstats u_stats; /* p_stats points here (use it!) */ + + /* + * Remaining fields only for core dump and/or ptrace-- + * not valid at other times! + */ + struct kinfo_proc u_kproc; /* proc + eproc */ + struct md_coredump u_md; /* machine dependent glop */ +}; + +/* + * Redefinitions to make the debuggers happy for now... This subterfuge + * brought to you by coredump() and trace_req(). These fields are *only* + * valid at those times! + */ +#define U_ar0 u_kproc.kp_proc.p_md.md_regs /* copy of curproc->p_md.md_regs */ +#define U_tsize u_kproc.kp_eproc.e_vm.vm_tsize +#define U_dsize u_kproc.kp_eproc.e_vm.vm_dsize +#define U_ssize u_kproc.kp_eproc.e_vm.vm_ssize +#define U_sig u_sigacts.ps_sig +#define U_code u_sigacts.ps_code + +#ifndef KERNEL +#define u_ar0 U_ar0 +#define u_tsize U_tsize +#define u_dsize U_dsize +#define u_ssize U_ssize +#define u_sig U_sig +#define u_code U_code +#endif /* KERNEL */ diff --git a/sys/sys/utsname.h b/sys/sys/utsname.h new file mode 100644 index 00000000000..aa0f2c75ab6 --- /dev/null +++ b/sys/sys/utsname.h @@ -0,0 +1,56 @@ +/*- + * Copyright (c) 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Chuck Karish of Mindcraft, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)utsname.h 8.1 (Berkeley) 1/4/94 + */ + +#ifndef _SYS_UTSNAME_H +#define _SYS_UTSNAME_H + +struct utsname { + char sysname[256]; /* Name of this OS. */ + char nodename[256]; /* Name of this network node. */ + char release[256]; /* Release level. */ + char version[256]; /* Version level. */ + char machine[256]; /* Hardware type. */ +}; + +#include + +__BEGIN_DECLS +int uname __P((struct utsname *)); +__END_DECLS + +#endif /* !_SYS_UTSNAME_H */ diff --git a/sys/sys/vadvise.h b/sys/sys/vadvise.h new file mode 100644 index 00000000000..be793e8e721 --- /dev/null +++ b/sys/sys/vadvise.h @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vadvise.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Parameters to vadvise() to tell system of particular paging + * behaviour: + * VA_NORM Normal strategy + * VA_ANOM Sampling page behaviour is not a win, don't bother + * Suitable during GCs in LISP, or sequential or random + * page referencing. + * VA_SEQL Sequential behaviour expected. + * VA_FLUSH Invalidate all page table entries. + */ +#define VA_NORM 0 +#define VA_ANOM 1 +#define VA_SEQL 2 +#define VA_FLUSH 3 diff --git a/sys/sys/vcmd.h b/sys/sys/vcmd.h new file mode 100644 index 00000000000..de27ec1b0af --- /dev/null +++ b/sys/sys/vcmd.h @@ -0,0 +1,43 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vcmd.h 8.1 (Berkeley) 6/2/93 + */ + +#include + +#define VPRINT 0100 +#define VPLOT 0200 +#define VPRINTPLOT 0400 + +#define VGETSTATE _IOR('v', 0, int) +#define VSETSTATE _IOW('v', 1, int) diff --git a/sys/sys/vlimit.h b/sys/sys/vlimit.h new file mode 100644 index 00000000000..b6457e64ddf --- /dev/null +++ b/sys/sys/vlimit.h @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vlimit.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * Limits for u.u_limit[i], per process, inherited. + */ +#define LIM_NORAISE 0 /* if <> 0, can't raise limits */ +#define LIM_CPU 1 /* max secs cpu time */ +#define LIM_FSIZE 2 /* max size of file created */ +#define LIM_DATA 3 /* max growth of data space */ +#define LIM_STACK 4 /* max growth of stack */ +#define LIM_CORE 5 /* max size of ``core'' file */ +#define LIM_MAXRSS 6 /* max desired data+stack core usage */ + +#define NLIMITS 6 + +#define INFINITY 0x7fffffff diff --git a/sys/sys/vmmeter.h b/sys/sys/vmmeter.h new file mode 100644 index 00000000000..f0b3d57f336 --- /dev/null +++ b/sys/sys/vmmeter.h @@ -0,0 +1,147 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vmmeter.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * System wide statistics counters. + */ +struct vmmeter { + /* + * General system activity. + */ + unsigned v_swtch; /* context switches */ + unsigned v_trap; /* calls to trap */ + unsigned v_syscall; /* calls to syscall() */ + unsigned v_intr; /* device interrupts */ + unsigned v_soft; /* software interrupts */ + unsigned v_faults; /* total faults taken */ + /* + * Virtual memory activity. + */ + unsigned v_lookups; /* object cache lookups */ + unsigned v_hits; /* object cache hits */ + unsigned v_vm_faults; /* number of address memory faults */ + unsigned v_cow_faults; /* number of copy-on-writes */ + unsigned v_swpin; /* swapins */ + unsigned v_swpout; /* swapouts */ + unsigned v_pswpin; /* pages swapped in */ + unsigned v_pswpout; /* pages swapped out */ + unsigned v_pageins; /* number of pageins */ + unsigned v_pageouts; /* number of pageouts */ + unsigned v_pgpgin; /* pages paged in */ + unsigned v_pgpgout; /* pages paged out */ + unsigned v_intrans; /* intransit blocking page faults */ + unsigned v_reactivated; /* number of pages reactivated from free list */ + unsigned v_rev; /* revolutions of the hand */ + unsigned v_scan; /* scans in page out daemon */ + unsigned v_dfree; /* pages freed by daemon */ + unsigned v_pfree; /* pages freed by exiting processes */ + unsigned v_zfod; /* pages zero filled on demand */ + unsigned v_nzfod; /* number of zfod's created */ + /* + * Distribution of page usages. + */ + unsigned v_page_size; /* page size in bytes */ + unsigned v_kernel_pages;/* number of pages in use by kernel */ + unsigned v_free_target; /* number of pages desired free */ + unsigned v_free_min; /* minimum number of pages desired free */ + unsigned v_free_count; /* number of pages free */ + unsigned v_wire_count; /* number of pages wired down */ + unsigned v_active_count;/* number of pages active */ + unsigned v_inactive_target; /* number of pages desired inactive */ + unsigned v_inactive_count; /* number of pages inactive */ +}; +#ifdef KERNEL +struct vmmeter cnt; +#endif + +/* systemwide totals computed every five seconds */ +struct vmtotal +{ + short t_rq; /* length of the run queue */ + short t_dw; /* jobs in ``disk wait'' (neg priority) */ + short t_pw; /* jobs in page wait */ + short t_sl; /* jobs sleeping in core */ + short t_sw; /* swapped out runnable/short block jobs */ + long t_vm; /* total virtual memory */ + long t_avm; /* active virtual memory */ + long t_rm; /* total real memory in use */ + long t_arm; /* active real memory */ + long t_vmshr; /* shared virtual memory */ + long t_avmshr; /* active shared virtual memory */ + long t_rmshr; /* shared real memory */ + long t_armshr; /* active shared real memory */ + long t_free; /* free memory pages */ +}; +#ifdef KERNEL +struct vmtotal total; +#endif + +/* + * Optional instrumentation. + */ +#ifdef PGINPROF + +#define NDMON 128 +#define NSMON 128 + +#define DRES 20 +#define SRES 5 + +#define PMONMIN 20 +#define PRES 50 +#define NPMON 64 + +#define RMONMIN 130 +#define RRES 5 +#define NRMON 64 + +/* data and stack size distribution counters */ +unsigned int dmon[NDMON+1]; +unsigned int smon[NSMON+1]; + +/* page in time distribution counters */ +unsigned int pmon[NPMON+2]; + +/* reclaim time distribution counters */ +unsigned int rmon[NRMON+2]; + +int pmonmin; +int pres; +int rmonmin; +int rres; + +unsigned rectime; /* accumulator for reclaim times */ +unsigned pgintime; /* accumulator for page in times */ +#endif diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h new file mode 100644 index 00000000000..fa51d994a21 --- /dev/null +++ b/sys/sys/vnode.h @@ -0,0 +1,397 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vnode.h 8.7 (Berkeley) 2/4/94 + */ + +#include + +/* + * The vnode is the focus of all file activity in UNIX. There is a + * unique vnode allocated for each active file, each current directory, + * each mounted-on file, text file, and the root. + */ + +/* + * Vnode types. VNON means no type. + */ +enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD }; + +/* + * Vnode tag types. + * These are for the benefit of external programs only (e.g., pstat) + * and should NEVER be inspected by the kernel. + */ +enum vtagtype { + VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_PC, VT_LFS, VT_LOFS, VT_FDESC, + VT_PORTAL, VT_NULL, VT_UMAP, VT_KERNFS, VT_PROCFS, VT_AFS, VT_ISOFS, + VT_UNION +}; + +/* + * Each underlying filesystem allocates its own private area and hangs + * it from v_data. If non-null, this area is freed in getnewvnode(). + */ +LIST_HEAD(buflists, buf); + +struct vnode { + u_long v_flag; /* vnode flags (see below) */ + short v_usecount; /* reference count of users */ + short v_writecount; /* reference count of writers */ + long v_holdcnt; /* page & buffer references */ + daddr_t v_lastr; /* last read (read-ahead) */ + u_long v_id; /* capability identifier */ + struct mount *v_mount; /* ptr to vfs we are in */ + int (**v_op)(); /* vnode operations vector */ + TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */ + LIST_ENTRY(vnode) v_mntvnodes; /* vnodes for mount point */ + struct buflists v_cleanblkhd; /* clean blocklist head */ + struct buflists v_dirtyblkhd; /* dirty blocklist head */ + long v_numoutput; /* num of writes in progress */ + enum vtype v_type; /* vnode type */ + union { + struct mount *vu_mountedhere;/* ptr to mounted vfs (VDIR) */ + struct socket *vu_socket; /* unix ipc (VSOCK) */ + caddr_t vu_vmdata; /* private data for vm (VREG) */ + struct specinfo *vu_specinfo; /* device (VCHR, VBLK) */ + struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ + } v_un; + struct nqlease *v_lease; /* Soft reference to lease */ + daddr_t v_lastw; /* last write (write cluster) */ + daddr_t v_cstart; /* start block of cluster */ + daddr_t v_lasta; /* last allocation */ + int v_clen; /* length of current cluster */ + int v_ralen; /* Read-ahead length */ + daddr_t v_maxra; /* last readahead block */ + long v_spare[7]; /* round to 128 bytes */ + enum vtagtype v_tag; /* type of underlying data */ + void *v_data; /* private data for fs */ +}; +#define v_mountedhere v_un.vu_mountedhere +#define v_socket v_un.vu_socket +#define v_vmdata v_un.vu_vmdata +#define v_specinfo v_un.vu_specinfo +#define v_fifoinfo v_un.vu_fifoinfo + +/* + * Vnode flags. + */ +#define VROOT 0x0001 /* root of its file system */ +#define VTEXT 0x0002 /* vnode is a pure text prototype */ +#define VSYSTEM 0x0004 /* vnode being used by kernel */ +#define VXLOCK 0x0100 /* vnode is locked to change underlying type */ +#define VXWANT 0x0200 /* process is waiting for vnode */ +#define VBWAIT 0x0400 /* waiting for output to complete */ +#define VALIASED 0x0800 /* vnode has an alias */ +#define VDIROP 0x1000 /* LFS: vnode is involved in a directory op */ + +/* + * Vnode attributes. A field value of VNOVAL represents a field whose value + * is unavailable (getattr) or which is not to be changed (setattr). + */ +struct vattr { + enum vtype va_type; /* vnode type (for create) */ + u_short va_mode; /* files access mode and type */ + short va_nlink; /* number of references to file */ + uid_t va_uid; /* owner user id */ + gid_t va_gid; /* owner group id */ + long va_fsid; /* file system id (dev for now) */ + long va_fileid; /* file id */ + u_quad_t va_size; /* file size in bytes */ + long va_blocksize; /* blocksize preferred for i/o */ + struct timespec va_atime; /* time of last access */ + struct timespec va_mtime; /* time of last modification */ + struct timespec va_ctime; /* time file changed */ + u_long va_gen; /* generation number of file */ + u_long va_flags; /* flags defined for file */ + dev_t va_rdev; /* device the special file represents */ + u_quad_t va_bytes; /* bytes of disk space held by file */ + u_quad_t va_filerev; /* file modification number */ + u_int va_vaflags; /* operations flags, see below */ + long va_spare; /* remain quad aligned */ +}; + +/* + * Flags for va_cflags. + */ +#define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ + +/* + * Flags for ioflag. + */ +#define IO_UNIT 0x01 /* do I/O as atomic unit */ +#define IO_APPEND 0x02 /* append write to end */ +#define IO_SYNC 0x04 /* do I/O synchronously */ +#define IO_NODELOCKED 0x08 /* underlying node already locked */ +#define IO_NDELAY 0x10 /* FNDELAY flag set in file table */ + +/* + * Modes. Some values same as Ixxx entries from inode.h for now. + */ +#define VSUID 04000 /* set user id on execution */ +#define VSGID 02000 /* set group id on execution */ +#define VSVTX 01000 /* save swapped text even after use */ +#define VREAD 00400 /* read, write, execute permissions */ +#define VWRITE 00200 +#define VEXEC 00100 + +/* + * Token indicating no attribute value yet assigned. + */ +#define VNOVAL (-1) + +#ifdef KERNEL +/* + * Convert between vnode types and inode formats (since POSIX.1 + * defines mode word of stat structure in terms of inode formats). + */ +extern enum vtype iftovt_tab[]; +extern int vttoif_tab[]; +#define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) +#define VTTOIF(indx) (vttoif_tab[(int)(indx)]) +#define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) + +/* + * Flags to various vnode functions. + */ +#define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ +#define FORCECLOSE 0x0002 /* vflush: force file closeure */ +#define WRITECLOSE 0x0004 /* vflush: only close writeable files */ +#define DOCLOSE 0x0008 /* vclean: close active files */ +#define V_SAVE 0x0001 /* vinvalbuf: sync file first */ +#define V_SAVEMETA 0x0002 /* vinvalbuf: leave indirect blocks */ + +#ifdef DIAGNOSTIC +#define HOLDRELE(vp) holdrele(vp) +#define VATTR_NULL(vap) vattr_null(vap) +#define VHOLD(vp) vhold(vp) +#define VREF(vp) vref(vp) + +void holdrele __P((struct vnode *)); +void vattr_null __P((struct vattr *)); +void vhold __P((struct vnode *)); +void vref __P((struct vnode *)); +#else +#define HOLDRELE(vp) (vp)->v_holdcnt-- /* decrease buf or page ref */ +#define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ +#define VHOLD(vp) (vp)->v_holdcnt++ /* increase buf or page ref */ +#define VREF(vp) (vp)->v_usecount++ /* increase reference */ +#endif + +#define NULLVP ((struct vnode *)NULL) + +/* + * Global vnode data. + */ +extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ +extern int desiredvnodes; /* number of vnodes desired */ +extern struct vattr va_null; /* predefined null vattr structure */ + +/* + * Macro/function to check for client cache inconsistency w.r.t. leasing. + */ +#define LEASE_READ 0x1 /* Check lease for readers */ +#define LEASE_WRITE 0x2 /* Check lease for modifiers */ + +#ifdef NFS +void lease_check __P((struct vnode *vp, struct proc *p, + struct ucred *ucred, int flag)); +void lease_updatetime __P((int deltat)); +#define LEASE_CHECK(vp, p, cred, flag) lease_check((vp), (p), (cred), (flag)) +#define LEASE_UPDATETIME(dt) lease_updatetime(dt) +#else +#define LEASE_CHECK(vp, p, cred, flag) +#define LEASE_UPDATETIME(dt) +#endif /* NFS */ +#endif /* KERNEL */ + + +/* + * Mods for exensibility. + */ + +/* + * Flags for vdesc_flags: + */ +#define VDESC_MAX_VPS 16 +/* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ +#define VDESC_VP0_WILLRELE 0x0001 +#define VDESC_VP1_WILLRELE 0x0002 +#define VDESC_VP2_WILLRELE 0x0004 +#define VDESC_VP3_WILLRELE 0x0008 +#define VDESC_NOMAP_VPP 0x0100 +#define VDESC_VPP_WILLRELE 0x0200 + +/* + * VDESC_NO_OFFSET is used to identify the end of the offset list + * and in places where no such field exists. + */ +#define VDESC_NO_OFFSET -1 + +/* + * This structure describes the vnode operation taking place. + */ +struct vnodeop_desc { + int vdesc_offset; /* offset in vector--first for speed */ + char *vdesc_name; /* a readable name for debugging */ + int vdesc_flags; /* VDESC_* flags */ + + /* + * These ops are used by bypass routines to map and locate arguments. + * Creds and procs are not needed in bypass routines, but sometimes + * they are useful to (for example) transport layers. + * Nameidata is useful because it has a cred in it. + */ + int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ + int vdesc_vpp_offset; /* return vpp location */ + int vdesc_cred_offset; /* cred location, if any */ + int vdesc_proc_offset; /* proc location, if any */ + int vdesc_componentname_offset; /* if any */ + /* + * Finally, we've got a list of private data (about each operation) + * for each transport layer. (Support to manage this list is not + * yet part of BSD.) + */ + caddr_t *vdesc_transports; +}; + +#ifdef KERNEL +/* + * A list of all the operation descs. + */ +extern struct vnodeop_desc *vnodeop_descs[]; + + +/* + * This macro is very helpful in defining those offsets in the vdesc struct. + * + * This is stolen from X11R4. I ingored all the fancy stuff for + * Crays, so if you decide to port this to such a serious machine, + * you might want to consult Intrisics.h's XtOffset{,Of,To}. + */ +#define VOPARG_OFFSET(p_type,field) \ + ((int) (((char *) (&(((p_type)NULL)->field))) - ((char *) NULL))) +#define VOPARG_OFFSETOF(s_type,field) \ + VOPARG_OFFSET(s_type*,field) +#define VOPARG_OFFSETTO(S_TYPE,S_OFFSET,STRUCT_P) \ + ((S_TYPE)(((char*)(STRUCT_P))+(S_OFFSET))) + + +/* + * This structure is used to configure the new vnodeops vector. + */ +struct vnodeopv_entry_desc { + struct vnodeop_desc *opve_op; /* which operation this is */ + int (*opve_impl)(); /* code implementing this operation */ +}; +struct vnodeopv_desc { + /* ptr to the ptr to the vector where op should go */ + int (***opv_desc_vector_p)(); + struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */ +}; + +/* + * A default routine which just returns an error. + */ +int vn_default_error __P((void)); + +/* + * A generic structure. + * This can be used by bypass routines to identify generic arguments. + */ +struct vop_generic_args { + struct vnodeop_desc *a_desc; + /* other random data follows, presumably */ +}; + +/* + * VOCALL calls an op given an ops vector. We break it out because BSD's + * vclean changes the ops vector and then wants to call ops with the old + * vector. + */ +#define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP)) + +/* + * This call works for vnodes in the kernel. + */ +#define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP)) +#define VDESC(OP) (& __CONCAT(OP,_desc)) +#define VOFFSET(OP) (VDESC(OP)->vdesc_offset) + +/* + * Finally, include the default set of vnode operations. + */ +#include + +/* + * Public vnode manipulation functions. + */ +struct file; +struct mount; +struct nameidata; +struct proc; +struct stat; +struct ucred; +struct uio; +struct vattr; +struct vnode; +struct vop_bwrite_args; + +int bdevvp __P((dev_t dev, struct vnode **vpp)); +int getnewvnode __P((enum vtagtype tag, + struct mount *mp, int (**vops)(), struct vnode **vpp)); +int vinvalbuf __P((struct vnode *vp, int save, struct ucred *cred, + struct proc *p, int slpflag, int slptimeo)); +void vattr_null __P((struct vattr *vap)); +int vcount __P((struct vnode *vp)); +int vget __P((struct vnode *vp, int lockflag)); +void vgone __P((struct vnode *vp)); +void vgoneall __P((struct vnode *vp)); +int vn_bwrite __P((struct vop_bwrite_args *ap)); +int vn_close __P((struct vnode *vp, + int flags, struct ucred *cred, struct proc *p)); +int vn_closefile __P((struct file *fp, struct proc *p)); +int vn_ioctl __P((struct file *fp, int com, caddr_t data, struct proc *p)); +int vn_open __P((struct nameidata *ndp, int fmode, int cmode)); +int vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base, + int len, off_t offset, enum uio_seg segflg, int ioflg, + struct ucred *cred, int *aresid, struct proc *p)); +int vn_read __P((struct file *fp, struct uio *uio, struct ucred *cred)); +int vn_select __P((struct file *fp, int which, struct proc *p)); +int vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p)); +int vn_write __P((struct file *fp, struct uio *uio, struct ucred *cred)); +struct vnode * + checkalias __P((struct vnode *vp, dev_t nvp_rdev, struct mount *mp)); +void vput __P((struct vnode *vp)); +void vref __P((struct vnode *vp)); +void vrele __P((struct vnode *vp)); +#endif /* KERNEL */ diff --git a/sys/sys/vsio.h b/sys/sys/vsio.h new file mode 100644 index 00000000000..d84218cc238 --- /dev/null +++ b/sys/sys/vsio.h @@ -0,0 +1,153 @@ +/*- + * Copyright (c) 1987, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vsio.h 8.1 (Berkeley) 6/2/93 + */ + + /**************************************************************************** + * * + * Copyright (c) 1983, 1984 by * + * DIGITAL EQUIPMENT CORPORATION, Maynard, Massachusetts. * + * All rights reserved. * + * * + * This software is furnished on an as-is basis and may be used and copied * + * only with inclusion of the above copyright notice. This software or any * + * other copies thereof may be provided or otherwise made available to * + * others only for non-commercial purposes. No title to or ownership of * + * the software is hereby transferred. * + * * + * The information in this software is subject to change without notice * + * and should not be construed as a commitment by DIGITAL EQUIPMENT * + * CORPORATION. * + * * + * DIGITAL assumes no responsibility for the use or reliability of its * + * software on equipment which is not supplied by DIGITAL. * + * * + * * + ****************************************************************************/ +/* + * vsio.h - VS100 I/O command definitions + * + * Author: Christopher A. Kent + * Digital Equipment Corporation + * Western Research Lab + * Date: Tue Jun 21 1983 + */ + +/* + * Possible ioctl calls + */ + +#define VSIOINIT _IO('V', 0) /* init the device */ +#define VSIOSTART _IOW('V', 1, int) /* start microcode */ +#define VSIOABORT _IO('V', 2) /* abort a command chain */ +#define VSIOPWRUP _IO('V', 3) /* power-up reset */ +#define VSIOGETVER _IOR('V', 4, int) /* get rom version */ +#define VSIOSYNC _IO('V', 6) /* synch with device */ +#define VSIOBBACTL _IOW('V', 8, int) /* control the BBA */ +#define VSIOFIBCTL _IOW('V', 9, int) /* lamp on/off */ +#define VSIOFIBRETRY _IOW('V',10, int) /* fiber retries */ +#define VSIOGETSTATS _IOR('V',11, vsStats) /* get statistics */ +#define VSIOGETIOA _IOR('V',13, vsIoAddrAddr)/* get ioreg address */ +#define VSIOUSERWAIT _IO('V', 15) /* wait for user I/O completion */ +#define VSIOWAITGO _IOW('V', 16, caddr_t) /* wait then go */ + + +#define VSIO_OFF 0 /* option off */ +#define VSIO_ON 1 /* option on */ + +#define VS_FIB_FINITE 1 /* finite retries */ +#define VS_FIB_INFINITE 2 /* infinite retries */ + +/* + * Event queue entries + */ + +typedef struct _vs_event{ + u_short vse_x; /* x position */ + u_short vse_y; /* y position */ + u_short vse_time; /* 10 millisecond units (button only) */ + char vse_type; /* button or motion? */ + u_char vse_key; /* the key (button only) */ + char vse_direction; /* which direction (button only) */ + char vse_device; /* which device (button only) */ +}vsEvent; + +#define VSE_BUTTON 0 /* button moved */ +#define VSE_MMOTION 1 /* mouse moved */ +#define VSE_TMOTION 2 /* tablet moved */ + +#define VSE_KBTUP 0 /* up */ +#define VSE_KBTDOWN 1 /* down */ + +#define VSE_MOUSE 1 /* mouse */ +#define VSE_DKB 2 /* main keyboard */ +#define VSE_TABLET 3 /* graphics tablet */ +#define VSE_AUX 4 /* auxiliary */ +#define VSE_CONSOLE 5 /* console */ + +typedef struct _vsStats{ + int errors; /* count errors */ + int unsolIntr; /* count unsolicited interrupts */ + int overruns; /* event queue overruns */ + int flashes; /* flashes on fiber link */ + int ignites; /* times turned on */ + int douses; /* times turned off */ + int linkErrors; /* link errors */ +}vsStats; + +typedef struct _vs_cursor{ + short x; + short y; +}vsCursor; + +typedef struct _vs_box { + short bottom; + short right; + short left; + short top; +}vsBox; + +typedef struct _vsIoAddr { + short *ioreg; + short status; + caddr_t obuff; + int obufflen; + int reloc; + vsEvent *ibuff; + int iqsize; /* may assume power of 2 */ + int ihead; /* atomic write */ + int itail; /* atomic read */ + vsCursor mouse; /* atomic read/write */ + vsBox mbox; /* atomic read/write */ +} vsIoAddr; +typedef vsIoAddr *vsIoAddrAddr; diff --git a/sys/sys/wait.h b/sys/sys/wait.h new file mode 100644 index 00000000000..33a68d9f33f --- /dev/null +++ b/sys/sys/wait.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)wait.h 8.1 (Berkeley) 6/2/93 + */ + +/* + * This file holds definitions relevent to the wait4 system call + * and the alternate interfaces that use it (wait, wait3, waitpid). + */ + +/* + * Macros to test the exit status returned by wait + * and extract the relevant values. + */ +#ifdef _POSIX_SOURCE +#define _W_INT(i) (i) +#else +#define _W_INT(w) (*(int *)&(w)) /* convert union wait to int */ +#define WCOREFLAG 0200 +#endif + +#define _WSTATUS(x) (_W_INT(x) & 0177) +#define _WSTOPPED 0177 /* _WSTATUS if process is stopped */ +#define WIFSTOPPED(x) (_WSTATUS(x) == _WSTOPPED) +#define WSTOPSIG(x) (_W_INT(x) >> 8) +#define WIFSIGNALED(x) (_WSTATUS(x) != _WSTOPPED && _WSTATUS(x) != 0) +#define WTERMSIG(x) (_WSTATUS(x)) +#define WIFEXITED(x) (_WSTATUS(x) == 0) +#define WEXITSTATUS(x) (_W_INT(x) >> 8) +#ifndef _POSIX_SOURCE +#define WCOREDUMP(x) (_W_INT(x) & WCOREFLAG) + +#define W_EXITCODE(ret, sig) ((ret) << 8 | (sig)) +#define W_STOPCODE(sig) ((sig) << 8 | _WSTOPPED) +#endif + +/* + * Option bits for the third argument of wait4. WNOHANG causes the + * wait to not hang if there are no stopped or terminated processes, rather + * returning an error indication in this case (pid==0). WUNTRACED + * indicates that the caller should receive status about untraced children + * which stop due to signals. If children are stopped and a wait without + * this option is done, it is as though they were still running... nothing + * about them is returned. + */ +#define WNOHANG 1 /* dont hang in wait */ +#define WUNTRACED 2 /* tell about stopped, untraced children */ + +#ifndef _POSIX_SOURCE +/* POSIX extensions and 4.2/4.3 compatability: */ + +/* + * Tokens for special values of the "pid" parameter to wait4. + */ +#define WAIT_ANY (-1) /* any process */ +#define WAIT_MYPGRP 0 /* any process in my process group */ + +#include + +/* + * Deprecated: + * Structure of the information in the status word returned by wait4. + * If w_stopval==WSTOPPED, then the second structure describes + * the information returned, else the first. + */ +union wait { + int w_status; /* used in syscall */ + /* + * Terminated process status. + */ + struct { +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned int w_Termsig:7, /* termination signal */ + w_Coredump:1, /* core dump indicator */ + w_Retcode:8, /* exit code if w_termsig==0 */ + w_Filler:16; /* upper bits filler */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + unsigned int w_Filler:16, /* upper bits filler */ + w_Retcode:8, /* exit code if w_termsig==0 */ + w_Coredump:1, /* core dump indicator */ + w_Termsig:7; /* termination signal */ +#endif + } w_T; + /* + * Stopped process status. Returned + * only for traced children unless requested + * with the WUNTRACED option bit. + */ + struct { +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned int w_Stopval:8, /* == W_STOPPED if stopped */ + w_Stopsig:8, /* signal that stopped us */ + w_Filler:16; /* upper bits filler */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + unsigned int w_Filler:16, /* upper bits filler */ + w_Stopsig:8, /* signal that stopped us */ + w_Stopval:8; /* == W_STOPPED if stopped */ +#endif + } w_S; +}; +#define w_termsig w_T.w_Termsig +#define w_coredump w_T.w_Coredump +#define w_retcode w_T.w_Retcode +#define w_stopval w_S.w_Stopval +#define w_stopsig w_S.w_Stopsig + +#define WSTOPPED _WSTOPPED +#endif /* _POSIX_SOURCE */ + +#ifndef KERNEL +#include +#include + +__BEGIN_DECLS +struct rusage; /* forward declaration */ + +pid_t wait __P((int *)); +pid_t waitpid __P((pid_t, int *, int)); +#ifndef _POSIX_SOURCE +pid_t wait3 __P((int *, int, struct rusage *)); +pid_t wait4 __P((pid_t, int *, int, struct rusage *)); +#endif +__END_DECLS +#endif diff --git a/sys/tools/vnode_if.awk b/sys/tools/vnode_if.awk new file mode 100644 index 00000000000..e190fa04836 --- /dev/null +++ b/sys/tools/vnode_if.awk @@ -0,0 +1,433 @@ +#!/bin/sh - +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 +# + +# Script to produce VFS front-end sugar. +# +# usage: vnode_if.sh srcfile +# (where srcfile is currently /sys/kern/vnode_if.src) +# +# These awk scripts are not particularly well written, specifically they +# don't use arrays well and figure out the same information repeatedly. +# Please rewrite them if you actually understand how to use awk. Note, +# they use nawk extensions and gawk's toupper. + +if [ $# -ne 1 ] ; then + echo 'usage: vnode_if.sh srcfile' + exit 1 +fi + +# Name of the source file. +SRC=$1 + +# Names of the created files. +CFILE=vnode_if.c +HEADER=vnode_if.h + +# Awk program (must support nawk extensions and gawk's "toupper") +# Use "awk" at Berkeley, "gawk" elsewhere. +AWK=awk + +# Print out header information for vnode_if.h. +cat << END_OF_LEADING_COMMENT > $HEADER +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +extern struct vnodeop_desc vop_default_desc; +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.h. +$AWK ' + NF == 0 || $0 ~ "^#" { + next; + } + { + # Get the function name. + name = $1; + uname = toupper(name); + + # Get the function arguments. + for (c1 = 0;; ++c1) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + a[c1] = $0; + } + + # Print out the vop_F_args structure. + printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n", + name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%sa_%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("};\n"); + + # Print out extern declaration. + printf("extern struct vnodeop_desc %s_desc;\n", name); + + # Print out inline struct. + printf("static inline int %s(", uname); + sep = ", "; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = ")\n"; + c3 = split(a[c2], t); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s", substr(t[c3], beg, end - beg), sep); + } + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%s%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("{\n\tstruct %s_args a;\n\n", name); + printf("\ta.a_desc = VDESC(%s);\n", name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("a.a_%s = %s\n", + substr(t[c3], beg, end - beg), substr(t[c3], beg)); + } + c1 = split(a[0], t); + beg = match(t[c1], "[^*]"); + end = match(t[c1], ";"); + printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n", + substr(t[c1], beg, end - beg), name); + }' < $SRC >> $HEADER + +# Print out header information for vnode_if.c. +cat << END_OF_LEADING_COMMENT > $CFILE +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +#include +#include +#include + +struct vnodeop_desc vop_default_desc = { + 0, + "default", + 0, + NULL, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; + +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.c. +$AWK 'function kill_surrounding_ws (s) { + sub (/^[ \t]*/, "", s); + sub (/[ \t]*$/, "", s); + return s; + } + + function read_args() { + numargs = 0; + while (getline ln) { + if (ln ~ /}/) { + break; + }; + + # Delete comments, if any. + gsub (/\/\*.*\*\//, "", ln); + + # Delete leading/trailing space. + ln = kill_surrounding_ws(ln); + + # Pick off direction. + if (1 == sub(/^INOUT[ \t]+/, "", ln)) + dir = "INOUT"; + else if (1 == sub(/^IN[ \t]+/, "", ln)) + dir = "IN"; + else if (1 == sub(/^OUT[ \t]+/, "", ln)) + dir = "OUT"; + else + bail("No IN/OUT direction for \"" ln "\"."); + + # check for "WILLRELE" + if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) { + rele = "WILLRELE"; + } else { + rele = "WONTRELE"; + }; + + # kill trailing ; + if (1 != sub (/;$/, "", ln)) { + bail("Missing end-of-line ; in \"" ln "\"."); + }; + + # pick off variable name + if (!(i = match(ln, /[A-Za-z0-9_]+$/))) { + bail("Missing var name \"a_foo\" in \"" ln "\"."); + }; + arg = substr (ln, i); + # Want to <>, but nawk cannot. + # Hack around this. + ln = substr(ln, 1, i-1); + + # what is left must be type + # (put clean it up some) + type = ln; + gsub (/[ \t]+/, " ", type); # condense whitespace + type = kill_surrounding_ws(type); + + # (boy this was easier in Perl) + + numargs++; + dirs[numargs] = dir; + reles[numargs] = rele; + types[numargs] = type; + args[numargs] = arg; + }; + } + + function generate_operation_vp_offsets() { + printf ("int %s_vp_offsets[] = {\n", name); + # as a side effect, figure out the releflags + releflags = ""; + vpnum = 0; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode *") { + printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n", + name, args[i]); + if (reles[i] == "WILLRELE") { + releflags = releflags "|VDESC_VP" vpnum "_WILLRELE"; + }; + vpnum++; + }; + }; + sub (/^\|/, "", releflags); + print "\tVDESC_NO_OFFSET"; + print "};"; + } + + function find_arg_with_type (type) { + for (i=1; i<=numargs; i++) { + if (types[i] == type) { + return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")"; + }; + }; + return "VDESC_NO_OFFSET"; + } + + function generate_operation_desc() { + printf ("struct vnodeop_desc %s_desc = {\n", name); + # offset + printf ("\t0,\n"); + # printable name + printf ("\t\"%s\",\n", name); + # flags + vppwillrele = ""; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode **" && + (reles[i] == "WILLRELE")) { + vppwillrele = "|VDESC_VPP_WILLRELE"; + }; + }; + if (releflags == "") { + printf ("\t0%s,\n", vppwillrele); + } else { + printf ("\t%s%s,\n", releflags, vppwillrele); + }; + # vp offsets + printf ("\t%s_vp_offsets,\n", name); + # vpp (if any) + printf ("\t%s,\n", find_arg_with_type("struct vnode **")); + # cred (if any) + printf ("\t%s,\n", find_arg_with_type("struct ucred *")); + # proc (if any) + printf ("\t%s,\n", find_arg_with_type("struct proc *")); + # componentname + printf ("\t%s,\n", find_arg_with_type("struct componentname *")); + # transport layer information + printf ("\tNULL,\n};\n"); + } + + NF == 0 || $0 ~ "^#" { + next; + } + { + # get the function name + name = $1; + + # get the function arguments + read_args(); + + # Print out the vop_F_vp_offsets structure. This all depends + # on naming conventions and nothing else. + generate_operation_vp_offsets(); + + # Print out the vnodeop_desc structure. + generate_operation_desc(); + + printf "\n"; + + }' < $SRC >> $CFILE +# THINGS THAT DON'T WORK RIGHT YET. +# +# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as +# arguments. This means that these operations can't function successfully +# through a bypass routine. +# +# Bwrite and strategy will be replaced when the VM page/buffer cache +# integration happens. +# +# To get around this problem for now we handle these ops as special cases. + +cat << END_OF_SPECIAL_CASES >> $HEADER +#include +struct vop_strategy_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_strategy_desc; +static inline int VOP_STRATEGY(bp) + struct buf *bp; +{ + struct vop_strategy_args a; + + a.a_desc = VDESC(vop_strategy); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_strategy), &a)); +} + +struct vop_bwrite_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_bwrite_desc; +static inline int VOP_BWRITE(bp) + struct buf *bp; +{ + struct vop_bwrite_args a; + + a.a_desc = VDESC(vop_bwrite); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a)); +} +END_OF_SPECIAL_CASES + +cat << END_OF_SPECIAL_CASES >> $CFILE +int vop_strategy_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_strategy_desc = { + 0, + "vop_strategy", + 0, + vop_strategy_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +int vop_bwrite_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_bwrite_desc = { + 0, + "vop_bwrite", + 0, + vop_bwrite_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +END_OF_SPECIAL_CASES + +# Add the vfs_op_descs array to the C file. +$AWK ' + BEGIN { + printf("\nstruct vnodeop_desc *vfs_op_descs[] = {\n"); + printf("\t&vop_default_desc, /* MUST BE FIRST */\n"); + printf("\t&vop_strategy_desc, /* XXX: SPECIAL CASE */\n"); + printf("\t&vop_bwrite_desc, /* XXX: SPECIAL CASE */\n"); + } + END { + printf("\tNULL\n};\n"); + } + NF == 0 || $0 ~ "^#" { + next; + } + { + # Get the function name. + printf("\t&%s_desc,\n", $1); + + # Skip the function arguments. + for (;;) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + } + }' < $SRC >> $CFILE + diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c new file mode 100644 index 00000000000..cdd2e4b2b35 --- /dev/null +++ b/sys/ufs/ffs/ffs_alloc.c @@ -0,0 +1,1474 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_alloc.c 8.8 (Berkeley) 2/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +extern u_long nextgennumber; + +static daddr_t ffs_alloccg __P((struct inode *, int, daddr_t, int)); +static daddr_t ffs_alloccgblk __P((struct fs *, struct cg *, daddr_t)); +static daddr_t ffs_clusteralloc __P((struct inode *, int, daddr_t, int)); +static ino_t ffs_dirpref __P((struct fs *)); +static daddr_t ffs_fragextend __P((struct inode *, int, long, int, int)); +static void ffs_fserr __P((struct fs *, u_int, char *)); +static u_long ffs_hashalloc + __P((struct inode *, int, long, int, u_long (*)())); +static ino_t ffs_nodealloccg __P((struct inode *, int, daddr_t, int)); +static daddr_t ffs_mapsearch __P((struct fs *, struct cg *, daddr_t, int)); + +/* + * Allocate a block in the file system. + * + * The size of the requested block is given, which must be some + * multiple of fs_fsize and <= fs_bsize. + * A preference may be optionally specified. If a preference is given + * the following hierarchy is used to allocate a block: + * 1) allocate the requested block. + * 2) allocate a rotationally optimal block in the same cylinder. + * 3) allocate a block in the same cylinder group. + * 4) quadradically rehash into other cylinder groups, until an + * available block is located. + * If no block preference is given the following heirarchy is used + * to allocate a block: + * 1) allocate a block in the cylinder group that contains the + * inode for the file. + * 2) quadradically rehash into other cylinder groups, until an + * available block is located. + */ +ffs_alloc(ip, lbn, bpref, size, cred, bnp) + register struct inode *ip; + daddr_t lbn, bpref; + int size; + struct ucred *cred; + daddr_t *bnp; +{ + register struct fs *fs; + daddr_t bno; + int cg, error; + + *bnp = 0; + fs = ip->i_fs; +#ifdef DIAGNOSTIC + if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { + printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); + panic("ffs_alloc: bad size"); + } + if (cred == NOCRED) + panic("ffs_alloc: missing credential\n"); +#endif /* DIAGNOSTIC */ + if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) + goto nospace; + if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0) + goto nospace; +#ifdef QUOTA + if (error = chkdq(ip, (long)btodb(size), cred, 0)) + return (error); +#endif + if (bpref >= fs->fs_size) + bpref = 0; + if (bpref == 0) + cg = ino_to_cg(fs, ip->i_number); + else + cg = dtog(fs, bpref); + bno = (daddr_t)ffs_hashalloc(ip, cg, (long)bpref, size, + (u_long (*)())ffs_alloccg); + if (bno > 0) { + ip->i_blocks += btodb(size); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + *bnp = bno; + return (0); + } +#ifdef QUOTA + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, (long)-btodb(size), cred, FORCE); +#endif +nospace: + ffs_fserr(fs, cred->cr_uid, "file system full"); + uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Reallocate a fragment to a bigger size + * + * The number and size of the old block is given, and a preference + * and new size is also specified. The allocator attempts to extend + * the original block. Failing that, the regular block allocator is + * invoked to get an appropriate block. + */ +ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) + register struct inode *ip; + daddr_t lbprev; + daddr_t bpref; + int osize, nsize; + struct ucred *cred; + struct buf **bpp; +{ + register struct fs *fs; + struct buf *bp; + int cg, request, error; + daddr_t bprev, bno; + + *bpp = 0; + fs = ip->i_fs; +#ifdef DIAGNOSTIC + if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || + (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { + printf( + "dev = 0x%x, bsize = %d, osize = %d, nsize = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); + panic("ffs_realloccg: bad size"); + } + if (cred == NOCRED) + panic("ffs_realloccg: missing credential\n"); +#endif /* DIAGNOSTIC */ + if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0) + goto nospace; + if ((bprev = ip->i_db[lbprev]) == 0) { + printf("dev = 0x%x, bsize = %d, bprev = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, bprev, fs->fs_fsmnt); + panic("ffs_realloccg: bad bprev"); + } + /* + * Allocate the extra space in the buffer. + */ + if (error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp)) { + brelse(bp); + return (error); + } +#ifdef QUOTA + if (error = chkdq(ip, (long)btodb(nsize - osize), cred, 0)) { + brelse(bp); + return (error); + } +#endif + /* + * Check for extension in the existing location. + */ + cg = dtog(fs, bprev); + if (bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize)) { + if (bp->b_blkno != fsbtodb(fs, bno)) + panic("bad blockno"); + ip->i_blocks += btodb(nsize - osize); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + allocbuf(bp, nsize); + bp->b_flags |= B_DONE; + bzero((char *)bp->b_data + osize, (u_int)nsize - osize); + *bpp = bp; + return (0); + } + /* + * Allocate a new disk location. + */ + if (bpref >= fs->fs_size) + bpref = 0; + switch ((int)fs->fs_optim) { + case FS_OPTSPACE: + /* + * Allocate an exact sized fragment. Although this makes + * best use of space, we will waste time relocating it if + * the file continues to grow. If the fragmentation is + * less than half of the minimum free reserve, we choose + * to begin optimizing for time. + */ + request = nsize; + if (fs->fs_minfree < 5 || + fs->fs_cstotal.cs_nffree > + fs->fs_dsize * fs->fs_minfree / (2 * 100)) + break; + log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", + fs->fs_fsmnt); + fs->fs_optim = FS_OPTTIME; + break; + case FS_OPTTIME: + /* + * At this point we have discovered a file that is trying to + * grow a small fragment to a larger fragment. To save time, + * we allocate a full sized block, then free the unused portion. + * If the file continues to grow, the `ffs_fragextend' call + * above will be able to grow it in place without further + * copying. If aberrant programs cause disk fragmentation to + * grow within 2% of the free reserve, we choose to begin + * optimizing for space. + */ + request = fs->fs_bsize; + if (fs->fs_cstotal.cs_nffree < + fs->fs_dsize * (fs->fs_minfree - 2) / 100) + break; + log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", + fs->fs_fsmnt); + fs->fs_optim = FS_OPTSPACE; + break; + default: + printf("dev = 0x%x, optim = %d, fs = %s\n", + ip->i_dev, fs->fs_optim, fs->fs_fsmnt); + panic("ffs_realloccg: bad optim"); + /* NOTREACHED */ + } + bno = (daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request, + (u_long (*)())ffs_alloccg); + if (bno > 0) { + bp->b_blkno = fsbtodb(fs, bno); + (void) vnode_pager_uncache(ITOV(ip)); + ffs_blkfree(ip, bprev, (long)osize); + if (nsize < request) + ffs_blkfree(ip, bno + numfrags(fs, nsize), + (long)(request - nsize)); + ip->i_blocks += btodb(nsize - osize); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + allocbuf(bp, nsize); + bp->b_flags |= B_DONE; + bzero((char *)bp->b_data + osize, (u_int)nsize - osize); + *bpp = bp; + return (0); + } +#ifdef QUOTA + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, (long)-btodb(nsize - osize), cred, FORCE); +#endif + brelse(bp); +nospace: + /* + * no space available + */ + ffs_fserr(fs, cred->cr_uid, "file system full"); + uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Reallocate a sequence of blocks into a contiguous sequence of blocks. + * + * The vnode and an array of buffer pointers for a range of sequential + * logical blocks to be made contiguous is given. The allocator attempts + * to find a range of sequential blocks starting as close as possible to + * an fs_rotdelay offset from the end of the allocation for the logical + * block immediately preceeding the current range. If successful, the + * physical block numbers in the buffer pointers and in the inode are + * changed to reflect the new allocation. If unsuccessful, the allocation + * is left unchanged. The success in doing the reallocation is returned. + * Note that the error return is not reflected back to the user. Rather + * the previous block allocation will be used. + */ +#include +int doasyncfree = 1; +struct ctldebug debug14 = { "doasyncfree", &doasyncfree }; +int +ffs_reallocblks(ap) + struct vop_reallocblks_args /* { + struct vnode *a_vp; + struct cluster_save *a_buflist; + } */ *ap; +{ + struct fs *fs; + struct inode *ip; + struct vnode *vp; + struct buf *sbp, *ebp; + daddr_t *bap, *sbap, *ebap; + struct cluster_save *buflist; + daddr_t start_lbn, end_lbn, soff, eoff, newblk, blkno; + struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; + int i, len, start_lvl, end_lvl, pref, ssize; + + vp = ap->a_vp; + ip = VTOI(vp); + fs = ip->i_fs; + if (fs->fs_contigsumsize <= 0) + return (ENOSPC); + buflist = ap->a_buflist; + len = buflist->bs_nchildren; + start_lbn = buflist->bs_children[0]->b_lblkno; + end_lbn = start_lbn + len - 1; +#ifdef DIAGNOSTIC + for (i = 1; i < len; i++) + if (buflist->bs_children[i]->b_lblkno != start_lbn + i) + panic("ffs_reallocblks: non-cluster"); +#endif + /* + * If the latest allocation is in a new cylinder group, assume that + * the filesystem has decided to move and do not force it back to + * the previous cylinder group. + */ + if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != + dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) + return (ENOSPC); + if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || + ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) + return (ENOSPC); + /* + * Get the starting offset and block map for the first block. + */ + if (start_lvl == 0) { + sbap = &ip->i_db[0]; + soff = start_lbn; + } else { + idp = &start_ap[start_lvl - 1]; + if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { + brelse(sbp); + return (ENOSPC); + } + sbap = (daddr_t *)sbp->b_data; + soff = idp->in_off; + } + /* + * Find the preferred location for the cluster. + */ + pref = ffs_blkpref(ip, start_lbn, soff, sbap); + /* + * If the block range spans two block maps, get the second map. + */ + if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { + ssize = len; + } else { +#ifdef DIAGNOSTIC + if (start_ap[start_lvl-1].in_lbn == idp->in_lbn) + panic("ffs_reallocblk: start == end"); +#endif + ssize = len - (idp->in_off + 1); + if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) + goto fail; + ebap = (daddr_t *)ebp->b_data; + } + /* + * Search the block map looking for an allocation of the desired size. + */ + if ((newblk = (daddr_t)ffs_hashalloc(ip, dtog(fs, pref), (long)pref, + len, (u_long (*)())ffs_clusteralloc)) == 0) + goto fail; + /* + * We have found a new contiguous block. + * + * First we have to replace the old block pointers with the new + * block pointers in the inode and indirect blocks associated + * with the file. + */ + blkno = newblk; + for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { + if (i == ssize) + bap = ebap; +#ifdef DIAGNOSTIC + if (buflist->bs_children[i]->b_blkno != fsbtodb(fs, *bap)) + panic("ffs_reallocblks: alloc mismatch"); +#endif + *bap++ = blkno; + } + /* + * Next we must write out the modified inode and indirect blocks. + * For strict correctness, the writes should be synchronous since + * the old block values may have been written to disk. In practise + * they are almost never written, but if we are concerned about + * strict correctness, the `doasyncfree' flag should be set to zero. + * + * The test on `doasyncfree' should be changed to test a flag + * that shows whether the associated buffers and inodes have + * been written. The flag should be set when the cluster is + * started and cleared whenever the buffer or inode is flushed. + * We can then check below to see if it is set, and do the + * synchronous write only when it has been cleared. + */ + if (sbap != &ip->i_db[0]) { + if (doasyncfree) + bdwrite(sbp); + else + bwrite(sbp); + } else { + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (!doasyncfree) + VOP_UPDATE(vp, &time, &time, MNT_WAIT); + } + if (ssize < len) + if (doasyncfree) + bdwrite(ebp); + else + bwrite(ebp); + /* + * Last, free the old blocks and assign the new blocks to the buffers. + */ + for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { + ffs_blkfree(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), + fs->fs_bsize); + buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); + } + return (0); + +fail: + if (ssize < len) + brelse(ebp); + if (sbap != &ip->i_db[0]) + brelse(sbp); + return (ENOSPC); +} + +/* + * Allocate an inode in the file system. + * + * If allocating a directory, use ffs_dirpref to select the inode. + * If allocating in a directory, the following hierarchy is followed: + * 1) allocate the preferred inode. + * 2) allocate an inode in the same cylinder group. + * 3) quadradically rehash into other cylinder groups, until an + * available inode is located. + * If no inode preference is given the following heirarchy is used + * to allocate an inode: + * 1) allocate an inode in cylinder group 0. + * 2) quadradically rehash into other cylinder groups, until an + * available inode is located. + */ +ffs_valloc(ap) + struct vop_valloc_args /* { + struct vnode *a_pvp; + int a_mode; + struct ucred *a_cred; + struct vnode **a_vpp; + } */ *ap; +{ + register struct vnode *pvp = ap->a_pvp; + register struct inode *pip; + register struct fs *fs; + register struct inode *ip; + mode_t mode = ap->a_mode; + ino_t ino, ipref; + int cg, error; + + *ap->a_vpp = NULL; + pip = VTOI(pvp); + fs = pip->i_fs; + if (fs->fs_cstotal.cs_nifree == 0) + goto noinodes; + + if ((mode & IFMT) == IFDIR) + ipref = ffs_dirpref(fs); + else + ipref = pip->i_number; + if (ipref >= fs->fs_ncg * fs->fs_ipg) + ipref = 0; + cg = ino_to_cg(fs, ipref); + ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode, ffs_nodealloccg); + if (ino == 0) + goto noinodes; + error = VFS_VGET(pvp->v_mount, ino, ap->a_vpp); + if (error) { + VOP_VFREE(pvp, ino, mode); + return (error); + } + ip = VTOI(*ap->a_vpp); + if (ip->i_mode) { + printf("mode = 0%o, inum = %d, fs = %s\n", + ip->i_mode, ip->i_number, fs->fs_fsmnt); + panic("ffs_valloc: dup alloc"); + } + if (ip->i_blocks) { /* XXX */ + printf("free inode %s/%d had %d blocks\n", + fs->fs_fsmnt, ino, ip->i_blocks); + ip->i_blocks = 0; + } + ip->i_flags = 0; + /* + * Set up a new generation number for this inode. + */ + if (++nextgennumber < (u_long)time.tv_sec) + nextgennumber = time.tv_sec; + ip->i_gen = nextgennumber; + return (0); +noinodes: + ffs_fserr(fs, ap->a_cred->cr_uid, "out of inodes"); + uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); + return (ENOSPC); +} + +/* + * Find a cylinder to place a directory. + * + * The policy implemented by this algorithm is to select from + * among those cylinder groups with above the average number of + * free inodes, the one with the smallest number of directories. + */ +static ino_t +ffs_dirpref(fs) + register struct fs *fs; +{ + int cg, minndir, mincg, avgifree; + + avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; + minndir = fs->fs_ipg; + mincg = 0; + for (cg = 0; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < minndir && + fs->fs_cs(fs, cg).cs_nifree >= avgifree) { + mincg = cg; + minndir = fs->fs_cs(fs, cg).cs_ndir; + } + return ((ino_t)(fs->fs_ipg * mincg)); +} + +/* + * Select the desired position for the next block in a file. The file is + * logically divided into sections. The first section is composed of the + * direct blocks. Each additional section contains fs_maxbpg blocks. + * + * If no blocks have been allocated in the first section, the policy is to + * request a block in the same cylinder group as the inode that describes + * the file. If no blocks have been allocated in any other section, the + * policy is to place the section in a cylinder group with a greater than + * average number of free blocks. An appropriate cylinder group is found + * by using a rotor that sweeps the cylinder groups. When a new group of + * blocks is needed, the sweep begins in the cylinder group following the + * cylinder group from which the previous allocation was made. The sweep + * continues until a cylinder group with greater than the average number + * of free blocks is found. If the allocation is for the first block in an + * indirect block, the information on the previous allocation is unavailable; + * here a best guess is made based upon the logical block number being + * allocated. + * + * If a section is already partially allocated, the policy is to + * contiguously allocate fs_maxcontig blocks. The end of one of these + * contiguous blocks and the beginning of the next is physically separated + * so that the disk head will be in transit between them for at least + * fs_rotdelay milliseconds. This is to allow time for the processor to + * schedule another I/O transfer. + */ +daddr_t +ffs_blkpref(ip, lbn, indx, bap) + struct inode *ip; + daddr_t lbn; + int indx; + daddr_t *bap; +{ + register struct fs *fs; + register int cg; + int avgbfree, startcg; + daddr_t nextblk; + + fs = ip->i_fs; + if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { + if (lbn < NDADDR) { + cg = ino_to_cg(fs, ip->i_number); + return (fs->fs_fpg * cg + fs->fs_frag); + } + /* + * Find a cylinder with greater than average number of + * unused data blocks. + */ + if (indx == 0 || bap[indx - 1] == 0) + startcg = + ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; + else + startcg = dtog(fs, bap[indx - 1]) + 1; + startcg %= fs->fs_ncg; + avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; + for (cg = startcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + fs->fs_cgrotor = cg; + return (fs->fs_fpg * cg + fs->fs_frag); + } + for (cg = 0; cg <= startcg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + fs->fs_cgrotor = cg; + return (fs->fs_fpg * cg + fs->fs_frag); + } + return (NULL); + } + /* + * One or more previous blocks have been laid out. If less + * than fs_maxcontig previous blocks are contiguous, the + * next block is requested contiguously, otherwise it is + * requested rotationally delayed by fs_rotdelay milliseconds. + */ + nextblk = bap[indx - 1] + fs->fs_frag; + if (indx < fs->fs_maxcontig || bap[indx - fs->fs_maxcontig] + + blkstofrags(fs, fs->fs_maxcontig) != nextblk) + return (nextblk); + if (fs->fs_rotdelay != 0) + /* + * Here we convert ms of delay to frags as: + * (frags) = (ms) * (rev/sec) * (sect/rev) / + * ((sect/frag) * (ms/sec)) + * then round up to the next block. + */ + nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect / + (NSPF(fs) * 1000), fs->fs_frag); + return (nextblk); +} + +/* + * Implement the cylinder overflow algorithm. + * + * The policy implemented by this algorithm is: + * 1) allocate the block in its requested cylinder group. + * 2) quadradically rehash on the cylinder group number. + * 3) brute force search for a free block. + */ +/*VARARGS5*/ +static u_long +ffs_hashalloc(ip, cg, pref, size, allocator) + struct inode *ip; + int cg; + long pref; + int size; /* size for data blocks, mode for inodes */ + u_long (*allocator)(); +{ + register struct fs *fs; + long result; + int i, icg = cg; + + fs = ip->i_fs; + /* + * 1: preferred cylinder group + */ + result = (*allocator)(ip, cg, pref, size); + if (result) + return (result); + /* + * 2: quadratic rehash + */ + for (i = 1; i < fs->fs_ncg; i *= 2) { + cg += i; + if (cg >= fs->fs_ncg) + cg -= fs->fs_ncg; + result = (*allocator)(ip, cg, 0, size); + if (result) + return (result); + } + /* + * 3: brute force search + * Note that we start at i == 2, since 0 was checked initially, + * and 1 is always checked in the quadratic rehash. + */ + cg = (icg + 2) % fs->fs_ncg; + for (i = 2; i < fs->fs_ncg; i++) { + result = (*allocator)(ip, cg, 0, size); + if (result) + return (result); + cg++; + if (cg == fs->fs_ncg) + cg = 0; + } + return (NULL); +} + +/* + * Determine whether a fragment can be extended. + * + * Check to see if the necessary fragments are available, and + * if they are, allocate them. + */ +static daddr_t +ffs_fragextend(ip, cg, bprev, osize, nsize) + struct inode *ip; + int cg; + long bprev; + int osize, nsize; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + long bno; + int frags, bbase; + int i, error; + + fs = ip->i_fs; + if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) + return (NULL); + frags = numfrags(fs, nsize); + bbase = fragnum(fs, bprev); + if (bbase > fragnum(fs, (bprev + frags - 1))) { + /* cannot extend across a block boundary */ + return (NULL); + } + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (NULL); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return (NULL); + } + cgp->cg_time = time.tv_sec; + bno = dtogd(fs, bprev); + for (i = numfrags(fs, osize); i < frags; i++) + if (isclr(cg_blksfree(cgp), bno + i)) { + brelse(bp); + return (NULL); + } + /* + * the current fragment can be extended + * deduct the count on fragment being extended into + * increase the count on the remaining fragment (if any) + * allocate the extended piece + */ + for (i = frags; i < fs->fs_frag - bbase; i++) + if (isclr(cg_blksfree(cgp), bno + i)) + break; + cgp->cg_frsum[i - numfrags(fs, osize)]--; + if (i != frags) + cgp->cg_frsum[i - frags]++; + for (i = numfrags(fs, osize); i < frags; i++) { + clrbit(cg_blksfree(cgp), bno + i); + cgp->cg_cs.cs_nffree--; + fs->fs_cstotal.cs_nffree--; + fs->fs_cs(fs, cg).cs_nffree--; + } + fs->fs_fmod = 1; + bdwrite(bp); + return (bprev); +} + +/* + * Determine whether a block can be allocated. + * + * Check to see if a block of the appropriate size is available, + * and if it is, allocate it. + */ +static daddr_t +ffs_alloccg(ip, cg, bpref, size) + struct inode *ip; + int cg; + daddr_t bpref; + int size; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + register int i; + int error, bno, frags, allocsiz; + + fs = ip->i_fs; + if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) + return (NULL); + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (NULL); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp) || + (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { + brelse(bp); + return (NULL); + } + cgp->cg_time = time.tv_sec; + if (size == fs->fs_bsize) { + bno = ffs_alloccgblk(fs, cgp, bpref); + bdwrite(bp); + return (bno); + } + /* + * check to see if any fragments are already available + * allocsiz is the size which will be allocated, hacking + * it down to a smaller size if necessary + */ + frags = numfrags(fs, size); + for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) + if (cgp->cg_frsum[allocsiz] != 0) + break; + if (allocsiz == fs->fs_frag) { + /* + * no fragments were available, so a block will be + * allocated, and hacked up + */ + if (cgp->cg_cs.cs_nbfree == 0) { + brelse(bp); + return (NULL); + } + bno = ffs_alloccgblk(fs, cgp, bpref); + bpref = dtogd(fs, bno); + for (i = frags; i < fs->fs_frag; i++) + setbit(cg_blksfree(cgp), bpref + i); + i = fs->fs_frag - frags; + cgp->cg_cs.cs_nffree += i; + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cg).cs_nffree += i; + fs->fs_fmod = 1; + cgp->cg_frsum[i]++; + bdwrite(bp); + return (bno); + } + bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); + if (bno < 0) { + brelse(bp); + return (NULL); + } + for (i = 0; i < frags; i++) + clrbit(cg_blksfree(cgp), bno + i); + cgp->cg_cs.cs_nffree -= frags; + fs->fs_cstotal.cs_nffree -= frags; + fs->fs_cs(fs, cg).cs_nffree -= frags; + fs->fs_fmod = 1; + cgp->cg_frsum[allocsiz]--; + if (frags != allocsiz) + cgp->cg_frsum[allocsiz - frags]++; + bdwrite(bp); + return (cg * fs->fs_fpg + bno); +} + +/* + * Allocate a block in a cylinder group. + * + * This algorithm implements the following policy: + * 1) allocate the requested block. + * 2) allocate a rotationally optimal block in the same cylinder. + * 3) allocate the next available block on the block rotor for the + * specified cylinder group. + * Note that this routine only allocates fs_bsize blocks; these + * blocks may be fragmented by the routine that allocates them. + */ +static daddr_t +ffs_alloccgblk(fs, cgp, bpref) + register struct fs *fs; + register struct cg *cgp; + daddr_t bpref; +{ + daddr_t bno, blkno; + int cylno, pos, delta; + short *cylbp; + register int i; + + if (bpref == 0 || dtog(fs, bpref) != cgp->cg_cgx) { + bpref = cgp->cg_rotor; + goto norot; + } + bpref = blknum(fs, bpref); + bpref = dtogd(fs, bpref); + /* + * if the requested block is available, use it + */ + if (ffs_isblock(fs, cg_blksfree(cgp), fragstoblks(fs, bpref))) { + bno = bpref; + goto gotit; + } + /* + * check for a block available on the same cylinder + */ + cylno = cbtocylno(fs, bpref); + if (cg_blktot(cgp)[cylno] == 0) + goto norot; + if (fs->fs_cpc == 0) { + /* + * Block layout information is not available. + * Leaving bpref unchanged means we take the + * next available free block following the one + * we just allocated. Hopefully this will at + * least hit a track cache on drives of unknown + * geometry (e.g. SCSI). + */ + goto norot; + } + /* + * check the summary information to see if a block is + * available in the requested cylinder starting at the + * requested rotational position and proceeding around. + */ + cylbp = cg_blks(fs, cgp, cylno); + pos = cbtorpos(fs, bpref); + for (i = pos; i < fs->fs_nrpos; i++) + if (cylbp[i] > 0) + break; + if (i == fs->fs_nrpos) + for (i = 0; i < pos; i++) + if (cylbp[i] > 0) + break; + if (cylbp[i] > 0) { + /* + * found a rotational position, now find the actual + * block. A panic if none is actually there. + */ + pos = cylno % fs->fs_cpc; + bno = (cylno - pos) * fs->fs_spc / NSPB(fs); + if (fs_postbl(fs, pos)[i] == -1) { + printf("pos = %d, i = %d, fs = %s\n", + pos, i, fs->fs_fsmnt); + panic("ffs_alloccgblk: cyl groups corrupted"); + } + for (i = fs_postbl(fs, pos)[i];; ) { + if (ffs_isblock(fs, cg_blksfree(cgp), bno + i)) { + bno = blkstofrags(fs, (bno + i)); + goto gotit; + } + delta = fs_rotbl(fs)[i]; + if (delta <= 0 || + delta + i > fragstoblks(fs, fs->fs_fpg)) + break; + i += delta; + } + printf("pos = %d, i = %d, fs = %s\n", pos, i, fs->fs_fsmnt); + panic("ffs_alloccgblk: can't find blk in cyl"); + } +norot: + /* + * no blocks in the requested cylinder, so take next + * available one in this cylinder group. + */ + bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); + if (bno < 0) + return (NULL); + cgp->cg_rotor = bno; +gotit: + blkno = fragstoblks(fs, bno); + ffs_clrblock(fs, cg_blksfree(cgp), (long)blkno); + ffs_clusteracct(fs, cgp, blkno, -1); + cgp->cg_cs.cs_nbfree--; + fs->fs_cstotal.cs_nbfree--; + fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; + cylno = cbtocylno(fs, bno); + cg_blks(fs, cgp, cylno)[cbtorpos(fs, bno)]--; + cg_blktot(cgp)[cylno]--; + fs->fs_fmod = 1; + return (cgp->cg_cgx * fs->fs_fpg + bno); +} + +/* + * Determine whether a cluster can be allocated. + * + * We do not currently check for optimal rotational layout if there + * are multiple choices in the same cylinder group. Instead we just + * take the first one that we find following bpref. + */ +static daddr_t +ffs_clusteralloc(ip, cg, bpref, len) + struct inode *ip; + int cg; + daddr_t bpref; + int len; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + int i, run, bno, bit, map; + u_char *mapp; + + fs = ip->i_fs; + if (fs->fs_cs(fs, cg).cs_nbfree < len) + return (NULL); + if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, + NOCRED, &bp)) + goto fail; + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) + goto fail; + /* + * Check to see if a cluster of the needed size (or bigger) is + * available in this cylinder group. + */ + for (i = len; i <= fs->fs_contigsumsize; i++) + if (cg_clustersum(cgp)[i] > 0) + break; + if (i > fs->fs_contigsumsize) + goto fail; + /* + * Search the cluster map to find a big enough cluster. + * We take the first one that we find, even if it is larger + * than we need as we prefer to get one close to the previous + * block allocation. We do not search before the current + * preference point as we do not want to allocate a block + * that is allocated before the previous one (as we will + * then have to wait for another pass of the elevator + * algorithm before it will be read). We prefer to fail and + * be recalled to try an allocation in the next cylinder group. + */ + if (dtog(fs, bpref) != cg) + bpref = 0; + else + bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref))); + mapp = &cg_clustersfree(cgp)[bpref / NBBY]; + map = *mapp++; + bit = 1 << (bpref % NBBY); + for (run = 0, i = bpref; i < cgp->cg_nclusterblks; i++) { + if ((map & bit) == 0) { + run = 0; + } else { + run++; + if (run == len) + break; + } + if ((i & (NBBY - 1)) != (NBBY - 1)) { + bit <<= 1; + } else { + map = *mapp++; + bit = 1; + } + } + if (i == cgp->cg_nclusterblks) + goto fail; + /* + * Allocate the cluster that we have found. + */ + bno = cg * fs->fs_fpg + blkstofrags(fs, i - run + 1); + len = blkstofrags(fs, len); + for (i = 0; i < len; i += fs->fs_frag) + if (ffs_alloccgblk(fs, cgp, bno + i) != bno + i) + panic("ffs_clusteralloc: lost block"); + brelse(bp); + return (bno); + +fail: + brelse(bp); + return (0); +} + +/* + * Determine whether an inode can be allocated. + * + * Check to see if an inode is available, and if it is, + * allocate it using the following policy: + * 1) allocate the requested inode. + * 2) allocate the next available inode after the requested + * inode in the specified cylinder group. + */ +static ino_t +ffs_nodealloccg(ip, cg, ipref, mode) + struct inode *ip; + int cg; + daddr_t ipref; + int mode; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + int error, start, len, loc, map, i; + + fs = ip->i_fs; + if (fs->fs_cs(fs, cg).cs_nifree == 0) + return (NULL); + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (NULL); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) { + brelse(bp); + return (NULL); + } + cgp->cg_time = time.tv_sec; + if (ipref) { + ipref %= fs->fs_ipg; + if (isclr(cg_inosused(cgp), ipref)) + goto gotit; + } + start = cgp->cg_irotor / NBBY; + len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); + loc = skpc(0xff, len, &cg_inosused(cgp)[start]); + if (loc == 0) { + len = start + 1; + start = 0; + loc = skpc(0xff, len, &cg_inosused(cgp)[0]); + if (loc == 0) { + printf("cg = %d, irotor = %d, fs = %s\n", + cg, cgp->cg_irotor, fs->fs_fsmnt); + panic("ffs_nodealloccg: map corrupted"); + /* NOTREACHED */ + } + } + i = start + len - loc; + map = cg_inosused(cgp)[i]; + ipref = i * NBBY; + for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) { + if ((map & i) == 0) { + cgp->cg_irotor = ipref; + goto gotit; + } + } + printf("fs = %s\n", fs->fs_fsmnt); + panic("ffs_nodealloccg: block not in map"); + /* NOTREACHED */ +gotit: + setbit(cg_inosused(cgp), ipref); + cgp->cg_cs.cs_nifree--; + fs->fs_cstotal.cs_nifree--; + fs->fs_cs(fs, cg).cs_nifree--; + fs->fs_fmod = 1; + if ((mode & IFMT) == IFDIR) { + cgp->cg_cs.cs_ndir++; + fs->fs_cstotal.cs_ndir++; + fs->fs_cs(fs, cg).cs_ndir++; + } + bdwrite(bp); + return (cg * fs->fs_ipg + ipref); +} + +/* + * Free a block or fragment. + * + * The specified block or fragment is placed back in the + * free map. If a fragment is deallocated, a possible + * block reassembly is checked. + */ +ffs_blkfree(ip, bno, size) + register struct inode *ip; + daddr_t bno; + long size; +{ + register struct fs *fs; + register struct cg *cgp; + struct buf *bp; + daddr_t blkno; + int i, error, cg, blk, frags, bbase; + + fs = ip->i_fs; + if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { + printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); + panic("blkfree: bad size"); + } + cg = dtog(fs, bno); + if ((u_int)bno >= fs->fs_size) { + printf("bad block %d, ino %d\n", bno, ip->i_number); + ffs_fserr(fs, ip->i_uid, "bad block"); + return; + } + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return; + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return; + } + cgp->cg_time = time.tv_sec; + bno = dtogd(fs, bno); + if (size == fs->fs_bsize) { + blkno = fragstoblks(fs, bno); + if (ffs_isblock(fs, cg_blksfree(cgp), blkno)) { + printf("dev = 0x%x, block = %d, fs = %s\n", + ip->i_dev, bno, fs->fs_fsmnt); + panic("blkfree: freeing free block"); + } + ffs_setblock(fs, cg_blksfree(cgp), blkno); + ffs_clusteracct(fs, cgp, blkno, 1); + cgp->cg_cs.cs_nbfree++; + fs->fs_cstotal.cs_nbfree++; + fs->fs_cs(fs, cg).cs_nbfree++; + i = cbtocylno(fs, bno); + cg_blks(fs, cgp, i)[cbtorpos(fs, bno)]++; + cg_blktot(cgp)[i]++; + } else { + bbase = bno - fragnum(fs, bno); + /* + * decrement the counts associated with the old frags + */ + blk = blkmap(fs, cg_blksfree(cgp), bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, -1); + /* + * deallocate the fragment + */ + frags = numfrags(fs, size); + for (i = 0; i < frags; i++) { + if (isset(cg_blksfree(cgp), bno + i)) { + printf("dev = 0x%x, block = %d, fs = %s\n", + ip->i_dev, bno + i, fs->fs_fsmnt); + panic("blkfree: freeing free frag"); + } + setbit(cg_blksfree(cgp), bno + i); + } + cgp->cg_cs.cs_nffree += i; + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cg).cs_nffree += i; + /* + * add back in counts associated with the new frags + */ + blk = blkmap(fs, cg_blksfree(cgp), bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1); + /* + * if a complete block has been reassembled, account for it + */ + blkno = fragstoblks(fs, bbase); + if (ffs_isblock(fs, cg_blksfree(cgp), blkno)) { + cgp->cg_cs.cs_nffree -= fs->fs_frag; + fs->fs_cstotal.cs_nffree -= fs->fs_frag; + fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; + ffs_clusteracct(fs, cgp, blkno, 1); + cgp->cg_cs.cs_nbfree++; + fs->fs_cstotal.cs_nbfree++; + fs->fs_cs(fs, cg).cs_nbfree++; + i = cbtocylno(fs, bbase); + cg_blks(fs, cgp, i)[cbtorpos(fs, bbase)]++; + cg_blktot(cgp)[i]++; + } + } + fs->fs_fmod = 1; + bdwrite(bp); +} + +/* + * Free an inode. + * + * The specified inode is placed back in the free map. + */ +int +ffs_vfree(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + register struct fs *fs; + register struct cg *cgp; + register struct inode *pip; + ino_t ino = ap->a_ino; + struct buf *bp; + int error, cg; + + pip = VTOI(ap->a_pvp); + fs = pip->i_fs; + if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) + panic("ifree: range: dev = 0x%x, ino = %d, fs = %s\n", + pip->i_dev, ino, fs->fs_fsmnt); + cg = ino_to_cg(fs, ino); + error = bread(pip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (0); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return (0); + } + cgp->cg_time = time.tv_sec; + ino %= fs->fs_ipg; + if (isclr(cg_inosused(cgp), ino)) { + printf("dev = 0x%x, ino = %d, fs = %s\n", + pip->i_dev, ino, fs->fs_fsmnt); + if (fs->fs_ronly == 0) + panic("ifree: freeing free inode"); + } + clrbit(cg_inosused(cgp), ino); + if (ino < cgp->cg_irotor) + cgp->cg_irotor = ino; + cgp->cg_cs.cs_nifree++; + fs->fs_cstotal.cs_nifree++; + fs->fs_cs(fs, cg).cs_nifree++; + if ((ap->a_mode & IFMT) == IFDIR) { + cgp->cg_cs.cs_ndir--; + fs->fs_cstotal.cs_ndir--; + fs->fs_cs(fs, cg).cs_ndir--; + } + fs->fs_fmod = 1; + bdwrite(bp); + return (0); +} + +/* + * Find a block of the specified size in the specified cylinder group. + * + * It is a panic if a request is made to find a block if none are + * available. + */ +static daddr_t +ffs_mapsearch(fs, cgp, bpref, allocsiz) + register struct fs *fs; + register struct cg *cgp; + daddr_t bpref; + int allocsiz; +{ + daddr_t bno; + int start, len, loc, i; + int blk, field, subfield, pos; + + /* + * find the fragment by searching through the free block + * map for an appropriate bit pattern + */ + if (bpref) + start = dtogd(fs, bpref) / NBBY; + else + start = cgp->cg_frotor / NBBY; + len = howmany(fs->fs_fpg, NBBY) - start; + loc = scanc((u_int)len, (u_char *)&cg_blksfree(cgp)[start], + (u_char *)fragtbl[fs->fs_frag], + (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); + if (loc == 0) { + len = start + 1; + start = 0; + loc = scanc((u_int)len, (u_char *)&cg_blksfree(cgp)[0], + (u_char *)fragtbl[fs->fs_frag], + (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); + if (loc == 0) { + printf("start = %d, len = %d, fs = %s\n", + start, len, fs->fs_fsmnt); + panic("ffs_alloccg: map corrupted"); + /* NOTREACHED */ + } + } + bno = (start + len - loc) * NBBY; + cgp->cg_frotor = bno; + /* + * found the byte in the map + * sift through the bits to find the selected frag + */ + for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { + blk = blkmap(fs, cg_blksfree(cgp), bno); + blk <<= 1; + field = around[allocsiz]; + subfield = inside[allocsiz]; + for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { + if ((blk & field) == subfield) + return (bno + pos); + field <<= 1; + subfield <<= 1; + } + } + printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt); + panic("ffs_alloccg: block not in map"); + return (-1); +} + +/* + * Update the cluster map because of an allocation or free. + * + * Cnt == 1 means free; cnt == -1 means allocating. + */ +ffs_clusteracct(fs, cgp, blkno, cnt) + struct fs *fs; + struct cg *cgp; + daddr_t blkno; + int cnt; +{ + long *sump; + u_char *freemapp, *mapp; + int i, start, end, forw, back, map, bit; + + if (fs->fs_contigsumsize <= 0) + return; + freemapp = cg_clustersfree(cgp); + sump = cg_clustersum(cgp); + /* + * Allocate or clear the actual block. + */ + if (cnt > 0) + setbit(freemapp, blkno); + else + clrbit(freemapp, blkno); + /* + * Find the size of the cluster going forward. + */ + start = blkno + 1; + end = start + fs->fs_contigsumsize; + if (end >= cgp->cg_nclusterblks) + end = cgp->cg_nclusterblks; + mapp = &freemapp[start / NBBY]; + map = *mapp++; + bit = 1 << (start % NBBY); + for (i = start; i < end; i++) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != (NBBY - 1)) { + bit <<= 1; + } else { + map = *mapp++; + bit = 1; + } + } + forw = i - start; + /* + * Find the size of the cluster going backward. + */ + start = blkno - 1; + end = start - fs->fs_contigsumsize; + if (end < 0) + end = -1; + mapp = &freemapp[start / NBBY]; + map = *mapp--; + bit = 1 << (start % NBBY); + for (i = start; i > end; i--) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != 0) { + bit >>= 1; + } else { + map = *mapp--; + bit = 1 << (NBBY - 1); + } + } + back = start - i; + /* + * Account for old cluster and the possibly new forward and + * back clusters. + */ + i = back + forw + 1; + if (i > fs->fs_contigsumsize) + i = fs->fs_contigsumsize; + sump[i] += cnt; + if (back > 0) + sump[back] -= cnt; + if (forw > 0) + sump[forw] -= cnt; +} + +/* + * Fserr prints the name of a file system with an error diagnostic. + * + * The form of the error message is: + * fs: error message + */ +static void +ffs_fserr(fs, uid, cp) + struct fs *fs; + u_int uid; + char *cp; +{ + + log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->fs_fsmnt, cp); +} diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c new file mode 100644 index 00000000000..752feec9947 --- /dev/null +++ b/sys/ufs/ffs/ffs_balloc.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_balloc.c 8.4 (Berkeley) 9/23/93 + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +/* + * Balloc defines the structure of file system storage + * by allocating the physical blocks on a device given + * the inode and the logical block number in a file. + */ +ffs_balloc(ip, bn, size, cred, bpp, flags) + register struct inode *ip; + register daddr_t bn; + int size; + struct ucred *cred; + struct buf **bpp; + int flags; +{ + register struct fs *fs; + register daddr_t nb; + struct buf *bp, *nbp; + struct vnode *vp = ITOV(ip); + struct indir indirs[NIADDR + 2]; + daddr_t newb, lbn, *bap, pref; + int osize, nsize, num, i, error; + + *bpp = NULL; + if (bn < 0) + return (EFBIG); + fs = ip->i_fs; + lbn = bn; + + /* + * If the next write will extend the file into a new block, + * and the file is currently composed of a fragment + * this fragment has to be extended to be a full block. + */ + nb = lblkno(fs, ip->i_size); + if (nb < NDADDR && nb < bn) { + osize = blksize(fs, ip, nb); + if (osize < fs->fs_bsize && osize > 0) { + error = ffs_realloccg(ip, nb, + ffs_blkpref(ip, nb, (int)nb, &ip->i_db[0]), + osize, (int)fs->fs_bsize, cred, &bp); + if (error) + return (error); + ip->i_size = (nb + 1) * fs->fs_bsize; + vnode_pager_setsize(vp, (u_long)ip->i_size); + ip->i_db[nb] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (flags & B_SYNC) + bwrite(bp); + else + bawrite(bp); + } + } + /* + * The first NDADDR blocks are direct blocks + */ + if (bn < NDADDR) { + nb = ip->i_db[bn]; + if (nb != 0 && ip->i_size >= (bn + 1) * fs->fs_bsize) { + error = bread(vp, bn, fs->fs_bsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + *bpp = bp; + return (0); + } + if (nb != 0) { + /* + * Consider need to reallocate a fragment. + */ + osize = fragroundup(fs, blkoff(fs, ip->i_size)); + nsize = fragroundup(fs, size); + if (nsize <= osize) { + error = bread(vp, bn, osize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + } else { + error = ffs_realloccg(ip, bn, + ffs_blkpref(ip, bn, (int)bn, &ip->i_db[0]), + osize, nsize, cred, &bp); + if (error) + return (error); + } + } else { + if (ip->i_size < (bn + 1) * fs->fs_bsize) + nsize = fragroundup(fs, size); + else + nsize = fs->fs_bsize; + error = ffs_alloc(ip, bn, + ffs_blkpref(ip, bn, (int)bn, &ip->i_db[0]), + nsize, cred, &newb); + if (error) + return (error); + bp = getblk(vp, bn, nsize, 0, 0); + bp->b_blkno = fsbtodb(fs, newb); + if (flags & B_CLRBUF) + clrbuf(bp); + } + ip->i_db[bn] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + *bpp = bp; + return (0); + } + /* + * Determine the number of levels of indirection. + */ + pref = 0; + if (error = ufs_getlbns(vp, bn, indirs, &num)) + return(error); +#ifdef DIAGNOSTIC + if (num < 1) + panic ("ffs_balloc: ufs_bmaparray returned indirect block\n"); +#endif + /* + * Fetch the first indirect block allocating if necessary. + */ + --num; + nb = ip->i_ib[indirs[0].in_off]; + if (nb == 0) { + pref = ffs_blkpref(ip, lbn, 0, (daddr_t *)0); + if (error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + cred, &newb)) + return (error); + nb = newb; + bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0); + bp->b_blkno = fsbtodb(fs, newb); + clrbuf(bp); + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if (error = bwrite(bp)) { + ffs_blkfree(ip, nb, fs->fs_bsize); + return (error); + } + ip->i_ib[indirs[0].in_off] = newb; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + /* + * Fetch through the indirect blocks, allocating as necessary. + */ + for (i = 1;;) { + error = bread(vp, + indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + bap = (daddr_t *)bp->b_data; + nb = bap[indirs[i].in_off]; + if (i == num) + break; + i += 1; + if (nb != 0) { + brelse(bp); + continue; + } + if (pref == 0) + pref = ffs_blkpref(ip, lbn, 0, (daddr_t *)0); + if (error = + ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) { + brelse(bp); + return (error); + } + nb = newb; + nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + clrbuf(nbp); + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if (error = bwrite(nbp)) { + ffs_blkfree(ip, nb, fs->fs_bsize); + brelse(bp); + return (error); + } + bap[indirs[i - 1].in_off] = nb; + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + if (flags & B_SYNC) { + bwrite(bp); + } else { + bdwrite(bp); + } + } + /* + * Get the data block, allocating if necessary. + */ + if (nb == 0) { + pref = ffs_blkpref(ip, lbn, indirs[i].in_off, &bap[0]); + if (error = ffs_alloc(ip, + lbn, pref, (int)fs->fs_bsize, cred, &newb)) { + brelse(bp); + return (error); + } + nb = newb; + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + if (flags & B_CLRBUF) + clrbuf(nbp); + bap[indirs[i].in_off] = nb; + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + if (flags & B_SYNC) { + bwrite(bp); + } else { + bdwrite(bp); + } + *bpp = nbp; + return (0); + } + brelse(bp); + if (flags & B_CLRBUF) { + error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); + if (error) { + brelse(nbp); + return (error); + } + } else { + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + } + *bpp = nbp; + return (0); +} diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h new file mode 100644 index 00000000000..ab467a272a9 --- /dev/null +++ b/sys/ufs/ffs/ffs_extern.h @@ -0,0 +1,101 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_extern.h 8.3 (Berkeley) 4/16/94 + */ + +struct buf; +struct fid; +struct fs; +struct inode; +struct mount; +struct nameidata; +struct proc; +struct statfs; +struct timeval; +struct ucred; +struct uio; +struct vnode; +struct mbuf; + +__BEGIN_DECLS +int ffs_alloc __P((struct inode *, + daddr_t, daddr_t, int, struct ucred *, daddr_t *)); +int ffs_balloc __P((struct inode *, + daddr_t, int, struct ucred *, struct buf **, int)); +int ffs_blkatoff __P((struct vop_blkatoff_args *)); +int ffs_blkfree __P((struct inode *, daddr_t, long)); +daddr_t ffs_blkpref __P((struct inode *, daddr_t, int, daddr_t *)); +int ffs_bmap __P((struct vop_bmap_args *)); +void ffs_clrblock __P((struct fs *, u_char *, daddr_t)); +int ffs_fhtovp __P((struct mount *, struct fid *, struct mbuf *, + struct vnode **, int *, struct ucred **)); +void ffs_fragacct __P((struct fs *, int, long [], int)); +int ffs_fsync __P((struct vop_fsync_args *)); +int ffs_init __P((void)); +int ffs_isblock __P((struct fs *, u_char *, daddr_t)); +int ffs_mount __P((struct mount *, + char *, caddr_t, struct nameidata *, struct proc *)); +int ffs_mountfs __P((struct vnode *, struct mount *, struct proc *)); +int ffs_mountroot __P((void)); +int ffs_read __P((struct vop_read_args *)); +int ffs_reallocblks __P((struct vop_reallocblks_args *)); +int ffs_realloccg __P((struct inode *, + daddr_t, daddr_t, int, int, struct ucred *, struct buf **)); +int ffs_reclaim __P((struct vop_reclaim_args *)); +void ffs_setblock __P((struct fs *, u_char *, daddr_t)); +int ffs_statfs __P((struct mount *, struct statfs *, struct proc *)); +int ffs_sync __P((struct mount *, int, struct ucred *, struct proc *)); +int ffs_truncate __P((struct vop_truncate_args *)); +int ffs_unmount __P((struct mount *, int, struct proc *)); +int ffs_update __P((struct vop_update_args *)); +int ffs_valloc __P((struct vop_valloc_args *)); +int ffs_vfree __P((struct vop_vfree_args *)); +int ffs_vget __P((struct mount *, ino_t, struct vnode **)); +int ffs_vptofh __P((struct vnode *, struct fid *)); +int ffs_write __P((struct vop_write_args *)); + +int bwrite(); /* FFS needs a bwrite routine. XXX */ + +#ifdef DIAGNOSTIC +void ffs_checkoverlap __P((struct buf *, struct inode *)); +#endif +__END_DECLS + +extern int (**ffs_vnodeop_p)(); +extern int (**ffs_specop_p)(); +#ifdef FIFO +extern int (**ffs_fifoop_p)(); +#define FFS_FIFOOPS ffs_fifoop_p +#else +#define FFS_FIFOOPS NULL +#endif diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c new file mode 100644 index 00000000000..b45aee53552 --- /dev/null +++ b/sys/ufs/ffs/ffs_inode.c @@ -0,0 +1,488 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_inode.c 8.5 (Berkeley) 12/30/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include + +static int ffs_indirtrunc __P((struct inode *, daddr_t, daddr_t, daddr_t, int, + long *)); + +int +ffs_init() +{ + return (ufs_init()); +} + +/* + * Update the access, modified, and inode change times as specified by the + * IACCESS, IUPDATE, and ICHANGE flags respectively. The IMODIFIED flag is + * used to specify that the inode needs to be updated but that the times have + * already been set. The access and modified times are taken from the second + * and third parameters; the inode change time is always taken from the current + * time. If waitfor is set, then wait for the disk write of the inode to + * complete. + */ +int +ffs_update(ap) + struct vop_update_args /* { + struct vnode *a_vp; + struct timeval *a_access; + struct timeval *a_modify; + int a_waitfor; + } */ *ap; +{ + register struct fs *fs; + struct buf *bp; + struct inode *ip; + int error; + + ip = VTOI(ap->a_vp); + if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) { + ip->i_flag &= + ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); + return (0); + } + if ((ip->i_flag & + (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) + return (0); + if (ip->i_flag & IN_ACCESS) + ip->i_atime.ts_sec = ap->a_access->tv_sec; + if (ip->i_flag & IN_UPDATE) { + ip->i_mtime.ts_sec = ap->a_modify->tv_sec; + ip->i_modrev++; + } + if (ip->i_flag & IN_CHANGE) + ip->i_ctime.ts_sec = time.tv_sec; + ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); + fs = ip->i_fs; + /* + * Ensure that uid and gid are correct. This is a temporary + * fix until fsck has been changed to do the update. + */ + if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ + ip->i_din.di_ouid = ip->i_uid; /* XXX */ + ip->i_din.di_ogid = ip->i_gid; /* XXX */ + } /* XXX */ + if (error = bread(ip->i_devvp, + fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, NOCRED, &bp)) { + brelse(bp); + return (error); + } + *((struct dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number)) = ip->i_din; + if (ap->a_waitfor) + return (bwrite(bp)); + else { + bdwrite(bp); + return (0); + } +} + +#define SINGLE 0 /* index of single indirect block */ +#define DOUBLE 1 /* index of double indirect block */ +#define TRIPLE 2 /* index of triple indirect block */ +/* + * Truncate the inode oip to at most length size, freeing the + * disk blocks. + */ +ffs_truncate(ap) + struct vop_truncate_args /* { + struct vnode *a_vp; + off_t a_length; + int a_flags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *ovp = ap->a_vp; + register daddr_t lastblock; + register struct inode *oip; + daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; + daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; + off_t length = ap->a_length; + register struct fs *fs; + struct buf *bp; + int offset, size, level; + long count, nblocks, vflags, blocksreleased = 0; + struct timeval tv; + register int i; + int aflags, error, allerror; + off_t osize; + + oip = VTOI(ovp); + tv = time; + if (ovp->v_type == VLNK && + oip->i_size < ovp->v_mount->mnt_maxsymlinklen) { +#ifdef DIAGNOSTIC + if (length != 0) + panic("ffs_truncate: partial truncate of symlink"); +#endif + bzero((char *)&oip->i_shortlink, (u_int)oip->i_size); + oip->i_size = 0; + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOP_UPDATE(ovp, &tv, &tv, 1)); + } + if (oip->i_size == length) { + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOP_UPDATE(ovp, &tv, &tv, 0)); + } +#ifdef QUOTA + if (error = getinoquota(oip)) + return (error); +#endif + vnode_pager_setsize(ovp, (u_long)length); + fs = oip->i_fs; + osize = oip->i_size; + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of oszie is 0, length will be at least 1. + */ + if (osize < length) { + offset = blkoff(fs, length - 1); + lbn = lblkno(fs, length - 1); + aflags = B_CLRBUF; + if (ap->a_flags & IO_SYNC) + aflags |= B_SYNC; + if (error = ffs_balloc(oip, lbn, offset + 1, ap->a_cred, &bp, + aflags)) + return (error); + oip->i_size = length; + (void) vnode_pager_uncache(ovp); + if (aflags & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOP_UPDATE(ovp, &tv, &tv, 1)); + } + /* + * Shorten the size of the file. If the file is not being + * truncated to a block boundry, the contents of the + * partial block following the end of the file must be + * zero'ed in case it ever become accessable again because + * of subsequent file growth. + */ + offset = blkoff(fs, length); + if (offset == 0) { + oip->i_size = length; + } else { + lbn = lblkno(fs, length); + aflags = B_CLRBUF; + if (ap->a_flags & IO_SYNC) + aflags |= B_SYNC; + if (error = ffs_balloc(oip, lbn, offset, ap->a_cred, &bp, + aflags)) + return (error); + oip->i_size = length; + size = blksize(fs, oip, lbn); + (void) vnode_pager_uncache(ovp); + bzero((char *)bp->b_data + offset, (u_int)(size - offset)); + allocbuf(bp, size); + if (aflags & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + } + /* + * Calculate index into inode's block list of + * last direct and indirect blocks (if any) + * which we want to keep. Lastblock is -1 when + * the file is truncated to 0. + */ + lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; + lastiblock[SINGLE] = lastblock - NDADDR; + lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); + lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); + nblocks = btodb(fs->fs_bsize); + /* + * Update file and block pointers on disk before we start freeing + * blocks. If we crash before free'ing blocks below, the blocks + * will be returned to the free list. lastiblock values are also + * normalized to -1 for calls to ffs_indirtrunc below. + */ + bcopy((caddr_t)&oip->i_db[0], (caddr_t)oldblks, sizeof oldblks); + for (level = TRIPLE; level >= SINGLE; level--) + if (lastiblock[level] < 0) { + oip->i_ib[level] = 0; + lastiblock[level] = -1; + } + for (i = NDADDR - 1; i > lastblock; i--) + oip->i_db[i] = 0; + oip->i_flag |= IN_CHANGE | IN_UPDATE; + if (error = VOP_UPDATE(ovp, &tv, &tv, MNT_WAIT)) + allerror = error; + /* + * Having written the new inode to disk, save its new configuration + * and put back the old block pointers long enough to process them. + * Note that we save the new block configuration so we can check it + * when we are done. + */ + bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof newblks); + bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof oldblks); + oip->i_size = osize; + vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA; + allerror = vinvalbuf(ovp, vflags, ap->a_cred, ap->a_p, 0, 0); + + /* + * Indirect blocks first. + */ + indir_lbn[SINGLE] = -NDADDR; + indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; + indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; + for (level = TRIPLE; level >= SINGLE; level--) { + bn = oip->i_ib[level]; + if (bn != 0) { + error = ffs_indirtrunc(oip, indir_lbn[level], + fsbtodb(fs, bn), lastiblock[level], level, &count); + if (error) + allerror = error; + blocksreleased += count; + if (lastiblock[level] < 0) { + oip->i_ib[level] = 0; + ffs_blkfree(oip, bn, fs->fs_bsize); + blocksreleased += nblocks; + } + } + if (lastiblock[level] >= 0) + goto done; + } + + /* + * All whole direct blocks or frags. + */ + for (i = NDADDR - 1; i > lastblock; i--) { + register long bsize; + + bn = oip->i_db[i]; + if (bn == 0) + continue; + oip->i_db[i] = 0; + bsize = blksize(fs, oip, i); + ffs_blkfree(oip, bn, bsize); + blocksreleased += btodb(bsize); + } + if (lastblock < 0) + goto done; + + /* + * Finally, look for a change in size of the + * last direct block; release any frags. + */ + bn = oip->i_db[lastblock]; + if (bn != 0) { + long oldspace, newspace; + + /* + * Calculate amount of space we're giving + * back as old block size minus new block size. + */ + oldspace = blksize(fs, oip, lastblock); + oip->i_size = length; + newspace = blksize(fs, oip, lastblock); + if (newspace == 0) + panic("itrunc: newspace"); + if (oldspace - newspace > 0) { + /* + * Block number of space to be free'd is + * the old block # plus the number of frags + * required for the storage we're keeping. + */ + bn += numfrags(fs, newspace); + ffs_blkfree(oip, bn, oldspace - newspace); + blocksreleased += btodb(oldspace - newspace); + } + } +done: +#ifdef DIAGNOSTIC + for (level = SINGLE; level <= TRIPLE; level++) + if (newblks[NDADDR + level] != oip->i_ib[level]) + panic("itrunc1"); + for (i = 0; i < NDADDR; i++) + if (newblks[i] != oip->i_db[i]) + panic("itrunc2"); + if (length == 0 && + (ovp->v_dirtyblkhd.lh_first || ovp->v_cleanblkhd.lh_first)) + panic("itrunc3"); +#endif /* DIAGNOSTIC */ + /* + * Put back the real size. + */ + oip->i_size = length; + oip->i_blocks -= blocksreleased; + if (oip->i_blocks < 0) /* sanity */ + oip->i_blocks = 0; + oip->i_flag |= IN_CHANGE; +#ifdef QUOTA + (void) chkdq(oip, -blocksreleased, NOCRED, 0); +#endif + return (allerror); +} + +/* + * Release blocks associated with the inode ip and stored in the indirect + * block bn. Blocks are free'd in LIFO order up to (but not including) + * lastbn. If level is greater than SINGLE, the block is an indirect block + * and recursive calls to indirtrunc must be used to cleanse other indirect + * blocks. + * + * NB: triple indirect blocks are untested. + */ +static int +ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) + register struct inode *ip; + daddr_t lbn, lastbn; + daddr_t dbn; + int level; + long *countp; +{ + register int i; + struct buf *bp; + register struct fs *fs = ip->i_fs; + register daddr_t *bap; + struct vnode *vp; + daddr_t *copy, nb, nlbn, last; + long blkcount, factor; + int nblocks, blocksreleased = 0; + int error = 0, allerror = 0; + + /* + * Calculate index in current block of last + * block to be kept. -1 indicates the entire + * block so we need not calculate the index. + */ + factor = 1; + for (i = SINGLE; i < level; i++) + factor *= NINDIR(fs); + last = lastbn; + if (lastbn > 0) + last /= factor; + nblocks = btodb(fs->fs_bsize); + /* + * Get buffer of block pointers, zero those entries corresponding + * to blocks to be free'd, and update on disk copy first. Since + * double(triple) indirect before single(double) indirect, calls + * to bmap on these blocks will fail. However, we already have + * the on disk address, so we have to set the b_blkno field + * explicitly instead of letting bread do everything for us. + */ + vp = ITOV(ip); + bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0); + if (bp->b_flags & (B_DONE | B_DELWRI)) { + /* Braces must be here in case trace evaluates to nothing. */ + trace(TR_BREADHIT, pack(vp, fs->fs_bsize), lbn); + } else { + trace(TR_BREADMISS, pack(vp, fs->fs_bsize), lbn); + curproc->p_stats->p_ru.ru_inblock++; /* pay for read */ + bp->b_flags |= B_READ; + if (bp->b_bcount > bp->b_bufsize) + panic("ffs_indirtrunc: bad buffer size"); + bp->b_blkno = dbn; + VOP_STRATEGY(bp); + error = biowait(bp); + } + if (error) { + brelse(bp); + *countp = 0; + return (error); + } + + bap = (daddr_t *)bp->b_data; + MALLOC(copy, daddr_t *, fs->fs_bsize, M_TEMP, M_WAITOK); + bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->fs_bsize); + bzero((caddr_t)&bap[last + 1], + (u_int)(NINDIR(fs) - (last + 1)) * sizeof (daddr_t)); + if (last == -1) + bp->b_flags |= B_INVAL; + error = bwrite(bp); + if (error) + allerror = error; + bap = copy; + + /* + * Recursively free totally unused blocks. + */ + for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; + i--, nlbn += factor) { + nb = bap[i]; + if (nb == 0) + continue; + if (level > SINGLE) { + if (error = ffs_indirtrunc(ip, nlbn, + fsbtodb(fs, nb), (daddr_t)-1, level - 1, &blkcount)) + allerror = error; + blocksreleased += blkcount; + } + ffs_blkfree(ip, nb, fs->fs_bsize); + blocksreleased += nblocks; + } + + /* + * Recursively free last partial block. + */ + if (level > SINGLE && lastbn >= 0) { + last = lastbn % factor; + nb = bap[i]; + if (nb != 0) { + if (error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), + last, level - 1, &blkcount)) + allerror = error; + blocksreleased += blkcount; + } + } + FREE(copy, M_TEMP); + *countp = blocksreleased; + return (allerror); +} diff --git a/sys/ufs/ffs/ffs_subr.c b/sys/ufs/ffs/ffs_subr.c new file mode 100644 index 00000000000..c251b16e697 --- /dev/null +++ b/sys/ufs/ffs/ffs_subr.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_subr.c 8.2 (Berkeley) 9/21/93 + */ + +#include +#include + +#ifdef KERNEL +#include +#include +#include +#include +#include +#include + +/* + * Return buffer with the contents of block "offset" from the beginning of + * directory "ip". If "res" is non-zero, fill it in with a pointer to the + * remaining space in the directory. + */ +int +ffs_blkatoff(ap) + struct vop_blkatoff_args /* { + struct vnode *a_vp; + off_t a_offset; + char **a_res; + struct buf **a_bpp; + } */ *ap; +{ + struct inode *ip; + register struct fs *fs; + struct buf *bp; + daddr_t lbn; + int bsize, error; + + ip = VTOI(ap->a_vp); + fs = ip->i_fs; + lbn = lblkno(fs, ap->a_offset); + bsize = blksize(fs, ip, lbn); + + *ap->a_bpp = NULL; + if (error = bread(ap->a_vp, lbn, bsize, NOCRED, &bp)) { + brelse(bp); + return (error); + } + if (ap->a_res) + *ap->a_res = (char *)bp->b_data + blkoff(fs, ap->a_offset); + *ap->a_bpp = bp; + return (0); +} +#endif + +/* + * Update the frsum fields to reflect addition or deletion + * of some frags. + */ +void +ffs_fragacct(fs, fragmap, fraglist, cnt) + struct fs *fs; + int fragmap; + long fraglist[]; + int cnt; +{ + int inblk; + register int field, subfield; + register int siz, pos; + + inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; + fragmap <<= 1; + for (siz = 1; siz < fs->fs_frag; siz++) { + if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) + continue; + field = around[siz]; + subfield = inside[siz]; + for (pos = siz; pos <= fs->fs_frag; pos++) { + if ((fragmap & field) == subfield) { + fraglist[siz] += cnt; + pos += siz; + field <<= siz; + subfield <<= siz; + } + field <<= 1; + subfield <<= 1; + } + } +} + +#if defined(KERNEL) && defined(DIAGNOSTIC) +void +ffs_checkoverlap(bp, ip) + struct buf *bp; + struct inode *ip; +{ + register struct buf *ebp, *ep; + register daddr_t start, last; + struct vnode *vp; + + ebp = &buf[nbuf]; + start = bp->b_blkno; + last = start + btodb(bp->b_bcount) - 1; + for (ep = buf; ep < ebp; ep++) { + if (ep == bp || (ep->b_flags & B_INVAL) || + ep->b_vp == NULLVP) + continue; + if (VOP_BMAP(ep->b_vp, (daddr_t)0, &vp, (daddr_t)0, NULL)) + continue; + if (vp != ip->i_devvp) + continue; + /* look for overlap */ + if (ep->b_bcount == 0 || ep->b_blkno > last || + ep->b_blkno + btodb(ep->b_bcount) <= start) + continue; + vprint("Disk overlap", vp); + (void)printf("\tstart %d, end %d overlap start %d, end %d\n", + start, last, ep->b_blkno, + ep->b_blkno + btodb(ep->b_bcount) - 1); + panic("Disk buffer overlap"); + } +} +#endif /* DIAGNOSTIC */ + +/* + * block operations + * + * check if a block is available + */ +int +ffs_isblock(fs, cp, h) + struct fs *fs; + unsigned char *cp; + daddr_t h; +{ + unsigned char mask; + + switch ((int)fs->fs_frag) { + case 8: + return (cp[h] == 0xff); + case 4: + mask = 0x0f << ((h & 0x1) << 2); + return ((cp[h >> 1] & mask) == mask); + case 2: + mask = 0x03 << ((h & 0x3) << 1); + return ((cp[h >> 2] & mask) == mask); + case 1: + mask = 0x01 << (h & 0x7); + return ((cp[h >> 3] & mask) == mask); + default: + panic("ffs_isblock"); + } +} + +/* + * take a block out of the map + */ +void +ffs_clrblock(fs, cp, h) + struct fs *fs; + u_char *cp; + daddr_t h; +{ + + switch ((int)fs->fs_frag) { + case 8: + cp[h] = 0; + return; + case 4: + cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); + return; + case 2: + cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); + return; + case 1: + cp[h >> 3] &= ~(0x01 << (h & 0x7)); + return; + default: + panic("ffs_clrblock"); + } +} + +/* + * put a block into the map + */ +void +ffs_setblock(fs, cp, h) + struct fs *fs; + unsigned char *cp; + daddr_t h; +{ + + switch ((int)fs->fs_frag) { + + case 8: + cp[h] = 0xff; + return; + case 4: + cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); + return; + case 2: + cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); + return; + case 1: + cp[h >> 3] |= (0x01 << (h & 0x7)); + return; + default: + panic("ffs_setblock"); + } +} diff --git a/sys/ufs/ffs/ffs_tables.c b/sys/ufs/ffs/ffs_tables.c new file mode 100644 index 00000000000..8cf46b0150a --- /dev/null +++ b/sys/ufs/ffs/ffs_tables.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_tables.c 8.1 (Berkeley) 6/11/93 + */ + +#include + +/* + * Bit patterns for identifying fragments in the block map + * used as ((map & around) == inside) + */ +int around[9] = { + 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff +}; +int inside[9] = { + 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe +}; + +/* + * Given a block map bit pattern, the frag tables tell whether a + * particular size fragment is available. + * + * used as: + * if ((1 << (size - 1)) & fragtbl[fs->fs_frag][map] { + * at least one fragment of the indicated size is available + * } + * + * These tables are used by the scanc instruction on the VAX to + * quickly find an appropriate fragment. + */ +u_char fragtbl124[256] = { + 0x00, 0x16, 0x16, 0x2a, 0x16, 0x16, 0x26, 0x4e, + 0x16, 0x16, 0x16, 0x3e, 0x2a, 0x3e, 0x4e, 0x8a, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x26, 0x36, 0x36, 0x2e, 0x36, 0x36, 0x26, 0x6e, + 0x36, 0x36, 0x36, 0x3e, 0x2e, 0x3e, 0x6e, 0xae, + 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, + 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, + 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, + 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, + 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, + 0x8a, 0x9e, 0x9e, 0xaa, 0x9e, 0x9e, 0xae, 0xce, + 0x9e, 0x9e, 0x9e, 0xbe, 0xaa, 0xbe, 0xce, 0x8a, +}; + +u_char fragtbl8[256] = { + 0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04, + 0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x10, 0x11, 0x20, 0x40, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, + 0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, + 0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0a, 0x12, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, + 0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0c, + 0x08, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x0a, 0x0c, + 0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80, +}; + +/* + * The actual fragtbl array. + */ +u_char *fragtbl[MAXFRAG + 1] = { + 0, fragtbl124, fragtbl124, 0, fragtbl124, 0, 0, 0, fragtbl8, +}; diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c new file mode 100644 index 00000000000..505dd5db8cb --- /dev/null +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -0,0 +1,843 @@ +/* + * Copyright (c) 1989, 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include + +int ffs_sbupdate __P((struct ufsmount *, int)); + +struct vfsops ufs_vfsops = { + ffs_mount, + ufs_start, + ffs_unmount, + ufs_root, + ufs_quotactl, + ffs_statfs, + ffs_sync, + ffs_vget, + ffs_fhtovp, + ffs_vptofh, + ffs_init, +}; + +extern u_long nextgennumber; + +/* + * Called by main() when ufs is going to be mounted as root. + * + * Name is updated by mount(8) after booting. + */ +#define ROOTNAME "root_device" + +ffs_mountroot() +{ + extern struct vnode *rootvp; + register struct fs *fs; + register struct mount *mp; + struct proc *p = curproc; /* XXX */ + struct ufsmount *ump; + u_int size; + int error; + + /* + * Get vnodes for swapdev and rootdev. + */ + if (bdevvp(swapdev, &swapdev_vp) || bdevvp(rootdev, &rootvp)) + panic("ffs_mountroot: can't setup bdevvp's"); + + mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + mp->mnt_op = &ufs_vfsops; + mp->mnt_flag = MNT_RDONLY; + if (error = ffs_mountfs(rootvp, mp, p)) { + free(mp, M_MOUNT); + return (error); + } + if (error = vfs_lock(mp)) { + (void)ffs_unmount(mp, 0, p); + free(mp, M_MOUNT); + return (error); + } + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mp->mnt_flag |= MNT_ROOTFS; + mp->mnt_vnodecovered = NULLVP; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + bzero(fs->fs_fsmnt, sizeof(fs->fs_fsmnt)); + fs->fs_fsmnt[0] = '/'; + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void)ffs_statfs(mp, &mp->mnt_stat, p); + vfs_unlock(mp); + inittodr(fs->fs_time); + return (0); +} + +/* + * VFS Operations. + * + * mount system call + */ +int +ffs_mount(mp, path, data, ndp, p) + register struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct vnode *devvp; + struct ufs_args args; + struct ufsmount *ump; + register struct fs *fs; + u_int size; + int error, flags; + + if (error = copyin(data, (caddr_t)&args, sizeof (struct ufs_args))) + return (error); + /* + * If updating, check whether changing from read-only to + * read/write; if there is no device name, that's all we do. + */ + if (mp->mnt_flag & MNT_UPDATE) { + ump = VFSTOUFS(mp); + fs = ump->um_fs; + error = 0; + if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + flags = WRITECLOSE; + if (mp->mnt_flag & MNT_FORCE) + flags |= FORCECLOSE; + if (vfs_busy(mp)) + return (EBUSY); + error = ffs_flushfiles(mp, flags, p); + vfs_unbusy(mp); + } + if (!error && (mp->mnt_flag & MNT_RELOAD)) + error = ffs_reload(mp, ndp->ni_cnd.cn_cred, p); + if (error) + return (error); + if (fs->fs_ronly && (mp->mnt_flag & MNT_WANTRDWR)) + fs->fs_ronly = 0; + if (args.fspec == 0) { + /* + * Process export requests. + */ + return (vfs_export(mp, &ump->um_export, &args.export)); + } + } + /* + * Not an update, or updating the name: look up the name + * and verify that it refers to a sensible block device. + */ + NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); + if (error = namei(ndp)) + return (error); + devvp = ndp->ni_vp; + + if (devvp->v_type != VBLK) { + vrele(devvp); + return (ENOTBLK); + } + if (major(devvp->v_rdev) >= nblkdev) { + vrele(devvp); + return (ENXIO); + } + if ((mp->mnt_flag & MNT_UPDATE) == 0) + error = ffs_mountfs(devvp, mp, p); + else { + if (devvp != ump->um_devvp) + error = EINVAL; /* needs translation */ + else + vrele(devvp); + } + if (error) { + vrele(devvp); + return (error); + } + ump = VFSTOUFS(mp); + fs = ump->um_fs; + (void) copyinstr(path, fs->fs_fsmnt, sizeof(fs->fs_fsmnt) - 1, &size); + bzero(fs->fs_fsmnt + size, sizeof(fs->fs_fsmnt) - size); + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void)ffs_statfs(mp, &mp->mnt_stat, p); + return (0); +} + +/* + * Reload all incore data for a filesystem (used after running fsck on + * the root filesystem and finding things to fix). The filesystem must + * be mounted read-only. + * + * Things to do to update the mount: + * 1) invalidate all cached meta-data. + * 2) re-read superblock from disk. + * 3) re-read summary information from disk. + * 4) invalidate all inactive vnodes. + * 5) invalidate all cached file data. + * 6) re-read inode data for all active vnodes. + */ +ffs_reload(mountp, cred, p) + register struct mount *mountp; + struct ucred *cred; + struct proc *p; +{ + register struct vnode *vp, *nvp, *devvp; + struct inode *ip; + struct csum *space; + struct buf *bp; + struct fs *fs; + int i, blks, size, error; + + if ((mountp->mnt_flag & MNT_RDONLY) == 0) + return (EINVAL); + /* + * Step 1: invalidate all cached meta-data. + */ + devvp = VFSTOUFS(mountp)->um_devvp; + if (vinvalbuf(devvp, 0, cred, p, 0, 0)) + panic("ffs_reload: dirty1"); + /* + * Step 2: re-read superblock from disk. + */ + if (error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) + return (error); + fs = (struct fs *)bp->b_data; + if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE || + fs->fs_bsize < sizeof(struct fs)) { + brelse(bp); + return (EIO); /* XXX needs translation */ + } + fs = VFSTOUFS(mountp)->um_fs; + bcopy(&fs->fs_csp[0], &((struct fs *)bp->b_data)->fs_csp[0], + sizeof(fs->fs_csp)); + bcopy(bp->b_data, fs, (u_int)fs->fs_sbsize); + if (fs->fs_sbsize < SBSIZE) + bp->b_flags |= B_INVAL; + brelse(bp); + ffs_oldfscompat(fs); + /* + * Step 3: re-read summary information from disk. + */ + blks = howmany(fs->fs_cssize, fs->fs_fsize); + space = fs->fs_csp[0]; + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + if (error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, + NOCRED, &bp)) + return (error); + bcopy(bp->b_data, fs->fs_csp[fragstoblks(fs, i)], (u_int)size); + brelse(bp); + } +loop: + for (vp = mountp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { + nvp = vp->v_mntvnodes.le_next; + /* + * Step 4: invalidate all inactive vnodes. + */ + if (vp->v_usecount == 0) { + vgone(vp); + continue; + } + /* + * Step 5: invalidate all cached file data. + */ + if (vget(vp, 1)) + goto loop; + if (vinvalbuf(vp, 0, cred, p, 0, 0)) + panic("ffs_reload: dirty2"); + /* + * Step 6: re-read inode data for all active vnodes. + */ + ip = VTOI(vp); + if (error = + bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, NOCRED, &bp)) { + vput(vp); + return (error); + } + ip->i_din = *((struct dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number)); + brelse(bp); + vput(vp); + if (vp->v_mount != mountp) + goto loop; + } + return (0); +} + +/* + * Common code for mount and mountroot + */ +int +ffs_mountfs(devvp, mp, p) + register struct vnode *devvp; + struct mount *mp; + struct proc *p; +{ + register struct ufsmount *ump; + struct buf *bp; + register struct fs *fs; + dev_t dev = devvp->v_rdev; + struct partinfo dpart; + caddr_t base, space; + int havepart = 0, blks; + int error, i, size; + int ronly; + extern struct vnode *rootvp; + + /* + * Disallow multiple mounts of the same device. + * Disallow mounting of a device that is currently in use + * (except for root, which might share swap device for miniroot). + * Flush out any old buffers remaining from a previous use. + */ + if (error = vfs_mountedon(devvp)) + return (error); + if (vcount(devvp) > 1 && devvp != rootvp) + return (EBUSY); + if (error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + + ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + if (error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p)) + return (error); + if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p) != 0) + size = DEV_BSIZE; + else { + havepart = 1; + size = dpart.disklab->d_secsize; + } + + bp = NULL; + ump = NULL; + if (error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) + goto out; + fs = (struct fs *)bp->b_data; + if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE || + fs->fs_bsize < sizeof(struct fs)) { + error = EINVAL; /* XXX needs translation */ + goto out; + } + ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK); + bzero((caddr_t)ump, sizeof *ump); + ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, + M_WAITOK); + bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); + if (fs->fs_sbsize < SBSIZE) + bp->b_flags |= B_INVAL; + brelse(bp); + bp = NULL; + fs = ump->um_fs; + fs->fs_ronly = ronly; + if (ronly == 0) + fs->fs_fmod = 1; + blks = howmany(fs->fs_cssize, fs->fs_fsize); + base = space = malloc((u_long)fs->fs_cssize, M_UFSMNT, + M_WAITOK); + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, + NOCRED, &bp); + if (error) { + free(base, M_UFSMNT); + goto out; + } + bcopy(bp->b_data, space, (u_int)size); + fs->fs_csp[fragstoblks(fs, i)] = (struct csum *)space; + space += size; + brelse(bp); + bp = NULL; + } + mp->mnt_data = (qaddr_t)ump; + mp->mnt_stat.f_fsid.val[0] = (long)dev; + mp->mnt_stat.f_fsid.val[1] = MOUNT_UFS; + mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; + mp->mnt_flag |= MNT_LOCAL; + ump->um_mountp = mp; + ump->um_dev = dev; + ump->um_devvp = devvp; + ump->um_nindir = fs->fs_nindir; + ump->um_bptrtodb = fs->fs_fsbtodb; + ump->um_seqinc = fs->fs_frag; + for (i = 0; i < MAXQUOTAS; i++) + ump->um_quotas[i] = NULLVP; + devvp->v_specflags |= SI_MOUNTEDON; + ffs_oldfscompat(fs); + return (0); +out: + if (bp) + brelse(bp); + (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); + if (ump) { + free(ump->um_fs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = (qaddr_t)0; + } + return (error); +} + +/* + * Sanity checks for old file systems. + * + * XXX - goes away some day. + */ +ffs_oldfscompat(fs) + struct fs *fs; +{ + int i; + + fs->fs_npsect = max(fs->fs_npsect, fs->fs_nsect); /* XXX */ + fs->fs_interleave = max(fs->fs_interleave, 1); /* XXX */ + if (fs->fs_postblformat == FS_42POSTBLFMT) /* XXX */ + fs->fs_nrpos = 8; /* XXX */ + if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ + quad_t sizepb = fs->fs_bsize; /* XXX */ + /* XXX */ + fs->fs_maxfilesize = fs->fs_bsize * NDADDR - 1; /* XXX */ + for (i = 0; i < NIADDR; i++) { /* XXX */ + sizepb *= NINDIR(fs); /* XXX */ + fs->fs_maxfilesize += sizepb; /* XXX */ + } /* XXX */ + fs->fs_qbmask = ~fs->fs_bmask; /* XXX */ + fs->fs_qfmask = ~fs->fs_fmask; /* XXX */ + } /* XXX */ + return (0); +} + +/* + * unmount system call + */ +int +ffs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + register struct ufsmount *ump; + register struct fs *fs; + int error, flags, ronly; + + flags = 0; + if (mntflags & MNT_FORCE) { + if (mp->mnt_flag & MNT_ROOTFS) + return (EINVAL); + flags |= FORCECLOSE; + } + if (error = ffs_flushfiles(mp, flags, p)) + return (error); + ump = VFSTOUFS(mp); + fs = ump->um_fs; + ronly = !fs->fs_ronly; + ump->um_devvp->v_specflags &= ~SI_MOUNTEDON; + error = VOP_CLOSE(ump->um_devvp, ronly ? FREAD : FREAD|FWRITE, + NOCRED, p); + vrele(ump->um_devvp); + free(fs->fs_csp[0], M_UFSMNT); + free(fs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = (qaddr_t)0; + mp->mnt_flag &= ~MNT_LOCAL; + return (error); +} + +/* + * Flush out all the files in a filesystem. + */ +ffs_flushfiles(mp, flags, p) + register struct mount *mp; + int flags; + struct proc *p; +{ + extern int doforce; + register struct ufsmount *ump; + int i, error; + + if (!doforce) + flags &= ~FORCECLOSE; + ump = VFSTOUFS(mp); +#ifdef QUOTA + if (mp->mnt_flag & MNT_QUOTA) { + if (error = vflush(mp, NULLVP, SKIPSYSTEM|flags)) + return (error); + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] == NULLVP) + continue; + quotaoff(p, mp, i); + } + /* + * Here we fall through to vflush again to ensure + * that we have gotten rid of all the system vnodes. + */ + } +#endif + error = vflush(mp, NULLVP, flags); + return (error); +} + +/* + * Get file system statistics. + */ +int +ffs_statfs(mp, sbp, p) + struct mount *mp; + register struct statfs *sbp; + struct proc *p; +{ + register struct ufsmount *ump; + register struct fs *fs; + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (fs->fs_magic != FS_MAGIC) + panic("ffs_statfs"); + sbp->f_type = MOUNT_UFS; + sbp->f_bsize = fs->fs_fsize; + sbp->f_iosize = fs->fs_bsize; + sbp->f_blocks = fs->fs_dsize; + sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + + fs->fs_cstotal.cs_nffree; + sbp->f_bavail = (fs->fs_dsize * (100 - fs->fs_minfree) / 100) - + (fs->fs_dsize - sbp->f_bfree); + sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; + sbp->f_ffree = fs->fs_cstotal.cs_nifree; + if (sbp != &mp->mnt_stat) { + bcopy((caddr_t)mp->mnt_stat.f_mntonname, + (caddr_t)&sbp->f_mntonname[0], MNAMELEN); + bcopy((caddr_t)mp->mnt_stat.f_mntfromname, + (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); + } + return (0); +} + +/* + * Go through the disk queues to initiate sandbagged IO; + * go through the inodes to write those that have been modified; + * initiate the writing of the super block if it has been modified. + * + * Note: we are always called with the filesystem marked `MPBUSY'. + */ +int +ffs_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + register struct vnode *vp; + register struct inode *ip; + register struct ufsmount *ump = VFSTOUFS(mp); + register struct fs *fs; + int error, allerror = 0; + + fs = ump->um_fs; + /* + * Write back modified superblock. + * Consistency check that the superblock + * is still in the buffer cache. + */ + if (fs->fs_fmod != 0) { + if (fs->fs_ronly != 0) { /* XXX */ + printf("fs = %s\n", fs->fs_fsmnt); + panic("update: rofs mod"); + } + fs->fs_fmod = 0; + fs->fs_time = time.tv_sec; + allerror = ffs_sbupdate(ump, waitfor); + } + /* + * Write back each (modified) inode. + */ +loop: + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + /* + * If the vnode that we are about to sync is no longer + * associated with this mount point, start over. + */ + if (vp->v_mount != mp) + goto loop; + if (VOP_ISLOCKED(vp)) + continue; + ip = VTOI(vp); + if ((ip->i_flag & + (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && + vp->v_dirtyblkhd.lh_first == NULL) + continue; + if (vget(vp, 1)) + goto loop; + if (error = VOP_FSYNC(vp, cred, waitfor, p)) + allerror = error; + vput(vp); + } + /* + * Force stale file system control information to be flushed. + */ + if (error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p)) + allerror = error; +#ifdef QUOTA + qsync(mp); +#endif + return (allerror); +} + +/* + * Look up a FFS dinode number to find its incore vnode, otherwise read it + * in from disk. If it is in core, wait for the lock bit to clear, then + * return the inode locked. Detection and handling of mount points must be + * done by the calling routine. + */ +int +ffs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + register struct fs *fs; + register struct inode *ip; + struct ufsmount *ump; + struct buf *bp; + struct vnode *vp; + dev_t dev; + int i, type, error; + + ump = VFSTOUFS(mp); + dev = ump->um_dev; + if ((*vpp = ufs_ihashget(dev, ino)) != NULL) + return (0); + + /* Allocate a new vnode/inode. */ + if (error = getnewvnode(VT_UFS, mp, ffs_vnodeop_p, &vp)) { + *vpp = NULL; + return (error); + } + type = ump->um_devvp->v_tag == VT_MFS ? M_MFSNODE : M_FFSNODE; /* XXX */ + MALLOC(ip, struct inode *, sizeof(struct inode), type, M_WAITOK); + bzero((caddr_t)ip, sizeof(struct inode)); + vp->v_data = ip; + ip->i_vnode = vp; + ip->i_fs = fs = ump->um_fs; + ip->i_dev = dev; + ip->i_number = ino; +#ifdef QUOTA + for (i = 0; i < MAXQUOTAS; i++) + ip->i_dquot[i] = NODQUOT; +#endif + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + ufs_ihashins(ip); + + /* Read in the disk contents for the inode, copy into the inode. */ + if (error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), + (int)fs->fs_bsize, NOCRED, &bp)) { + /* + * The inode does not contain anything useful, so it would + * be misleading to leave it on its hash chain. With mode + * still zero, it will be unlinked and returned to the free + * list by vput(). + */ + vput(vp); + brelse(bp); + *vpp = NULL; + return (error); + } + ip->i_din = *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ino)); + brelse(bp); + + /* + * Initialize the vnode from the inode, check for aliases. + * Note that the underlying vnode may have changed. + */ + if (error = ufs_vinit(mp, ffs_specop_p, FFS_FIFOOPS, &vp)) { + vput(vp); + *vpp = NULL; + return (error); + } + /* + * Finish inode initialization now that aliasing has been resolved. + */ + ip->i_devvp = ump->um_devvp; + VREF(ip->i_devvp); + /* + * Set up a generation number for this inode if it does not + * already have one. This should only happen on old filesystems. + */ + if (ip->i_gen == 0) { + if (++nextgennumber < (u_long)time.tv_sec) + nextgennumber = time.tv_sec; + ip->i_gen = nextgennumber; + if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) + ip->i_flag |= IN_MODIFIED; + } + /* + * Ensure that uid and gid are correct. This is a temporary + * fix until fsck has been changed to do the update. + */ + if (fs->fs_inodefmt < FS_44INODEFMT) { /* XXX */ + ip->i_uid = ip->i_din.di_ouid; /* XXX */ + ip->i_gid = ip->i_din.di_ogid; /* XXX */ + } /* XXX */ + + *vpp = vp; + return (0); +} + +/* + * File handle to vnode + * + * Have to be really careful about stale file handles: + * - check that the inode number is valid + * - call ffs_vget() to get the locked inode + * - check for an unallocated inode (i_mode == 0) + * - check that the given client host has export rights and return + * those rights via. exflagsp and credanonp + */ +int +ffs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) + register struct mount *mp; + struct fid *fhp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + register struct ufid *ufhp; + struct fs *fs; + + ufhp = (struct ufid *)fhp; + fs = VFSTOUFS(mp)->um_fs; + if (ufhp->ufid_ino < ROOTINO || + ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) + return (ESTALE); + return (ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp)); +} + +/* + * Vnode pointer to File handle + */ +/* ARGSUSED */ +ffs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + register struct inode *ip; + register struct ufid *ufhp; + + ip = VTOI(vp); + ufhp = (struct ufid *)fhp; + ufhp->ufid_len = sizeof(struct ufid); + ufhp->ufid_ino = ip->i_number; + ufhp->ufid_gen = ip->i_gen; + return (0); +} + +/* + * Write a superblock and associated information back to disk. + */ +int +ffs_sbupdate(mp, waitfor) + struct ufsmount *mp; + int waitfor; +{ + register struct fs *fs = mp->um_fs; + register struct buf *bp; + int blks; + caddr_t space; + int i, size, error = 0; + + bp = getblk(mp->um_devvp, SBLOCK, (int)fs->fs_sbsize, 0, 0); + bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); + /* Restore compatibility to old file systems. XXX */ + if (fs->fs_postblformat == FS_42POSTBLFMT) /* XXX */ + ((struct fs *)bp->b_data)->fs_nrpos = -1; /* XXX */ + if (waitfor == MNT_WAIT) + error = bwrite(bp); + else + bawrite(bp); + blks = howmany(fs->fs_cssize, fs->fs_fsize); + space = (caddr_t)fs->fs_csp[0]; + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), + size, 0, 0); + bcopy(space, bp->b_data, (u_int)size); + space += size; + if (waitfor == MNT_WAIT) + error = bwrite(bp); + else + bawrite(bp); + } + return (error); +} diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c new file mode 100644 index 00000000000..59814f2f378 --- /dev/null +++ b/sys/ufs/ffs/ffs_vnops.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_vnops.c 8.7 (Berkeley) 2/3/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +/* Global vfs data structures for ufs. */ +int (**ffs_vnodeop_p)(); +struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, ufs_lookup }, /* lookup */ + { &vop_create_desc, ufs_create }, /* create */ + { &vop_mknod_desc, ufs_mknod }, /* mknod */ + { &vop_open_desc, ufs_open }, /* open */ + { &vop_close_desc, ufs_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, ufs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ffs_read }, /* read */ + { &vop_write_desc, ffs_write }, /* write */ + { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */ + { &vop_select_desc, ufs_select }, /* select */ + { &vop_mmap_desc, ufs_mmap }, /* mmap */ + { &vop_fsync_desc, ffs_fsync }, /* fsync */ + { &vop_seek_desc, ufs_seek }, /* seek */ + { &vop_remove_desc, ufs_remove }, /* remove */ + { &vop_link_desc, ufs_link }, /* link */ + { &vop_rename_desc, ufs_rename }, /* rename */ + { &vop_mkdir_desc, ufs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, ufs_rmdir }, /* rmdir */ + { &vop_symlink_desc, ufs_symlink }, /* symlink */ + { &vop_readdir_desc, ufs_readdir }, /* readdir */ + { &vop_readlink_desc, ufs_readlink }, /* readlink */ + { &vop_abortop_desc, ufs_abortop }, /* abortop */ + { &vop_inactive_desc, ufs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, ufs_bmap }, /* bmap */ + { &vop_strategy_desc, ufs_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */ + { &vop_advlock_desc, ufs_advlock }, /* advlock */ + { &vop_blkatoff_desc, ffs_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, ffs_valloc }, /* valloc */ + { &vop_reallocblks_desc, ffs_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, ffs_vfree }, /* vfree */ + { &vop_truncate_desc, ffs_truncate }, /* truncate */ + { &vop_update_desc, ffs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc ffs_vnodeop_opv_desc = + { &ffs_vnodeop_p, ffs_vnodeop_entries }; + +int (**ffs_specop_p)(); +struct vnodeopv_entry_desc ffs_specop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, spec_create }, /* create */ + { &vop_mknod_desc, spec_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, ufsspec_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, ufs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ufsspec_read }, /* read */ + { &vop_write_desc, ufsspec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_select_desc, spec_select }, /* select */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, ffs_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, spec_remove }, /* remove */ + { &vop_link_desc, spec_link }, /* link */ + { &vop_rename_desc, spec_rename }, /* rename */ + { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ + { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ + { &vop_symlink_desc, spec_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, ufs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + { &vop_strategy_desc, spec_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_blkatoff_desc, spec_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, spec_valloc }, /* valloc */ + { &vop_reallocblks_desc, spec_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, ffs_vfree }, /* vfree */ + { &vop_truncate_desc, spec_truncate }, /* truncate */ + { &vop_update_desc, ffs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc ffs_specop_opv_desc = + { &ffs_specop_p, ffs_specop_entries }; + +#ifdef FIFO +int (**ffs_fifoop_p)(); +struct vnodeopv_entry_desc ffs_fifoop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fifo_lookup }, /* lookup */ + { &vop_create_desc, fifo_create }, /* create */ + { &vop_mknod_desc, fifo_mknod }, /* mknod */ + { &vop_open_desc, fifo_open }, /* open */ + { &vop_close_desc, ufsfifo_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, ufs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ufsfifo_read }, /* read */ + { &vop_write_desc, ufsfifo_write }, /* write */ + { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */ + { &vop_select_desc, fifo_select }, /* select */ + { &vop_mmap_desc, fifo_mmap }, /* mmap */ + { &vop_fsync_desc, ffs_fsync }, /* fsync */ + { &vop_seek_desc, fifo_seek }, /* seek */ + { &vop_remove_desc, fifo_remove }, /* remove */ + { &vop_link_desc, fifo_link }, /* link */ + { &vop_rename_desc, fifo_rename }, /* rename */ + { &vop_mkdir_desc, fifo_mkdir }, /* mkdir */ + { &vop_rmdir_desc, fifo_rmdir }, /* rmdir */ + { &vop_symlink_desc, fifo_symlink }, /* symlink */ + { &vop_readdir_desc, fifo_readdir }, /* readdir */ + { &vop_readlink_desc, fifo_readlink }, /* readlink */ + { &vop_abortop_desc, fifo_abortop }, /* abortop */ + { &vop_inactive_desc, ufs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, fifo_bmap }, /* bmap */ + { &vop_strategy_desc, fifo_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */ + { &vop_advlock_desc, fifo_advlock }, /* advlock */ + { &vop_blkatoff_desc, fifo_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fifo_valloc }, /* valloc */ + { &vop_reallocblks_desc, fifo_reallocblks }, /* reallocblks */ + { &vop_vfree_desc, ffs_vfree }, /* vfree */ + { &vop_truncate_desc, fifo_truncate }, /* truncate */ + { &vop_update_desc, ffs_update }, /* update */ + { &vop_bwrite_desc, vn_bwrite }, + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc ffs_fifoop_opv_desc = + { &ffs_fifoop_p, ffs_fifoop_entries }; +#endif /* FIFO */ + +#ifdef DEBUG +/* + * Enabling cluster read/write operations. + */ +#include +int doclusterread = 1; +struct ctldebug debug11 = { "doclusterread", &doclusterread }; +int doclusterwrite = 1; +struct ctldebug debug12 = { "doclusterwrite", &doclusterwrite }; +#else +/* XXX for ufs_readwrite */ +#define doclusterread 1 +#define doclusterwrite 1 +#endif + +#include + +/* + * Synch an open file. + */ +/* ARGSUSED */ +int +ffs_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct buf *bp; + struct timeval tv; + struct buf *nbp; + int s; + + /* + * Flush all dirty buffers associated with a vnode. + */ +loop: + s = splbio(); + for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { + nbp = bp->b_vnbufs.le_next; + if ((bp->b_flags & B_BUSY)) + continue; + if ((bp->b_flags & B_DELWRI) == 0) + panic("ffs_fsync: not dirty"); + bremfree(bp); + bp->b_flags |= B_BUSY; + splx(s); + /* + * Wait for I/O associated with indirect blocks to complete, + * since there is no way to quickly wait for them below. + */ + if (bp->b_vp == vp || ap->a_waitfor == MNT_NOWAIT) + (void) bawrite(bp); + else + (void) bwrite(bp); + goto loop; + } + if (ap->a_waitfor == MNT_WAIT) { + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1); + } +#ifdef DIAGNOSTIC + if (vp->v_dirtyblkhd.lh_first) { + vprint("ffs_fsync: dirty", vp); + goto loop; + } +#endif + } + splx(s); + tv = time; + return (VOP_UPDATE(ap->a_vp, &tv, &tv, ap->a_waitfor == MNT_WAIT)); +} diff --git a/sys/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h new file mode 100644 index 00000000000..bef052feef4 --- /dev/null +++ b/sys/ufs/ffs/fs.h @@ -0,0 +1,489 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fs.h 8.7 (Berkeley) 4/19/94 + */ + +/* + * Each disk drive contains some number of file systems. + * A file system consists of a number of cylinder groups. + * Each cylinder group has inodes and data. + * + * A file system is described by its super-block, which in turn + * describes the cylinder groups. The super-block is critical + * data and is replicated in each cylinder group to protect against + * catastrophic loss. This is done at `newfs' time and the critical + * super-block data does not change, so the copies need not be + * referenced further unless disaster strikes. + * + * For file system fs, the offsets of the various blocks of interest + * are given in the super block as: + * [fs->fs_sblkno] Super-block + * [fs->fs_cblkno] Cylinder group block + * [fs->fs_iblkno] Inode blocks + * [fs->fs_dblkno] Data blocks + * The beginning of cylinder group cg in fs, is given by + * the ``cgbase(fs, cg)'' macro. + * + * The first boot and super blocks are given in absolute disk addresses. + * The byte-offset forms are preferred, as they don't imply a sector size. + */ +#define BBSIZE 8192 +#define SBSIZE 8192 +#define BBOFF ((off_t)(0)) +#define SBOFF ((off_t)(BBOFF + BBSIZE)) +#define BBLOCK ((daddr_t)(0)) +#define SBLOCK ((daddr_t)(BBLOCK + BBSIZE / DEV_BSIZE)) + +/* + * Addresses stored in inodes are capable of addressing fragments + * of `blocks'. File system blocks of at most size MAXBSIZE can + * be optionally broken into 2, 4, or 8 pieces, each of which is + * addressible; these pieces may be DEV_BSIZE, or some multiple of + * a DEV_BSIZE unit. + * + * Large files consist of exclusively large data blocks. To avoid + * undue wasted disk space, the last data block of a small file may be + * allocated as only as many fragments of a large block as are + * necessary. The file system format retains only a single pointer + * to such a fragment, which is a piece of a single large block that + * has been divided. The size of such a fragment is determinable from + * information in the inode, using the ``blksize(fs, ip, lbn)'' macro. + * + * The file system records space availability at the fragment level; + * to determine block availability, aligned fragments are examined. + */ + +/* + * MINBSIZE is the smallest allowable block size. + * In order to insure that it is possible to create files of size + * 2^32 with only two levels of indirection, MINBSIZE is set to 4096. + * MINBSIZE must be big enough to hold a cylinder group block, + * thus changes to (struct cg) must keep its size within MINBSIZE. + * Note that super blocks are always of size SBSIZE, + * and that both SBSIZE and MAXBSIZE must be >= MINBSIZE. + */ +#define MINBSIZE 4096 + +/* + * The path name on which the file system is mounted is maintained + * in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in + * the super block for this name. + * The limit on the amount of summary information per file system + * is defined by MAXCSBUFS. It is currently parameterized for a + * maximum of two million cylinders. + */ +#define MAXMNTLEN 512 +#define MAXCSBUFS 32 + +/* + * A summary of contiguous blocks of various sizes is maintained + * in each cylinder group. Normally this is set by the initial + * value of fs_maxcontig. To conserve space, a maximum summary size + * is set by FS_MAXCONTIG. + */ +#define FS_MAXCONTIG 16 + +/* + * MINFREE gives the minimum acceptable percentage of file system + * blocks which may be free. If the freelist drops below this level + * only the superuser may continue to allocate blocks. This may + * be set to 0 if no reserve of free blocks is deemed necessary, + * however throughput drops by fifty percent if the file system + * is run at between 95% and 100% full; thus the minimum default + * value of fs_minfree is 5%. However, to get good clustering + * performance, 10% is a better choice. hence we use 10% as our + * default value. With 10% free space, fragmentation is not a + * problem, so we choose to optimize for time. + */ +#define MINFREE 5 +#define DEFAULTOPT FS_OPTTIME + +/* + * Per cylinder group information; summarized in blocks allocated + * from first cylinder group data blocks. These blocks have to be + * read in from fs_csaddr (size fs_cssize) in addition to the + * super block. + * + * N.B. sizeof(struct csum) must be a power of two in order for + * the ``fs_cs'' macro to work (see below). + */ +struct csum { + long cs_ndir; /* number of directories */ + long cs_nbfree; /* number of free blocks */ + long cs_nifree; /* number of free inodes */ + long cs_nffree; /* number of free frags */ +}; + +/* + * Super block for a file system. + */ +struct fs { + struct fs *fs_link; /* linked list of file systems */ + struct fs *fs_rlink; /* used for incore super blocks */ + daddr_t fs_sblkno; /* addr of super-block in filesys */ + daddr_t fs_cblkno; /* offset of cyl-block in filesys */ + daddr_t fs_iblkno; /* offset of inode-blocks in filesys */ + daddr_t fs_dblkno; /* offset of first data after cg */ + long fs_cgoffset; /* cylinder group offset in cylinder */ + long fs_cgmask; /* used to calc mod fs_ntrak */ + time_t fs_time; /* last time written */ + long fs_size; /* number of blocks in fs */ + long fs_dsize; /* number of data blocks in fs */ + long fs_ncg; /* number of cylinder groups */ + long fs_bsize; /* size of basic blocks in fs */ + long fs_fsize; /* size of frag blocks in fs */ + long fs_frag; /* number of frags in a block in fs */ +/* these are configuration parameters */ + long fs_minfree; /* minimum percentage of free blocks */ + long fs_rotdelay; /* num of ms for optimal next block */ + long fs_rps; /* disk revolutions per second */ +/* these fields can be computed from the others */ + long fs_bmask; /* ``blkoff'' calc of blk offsets */ + long fs_fmask; /* ``fragoff'' calc of frag offsets */ + long fs_bshift; /* ``lblkno'' calc of logical blkno */ + long fs_fshift; /* ``numfrags'' calc number of frags */ +/* these are configuration parameters */ + long fs_maxcontig; /* max number of contiguous blks */ + long fs_maxbpg; /* max number of blks per cyl group */ +/* these fields can be computed from the others */ + long fs_fragshift; /* block to frag shift */ + long fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ + long fs_sbsize; /* actual size of super block */ + long fs_csmask; /* csum block offset */ + long fs_csshift; /* csum block number */ + long fs_nindir; /* value of NINDIR */ + long fs_inopb; /* value of INOPB */ + long fs_nspf; /* value of NSPF */ +/* yet another configuration parameter */ + long fs_optim; /* optimization preference, see below */ +/* these fields are derived from the hardware */ + long fs_npsect; /* # sectors/track including spares */ + long fs_interleave; /* hardware sector interleave */ + long fs_trackskew; /* sector 0 skew, per track */ + long fs_headswitch; /* head switch time, usec */ + long fs_trkseek; /* track-to-track seek, usec */ +/* sizes determined by number of cylinder groups and their sizes */ + daddr_t fs_csaddr; /* blk addr of cyl grp summary area */ + long fs_cssize; /* size of cyl grp summary area */ + long fs_cgsize; /* cylinder group size */ +/* these fields are derived from the hardware */ + long fs_ntrak; /* tracks per cylinder */ + long fs_nsect; /* sectors per track */ + long fs_spc; /* sectors per cylinder */ +/* this comes from the disk driver partitioning */ + long fs_ncyl; /* cylinders in file system */ +/* these fields can be computed from the others */ + long fs_cpg; /* cylinders per group */ + long fs_ipg; /* inodes per group */ + long fs_fpg; /* blocks per group * fs_frag */ +/* this data must be re-computed after crashes */ + struct csum fs_cstotal; /* cylinder summary information */ +/* these fields are cleared at mount time */ + char fs_fmod; /* super block modified flag */ + char fs_clean; /* file system is clean flag */ + char fs_ronly; /* mounted read-only flag */ + char fs_flags; /* currently unused flag */ + char fs_fsmnt[MAXMNTLEN]; /* name mounted on */ +/* these fields retain the current block allocation info */ + long fs_cgrotor; /* last cg searched */ + struct csum *fs_csp[MAXCSBUFS];/* list of fs_cs info buffers */ + long fs_cpc; /* cyl per cycle in postbl */ + short fs_opostbl[16][8]; /* old rotation block list head */ + long fs_sparecon[50]; /* reserved for future constants */ + long fs_contigsumsize; /* size of cluster summary array */ + long fs_maxsymlinklen; /* max length of an internal symlink */ + long fs_inodefmt; /* format of on-disk inodes */ + u_quad_t fs_maxfilesize; /* maximum representable file size */ + quad_t fs_qbmask; /* ~fs_bmask - for use with quad size */ + quad_t fs_qfmask; /* ~fs_fmask - for use with quad size */ + long fs_state; /* validate fs_clean field */ + long fs_postblformat; /* format of positional layout tables */ + long fs_nrpos; /* number of rotational positions */ + long fs_postbloff; /* (short) rotation block list head */ + long fs_rotbloff; /* (u_char) blocks for each rotation */ + long fs_magic; /* magic number */ + u_char fs_space[1]; /* list of blocks for each rotation */ +/* actually longer */ +}; +/* + * Filesystem idetification + */ +#define FS_MAGIC 0x011954 /* the fast filesystem magic number */ +#define FS_OKAY 0x7c269d38 /* superblock checksum */ +#define FS_42INODEFMT -1 /* 4.2BSD inode format */ +#define FS_44INODEFMT 2 /* 4.4BSD inode format */ +/* + * Preference for optimization. + */ +#define FS_OPTTIME 0 /* minimize allocation time */ +#define FS_OPTSPACE 1 /* minimize disk fragmentation */ + +/* + * Rotational layout table format types + */ +#define FS_42POSTBLFMT -1 /* 4.2BSD rotational table format */ +#define FS_DYNAMICPOSTBLFMT 1 /* dynamic rotational table format */ +/* + * Macros for access to superblock array structures + */ +#define fs_postbl(fs, cylno) \ + (((fs)->fs_postblformat == FS_42POSTBLFMT) \ + ? ((fs)->fs_opostbl[cylno]) \ + : ((short *)((char *)(fs) + (fs)->fs_postbloff) + (cylno) * (fs)->fs_nrpos)) +#define fs_rotbl(fs) \ + (((fs)->fs_postblformat == FS_42POSTBLFMT) \ + ? ((fs)->fs_space) \ + : ((u_char *)((char *)(fs) + (fs)->fs_rotbloff))) + +/* + * The size of a cylinder group is calculated by CGSIZE. The maximum size + * is limited by the fact that cylinder groups are at most one block. + * Its size is derived from the size of the maps maintained in the + * cylinder group and the (struct cg) size. + */ +#define CGSIZE(fs) \ + /* base cg */ (sizeof(struct cg) + sizeof(long) + \ + /* blktot size */ (fs)->fs_cpg * sizeof(long) + \ + /* blks size */ (fs)->fs_cpg * (fs)->fs_nrpos * sizeof(short) + \ + /* inode map */ howmany((fs)->fs_ipg, NBBY) + \ + /* block map */ howmany((fs)->fs_cpg * (fs)->fs_spc / NSPF(fs), NBBY) +\ + /* if present */ ((fs)->fs_contigsumsize <= 0 ? 0 : \ + /* cluster sum */ (fs)->fs_contigsumsize * sizeof(long) + \ + /* cluster map */ howmany((fs)->fs_cpg * (fs)->fs_spc / NSPB(fs), NBBY))) + +/* + * Convert cylinder group to base address of its global summary info. + * + * N.B. This macro assumes that sizeof(struct csum) is a power of two. + */ +#define fs_cs(fs, indx) \ + fs_csp[(indx) >> (fs)->fs_csshift][(indx) & ~(fs)->fs_csmask] + +/* + * Cylinder group block for a file system. + */ +#define CG_MAGIC 0x090255 +struct cg { + struct cg *cg_link; /* linked list of cyl groups */ + long cg_magic; /* magic number */ + time_t cg_time; /* time last written */ + long cg_cgx; /* we are the cgx'th cylinder group */ + short cg_ncyl; /* number of cyl's this cg */ + short cg_niblk; /* number of inode blocks this cg */ + long cg_ndblk; /* number of data blocks this cg */ + struct csum cg_cs; /* cylinder summary information */ + long cg_rotor; /* position of last used block */ + long cg_frotor; /* position of last used frag */ + long cg_irotor; /* position of last used inode */ + long cg_frsum[MAXFRAG]; /* counts of available frags */ + long cg_btotoff; /* (long) block totals per cylinder */ + long cg_boff; /* (short) free block positions */ + long cg_iusedoff; /* (char) used inode map */ + long cg_freeoff; /* (u_char) free block map */ + long cg_nextfreeoff; /* (u_char) next available space */ + long cg_clustersumoff; /* (long) counts of avail clusters */ + long cg_clusteroff; /* (char) free cluster map */ + long cg_nclusterblks; /* number of clusters this cg */ + long cg_sparecon[13]; /* reserved for future use */ + u_char cg_space[1]; /* space for cylinder group maps */ +/* actually longer */ +}; +/* + * Macros for access to cylinder group array structures + */ +#define cg_blktot(cgp) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_btot) \ + : ((long *)((char *)(cgp) + (cgp)->cg_btotoff))) +#define cg_blks(fs, cgp, cylno) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_b[cylno]) \ + : ((short *)((char *)(cgp) + (cgp)->cg_boff) + (cylno) * (fs)->fs_nrpos)) +#define cg_inosused(cgp) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_iused) \ + : ((char *)((char *)(cgp) + (cgp)->cg_iusedoff))) +#define cg_blksfree(cgp) \ + (((cgp)->cg_magic != CG_MAGIC) \ + ? (((struct ocg *)(cgp))->cg_free) \ + : ((u_char *)((char *)(cgp) + (cgp)->cg_freeoff))) +#define cg_chkmagic(cgp) \ + ((cgp)->cg_magic == CG_MAGIC || ((struct ocg *)(cgp))->cg_magic == CG_MAGIC) +#define cg_clustersfree(cgp) \ + ((u_char *)((char *)(cgp) + (cgp)->cg_clusteroff)) +#define cg_clustersum(cgp) \ + ((long *)((char *)(cgp) + (cgp)->cg_clustersumoff)) + +/* + * The following structure is defined + * for compatibility with old file systems. + */ +struct ocg { + struct ocg *cg_link; /* linked list of cyl groups */ + struct ocg *cg_rlink; /* used for incore cyl groups */ + time_t cg_time; /* time last written */ + long cg_cgx; /* we are the cgx'th cylinder group */ + short cg_ncyl; /* number of cyl's this cg */ + short cg_niblk; /* number of inode blocks this cg */ + long cg_ndblk; /* number of data blocks this cg */ + struct csum cg_cs; /* cylinder summary information */ + long cg_rotor; /* position of last used block */ + long cg_frotor; /* position of last used frag */ + long cg_irotor; /* position of last used inode */ + long cg_frsum[8]; /* counts of available frags */ + long cg_btot[32]; /* block totals per cylinder */ + short cg_b[32][8]; /* positions of free blocks */ + char cg_iused[256]; /* used inode map */ + long cg_magic; /* magic number */ + u_char cg_free[1]; /* free block map */ +/* actually longer */ +}; + +/* + * Turn file system block numbers into disk block addresses. + * This maps file system blocks to device size blocks. + */ +#define fsbtodb(fs, b) ((b) << (fs)->fs_fsbtodb) +#define dbtofsb(fs, b) ((b) >> (fs)->fs_fsbtodb) + +/* + * Cylinder group macros to locate things in cylinder groups. + * They calc file system addresses of cylinder group data structures. + */ +#define cgbase(fs, c) ((daddr_t)((fs)->fs_fpg * (c))) +#define cgdmin(fs, c) (cgstart(fs, c) + (fs)->fs_dblkno) /* 1st data */ +#define cgimin(fs, c) (cgstart(fs, c) + (fs)->fs_iblkno) /* inode blk */ +#define cgsblock(fs, c) (cgstart(fs, c) + (fs)->fs_sblkno) /* super blk */ +#define cgtod(fs, c) (cgstart(fs, c) + (fs)->fs_cblkno) /* cg block */ +#define cgstart(fs, c) \ + (cgbase(fs, c) + (fs)->fs_cgoffset * ((c) & ~((fs)->fs_cgmask))) + +/* + * Macros for handling inode numbers: + * inode number to file system block offset. + * inode number to cylinder group number. + * inode number to file system block address. + */ +#define ino_to_cg(fs, x) ((x) / (fs)->fs_ipg) +#define ino_to_fsba(fs, x) \ + ((daddr_t)(cgimin(fs, ino_to_cg(fs, x)) + \ + (blkstofrags((fs), (((x) % (fs)->fs_ipg) / INOPB(fs)))))) +#define ino_to_fsbo(fs, x) ((x) % INOPB(fs)) + +/* + * Give cylinder group number for a file system block. + * Give cylinder group block number for a file system block. + */ +#define dtog(fs, d) ((d) / (fs)->fs_fpg) +#define dtogd(fs, d) ((d) % (fs)->fs_fpg) + +/* + * Extract the bits for a block from a map. + * Compute the cylinder and rotational position of a cyl block addr. + */ +#define blkmap(fs, map, loc) \ + (((map)[(loc) / NBBY] >> ((loc) % NBBY)) & (0xff >> (NBBY - (fs)->fs_frag))) +#define cbtocylno(fs, bno) \ + ((bno) * NSPF(fs) / (fs)->fs_spc) +#define cbtorpos(fs, bno) \ + (((bno) * NSPF(fs) % (fs)->fs_spc / (fs)->fs_nsect * (fs)->fs_trackskew + \ + (bno) * NSPF(fs) % (fs)->fs_spc % (fs)->fs_nsect * (fs)->fs_interleave) % \ + (fs)->fs_nsect * (fs)->fs_nrpos / (fs)->fs_npsect) + +/* + * The following macros optimize certain frequently calculated + * quantities by using shifts and masks in place of divisions + * modulos and multiplications. + */ +#define blkoff(fs, loc) /* calculates (loc % fs->fs_bsize) */ \ + ((loc) & (fs)->fs_qbmask) +#define fragoff(fs, loc) /* calculates (loc % fs->fs_fsize) */ \ + ((loc) & (fs)->fs_qfmask) +#define lblktosize(fs, blk) /* calculates (blk * fs->fs_bsize) */ \ + ((blk) << (fs)->fs_bshift) +#define lblkno(fs, loc) /* calculates (loc / fs->fs_bsize) */ \ + ((loc) >> (fs)->fs_bshift) +#define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \ + ((loc) >> (fs)->fs_fshift) +#define blkroundup(fs, size) /* calculates roundup(size, fs->fs_bsize) */ \ + (((size) + (fs)->fs_qbmask) & (fs)->fs_bmask) +#define fragroundup(fs, size) /* calculates roundup(size, fs->fs_fsize) */ \ + (((size) + (fs)->fs_qfmask) & (fs)->fs_fmask) +#define fragstoblks(fs, frags) /* calculates (frags / fs->fs_frag) */ \ + ((frags) >> (fs)->fs_fragshift) +#define blkstofrags(fs, blks) /* calculates (blks * fs->fs_frag) */ \ + ((blks) << (fs)->fs_fragshift) +#define fragnum(fs, fsb) /* calculates (fsb % fs->fs_frag) */ \ + ((fsb) & ((fs)->fs_frag - 1)) +#define blknum(fs, fsb) /* calculates rounddown(fsb, fs->fs_frag) */ \ + ((fsb) &~ ((fs)->fs_frag - 1)) + +/* + * Determine the number of available frags given a + * percentage to hold in reserve + */ +#define freespace(fs, percentreserved) \ + (blkstofrags((fs), (fs)->fs_cstotal.cs_nbfree) + \ + (fs)->fs_cstotal.cs_nffree - ((fs)->fs_dsize * (percentreserved) / 100)) + +/* + * Determining the size of a file block in the file system. + */ +#define blksize(fs, ip, lbn) \ + (((lbn) >= NDADDR || (ip)->i_size >= ((lbn) + 1) << (fs)->fs_bshift) \ + ? (fs)->fs_bsize \ + : (fragroundup(fs, blkoff(fs, (ip)->i_size)))) +#define dblksize(fs, dip, lbn) \ + (((lbn) >= NDADDR || (dip)->di_size >= ((lbn) + 1) << (fs)->fs_bshift) \ + ? (fs)->fs_bsize \ + : (fragroundup(fs, blkoff(fs, (dip)->di_size)))) + +/* + * Number of disk sectors per block; assumes DEV_BSIZE byte sector size. + */ +#define NSPB(fs) ((fs)->fs_nspf << (fs)->fs_fragshift) +#define NSPF(fs) ((fs)->fs_nspf) + +/* + * INOPB is the number of inodes in a secondary storage block. + */ +#define INOPB(fs) ((fs)->fs_inopb) +#define INOPF(fs) ((fs)->fs_inopb >> (fs)->fs_fragshift) + +/* + * NINDIR is the number of indirects in a file system block. + */ +#define NINDIR(fs) ((fs)->fs_nindir) + +extern int inside[], around[]; +extern u_char *fragtbl[]; diff --git a/sys/ufs/lfs/README b/sys/ufs/lfs/README new file mode 100644 index 00000000000..724b18fb9ea --- /dev/null +++ b/sys/ufs/lfs/README @@ -0,0 +1,139 @@ +# @(#)README 8.1 (Berkeley) 6/11/93 + +The file system is reasonably stable, but incomplete. There are +places where cleaning performance can be improved dramatically (see +comments in lfs_syscalls.c). For details on the implementation, +performance and why garbage collection always wins, see Dr. Margo +Seltzer's thesis available for anonymous ftp from toe.cs.berkeley.edu, +in the directory pub/personal/margo/thesis.ps.Z, or the January 1993 +USENIX paper. + +Missing Functionality: + Multiple block sizes and/or fragments are not yet implemented. + +---------- +The disk is laid out in segments. The first segment starts 8K into the +disk (the first 8K is used for boot information). Each segment is composed +of the following: + + An optional super block + One or more groups of: + segment summary + 0 or more data blocks + 0 or more inode blocks + +The segment summary and inode/data blocks start after the super block (if +present), and grow toward the end of the segment. + + _______________________________________________ + | | | | | + | summary | data/inode | summary | data/inode | + | block | blocks | block | blocks | ... + |_________|____________|_________|____________| + +The data/inode blocks following a summary block are described by the +summary block. In order to permit the segment to be written in any order +and in a forward direction only, a checksum is calculated across the +blocks described by the summary. Additionally, the summary is checksummed +and timestamped. Both of these are intended for recovery; the former is +to make it easy to determine that it *is* a summary block and the latter +is to make it easy to determine when recovery is finished for partially +written segments. These checksums are also used by the cleaner. + + Summary block (detail) + ________________ + | sum cksum | + | data cksum | + | next segment | + | timestamp | + | FINFO count | + | inode count | + | flags | + |______________| + | FINFO-1 | 0 or more file info structures, identifying the + | . | blocks in the segment. + | . | + | . | + | FINFO-N | + | inode-N | + | . | + | . | + | . | 0 or more inode daddr_t's, identifying the inode + | inode-1 | blocks in the segment. + |______________| + +Inode blocks are blocks of on-disk inodes in the same format as those in +the FFS. However, spare[0] contains the inode number of the inode so we +can find a particular inode on a page. They are packed page_size / +sizeof(inode) to a block. Data blocks are exactly as in the FFS. Both +inodes and data blocks move around the file system at will. + +The file system is described by a super-block which is replicated and +occurs as the first block of the first and other segments. (The maximum +number of super-blocks is MAXNUMSB). Each super-block maintains a list +of the disk addresses of all the super-blocks. The super-block maintains +a small amount of checkpoint information, essentially just enough to find +the inode for the IFILE (fs->lfs_idaddr). + +The IFILE is visible in the file system, as inode number IFILE_INUM. It +contains information shared between the kernel and various user processes. + + Ifile (detail) + ________________ + | cleaner info | Cleaner information per file system. (Page + | | granularity.) + |______________| + | segment | Space available and last modified times per + | usage table | segment. (Page granularity.) + |______________| + | IFILE-1 | Per inode status information: current version #, + | . | if currently allocated, last access time and + | . | current disk address of containing inode block. + | . | If current disk address is LFS_UNUSED_DADDR, the + | IFILE-N | inode is not in use, and it's on the free list. + |______________| + + +First Segment at Creation Time: +_____________________________________________________________ +| | | | | | | | +| 8K pad | Super | summary | inode | ifile | root | l + f | +| | block | | block | | dir | dir | +|________|_______|_________|_______|_______|_______|_______| + ^ + Segment starts here. + +Some differences from the Sprite LFS implementation. + +1. The LFS implementation placed the ifile metadata and the super block + at fixed locations. This implementation replicates the super block + and puts each at a fixed location. The checkpoint data is divided into + two parts -- just enough information to find the IFILE is stored in + two of the super blocks, although it is not toggled between them as in + the Sprite implementation. (This was deliberate, to avoid a single + point of failure.) The remaining checkpoint information is treated as + a regular file, which means that the cleaner info, the segment usage + table and the ifile meta-data are stored in normal log segments. + (Tastes great, less filling...) + +2. The segment layout is radically different in Sprite; this implementation + uses something a lot like network framing, where data/inode blocks are + written asynchronously, and a checksum is used to validate any set of + summary and data/inode blocks. Sprite writes summary blocks synchronously + after the data/inode blocks have been written and the existence of the + summary block validates the data/inode blocks. This permits us to write + everything contiguously, even partial segments and their summaries, whereas + Sprite is forced to seek (from the end of the data inode to the summary + which lives at the end of the segment). Additionally, writing the summary + synchronously should cost about 1/2 a rotation per summary. + +3. Sprite LFS distinguishes between different types of blocks in the segment. + Other than inode blocks and data blocks, we don't. + +4. Sprite LFS traverses the IFILE looking for free blocks. We maintain a + free list threaded through the IFILE entries. + +5. The cleaner runs in user space, as opposed to kernel space. It shares + information with the kernel by reading/writing the IFILE and through + cleaner specific system calls. + diff --git a/sys/ufs/lfs/TODO b/sys/ufs/lfs/TODO new file mode 100644 index 00000000000..ace8f5eaef6 --- /dev/null +++ b/sys/ufs/lfs/TODO @@ -0,0 +1,116 @@ +# @(#)TODO 8.1 (Berkeley) 6/11/93 + +NOTE: Changed the lookup on a page of inodes to search from the back +in case the same inode gets written twice on the same page. + +Make sure that if you are writing a file, but not all the blocks +make it into a single segment, that you do not write the inode in +that segment. + +Keith: + Why not delete the lfs_bmapv call, just mark everything dirty + that isn't deleted/truncated? Get some numbers about + what percentage of the stuff that the cleaner thinks + might be live is live. If it's high, get rid of lfs_bmapv. + + There is a nasty problem in that it may take *more* room to write + the data to clean a segment than is returned by the new segment + because of indirect blocks in segment 2 being dirtied by the data + being copied into the log from segment 1. The suggested solution + at this point is to detect it when we have no space left on the + filesystem, write the extra data into the last segment (leaving + no clean ones), make it a checkpoint and shut down the file system + for fixing by a utility reading the raw partition. Argument is + that this should never happen and is practically impossible to fix + since the cleaner would have to theoretically build a model of the + entire filesystem in memory to detect the condition occurring. + A file coalescing cleaner will help avoid the problem, and one + that reads/writes from the raw disk could fix it. + +DONE Currently, inodes are being flushed to disk synchronously upon + creation -- see ufs_makeinode. However, only the inode + is flushed, the directory "name" is written using VOP_BWRITE, + so it's not synchronous. Possible solutions: 1: get some + ordering in the writes so that inode/directory entries get + stuffed into the same segment. 2: do both synchronously + 3: add Mendel's information into the stream so we log + creation/deletion of inodes. 4: do some form of partial + segment when changing the inode (creation/deletion/rename). +DONE Fix i_block increment for indirect blocks. + If the file system is tar'd, extracted on top of another LFS, the + IFILE ain't worth diddly. Is the cleaner writing the IFILE? + If not, let's make it read-only. +DONE Delete unnecessary source from utils in main-line source tree. +DONE Make sure that we're counting meta blocks in the inode i_block count. + Overlap the version and nextfree fields in the IFILE +DONE Vinvalbuf (Kirk): + Why writing blocks that are no longer useful? + Are the semantics of close such that blocks have to be flushed? + How specify in the buf chain the blocks that don't need + to be written? (Different numbering of indirect blocks.) + +Margo: + Change so that only search one sector of inode block file for the + inode by using sector addresses in the ifile instead of + logical disk addresses. + Fix the use of the ifile version field to use the generation + number instead. +DONE Unmount; not doing a bgetvp (VHOLD) in lfs_newbuf call. +DONE Document in the README file where the checkpoint information is + on disk. + Variable block sizes (Margo/Keith). + Switch the byte accounting to sector accounting. +DONE Check lfs.h and make sure that the #defines/structures are all + actually needed. +DONE Add a check in lfs_segment.c so that if the segment is empty, + we don't write it. + Need to keep vnode v_numoutput up to date for pending writes? +DONE USENIX paper (Carl/Margo). + + +Evelyn: + lfsck: If delete a file that's being executed, the version number + isn't updated, and lfsck has to figure this out; case is the same as if have an inode that no directory references, + so the file should be reattached into lost+found. + Recovery/fsck. + +Carl: + Investigate: clustering of reads (if blocks in the segment are ordered, + should read them all) and writes (McVoy paper). + Investigate: should the access time be part of the IFILE: + pro: theoretically, saves disk writes + con: cacheing inodes should obviate this advantage + the IFILE is already humongous + Cleaner. + Port to OSF/1 (Carl/Keith). + Currently there's no notion of write error checking. + + Failed data/inode writes should be rescheduled (kernel level + bad blocking). + + Failed superblock writes should cause selection of new + superblock for checkpointing. + +FUTURE FANTASIES: ============ + ++ unrm, versioning ++ transactions ++ extended cleaner policies (hot/cold data, data placement) + +============================== +Problem with the concept of multiple buffer headers referencing the segment: +Positives: + Don't lock down 1 segment per file system of physical memory. + Don't copy from buffers to segment memory. + Don't tie down the bus to transfer 1M. + Works on controllers supporting less than large transfers. + Disk can start writing immediately instead of waiting 1/2 rotation + and the full transfer. +Negatives: + Have to do segment write then segment summary write, since the latter + is what verifies that the segment is okay. (Is there another way + to do this?) +============================== + +The algorithm for selecting the disk addresses of the super-blocks +has to be available to the user program which checks the file system. + +(Currently in newfs, becomes a common subroutine.) diff --git a/sys/ufs/lfs/lfs.h b/sys/ufs/lfs/lfs.h new file mode 100644 index 00000000000..87b8c22ccc0 --- /dev/null +++ b/sys/ufs/lfs/lfs.h @@ -0,0 +1,353 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs.h 8.3 (Berkeley) 9/23/93 + */ + +#define LFS_LABELPAD 8192 /* LFS label size */ +#define LFS_SBPAD 8192 /* LFS superblock size */ + +/* + * XXX + * This is a kluge and NEEDS to go away. + * + * Right now, ufs code handles most of the calls for directory operations + * such as create, mkdir, link, etc. As a result VOP_UPDATE is being + * called with waitfor set (since ffs does these things synchronously). + * Since LFS does not want to do these synchronously, we treat the last + * argument to lfs_update as a set of flags. If LFS_SYNC is set, then + * the update should be synchronous, if not, do it asynchronously. + * Unfortunately, this means that LFS won't work with NFS yet because + * NFS goes through paths that will make normal calls to ufs which will + * call lfs with a last argument of 1. + */ +#define LFS_SYNC 0x02 + +/* On-disk and in-memory checkpoint segment usage structure. */ +typedef struct segusage SEGUSE; +struct segusage { + u_long su_nbytes; /* number of live bytes */ + u_long su_lastmod; /* SEGUSE last modified timestamp */ + u_short su_nsums; /* number of summaries in segment */ + u_short su_ninos; /* number of inode blocks in seg */ +#define SEGUSE_ACTIVE 0x1 /* segment is currently being written */ +#define SEGUSE_DIRTY 0x2 /* segment has data in it */ +#define SEGUSE_SUPERBLOCK 0x4 /* segment contains a superblock */ + u_long su_flags; +}; + +#define SEGUPB(fs) (1 << (fs)->lfs_sushift) +#define SEGTABSIZE_SU(fs) \ + (((fs)->lfs_nseg + SEGUPB(fs) - 1) >> (fs)->lfs_sushift) + +/* On-disk file information. One per file with data blocks in the segment. */ +typedef struct finfo FINFO; +struct finfo { + u_long fi_nblocks; /* number of blocks */ + u_long fi_version; /* version number */ + u_long fi_ino; /* inode number */ + long fi_blocks[1]; /* array of logical block numbers */ +}; + +/* On-disk and in-memory super block. */ +struct lfs { +#define LFS_MAGIC 0x070162 + u_long lfs_magic; /* magic number */ +#define LFS_VERSION 1 + u_long lfs_version; /* version number */ + + u_long lfs_size; /* number of blocks in fs */ + u_long lfs_ssize; /* number of blocks per segment */ + u_long lfs_dsize; /* number of disk blocks in fs */ + u_long lfs_bsize; /* file system block size */ + u_long lfs_fsize; /* size of frag blocks in fs */ + u_long lfs_frag; /* number of frags in a block in fs */ + +/* Checkpoint region. */ + ino_t lfs_free; /* start of the free list */ + u_long lfs_bfree; /* number of free disk blocks */ + u_long lfs_nfiles; /* number of allocated inodes */ + long lfs_avail; /* blocks available for writing */ + u_long lfs_uinodes; /* inodes in cache not yet on disk */ + daddr_t lfs_idaddr; /* inode file disk address */ + ino_t lfs_ifile; /* inode file inode number */ + daddr_t lfs_lastseg; /* address of last segment written */ + daddr_t lfs_nextseg; /* address of next segment to write */ + daddr_t lfs_curseg; /* current segment being written */ + daddr_t lfs_offset; /* offset in curseg for next partial */ + daddr_t lfs_lastpseg; /* address of last partial written */ + u_long lfs_tstamp; /* time stamp */ + +/* These are configuration parameters. */ + u_long lfs_minfree; /* minimum percentage of free blocks */ + +/* These fields can be computed from the others. */ + u_quad_t lfs_maxfilesize; /* maximum representable file size */ + u_long lfs_dbpseg; /* disk blocks per segment */ + u_long lfs_inopb; /* inodes per block */ + u_long lfs_ifpb; /* IFILE entries per block */ + u_long lfs_sepb; /* SEGUSE entries per block */ + u_long lfs_nindir; /* indirect pointers per block */ + u_long lfs_nseg; /* number of segments */ + u_long lfs_nspf; /* number of sectors per fragment */ + u_long lfs_cleansz; /* cleaner info size in blocks */ + u_long lfs_segtabsz; /* segment table size in blocks */ + + u_long lfs_segmask; /* calculate offset within a segment */ + u_long lfs_segshift; /* fast mult/div for segments */ + u_long lfs_bmask; /* calc block offset from file offset */ + u_long lfs_bshift; /* calc block number from file offset */ + u_long lfs_ffmask; /* calc frag offset from file offset */ + u_long lfs_ffshift; /* fast mult/div for frag from file */ + u_long lfs_fbmask; /* calc frag offset from block offset */ + u_long lfs_fbshift; /* fast mult/div for frag from block */ + u_long lfs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ + u_long lfs_sushift; /* fast mult/div for segusage table */ + +#define LFS_MIN_SBINTERVAL 5 /* minimum superblock segment spacing */ +#define LFS_MAXNUMSB 10 /* superblock disk offsets */ + daddr_t lfs_sboffs[LFS_MAXNUMSB]; + +/* These fields are set at mount time and are meaningless on disk. */ + struct segment *lfs_sp; /* current segment being written */ + struct vnode *lfs_ivnode; /* vnode for the ifile */ + u_long lfs_seglock; /* single-thread the segment writer */ + pid_t lfs_lockpid; /* pid of lock holder */ + u_long lfs_iocount; /* number of ios pending */ + u_long lfs_writer; /* don't allow any dirops to start */ + u_long lfs_dirops; /* count of active directory ops */ + u_long lfs_doifile; /* Write ifile blocks on next write */ + u_long lfs_nactive; /* Number of segments since last ckp */ + u_char lfs_fmod; /* super block modified flag */ + u_char lfs_clean; /* file system is clean flag */ + u_char lfs_ronly; /* mounted read-only flag */ + u_char lfs_flags; /* currently unused flag */ + u_char lfs_fsmnt[MNAMELEN]; /* name mounted on */ + u_char pad[3]; /* long-align */ + +/* Checksum; valid on disk. */ + u_long lfs_cksum; /* checksum for superblock checking */ +}; + +/* + * Inode 0 is the out-of-band inode number, inode 1 is the inode number for + * the IFILE, the root inode is 2 and the lost+found inode is 3. + */ + +/* Fixed inode numbers. */ +#define LFS_UNUSED_INUM 0 /* out of band inode number */ +#define LFS_IFILE_INUM 1 /* IFILE inode number */ +#define LOSTFOUNDINO 3 /* lost+found inode number */ +#define LFS_FIRST_INUM 4 /* first free inode number */ + +/* Address calculations for metadata located in the inode */ +#define S_INDIR(fs) -NDADDR +#define D_INDIR(fs) (S_INDIR(fs) - NINDIR(fs) - 1) +#define T_INDIR(fs) (D_INDIR(fs) - NINDIR(fs) * NINDIR(fs) - 1) + +/* Unassigned disk address. */ +#define UNASSIGNED -1 + +/* Unused logical block number */ +#define LFS_UNUSED_LBN -1 + +typedef struct ifile IFILE; +struct ifile { + u_long if_version; /* inode version number */ +#define LFS_UNUSED_DADDR 0 /* out-of-band daddr */ + daddr_t if_daddr; /* inode disk address */ + ino_t if_nextfree; /* next-unallocated inode */ +}; + +/* + * Cleaner information structure. This resides in the ifile and is used + * to pass information between the cleaner and the kernel. + */ +typedef struct _cleanerinfo { + u_long clean; /* K: number of clean segments */ + u_long dirty; /* K: number of dirty segments */ +} CLEANERINFO; + +#define CLEANSIZE_SU(fs) \ + ((sizeof(CLEANERINFO) + (fs)->lfs_bsize - 1) >> (fs)->lfs_bshift) + +/* + * All summary blocks are the same size, so we can always read a summary + * block easily from a segment. + */ +#define LFS_SUMMARY_SIZE 512 + +/* On-disk segment summary information */ +typedef struct segsum SEGSUM; +struct segsum { + u_long ss_sumsum; /* check sum of summary block */ + u_long ss_datasum; /* check sum of data */ + daddr_t ss_next; /* next segment */ + u_long ss_create; /* creation time stamp */ + u_short ss_nfinfo; /* number of file info structures */ + u_short ss_ninos; /* number of inodes in summary */ +#define SS_DIROP 0x01 /* segment begins a dirop */ +#define SS_CONT 0x02 /* more partials to finish this write*/ + u_short ss_flags; /* used for directory operations */ + u_short ss_pad; /* extra space */ + /* FINFO's and inode daddr's... */ +}; + +/* NINDIR is the number of indirects in a file system block. */ +#define NINDIR(fs) ((fs)->lfs_nindir) + +/* INOPB is the number of inodes in a secondary storage block. */ +#define INOPB(fs) ((fs)->lfs_inopb) + +#define blksize(fs) ((fs)->lfs_bsize) +#define blkoff(fs, loc) ((loc) & (fs)->lfs_bmask) +#define fsbtodb(fs, b) ((b) << (fs)->lfs_fsbtodb) +#define dbtofsb(fs, b) ((b) >> (fs)->lfs_fsbtodb) +#define lblkno(fs, loc) ((loc) >> (fs)->lfs_bshift) +#define lblktosize(fs, blk) ((blk) << (fs)->lfs_bshift) +#define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \ + ((loc) >> (fs)->lfs_bshift) + +#define datosn(fs, daddr) /* disk address to segment number */ \ + (((daddr) - (fs)->lfs_sboffs[0]) / fsbtodb((fs), (fs)->lfs_ssize)) +#define sntoda(fs, sn) /* segment number to disk address */ \ + ((daddr_t)((sn) * ((fs)->lfs_ssize << (fs)->lfs_fsbtodb) + \ + (fs)->lfs_sboffs[0])) + +/* Read in the block with the cleaner info from the ifile. */ +#define LFS_CLEANERINFO(CP, F, BP) { \ + VTOI((F)->lfs_ivnode)->i_flag |= IN_ACCESS; \ + if (bread((F)->lfs_ivnode, \ + (daddr_t)0, (F)->lfs_bsize, NOCRED, &(BP))) \ + panic("lfs: ifile read"); \ + (CP) = (CLEANERINFO *)(BP)->b_data; \ +} + +/* Read in the block with a specific inode from the ifile. */ +#define LFS_IENTRY(IP, F, IN, BP) { \ + int _e; \ + VTOI((F)->lfs_ivnode)->i_flag |= IN_ACCESS; \ + if (_e = bread((F)->lfs_ivnode, \ + (IN) / (F)->lfs_ifpb + (F)->lfs_cleansz + (F)->lfs_segtabsz,\ + (F)->lfs_bsize, NOCRED, &(BP))) \ + panic("lfs: ifile read %d", _e); \ + (IP) = (IFILE *)(BP)->b_data + (IN) % (F)->lfs_ifpb; \ +} + +/* Read in the block with a specific segment usage entry from the ifile. */ +#define LFS_SEGENTRY(SP, F, IN, BP) { \ + int _e; \ + VTOI((F)->lfs_ivnode)->i_flag |= IN_ACCESS; \ + if (_e = bread((F)->lfs_ivnode, \ + ((IN) >> (F)->lfs_sushift) + (F)->lfs_cleansz, \ + (F)->lfs_bsize, NOCRED, &(BP))) \ + panic("lfs: ifile read: %d", _e); \ + (SP) = (SEGUSE *)(BP)->b_data + ((IN) & (F)->lfs_sepb - 1); \ +} + +/* + * Determine if there is enough room currently available to write db + * disk blocks. We need enough blocks for the new blocks, the current, + * inode blocks, a summary block, plus potentially the ifile inode and + * the segment usage table, plus an ifile page. + */ +#define LFS_FITS(fs, db) \ + ((long)((db + ((fs)->lfs_uinodes + INOPB((fs))) / INOPB((fs)) + \ + fsbtodb(fs, 1) + LFS_SUMMARY_SIZE / DEV_BSIZE + \ + (fs)->lfs_segtabsz)) < (fs)->lfs_avail) + +/* Determine if a buffer belongs to the ifile */ +#define IS_IFILE(bp) (VTOI(bp->b_vp)->i_number == LFS_IFILE_INUM) + +/* + * Structures used by lfs_bmapv and lfs_markv to communicate information + * about inodes and data blocks. + */ +typedef struct block_info { + ino_t bi_inode; /* inode # */ + daddr_t bi_lbn; /* logical block w/in file */ + daddr_t bi_daddr; /* disk address of block */ + time_t bi_segcreate; /* origin segment create time */ + int bi_version; /* file version number */ + void *bi_bp; /* data buffer */ +} BLOCK_INFO; + +/* In-memory description of a segment about to be written. */ +struct segment { + struct lfs *fs; /* file system pointer */ + struct buf **bpp; /* pointer to buffer array */ + struct buf **cbpp; /* pointer to next available bp */ + struct buf **start_bpp; /* pointer to first bp in this set */ + struct buf *ibp; /* buffer pointer to inode page */ + struct finfo *fip; /* current fileinfo pointer */ + struct vnode *vp; /* vnode being gathered */ + void *segsum; /* segment summary info */ + u_long ninodes; /* number of inodes in this segment */ + u_long seg_bytes_left; /* bytes left in segment */ + u_long sum_bytes_left; /* bytes left in summary block */ + u_long seg_number; /* number of this segment */ + daddr_t *start_lbp; /* beginning lbn for this set */ +#define SEGM_CKP 0x01 /* doing a checkpoint */ +#define SEGM_CLEAN 0x02 /* cleaner call; don't sort */ +#define SEGM_SYNC 0x04 /* wait for segment */ + u_long seg_flags; /* run-time flags for this segment */ +}; + +#define ISSPACE(F, BB, C) \ + (((C)->cr_uid == 0 && (F)->lfs_bfree >= (BB)) || \ + ((C)->cr_uid != 0 && IS_FREESPACE(F, BB))) + +#define IS_FREESPACE(F, BB) \ + ((F)->lfs_bfree > ((F)->lfs_dsize * (F)->lfs_minfree / 100 + (BB))) + +#define ISSPACE_XXX(F, BB) \ + ((F)->lfs_bfree >= (BB)) + +#define DOSTATS +#ifdef DOSTATS +/* Statistics Counters */ +struct lfs_stats { + int segsused; + int psegwrites; + int psyncwrites; + int pcleanwrites; + int blocktot; + int cleanblocks; + int ncheckpoints; + int nwrites; + int nsync_writes; + int wait_exceeded; + int write_exceeded; + int flush_invoked; +}; +extern struct lfs_stats lfs_stats; +#endif diff --git a/sys/ufs/lfs/lfs_alloc.c b/sys/ufs/lfs/lfs_alloc.c new file mode 100644 index 00000000000..3f06c813930 --- /dev/null +++ b/sys/ufs/lfs/lfs_alloc.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_alloc.c 8.4 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +extern u_long nextgennumber; + +/* Allocate a new inode. */ +/* ARGSUSED */ +int +lfs_valloc(ap) + struct vop_valloc_args /* { + struct vnode *a_pvp; + int a_mode; + struct ucred *a_cred; + struct vnode **a_vpp; + } */ *ap; +{ + struct lfs *fs; + struct buf *bp; + struct ifile *ifp; + struct inode *ip; + struct vnode *vp; + daddr_t blkno; + ino_t new_ino; + u_long i, max; + int error; + + /* Get the head of the freelist. */ + fs = VTOI(ap->a_pvp)->i_lfs; + new_ino = fs->lfs_free; +#ifdef ALLOCPRINT + printf("lfs_ialloc: allocate inode %d\n", new_ino); +#endif + + /* + * Remove the inode from the free list and write the new start + * of the free list into the superblock. + */ + LFS_IENTRY(ifp, fs, new_ino, bp); + if (ifp->if_daddr != LFS_UNUSED_DADDR) + panic("lfs_ialloc: inuse inode on the free list"); + fs->lfs_free = ifp->if_nextfree; + brelse(bp); + + /* Extend IFILE so that the next lfs_valloc will succeed. */ + if (fs->lfs_free == LFS_UNUSED_INUM) { + vp = fs->lfs_ivnode; + ip = VTOI(vp); + blkno = lblkno(fs, ip->i_size); + lfs_balloc(vp, fs->lfs_bsize, blkno, &bp); + ip->i_size += fs->lfs_bsize; + vnode_pager_setsize(vp, (u_long)ip->i_size); + vnode_pager_uncache(vp); + + i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) * + fs->lfs_ifpb; + fs->lfs_free = i; + max = i + fs->lfs_ifpb; + for (ifp = (struct ifile *)bp->b_data; i < max; ++ifp) { + ifp->if_version = 1; + ifp->if_daddr = LFS_UNUSED_DADDR; + ifp->if_nextfree = ++i; + } + ifp--; + ifp->if_nextfree = LFS_UNUSED_INUM; + if (error = VOP_BWRITE(bp)) + return (error); + } + + /* Create a vnode to associate with the inode. */ + if (error = lfs_vcreate(ap->a_pvp->v_mount, new_ino, &vp)) + return (error); + + + ip = VTOI(vp); + /* Zero out the direct and indirect block addresses. */ + bzero(&ip->i_din, sizeof(struct dinode)); + ip->i_din.di_inumber = new_ino; + + /* Set a new generation number for this inode. */ + if (++nextgennumber < (u_long)time.tv_sec) + nextgennumber = time.tv_sec; + ip->i_gen = nextgennumber; + + /* Insert into the inode hash table. */ + ufs_ihashins(ip); + + if (error = ufs_vinit(vp->v_mount, lfs_specop_p, LFS_FIFOOPS, &vp)) { + vput(vp); + *ap->a_vpp = NULL; + return (error); + } + + *ap->a_vpp = vp; + vp->v_flag |= VDIROP; + VREF(ip->i_devvp); + + /* Set superblock modified bit and increment file count. */ + fs->lfs_fmod = 1; + ++fs->lfs_nfiles; + return (0); +} + +/* Create a new vnode/inode pair and initialize what fields we can. */ +int +lfs_vcreate(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + extern int (**lfs_vnodeop_p)(); + struct inode *ip; + struct ufsmount *ump; + int error, i; + + /* Create the vnode. */ + if (error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, vpp)) { + *vpp = NULL; + return (error); + } + + /* Get a pointer to the private mount structure. */ + ump = VFSTOUFS(mp); + + /* Initialize the inode. */ + MALLOC(ip, struct inode *, sizeof(struct inode), M_LFSNODE, M_WAITOK); + (*vpp)->v_data = ip; + ip->i_vnode = *vpp; + ip->i_devvp = ump->um_devvp; + ip->i_flag = IN_MODIFIED; + ip->i_dev = ump->um_dev; + ip->i_number = ip->i_din.di_inumber = ino; +ip->i_din.di_spare[0] = 0xdeadbeef; +ip->i_din.di_spare[1] = 0xdeadbeef; + ip->i_lfs = ump->um_lfs; +#ifdef QUOTA + for (i = 0; i < MAXQUOTAS; i++) + ip->i_dquot[i] = NODQUOT; +#endif + ip->i_lockf = 0; + ip->i_diroff = 0; + ip->i_mode = 0; + ip->i_size = 0; + ip->i_blocks = 0; + ++ump->um_lfs->lfs_uinodes; + return (0); +} + +/* Free an inode. */ +/* ARGUSED */ +int +lfs_vfree(ap) + struct vop_vfree_args /* { + struct vnode *a_pvp; + ino_t a_ino; + int a_mode; + } */ *ap; +{ + SEGUSE *sup; + struct buf *bp; + struct ifile *ifp; + struct inode *ip; + struct lfs *fs; + daddr_t old_iaddr; + ino_t ino; + + /* Get the inode number and file system. */ + ip = VTOI(ap->a_pvp); + fs = ip->i_lfs; + ino = ip->i_number; + if (ip->i_flag & IN_MODIFIED) { + --fs->lfs_uinodes; + ip->i_flag &= + ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); + } + /* + * Set the ifile's inode entry to unused, increment its version number + * and link it into the free chain. + */ + LFS_IENTRY(ifp, fs, ino, bp); + old_iaddr = ifp->if_daddr; + ifp->if_daddr = LFS_UNUSED_DADDR; + ++ifp->if_version; + ifp->if_nextfree = fs->lfs_free; + fs->lfs_free = ino; + (void) VOP_BWRITE(bp); + + if (old_iaddr != LFS_UNUSED_DADDR) { + LFS_SEGENTRY(sup, fs, datosn(fs, old_iaddr), bp); +#ifdef DIAGNOSTIC + if (sup->su_nbytes < sizeof(struct dinode)) + panic("lfs_vfree: negative byte count (segment %d)\n", + datosn(fs, old_iaddr)); +#endif + sup->su_nbytes -= sizeof(struct dinode); + (void) VOP_BWRITE(bp); + } + + /* Set superblock modified bit and decrement file count. */ + fs->lfs_fmod = 1; + --fs->lfs_nfiles; + return (0); +} diff --git a/sys/ufs/lfs/lfs_balloc.c b/sys/ufs/lfs/lfs_balloc.c new file mode 100644 index 00000000000..b56bc9ec51b --- /dev/null +++ b/sys/ufs/lfs/lfs_balloc.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_balloc.c 8.1 (Berkeley) 6/11/93 + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include + +int +lfs_balloc(vp, iosize, lbn, bpp) + struct vnode *vp; + u_long iosize; + daddr_t lbn; + struct buf **bpp; +{ + struct buf *ibp, *bp; + struct inode *ip; + struct lfs *fs; + struct indir indirs[NIADDR+2]; + daddr_t daddr; + int bb, error, i, num; + + ip = VTOI(vp); + fs = ip->i_lfs; + + /* + * Three cases: it's a block beyond the end of file, it's a block in + * the file that may or may not have been assigned a disk address or + * we're writing an entire block. Note, if the daddr is unassigned, + * the block might still have existed in the cache (if it was read + * or written earlier). If it did, make sure we don't count it as a + * new block or zero out its contents. If it did not, make sure + * we allocate any necessary indirect blocks. + */ + + *bpp = NULL; + if (error = ufs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, NULL )) + return (error); + + *bpp = bp = getblk(vp, lbn, fs->lfs_bsize, 0, 0); + bb = VFSTOUFS(vp->v_mount)->um_seqinc; + if (daddr == UNASSIGNED) + /* May need to allocate indirect blocks */ + for (i = 1; i < num; ++i) + if (!indirs[i].in_exists) { + ibp = + getblk(vp, indirs[i].in_lbn, fs->lfs_bsize, + 0, 0); + if (!(ibp->b_flags & (B_DONE | B_DELWRI))) { + if (!ISSPACE(fs, bb, curproc->p_ucred)){ + ibp->b_flags |= B_INVAL; + brelse(ibp); + error = ENOSPC; + } else { + ip->i_blocks += bb; + ip->i_lfs->lfs_bfree -= bb; + clrbuf(ibp); + error = VOP_BWRITE(ibp); + } + } else + panic ("Indirect block should not exist"); + } + if (error) { + if (bp) + brelse(bp); + return(error); + } + + + /* Now, we may need to allocate the data block */ + if (!(bp->b_flags & (B_CACHE | B_DONE | B_DELWRI))) { + if (daddr == UNASSIGNED) + if (!ISSPACE(fs, bb, curproc->p_ucred)) { + bp->b_flags |= B_INVAL; + brelse(bp); + return(ENOSPC); + } else { + ip->i_blocks += bb; + ip->i_lfs->lfs_bfree -= bb; + if (iosize != fs->lfs_bsize) + clrbuf(bp); + } + else if (iosize == fs->lfs_bsize) + bp->b_blkno = daddr; /* Skip the I/O */ + else { + bp->b_blkno = daddr; + bp->b_flags |= B_READ; + VOP_STRATEGY(bp); + return(biowait(bp)); + } + } + return (error); +} diff --git a/sys/ufs/lfs/lfs_bio.c b/sys/ufs/lfs/lfs_bio.c new file mode 100644 index 00000000000..0f021f17208 --- /dev/null +++ b/sys/ufs/lfs/lfs_bio.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_bio.c 8.4 (Berkeley) 12/30/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +/* + * LFS block write function. + * + * XXX + * No write cost accounting is done. + * This is almost certainly wrong for synchronous operations and NFS. + */ +int lfs_allclean_wakeup; /* Cleaner wakeup address. */ +int locked_queue_count; /* XXX Count of locked-down buffers. */ +int lfs_writing; /* Set if already kicked off a writer + because of buffer space */ +/* +#define WRITE_THRESHHOLD ((nbuf >> 2) - 10) +#define WAIT_THRESHHOLD ((nbuf >> 1) - 10) +*/ +#define WAIT_THRESHHOLD (nbuf - (nbuf >> 2) - 10) +#define WRITE_THRESHHOLD ((nbuf >> 1) - 10) +#define LFS_BUFWAIT 2 + +int +lfs_bwrite(ap) + struct vop_bwrite_args /* { + struct buf *a_bp; + } */ *ap; +{ + register struct buf *bp = ap->a_bp; + struct lfs *fs; + struct inode *ip; + int error, s; + + /* + * Set the delayed write flag and use reassignbuf to move the buffer + * from the clean list to the dirty one. + * + * Set the B_LOCKED flag and unlock the buffer, causing brelse to move + * the buffer onto the LOCKED free list. This is necessary, otherwise + * getnewbuf() would try to reclaim the buffers using bawrite, which + * isn't going to work. + * + * XXX we don't let meta-data writes run out of space because they can + * come from the segment writer. We need to make sure that there is + * enough space reserved so that there's room to write meta-data + * blocks. + */ + if (!(bp->b_flags & B_LOCKED)) { + fs = VFSTOUFS(bp->b_vp->v_mount)->um_lfs; + while (!LFS_FITS(fs, fsbtodb(fs, 1)) && !IS_IFILE(bp) && + bp->b_lblkno > 0) { + /* Out of space, need cleaner to run */ + wakeup(&lfs_allclean_wakeup); + if (error = tsleep(&fs->lfs_avail, PCATCH | PUSER, + "cleaner", NULL)) { + brelse(bp); + return (error); + } + } + ip = VTOI((bp)->b_vp); + if (!(ip->i_flag & IN_MODIFIED)) + ++fs->lfs_uinodes; + ip->i_flag |= IN_CHANGE | IN_MODIFIED | IN_UPDATE; + fs->lfs_avail -= fsbtodb(fs, 1); + ++locked_queue_count; + bp->b_flags |= B_DELWRI | B_LOCKED; + bp->b_flags &= ~(B_READ | B_ERROR); + s = splbio(); + reassignbuf(bp, bp->b_vp); + splx(s); + } + brelse(bp); + return (0); +} + +/* + * XXX + * This routine flushes buffers out of the B_LOCKED queue when LFS has too + * many locked down. Eventually the pageout daemon will simply call LFS + * when pages need to be reclaimed. Note, we have one static count of locked + * buffers, so we can't have more than a single file system. To make this + * work for multiple file systems, put the count into the mount structure. + */ +void +lfs_flush() +{ + register struct mount *mp; + +#ifdef DOSTATS + ++lfs_stats.write_exceeded; +#endif + if (lfs_writing) + return; + lfs_writing = 1; + for (mp = mountlist.tqh_first; mp != NULL; mp = mp->mnt_list.tqe_next) { + /* The lock check below is to avoid races with unmount. */ + if (mp->mnt_stat.f_type == MOUNT_LFS && + (mp->mnt_flag & (MNT_MLOCK|MNT_RDONLY|MNT_UNMOUNT)) == 0 && + !((((struct ufsmount *)mp->mnt_data))->ufsmount_u.lfs)->lfs_dirops ) { + /* + * We set the queue to 0 here because we are about to + * write all the dirty buffers we have. If more come + * in while we're writing the segment, they may not + * get written, so we want the count to reflect these + * new writes after the segwrite completes. + */ +#ifdef DOSTATS + ++lfs_stats.flush_invoked; +#endif + lfs_segwrite(mp, 0); + } + } + lfs_writing = 0; +} + +int +lfs_check(vp, blkno) + struct vnode *vp; + daddr_t blkno; +{ + extern int lfs_allclean_wakeup; + int error; + + error = 0; + if (incore(vp, blkno)) + return (0); + if (locked_queue_count > WRITE_THRESHHOLD) + lfs_flush(); + + /* If out of buffers, wait on writer */ + while (locked_queue_count > WAIT_THRESHHOLD) { +#ifdef DOSTATS + ++lfs_stats.wait_exceeded; +#endif + error = tsleep(&locked_queue_count, PCATCH | PUSER, "buffers", + hz * LFS_BUFWAIT); + } + + return (error); +} diff --git a/sys/ufs/lfs/lfs_cksum.c b/sys/ufs/lfs/lfs_cksum.c new file mode 100644 index 00000000000..77b011aa2c4 --- /dev/null +++ b/sys/ufs/lfs/lfs_cksum.c @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_cksum.c 8.1 (Berkeley) 6/11/93 + */ + +#include + +/* + * Simple, general purpose, fast checksum. Data must be short-aligned. + * Returns a u_long in case we ever want to do something more rigorous. + * + * XXX + * Use the TCP/IP checksum instead. + */ +u_long +cksum(str, len) + register void *str; + register size_t len; +{ + register u_long sum; + + len &= ~(sizeof(u_short) - 1); + for (sum = 0; len; len -= sizeof(u_short)) { + sum ^= *(u_short *)str; + ++(u_short *)str; + } + return (sum); +} diff --git a/sys/ufs/lfs/lfs_debug.c b/sys/ufs/lfs/lfs_debug.c new file mode 100644 index 00000000000..cc28d609023 --- /dev/null +++ b/sys/ufs/lfs/lfs_debug.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_debug.c 8.1 (Berkeley) 6/11/93 + */ + +#ifdef DEBUG +#include +#include +#include +#include + +#include +#include +#include +#include + +void +lfs_dump_super(lfsp) + struct lfs *lfsp; +{ + int i; + + (void)printf("%s%lx\t%s%lx\t%s%d\t%s%d\n", + "magic ", lfsp->lfs_magic, + "version ", lfsp->lfs_version, + "size ", lfsp->lfs_size, + "ssize ", lfsp->lfs_ssize); + (void)printf("%s%d\t%s%d\t%s%d\t%s%d\n", + "dsize ", lfsp->lfs_dsize, + "bsize ", lfsp->lfs_bsize, + "fsize ", lfsp->lfs_fsize, + "frag ", lfsp->lfs_frag); + + (void)printf("%s%d\t%s%d\t%s%d\t%s%d\n", + "minfree ", lfsp->lfs_minfree, + "inopb ", lfsp->lfs_inopb, + "ifpb ", lfsp->lfs_ifpb, + "nindir ", lfsp->lfs_nindir); + + (void)printf("%s%d\t%s%d\t%s%d\t%s%d\n", + "nseg ", lfsp->lfs_nseg, + "nspf ", lfsp->lfs_nspf, + "cleansz ", lfsp->lfs_cleansz, + "segtabsz ", lfsp->lfs_segtabsz); + + (void)printf("%s%lx\t%s%d\t%s%lx\t%s%d\n", + "segmask ", lfsp->lfs_segmask, + "segshift ", lfsp->lfs_segshift, + "bmask ", lfsp->lfs_bmask, + "bshift ", lfsp->lfs_bshift); + + (void)printf("%s%lx\t%s%d\t%s%lx\t%s%d\n", + "ffmask ", lfsp->lfs_ffmask, + "ffshift ", lfsp->lfs_ffshift, + "fbmask ", lfsp->lfs_fbmask, + "fbshift ", lfsp->lfs_fbshift); + + (void)printf("%s%d\t%s%d\t%s%lx\t%s%qx\n", + "sushift ", lfsp->lfs_sushift, + "fsbtodb ", lfsp->lfs_fsbtodb, + "cksum ", lfsp->lfs_cksum, + "maxfilesize ", lfsp->lfs_maxfilesize); + + (void)printf("Superblock disk addresses:"); + for (i = 0; i < LFS_MAXNUMSB; i++) + (void)printf(" %lx", lfsp->lfs_sboffs[i]); + (void)printf("\n"); + + (void)printf("Checkpoint Info\n"); + (void)printf("%s%d\t%s%lx\t%s%d\n", + "free ", lfsp->lfs_free, + "idaddr ", lfsp->lfs_idaddr, + "ifile ", lfsp->lfs_ifile); + (void)printf("%s%lx\t%s%d\t%s%lx\t%s%lx\t%s%lx\t%s%lx\n", + "bfree ", lfsp->lfs_bfree, + "nfiles ", lfsp->lfs_nfiles, + "lastseg ", lfsp->lfs_lastseg, + "nextseg ", lfsp->lfs_nextseg, + "curseg ", lfsp->lfs_curseg, + "offset ", lfsp->lfs_offset); + (void)printf("tstamp %lx\n", lfsp->lfs_tstamp); +} + +void +lfs_dump_dinode(dip) + struct dinode *dip; +{ + int i; + + (void)printf("%s%u\t%s%d\t%s%u\t%s%u\t%s%lu\n", + "mode ", dip->di_mode, + "nlink ", dip->di_nlink, + "uid ", dip->di_uid, + "gid ", dip->di_gid, + "size ", dip->di_size); + (void)printf("inum %ld\n", dip->di_inumber); + (void)printf("Direct Addresses\n"); + for (i = 0; i < NDADDR; i++) { + (void)printf("\t%lx", dip->di_db[i]); + if ((i % 6) == 5) + (void)printf("\n"); + } + for (i = 0; i < NIADDR; i++) + (void)printf("\t%lx", dip->di_ib[i]); + (void)printf("\n"); +} +#endif /* DEBUG */ diff --git a/sys/ufs/lfs/lfs_extern.h b/sys/ufs/lfs/lfs_extern.h new file mode 100644 index 00000000000..c1157ade02a --- /dev/null +++ b/sys/ufs/lfs/lfs_extern.h @@ -0,0 +1,106 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_extern.h 8.2 (Berkeley) 4/16/94 + */ + +struct fid; +struct mount; +struct nameidata; +struct proc; +struct statfs; +struct timeval; +struct inode; +struct uio; +struct mbuf; + +__BEGIN_DECLS +u_long cksum __P((void *, size_t)); /* XXX */ +int lfs_balloc __P((struct vnode *, u_long, daddr_t, struct buf **)); +int lfs_blkatoff __P((struct vop_blkatoff_args *)); +int lfs_bwrite __P((struct vop_bwrite_args *)); +int lfs_check __P((struct vnode *, daddr_t)); +int lfs_close __P((struct vop_close_args *)); +int lfs_create __P((struct vop_create_args *)); +int lfs_fhtovp __P((struct mount *, struct fid *, struct mbuf *, + struct vnode **, int *, struct ucred **)); +int lfs_fsync __P((struct vop_fsync_args *)); +int lfs_getattr __P((struct vop_getattr_args *)); +struct dinode * + lfs_ifind __P((struct lfs *, ino_t, struct dinode *)); +int lfs_inactive __P((struct vop_inactive_args *)); +int lfs_init __P((void)); +int lfs_initseg __P((struct lfs *)); +int lfs_link __P((struct vop_link_args *)); +int lfs_makeinode __P((int, struct nameidata *, struct inode **)); +int lfs_mkdir __P((struct vop_mkdir_args *)); +int lfs_mknod __P((struct vop_mknod_args *)); +int lfs_mount __P((struct mount *, + char *, caddr_t, struct nameidata *, struct proc *)); +int lfs_mountroot __P((void)); +struct buf * + lfs_newbuf __P((struct vnode *, daddr_t, size_t)); +int lfs_read __P((struct vop_read_args *)); +int lfs_remove __P((struct vop_remove_args *)); +int lfs_rmdir __P((struct vop_rmdir_args *)); +int lfs_rename __P((struct vop_rename_args *)); +void lfs_seglock __P((struct lfs *, unsigned long flags)); +void lfs_segunlock __P((struct lfs *)); +int lfs_segwrite __P((struct mount *, int)); +int lfs_statfs __P((struct mount *, struct statfs *, struct proc *)); +int lfs_symlink __P((struct vop_symlink_args *)); +int lfs_sync __P((struct mount *, int, struct ucred *, struct proc *)); +int lfs_truncate __P((struct vop_truncate_args *)); +int lfs_unmount __P((struct mount *, int, struct proc *)); +int lfs_update __P((struct vop_update_args *)); +int lfs_valloc __P((struct vop_valloc_args *)); +int lfs_vcreate __P((struct mount *, ino_t, struct vnode **)); +int lfs_vfree __P((struct vop_vfree_args *)); +int lfs_vflush __P((struct vnode *)); +int lfs_vget __P((struct mount *, ino_t, struct vnode **)); +int lfs_vptofh __P((struct vnode *, struct fid *)); +int lfs_vref __P((struct vnode *)); +void lfs_vunref __P((struct vnode *)); +int lfs_write __P((struct vop_write_args *)); +#ifdef DEBUG +void lfs_dump_dinode __P((struct dinode *)); +void lfs_dump_super __P((struct lfs *)); +#endif +__END_DECLS +extern int (**lfs_vnodeop_p)(); +extern int (**lfs_specop_p)(); +#ifdef FIFO +extern int (**lfs_fifoop_p)(); +#define LFS_FIFOOPS lfs_fifoop_p +#else +#define LFS_FIFOOPS NULL +#endif diff --git a/sys/ufs/lfs/lfs_inode.c b/sys/ufs/lfs/lfs_inode.c new file mode 100644 index 00000000000..1a06aa23ed8 --- /dev/null +++ b/sys/ufs/lfs/lfs_inode.c @@ -0,0 +1,359 @@ +/* + * Copyright (c) 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_inode.c 8.5 (Berkeley) 12/30/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include + +int +lfs_init() +{ + return (ufs_init()); +} + +/* Search a block for a specific dinode. */ +struct dinode * +lfs_ifind(fs, ino, dip) + struct lfs *fs; + ino_t ino; + register struct dinode *dip; +{ + register int cnt; + register struct dinode *ldip; + + for (cnt = INOPB(fs), ldip = dip + (cnt - 1); cnt--; --ldip) + if (ldip->di_inumber == ino) + return (ldip); + + panic("lfs_ifind: dinode %u not found", ino); + /* NOTREACHED */ +} + +int +lfs_update(ap) + struct vop_update_args /* { + struct vnode *a_vp; + struct timeval *a_access; + struct timeval *a_modify; + int a_waitfor; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip; + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (0); + ip = VTOI(vp); + if ((ip->i_flag & + (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0) + return (0); + if (ip->i_flag & IN_ACCESS) + ip->i_atime.ts_sec = ap->a_access->tv_sec; + if (ip->i_flag & IN_UPDATE) { + ip->i_mtime.ts_sec = ap->a_modify->tv_sec; + (ip)->i_modrev++; + } + if (ip->i_flag & IN_CHANGE) + ip->i_ctime.ts_sec = time.tv_sec; + ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); + + if (!(ip->i_flag & IN_MODIFIED)) + ++(VFSTOUFS(vp->v_mount)->um_lfs->lfs_uinodes); + ip->i_flag |= IN_MODIFIED; + + /* If sync, push back the vnode and any dirty blocks it may have. */ + return (ap->a_waitfor & LFS_SYNC ? lfs_vflush(vp) : 0); +} + +/* Update segment usage information when removing a block. */ +#define UPDATE_SEGUSE \ + if (lastseg != -1) { \ + LFS_SEGENTRY(sup, fs, lastseg, sup_bp); \ + if ((num << fs->lfs_bshift) > sup->su_nbytes) \ + panic("lfs_truncate: negative bytes in segment %d\n", \ + lastseg); \ + sup->su_nbytes -= num << fs->lfs_bshift; \ + e1 = VOP_BWRITE(sup_bp); \ + blocksreleased += num; \ + } + +#define SEGDEC { \ + if (daddr != 0) { \ + if (lastseg != (seg = datosn(fs, daddr))) { \ + UPDATE_SEGUSE; \ + num = 1; \ + lastseg = seg; \ + } else \ + ++num; \ + } \ +} + +/* + * Truncate the inode ip to at most length size. Update segment usage + * table information. + */ +/* ARGSUSED */ +int +lfs_truncate(ap) + struct vop_truncate_args /* { + struct vnode *a_vp; + off_t a_length; + int a_flags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct indir *inp; + register int i; + register daddr_t *daddrp; + register struct vnode *vp = ap->a_vp; + off_t length = ap->a_length; + struct buf *bp, *sup_bp; + struct timeval tv; + struct ifile *ifp; + struct inode *ip; + struct lfs *fs; + struct indir a[NIADDR + 2], a_end[NIADDR + 2]; + SEGUSE *sup; + daddr_t daddr, lastblock, lbn, olastblock; + long off, a_released, blocksreleased, i_released; + int e1, e2, depth, lastseg, num, offset, seg, size; + + ip = VTOI(vp); + tv = time; + if (vp->v_type == VLNK && vp->v_mount->mnt_maxsymlinklen > 0) { +#ifdef DIAGNOSTIC + if (length != 0) + panic("lfs_truncate: partial truncate of symlink"); +#endif + bzero((char *)&ip->i_shortlink, (u_int)ip->i_size); + ip->i_size = 0; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOP_UPDATE(vp, &tv, &tv, 0)); + } + vnode_pager_setsize(vp, (u_long)length); + + fs = ip->i_lfs; + + /* If length is larger than the file, just update the times. */ + if (ip->i_size <= length) { + ip->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOP_UPDATE(vp, &tv, &tv, 0)); + } + + /* + * Calculate index into inode's block list of last direct and indirect + * blocks (if any) which we want to keep. Lastblock is 0 when the + * file is truncated to 0. + */ + lastblock = lblkno(fs, length + fs->lfs_bsize - 1); + olastblock = lblkno(fs, ip->i_size + fs->lfs_bsize - 1) - 1; + + /* + * Update the size of the file. If the file is not being truncated to + * a block boundry, the contents of the partial block following the end + * of the file must be zero'ed in case it ever become accessable again + * because of subsequent file growth. + */ + offset = blkoff(fs, length); + if (offset == 0) + ip->i_size = length; + else { + lbn = lblkno(fs, length); +#ifdef QUOTA + if (e1 = getinoquota(ip)) + return (e1); +#endif + if (e1 = bread(vp, lbn, fs->lfs_bsize, NOCRED, &bp)) + return (e1); + ip->i_size = length; + size = blksize(fs); + (void)vnode_pager_uncache(vp); + bzero((char *)bp->b_data + offset, (u_int)(size - offset)); + allocbuf(bp, size); + if (e1 = VOP_BWRITE(bp)) + return (e1); + } + /* + * Modify sup->su_nbyte counters for each deleted block; keep track + * of number of blocks removed for ip->i_blocks. + */ + blocksreleased = 0; + num = 0; + lastseg = -1; + + for (lbn = olastblock; lbn >= lastblock;) { + /* XXX use run length from bmap array to make this faster */ + ufs_bmaparray(vp, lbn, &daddr, a, &depth, NULL); + if (lbn == olastblock) + for (i = NIADDR + 2; i--;) + a_end[i] = a[i]; + switch (depth) { + case 0: /* Direct block. */ + daddr = ip->i_db[lbn]; + SEGDEC; + ip->i_db[lbn] = 0; + --lbn; + break; +#ifdef DIAGNOSTIC + case 1: /* An indirect block. */ + panic("lfs_truncate: ufs_bmaparray returned depth 1"); + /* NOTREACHED */ +#endif + default: /* Chain of indirect blocks. */ + inp = a + --depth; + if (inp->in_off > 0 && lbn != lastblock) { + lbn -= inp->in_off < lbn - lastblock ? + inp->in_off : lbn - lastblock; + break; + } + for (; depth && (inp->in_off == 0 || lbn == lastblock); + --inp, --depth) { + if (bread(vp, + inp->in_lbn, fs->lfs_bsize, NOCRED, &bp)) + panic("lfs_truncate: bread bno %d", + inp->in_lbn); + daddrp = (daddr_t *)bp->b_data + inp->in_off; + for (i = inp->in_off; + i++ <= a_end[depth].in_off;) { + daddr = *daddrp++; + SEGDEC; + } + a_end[depth].in_off = NINDIR(fs) - 1; + if (inp->in_off == 0) + brelse (bp); + else { + bzero((daddr_t *)bp->b_data + + inp->in_off, fs->lfs_bsize - + inp->in_off * sizeof(daddr_t)); + if (e1 = VOP_BWRITE(bp)) + return (e1); + } + } + if (depth == 0 && a[1].in_off == 0) { + off = a[0].in_off; + daddr = ip->i_ib[off]; + SEGDEC; + ip->i_ib[off] = 0; + } + if (lbn == lastblock || lbn <= NDADDR) + --lbn; + else { + lbn -= NINDIR(fs); + if (lbn < lastblock) + lbn = lastblock; + } + } + } + UPDATE_SEGUSE; + + /* If truncating the file to 0, update the version number. */ + if (length == 0) { + LFS_IENTRY(ifp, fs, ip->i_number, bp); + ++ifp->if_version; + (void) VOP_BWRITE(bp); + } + +#ifdef DIAGNOSTIC + if (ip->i_blocks < fsbtodb(fs, blocksreleased)) { + printf("lfs_truncate: block count < 0\n"); + blocksreleased = ip->i_blocks; + } +#endif + ip->i_blocks -= fsbtodb(fs, blocksreleased); + fs->lfs_bfree += fsbtodb(fs, blocksreleased); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * Traverse dirty block list counting number of dirty buffers + * that are being deleted out of the cache, so that the lfs_avail + * field can be updated. + */ + a_released = 0; + i_released = 0; + for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = bp->b_vnbufs.le_next) + if (bp->b_flags & B_LOCKED) { + ++a_released; + /* + * XXX + * When buffers are created in the cache, their block + * number is set equal to their logical block number. + * If that is still true, we are assuming that the + * blocks are new (not yet on disk) and weren't + * counted above. However, there is a slight chance + * that a block's disk address is equal to its logical + * block number in which case, we'll get an overcounting + * here. + */ + if (bp->b_blkno == bp->b_lblkno) + ++i_released; + } + blocksreleased = fsbtodb(fs, i_released); +#ifdef DIAGNOSTIC + if (blocksreleased > ip->i_blocks) { + printf("lfs_inode: Warning! %s\n", + "more blocks released from inode than are in inode"); + blocksreleased = ip->i_blocks; + } +#endif + fs->lfs_bfree += blocksreleased; + ip->i_blocks -= blocksreleased; +#ifdef DIAGNOSTIC + if (length == 0 && ip->i_blocks != 0) + printf("lfs_inode: Warning! %s%d%s\n", + "Truncation to zero, but ", ip->i_blocks, + " blocks left on inode"); +#endif + fs->lfs_avail += fsbtodb(fs, a_released); + e1 = vinvalbuf(vp, (length > 0) ? V_SAVE : 0, ap->a_cred, ap->a_p, + 0, 0); + e2 = VOP_UPDATE(vp, &tv, &tv, 0); + return (e1 ? e1 : e2 ? e2 : 0); +} diff --git a/sys/ufs/lfs/lfs_segment.c b/sys/ufs/lfs/lfs_segment.c new file mode 100644 index 00000000000..249d59ddda5 --- /dev/null +++ b/sys/ufs/lfs/lfs_segment.c @@ -0,0 +1,1111 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_segment.c 8.5 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +extern int count_lock_queue __P((void)); + +#define MAX_ACTIVE 10 +/* + * Determine if it's OK to start a partial in this segment, or if we need + * to go on to a new segment. + */ +#define LFS_PARTIAL_FITS(fs) \ + ((fs)->lfs_dbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \ + 1 << (fs)->lfs_fsbtodb) + +void lfs_callback __P((struct buf *)); +void lfs_gather __P((struct lfs *, struct segment *, + struct vnode *, int (*) __P((struct lfs *, struct buf *)))); +int lfs_gatherblock __P((struct segment *, struct buf *, int *)); +void lfs_iset __P((struct inode *, daddr_t, time_t)); +int lfs_match_data __P((struct lfs *, struct buf *)); +int lfs_match_dindir __P((struct lfs *, struct buf *)); +int lfs_match_indir __P((struct lfs *, struct buf *)); +int lfs_match_tindir __P((struct lfs *, struct buf *)); +void lfs_newseg __P((struct lfs *)); +void lfs_shellsort __P((struct buf **, daddr_t *, register int)); +void lfs_supercallback __P((struct buf *)); +void lfs_updatemeta __P((struct segment *)); +int lfs_vref __P((struct vnode *)); +void lfs_vunref __P((struct vnode *)); +void lfs_writefile __P((struct lfs *, struct segment *, struct vnode *)); +int lfs_writeinode __P((struct lfs *, struct segment *, struct inode *)); +int lfs_writeseg __P((struct lfs *, struct segment *)); +void lfs_writesuper __P((struct lfs *)); +void lfs_writevnodes __P((struct lfs *fs, struct mount *mp, + struct segment *sp, int dirops)); + +int lfs_allclean_wakeup; /* Cleaner wakeup address. */ + +/* Statistics Counters */ +#define DOSTATS +struct lfs_stats lfs_stats; + +/* op values to lfs_writevnodes */ +#define VN_REG 0 +#define VN_DIROP 1 +#define VN_EMPTY 2 + +/* + * Ifile and meta data blocks are not marked busy, so segment writes MUST be + * single threaded. Currently, there are two paths into lfs_segwrite, sync() + * and getnewbuf(). They both mark the file system busy. Lfs_vflush() + * explicitly marks the file system busy. So lfs_segwrite is safe. I think. + */ + +int +lfs_vflush(vp) + struct vnode *vp; +{ + struct inode *ip; + struct lfs *fs; + struct segment *sp; + + fs = VFSTOUFS(vp->v_mount)->um_lfs; + if (fs->lfs_nactive > MAX_ACTIVE) + return(lfs_segwrite(vp->v_mount, SEGM_SYNC|SEGM_CKP)); + lfs_seglock(fs, SEGM_SYNC); + sp = fs->lfs_sp; + + + ip = VTOI(vp); + if (vp->v_dirtyblkhd.lh_first == NULL) + lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY); + + do { + do { + if (vp->v_dirtyblkhd.lh_first != NULL) + lfs_writefile(fs, sp, vp); + } while (lfs_writeinode(fs, sp, ip)); + + } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM); + +#ifdef DOSTATS + ++lfs_stats.nwrites; + if (sp->seg_flags & SEGM_SYNC) + ++lfs_stats.nsync_writes; + if (sp->seg_flags & SEGM_CKP) + ++lfs_stats.ncheckpoints; +#endif + lfs_segunlock(fs); + return (0); +} + +void +lfs_writevnodes(fs, mp, sp, op) + struct lfs *fs; + struct mount *mp; + struct segment *sp; + int op; +{ + struct inode *ip; + struct vnode *vp; + +loop: + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + /* + * If the vnode that we are about to sync is no longer + * associated with this mount point, start over. + */ + if (vp->v_mount != mp) + goto loop; + + /* XXX ignore dirops for now + if (op == VN_DIROP && !(vp->v_flag & VDIROP) || + op != VN_DIROP && (vp->v_flag & VDIROP)) + continue; + */ + + if (op == VN_EMPTY && vp->v_dirtyblkhd.lh_first) + continue; + + if (vp->v_type == VNON) + continue; + + if (lfs_vref(vp)) + continue; + + /* + * Write the inode/file if dirty and it's not the + * the IFILE. + */ + ip = VTOI(vp); + if ((ip->i_flag & + (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE) || + vp->v_dirtyblkhd.lh_first != NULL) && + ip->i_number != LFS_IFILE_INUM) { + if (vp->v_dirtyblkhd.lh_first != NULL) + lfs_writefile(fs, sp, vp); + (void) lfs_writeinode(fs, sp, ip); + } + vp->v_flag &= ~VDIROP; + lfs_vunref(vp); + } +} + +int +lfs_segwrite(mp, flags) + struct mount *mp; + int flags; /* Do a checkpoint. */ +{ + struct buf *bp; + struct inode *ip; + struct lfs *fs; + struct segment *sp; + struct vnode *vp; + SEGUSE *segusep; + daddr_t ibno; + CLEANERINFO *cip; + int clean, do_ckp, error, i; + + fs = VFSTOUFS(mp)->um_lfs; + + /* + * If we have fewer than 2 clean segments, wait until cleaner + * writes. + */ + do { + LFS_CLEANERINFO(cip, fs, bp); + clean = cip->clean; + brelse(bp); + if (clean <= 2) { + printf ("segs clean: %d\n", clean); + wakeup(&lfs_allclean_wakeup); + if (error = tsleep(&fs->lfs_avail, PRIBIO + 1, + "lfs writer", 0)) + return (error); + } + } while (clean <= 2 ); + + /* + * Allocate a segment structure and enough space to hold pointers to + * the maximum possible number of buffers which can be described in a + * single summary block. + */ + do_ckp = flags & SEGM_CKP || fs->lfs_nactive > MAX_ACTIVE; + lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0)); + sp = fs->lfs_sp; + + lfs_writevnodes(fs, mp, sp, VN_REG); + + /* XXX ignore ordering of dirops for now */ + /* XXX + fs->lfs_writer = 1; + if (fs->lfs_dirops && (error = + tsleep(&fs->lfs_writer, PRIBIO + 1, "lfs writer", 0))) { + free(sp->bpp, M_SEGMENT); + free(sp, M_SEGMENT); + fs->lfs_writer = 0; + return (error); + } + + lfs_writevnodes(fs, mp, sp, VN_DIROP); + */ + + /* + * If we are doing a checkpoint, mark everything since the + * last checkpoint as no longer ACTIVE. + */ + if (do_ckp) + for (ibno = fs->lfs_cleansz + fs->lfs_segtabsz; + --ibno >= fs->lfs_cleansz; ) { + if (bread(fs->lfs_ivnode, ibno, fs->lfs_bsize, + NOCRED, &bp)) + + panic("lfs: ifile read"); + segusep = (SEGUSE *)bp->b_data; + for (i = fs->lfs_sepb; i--; segusep++) + segusep->su_flags &= ~SEGUSE_ACTIVE; + + error = VOP_BWRITE(bp); + } + + if (do_ckp || fs->lfs_doifile) { +redo: + vp = fs->lfs_ivnode; + while (vget(vp, 1)); + ip = VTOI(vp); + if (vp->v_dirtyblkhd.lh_first != NULL) + lfs_writefile(fs, sp, vp); + (void)lfs_writeinode(fs, sp, ip); + vput(vp); + if (lfs_writeseg(fs, sp) && do_ckp) + goto redo; + } else + (void) lfs_writeseg(fs, sp); + + /* + * If the I/O count is non-zero, sleep until it reaches zero. At the + * moment, the user's process hangs around so we can sleep. + */ + /* XXX ignore dirops for now + fs->lfs_writer = 0; + fs->lfs_doifile = 0; + wakeup(&fs->lfs_dirops); + */ + +#ifdef DOSTATS + ++lfs_stats.nwrites; + if (sp->seg_flags & SEGM_SYNC) + ++lfs_stats.nsync_writes; + if (sp->seg_flags & SEGM_CKP) + ++lfs_stats.ncheckpoints; +#endif + lfs_segunlock(fs); + return (0); +} + +/* + * Write the dirty blocks associated with a vnode. + */ +void +lfs_writefile(fs, sp, vp) + struct lfs *fs; + struct segment *sp; + struct vnode *vp; +{ + struct buf *bp; + struct finfo *fip; + IFILE *ifp; + + if (sp->seg_bytes_left < fs->lfs_bsize || + sp->sum_bytes_left < sizeof(struct finfo)) + (void) lfs_writeseg(fs, sp); + + sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(daddr_t); + ++((SEGSUM *)(sp->segsum))->ss_nfinfo; + + fip = sp->fip; + fip->fi_nblocks = 0; + fip->fi_ino = VTOI(vp)->i_number; + LFS_IENTRY(ifp, fs, fip->fi_ino, bp); + fip->fi_version = ifp->if_version; + brelse(bp); + + /* + * It may not be necessary to write the meta-data blocks at this point, + * as the roll-forward recovery code should be able to reconstruct the + * list. + */ + lfs_gather(fs, sp, vp, lfs_match_data); + lfs_gather(fs, sp, vp, lfs_match_indir); + lfs_gather(fs, sp, vp, lfs_match_dindir); +#ifdef TRIPLE + lfs_gather(fs, sp, vp, lfs_match_tindir); +#endif + + fip = sp->fip; + if (fip->fi_nblocks != 0) { + sp->fip = + (struct finfo *)((caddr_t)fip + sizeof(struct finfo) + + sizeof(daddr_t) * (fip->fi_nblocks - 1)); + sp->start_lbp = &sp->fip->fi_blocks[0]; + } else { + sp->sum_bytes_left += sizeof(struct finfo) - sizeof(daddr_t); + --((SEGSUM *)(sp->segsum))->ss_nfinfo; + } +} + +int +lfs_writeinode(fs, sp, ip) + struct lfs *fs; + struct segment *sp; + struct inode *ip; +{ + struct buf *bp, *ibp; + IFILE *ifp; + SEGUSE *sup; + daddr_t daddr; + ino_t ino; + int error, i, ndx; + int redo_ifile = 0; + + if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE))) + return(0); + + /* Allocate a new inode block if necessary. */ + if (sp->ibp == NULL) { + /* Allocate a new segment if necessary. */ + if (sp->seg_bytes_left < fs->lfs_bsize || + sp->sum_bytes_left < sizeof(daddr_t)) + (void) lfs_writeseg(fs, sp); + + /* Get next inode block. */ + daddr = fs->lfs_offset; + fs->lfs_offset += fsbtodb(fs, 1); + sp->ibp = *sp->cbpp++ = + lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, daddr, + fs->lfs_bsize); + /* Zero out inode numbers */ + for (i = 0; i < INOPB(fs); ++i) + ((struct dinode *)sp->ibp->b_data)[i].di_inumber = 0; + ++sp->start_bpp; + fs->lfs_avail -= fsbtodb(fs, 1); + /* Set remaining space counters. */ + sp->seg_bytes_left -= fs->lfs_bsize; + sp->sum_bytes_left -= sizeof(daddr_t); + ndx = LFS_SUMMARY_SIZE / sizeof(daddr_t) - + sp->ninodes / INOPB(fs) - 1; + ((daddr_t *)(sp->segsum))[ndx] = daddr; + } + + /* Update the inode times and copy the inode onto the inode page. */ + if (ip->i_flag & IN_MODIFIED) + --fs->lfs_uinodes; + ITIMES(ip, &time, &time); + ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE); + bp = sp->ibp; + ((struct dinode *)bp->b_data)[sp->ninodes % INOPB(fs)] = ip->i_din; + /* Increment inode count in segment summary block. */ + ++((SEGSUM *)(sp->segsum))->ss_ninos; + + /* If this page is full, set flag to allocate a new page. */ + if (++sp->ninodes % INOPB(fs) == 0) + sp->ibp = NULL; + + /* + * If updating the ifile, update the super-block. Update the disk + * address and access times for this inode in the ifile. + */ + ino = ip->i_number; + if (ino == LFS_IFILE_INUM) { + daddr = fs->lfs_idaddr; + fs->lfs_idaddr = bp->b_blkno; + } else { + LFS_IENTRY(ifp, fs, ino, ibp); + daddr = ifp->if_daddr; + ifp->if_daddr = bp->b_blkno; + error = VOP_BWRITE(ibp); + } + + /* + * No need to update segment usage if there was no former inode address + * or if the last inode address is in the current partial segment. + */ + if (daddr != LFS_UNUSED_DADDR && + !(daddr >= fs->lfs_lastpseg && daddr <= bp->b_blkno)) { + LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); +#ifdef DIAGNOSTIC + if (sup->su_nbytes < sizeof(struct dinode)) { + /* XXX -- Change to a panic. */ + printf("lfs: negative bytes (segment %d)\n", + datosn(fs, daddr)); + panic("negative bytes"); + } +#endif + sup->su_nbytes -= sizeof(struct dinode); + redo_ifile = + (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED)); + error = VOP_BWRITE(bp); + } + return (redo_ifile); +} + +int +lfs_gatherblock(sp, bp, sptr) + struct segment *sp; + struct buf *bp; + int *sptr; +{ + struct lfs *fs; + int version; + + /* + * If full, finish this segment. We may be doing I/O, so + * release and reacquire the splbio(). + */ +#ifdef DIAGNOSTIC + if (sp->vp == NULL) + panic ("lfs_gatherblock: Null vp in segment"); +#endif + fs = sp->fs; + if (sp->sum_bytes_left < sizeof(daddr_t) || + sp->seg_bytes_left < fs->lfs_bsize) { + if (sptr) + splx(*sptr); + lfs_updatemeta(sp); + + version = sp->fip->fi_version; + (void) lfs_writeseg(fs, sp); + + sp->fip->fi_version = version; + sp->fip->fi_ino = VTOI(sp->vp)->i_number; + /* Add the current file to the segment summary. */ + ++((SEGSUM *)(sp->segsum))->ss_nfinfo; + sp->sum_bytes_left -= + sizeof(struct finfo) - sizeof(daddr_t); + + if (sptr) + *sptr = splbio(); + return(1); + } + + /* Insert into the buffer list, update the FINFO block. */ + bp->b_flags |= B_GATHERED; + *sp->cbpp++ = bp; + sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno; + + sp->sum_bytes_left -= sizeof(daddr_t); + sp->seg_bytes_left -= fs->lfs_bsize; + return(0); +} + +void +lfs_gather(fs, sp, vp, match) + struct lfs *fs; + struct segment *sp; + struct vnode *vp; + int (*match) __P((struct lfs *, struct buf *)); +{ + struct buf *bp; + int s; + + sp->vp = vp; + s = splbio(); +loop: for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = bp->b_vnbufs.le_next) { + if (bp->b_flags & B_BUSY || !match(fs, bp) || + bp->b_flags & B_GATHERED) + continue; +#ifdef DIAGNOSTIC + if (!(bp->b_flags & B_DELWRI)) + panic("lfs_gather: bp not B_DELWRI"); + if (!(bp->b_flags & B_LOCKED)) + panic("lfs_gather: bp not B_LOCKED"); +#endif + if (lfs_gatherblock(sp, bp, &s)) + goto loop; + } + splx(s); + lfs_updatemeta(sp); + sp->vp = NULL; +} + + +/* + * Update the metadata that points to the blocks listed in the FINFO + * array. + */ +void +lfs_updatemeta(sp) + struct segment *sp; +{ + SEGUSE *sup; + struct buf *bp; + struct lfs *fs; + struct vnode *vp; + struct indir a[NIADDR + 2], *ap; + struct inode *ip; + daddr_t daddr, lbn, off; + int db_per_fsb, error, i, nblocks, num; + + vp = sp->vp; + nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp; + if (vp == NULL || nblocks == 0) + return; + + /* Sort the blocks. */ + if (!(sp->seg_flags & SEGM_CLEAN)) + lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks); + + /* + * Assign disk addresses, and update references to the logical + * block and the segment usage information. + */ + fs = sp->fs; + db_per_fsb = fsbtodb(fs, 1); + for (i = nblocks; i--; ++sp->start_bpp) { + lbn = *sp->start_lbp++; + (*sp->start_bpp)->b_blkno = off = fs->lfs_offset; + fs->lfs_offset += db_per_fsb; + + if (error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL)) + panic("lfs_updatemeta: ufs_bmaparray %d", error); + ip = VTOI(vp); + switch (num) { + case 0: + ip->i_db[lbn] = off; + break; + case 1: + ip->i_ib[a[0].in_off] = off; + break; + default: + ap = &a[num - 1]; + if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp)) + panic("lfs_updatemeta: bread bno %d", + ap->in_lbn); + /* + * Bread may create a new indirect block which needs + * to get counted for the inode. + */ + if (bp->b_blkno == -1 && !(bp->b_flags & B_CACHE)) { +printf ("Updatemeta allocating indirect block: shouldn't happen\n"); + ip->i_blocks += btodb(fs->lfs_bsize); + fs->lfs_bfree -= btodb(fs->lfs_bsize); + } + ((daddr_t *)bp->b_data)[ap->in_off] = off; + VOP_BWRITE(bp); + } + + /* Update segment usage information. */ + if (daddr != UNASSIGNED && + !(daddr >= fs->lfs_lastpseg && daddr <= off)) { + LFS_SEGENTRY(sup, fs, datosn(fs, daddr), bp); +#ifdef DIAGNOSTIC + if (sup->su_nbytes < fs->lfs_bsize) { + /* XXX -- Change to a panic. */ + printf("lfs: negative bytes (segment %d)\n", + datosn(fs, daddr)); + panic ("Negative Bytes"); + } +#endif + sup->su_nbytes -= fs->lfs_bsize; + error = VOP_BWRITE(bp); + } + } +} + +/* + * Start a new segment. + */ +int +lfs_initseg(fs) + struct lfs *fs; +{ + struct segment *sp; + SEGUSE *sup; + SEGSUM *ssp; + struct buf *bp; + int repeat; + + sp = fs->lfs_sp; + + repeat = 0; + /* Advance to the next segment. */ + if (!LFS_PARTIAL_FITS(fs)) { + /* Wake up any cleaning procs waiting on this file system. */ + wakeup(&lfs_allclean_wakeup); + + lfs_newseg(fs); + repeat = 1; + fs->lfs_offset = fs->lfs_curseg; + sp->seg_number = datosn(fs, fs->lfs_curseg); + sp->seg_bytes_left = fs->lfs_dbpseg * DEV_BSIZE; + + /* + * If the segment contains a superblock, update the offset + * and summary address to skip over it. + */ + LFS_SEGENTRY(sup, fs, sp->seg_number, bp); + if (sup->su_flags & SEGUSE_SUPERBLOCK) { + fs->lfs_offset += LFS_SBPAD / DEV_BSIZE; + sp->seg_bytes_left -= LFS_SBPAD; + } + brelse(bp); + } else { + sp->seg_number = datosn(fs, fs->lfs_curseg); + sp->seg_bytes_left = (fs->lfs_dbpseg - + (fs->lfs_offset - fs->lfs_curseg)) * DEV_BSIZE; + } + fs->lfs_lastpseg = fs->lfs_offset; + + sp->fs = fs; + sp->ibp = NULL; + sp->ninodes = 0; + + /* Get a new buffer for SEGSUM and enter it into the buffer list. */ + sp->cbpp = sp->bpp; + *sp->cbpp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_offset, + LFS_SUMMARY_SIZE); + sp->segsum = (*sp->cbpp)->b_data; + bzero(sp->segsum, LFS_SUMMARY_SIZE); + sp->start_bpp = ++sp->cbpp; + fs->lfs_offset += LFS_SUMMARY_SIZE / DEV_BSIZE; + + /* Set point to SEGSUM, initialize it. */ + ssp = sp->segsum; + ssp->ss_next = fs->lfs_nextseg; + ssp->ss_nfinfo = ssp->ss_ninos = 0; + + /* Set pointer to first FINFO, initialize it. */ + sp->fip = (struct finfo *)(sp->segsum + sizeof(SEGSUM)); + sp->fip->fi_nblocks = 0; + sp->start_lbp = &sp->fip->fi_blocks[0]; + + sp->seg_bytes_left -= LFS_SUMMARY_SIZE; + sp->sum_bytes_left = LFS_SUMMARY_SIZE - sizeof(SEGSUM); + + return(repeat); +} + +/* + * Return the next segment to write. + */ +void +lfs_newseg(fs) + struct lfs *fs; +{ + CLEANERINFO *cip; + SEGUSE *sup; + struct buf *bp; + int curseg, isdirty, sn; + + LFS_SEGENTRY(sup, fs, datosn(fs, fs->lfs_nextseg), bp); + sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE; + sup->su_nbytes = 0; + sup->su_nsums = 0; + sup->su_ninos = 0; + (void) VOP_BWRITE(bp); + + LFS_CLEANERINFO(cip, fs, bp); + --cip->clean; + ++cip->dirty; + (void) VOP_BWRITE(bp); + + fs->lfs_lastseg = fs->lfs_curseg; + fs->lfs_curseg = fs->lfs_nextseg; + for (sn = curseg = datosn(fs, fs->lfs_curseg);;) { + sn = (sn + 1) % fs->lfs_nseg; + if (sn == curseg) + panic("lfs_nextseg: no clean segments"); + LFS_SEGENTRY(sup, fs, sn, bp); + isdirty = sup->su_flags & SEGUSE_DIRTY; + brelse(bp); + if (!isdirty) + break; + } + + ++fs->lfs_nactive; + fs->lfs_nextseg = sntoda(fs, sn); +#ifdef DOSTATS + ++lfs_stats.segsused; +#endif +} + +int +lfs_writeseg(fs, sp) + struct lfs *fs; + struct segment *sp; +{ + extern int locked_queue_count; + struct buf **bpp, *bp, *cbp; + SEGUSE *sup; + SEGSUM *ssp; + dev_t i_dev; + size_t size; + u_long *datap, *dp; + int ch_per_blk, do_again, i, nblocks, num, s; + int (*strategy)__P((struct vop_strategy_args *)); + struct vop_strategy_args vop_strategy_a; + u_short ninos; + char *p; + + /* + * If there are no buffers other than the segment summary to write + * and it is not a checkpoint, don't do anything. On a checkpoint, + * even if there aren't any buffers, you need to write the superblock. + */ + if ((nblocks = sp->cbpp - sp->bpp) == 1) + return (0); + + ssp = (SEGSUM *)sp->segsum; + + /* Update the segment usage information. */ + LFS_SEGENTRY(sup, fs, sp->seg_number, bp); + ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs); + sup->su_nbytes += nblocks - 1 - ninos << fs->lfs_bshift; + sup->su_nbytes += ssp->ss_ninos * sizeof(struct dinode); + sup->su_nbytes += LFS_SUMMARY_SIZE; + sup->su_lastmod = time.tv_sec; + sup->su_ninos += ninos; + ++sup->su_nsums; + do_again = !(bp->b_flags & B_GATHERED); + (void)VOP_BWRITE(bp); + /* + * Compute checksum across data and then across summary; the first + * block (the summary block) is skipped. Set the create time here + * so that it's guaranteed to be later than the inode mod times. + * + * XXX + * Fix this to do it inline, instead of malloc/copy. + */ + datap = dp = malloc(nblocks * sizeof(u_long), M_SEGMENT, M_WAITOK); + for (bpp = sp->bpp, i = nblocks - 1; i--;) { + if ((*++bpp)->b_flags & B_INVAL) { + if (copyin((*bpp)->b_saveaddr, dp++, sizeof(u_long))) + panic("lfs_writeseg: copyin failed"); + } else + *dp++ = ((u_long *)(*bpp)->b_data)[0]; + } + ssp->ss_create = time.tv_sec; + ssp->ss_datasum = cksum(datap, (nblocks - 1) * sizeof(u_long)); + ssp->ss_sumsum = + cksum(&ssp->ss_datasum, LFS_SUMMARY_SIZE - sizeof(ssp->ss_sumsum)); + free(datap, M_SEGMENT); +#ifdef DIAGNOSTIC + if (fs->lfs_bfree < fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE) + panic("lfs_writeseg: No diskspace for summary"); +#endif + fs->lfs_bfree -= (fsbtodb(fs, ninos) + LFS_SUMMARY_SIZE / DEV_BSIZE); + + i_dev = VTOI(fs->lfs_ivnode)->i_dev; + strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)]; + + /* + * When we simply write the blocks we lose a rotation for every block + * written. To avoid this problem, we allocate memory in chunks, copy + * the buffers into the chunk and write the chunk. MAXPHYS is the + * largest size I/O devices can handle. + * When the data is copied to the chunk, turn off the the B_LOCKED bit + * and brelse the buffer (which will move them to the LRU list). Add + * the B_CALL flag to the buffer header so we can count I/O's for the + * checkpoints and so we can release the allocated memory. + * + * XXX + * This should be removed if the new virtual memory system allows us to + * easily make the buffers contiguous in kernel memory and if that's + * fast enough. + */ + ch_per_blk = MAXPHYS / fs->lfs_bsize; + for (bpp = sp->bpp, i = nblocks; i;) { + num = ch_per_blk; + if (num > i) + num = i; + i -= num; + size = num * fs->lfs_bsize; + + cbp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, + (*bpp)->b_blkno, size); + cbp->b_dev = i_dev; + cbp->b_flags |= B_ASYNC | B_BUSY; + + s = splbio(); + ++fs->lfs_iocount; + for (p = cbp->b_data; num--;) { + bp = *bpp++; + /* + * Fake buffers from the cleaner are marked as B_INVAL. + * We need to copy the data from user space rather than + * from the buffer indicated. + * XXX == what do I do on an error? + */ + if (bp->b_flags & B_INVAL) { + if (copyin(bp->b_saveaddr, p, bp->b_bcount)) + panic("lfs_writeseg: copyin failed"); + } else + bcopy(bp->b_data, p, bp->b_bcount); + p += bp->b_bcount; + if (bp->b_flags & B_LOCKED) + --locked_queue_count; + bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI | + B_LOCKED | B_GATHERED); + if (bp->b_flags & B_CALL) { + /* if B_CALL, it was created with newbuf */ + brelvp(bp); + if (!(bp->b_flags & B_INVAL)) + free(bp->b_data, M_SEGMENT); + free(bp, M_SEGMENT); + } else { + bremfree(bp); + bp->b_flags |= B_DONE; + reassignbuf(bp, bp->b_vp); + brelse(bp); + } + } + ++cbp->b_vp->v_numoutput; + splx(s); + cbp->b_bcount = p - (char *)cbp->b_data; + /* + * XXXX This is a gross and disgusting hack. Since these + * buffers are physically addressed, they hang off the + * device vnode (devvp). As a result, they have no way + * of getting to the LFS superblock or lfs structure to + * keep track of the number of I/O's pending. So, I am + * going to stuff the fs into the saveaddr field of + * the buffer (yuk). + */ + cbp->b_saveaddr = (caddr_t)fs; + vop_strategy_a.a_desc = VDESC(vop_strategy); + vop_strategy_a.a_bp = cbp; + (strategy)(&vop_strategy_a); + } + /* + * XXX + * Vinvalbuf can move locked buffers off the locked queue + * and we have no way of knowing about this. So, after + * doing a big write, we recalculate how many bufers are + * really still left on the locked queue. + */ + locked_queue_count = count_lock_queue(); + wakeup(&locked_queue_count); +#ifdef DOSTATS + ++lfs_stats.psegwrites; + lfs_stats.blocktot += nblocks - 1; + if (fs->lfs_sp->seg_flags & SEGM_SYNC) + ++lfs_stats.psyncwrites; + if (fs->lfs_sp->seg_flags & SEGM_CLEAN) { + ++lfs_stats.pcleanwrites; + lfs_stats.cleanblocks += nblocks - 1; + } +#endif + return (lfs_initseg(fs) || do_again); +} + +void +lfs_writesuper(fs) + struct lfs *fs; +{ + struct buf *bp; + dev_t i_dev; + int (*strategy) __P((struct vop_strategy_args *)); + int s; + struct vop_strategy_args vop_strategy_a; + + i_dev = VTOI(fs->lfs_ivnode)->i_dev; + strategy = VTOI(fs->lfs_ivnode)->i_devvp->v_op[VOFFSET(vop_strategy)]; + + /* Checksum the superblock and copy it into a buffer. */ + fs->lfs_cksum = cksum(fs, sizeof(struct lfs) - sizeof(fs->lfs_cksum)); + bp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, fs->lfs_sboffs[0], + LFS_SBPAD); + *(struct lfs *)bp->b_data = *fs; + + /* XXX Toggle between first two superblocks; for now just write first */ + bp->b_dev = i_dev; + bp->b_flags |= B_BUSY | B_CALL | B_ASYNC; + bp->b_flags &= ~(B_DONE | B_ERROR | B_READ | B_DELWRI); + bp->b_iodone = lfs_supercallback; + vop_strategy_a.a_desc = VDESC(vop_strategy); + vop_strategy_a.a_bp = bp; + s = splbio(); + ++bp->b_vp->v_numoutput; + splx(s); + (strategy)(&vop_strategy_a); +} + +/* + * Logical block number match routines used when traversing the dirty block + * chain. + */ +int +lfs_match_data(fs, bp) + struct lfs *fs; + struct buf *bp; +{ + return (bp->b_lblkno >= 0); +} + +int +lfs_match_indir(fs, bp) + struct lfs *fs; + struct buf *bp; +{ + int lbn; + + lbn = bp->b_lblkno; + return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0); +} + +int +lfs_match_dindir(fs, bp) + struct lfs *fs; + struct buf *bp; +{ + int lbn; + + lbn = bp->b_lblkno; + return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1); +} + +int +lfs_match_tindir(fs, bp) + struct lfs *fs; + struct buf *bp; +{ + int lbn; + + lbn = bp->b_lblkno; + return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2); +} + +/* + * Allocate a new buffer header. + */ +struct buf * +lfs_newbuf(vp, daddr, size) + struct vnode *vp; + daddr_t daddr; + size_t size; +{ + struct buf *bp; + size_t nbytes; + + nbytes = roundup(size, DEV_BSIZE); + bp = malloc(sizeof(struct buf), M_SEGMENT, M_WAITOK); + bzero(bp, sizeof(struct buf)); + if (nbytes) + bp->b_data = malloc(nbytes, M_SEGMENT, M_WAITOK); + bgetvp(vp, bp); + bp->b_bufsize = size; + bp->b_bcount = size; + bp->b_lblkno = daddr; + bp->b_blkno = daddr; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_iodone = lfs_callback; + bp->b_flags |= B_BUSY | B_CALL | B_NOCACHE; + return (bp); +} + +void +lfs_callback(bp) + struct buf *bp; +{ + struct lfs *fs; + + fs = (struct lfs *)bp->b_saveaddr; +#ifdef DIAGNOSTIC + if (fs->lfs_iocount == 0) + panic("lfs_callback: zero iocount\n"); +#endif + if (--fs->lfs_iocount == 0) + wakeup(&fs->lfs_iocount); + + brelvp(bp); + free(bp->b_data, M_SEGMENT); + free(bp, M_SEGMENT); +} + +void +lfs_supercallback(bp) + struct buf *bp; +{ + brelvp(bp); + free(bp->b_data, M_SEGMENT); + free(bp, M_SEGMENT); +} + +/* + * Shellsort (diminishing increment sort) from Data Structures and + * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290; + * see also Knuth Vol. 3, page 84. The increments are selected from + * formula (8), page 95. Roughly O(N^3/2). + */ +/* + * This is our own private copy of shellsort because we want to sort + * two parallel arrays (the array of buffer pointers and the array of + * logical block numbers) simultaneously. Note that we cast the array + * of logical block numbers to a unsigned in this routine so that the + * negative block numbers (meta data blocks) sort AFTER the data blocks. + */ +void +lfs_shellsort(bp_array, lb_array, nmemb) + struct buf **bp_array; + daddr_t *lb_array; + register int nmemb; +{ + static int __rsshell_increments[] = { 4, 1, 0 }; + register int incr, *incrp, t1, t2; + struct buf *bp_temp; + u_long lb_temp; + + for (incrp = __rsshell_increments; incr = *incrp++;) + for (t1 = incr; t1 < nmemb; ++t1) + for (t2 = t1 - incr; t2 >= 0;) + if (lb_array[t2] > lb_array[t2 + incr]) { + lb_temp = lb_array[t2]; + lb_array[t2] = lb_array[t2 + incr]; + lb_array[t2 + incr] = lb_temp; + bp_temp = bp_array[t2]; + bp_array[t2] = bp_array[t2 + incr]; + bp_array[t2 + incr] = bp_temp; + t2 -= incr; + } else + break; +} + +/* + * Check VXLOCK. Return 1 if the vnode is locked. Otherwise, vget it. + */ +lfs_vref(vp) + register struct vnode *vp; +{ + + if (vp->v_flag & VXLOCK) + return(1); + return (vget(vp, 0)); +} + +void +lfs_vunref(vp) + register struct vnode *vp; +{ + extern int lfs_no_inactive; + + /* + * This is vrele except that we do not want to VOP_INACTIVE + * this vnode. Rather than inline vrele here, we use a global + * flag to tell lfs_inactive not to run. Yes, its gross. + */ + lfs_no_inactive = 1; + vrele(vp); + lfs_no_inactive = 0; +} diff --git a/sys/ufs/lfs/lfs_subr.c b/sys/ufs/lfs/lfs_subr.c new file mode 100644 index 00000000000..afcd8c29b3f --- /dev/null +++ b/sys/ufs/lfs/lfs_subr.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_subr.c 8.2 (Berkeley) 9/21/93 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Return buffer with the contents of block "offset" from the beginning of + * directory "ip". If "res" is non-zero, fill it in with a pointer to the + * remaining space in the directory. + */ +int +lfs_blkatoff(ap) + struct vop_blkatoff_args /* { + struct vnode *a_vp; + off_t a_offset; + char **a_res; + struct buf **a_bpp; + } */ *ap; +{ + register struct lfs *fs; + struct inode *ip; + struct buf *bp; + daddr_t lbn; + int bsize, error; + + ip = VTOI(ap->a_vp); + fs = ip->i_lfs; + lbn = lblkno(fs, ap->a_offset); + bsize = blksize(fs); + + *ap->a_bpp = NULL; + if (error = bread(ap->a_vp, lbn, bsize, NOCRED, &bp)) { + brelse(bp); + return (error); + } + if (ap->a_res) + *ap->a_res = (char *)bp->b_data + blkoff(fs, ap->a_offset); + *ap->a_bpp = bp; + return (0); +} + + +/* + * lfs_seglock -- + * Single thread the segment writer. + */ +void +lfs_seglock(fs, flags) + struct lfs *fs; + unsigned long flags; +{ + struct segment *sp; + int s; + + if (fs->lfs_seglock) + if (fs->lfs_lockpid == curproc->p_pid) { + ++fs->lfs_seglock; + fs->lfs_sp->seg_flags |= flags; + return; + } else while (fs->lfs_seglock) + (void)tsleep(&fs->lfs_seglock, PRIBIO + 1, + "lfs seglock", 0); + + fs->lfs_seglock = 1; + fs->lfs_lockpid = curproc->p_pid; + + sp = fs->lfs_sp = malloc(sizeof(struct segment), M_SEGMENT, M_WAITOK); + sp->bpp = malloc(((LFS_SUMMARY_SIZE - sizeof(SEGSUM)) / + sizeof(daddr_t) + 1) * sizeof(struct buf *), M_SEGMENT, M_WAITOK); + sp->seg_flags = flags; + sp->vp = NULL; + (void) lfs_initseg(fs); + + /* + * Keep a cumulative count of the outstanding I/O operations. If the + * disk drive catches up with us it could go to zero before we finish, + * so we artificially increment it by one until we've scheduled all of + * the writes we intend to do. + */ + s = splbio(); + ++fs->lfs_iocount; + splx(s); +} +/* + * lfs_segunlock -- + * Single thread the segment writer. + */ +void +lfs_segunlock(fs) + struct lfs *fs; +{ + struct segment *sp; + unsigned long sync, ckp; + int s; + + if (fs->lfs_seglock == 1) { + + sp = fs->lfs_sp; + sync = sp->seg_flags & SEGM_SYNC; + ckp = sp->seg_flags & SEGM_CKP; + if (sp->bpp != sp->cbpp) { + /* Free allocated segment summary */ + fs->lfs_offset -= LFS_SUMMARY_SIZE / DEV_BSIZE; + brelvp(*sp->bpp); + free((*sp->bpp)->b_data, M_SEGMENT); + free(*sp->bpp, M_SEGMENT); + } else + printf ("unlock to 0 with no summary"); + free(sp->bpp, M_SEGMENT); + free(sp, M_SEGMENT); + + /* + * If the I/O count is non-zero, sleep until it reaches zero. + * At the moment, the user's process hangs around so we can + * sleep. + */ + s = splbio(); + --fs->lfs_iocount; + /* + * We let checkpoints happen asynchronously. That means + * that during recovery, we have to roll forward between + * the two segments described by the first and second + * superblocks to make sure that the checkpoint described + * by a superblock completed. + */ + if (sync && fs->lfs_iocount) + (void)tsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs vflush", 0); + splx(s); + if (ckp) { + fs->lfs_nactive = 0; + lfs_writesuper(fs); + } + --fs->lfs_seglock; + fs->lfs_lockpid = 0; + wakeup(&fs->lfs_seglock); + } else if (fs->lfs_seglock == 0) { + panic ("Seglock not held"); + } else { + --fs->lfs_seglock; + } +} diff --git a/sys/ufs/lfs/lfs_syscalls.c b/sys/ufs/lfs/lfs_syscalls.c new file mode 100644 index 00000000000..666595e6b59 --- /dev/null +++ b/sys/ufs/lfs/lfs_syscalls.c @@ -0,0 +1,562 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_syscalls.c 8.5 (Berkeley) 4/20/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#define BUMP_FIP(SP) \ + (SP)->fip = (FINFO *) (&(SP)->fip->fi_blocks[(SP)->fip->fi_nblocks]) + +#define INC_FINFO(SP) ++((SEGSUM *)((SP)->segsum))->ss_nfinfo +#define DEC_FINFO(SP) --((SEGSUM *)((SP)->segsum))->ss_nfinfo + +/* + * Before committing to add something to a segment summary, make sure there + * is enough room. S is the bytes added to the summary. + */ +#define CHECK_SEG(s) \ +if (sp->sum_bytes_left < (s)) { \ + (void) lfs_writeseg(fs, sp); \ +} +struct buf *lfs_fakebuf __P((struct vnode *, int, size_t, caddr_t)); + +/* + * lfs_markv: + * + * This will mark inodes and blocks dirty, so they are written into the log. + * It will block until all the blocks have been written. The segment create + * time passed in the block_info and inode_info structures is used to decide + * if the data is valid for each block (in case some process dirtied a block + * or inode that is being cleaned between the determination that a block is + * live and the lfs_markv call). + * + * 0 on success + * -1/errno is return on error. + */ +struct lfs_markv_args { + fsid_t *fsidp; /* file system */ + BLOCK_INFO *blkiov; /* block array */ + int blkcnt; /* count of block array entries */ +}; +int +lfs_markv(p, uap, retval) + struct proc *p; + struct lfs_markv_args *uap; + int *retval; +{ + struct segment *sp; + BLOCK_INFO *blkp; + IFILE *ifp; + struct buf *bp, **bpp; + struct inode *ip; + struct lfs *fs; + struct mount *mntp; + struct vnode *vp; + fsid_t fsid; + void *start; + ino_t lastino; + daddr_t b_daddr, v_daddr; + u_long bsize; + int cnt, error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + + if (error = copyin(uap->fsidp, &fsid, sizeof(fsid_t))) + return (error); + if ((mntp = getvfs(&fsid)) == NULL) + return (EINVAL); + + cnt = uap->blkcnt; + start = malloc(cnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK); + if (error = copyin(uap->blkiov, start, cnt * sizeof(BLOCK_INFO))) + goto err1; + + /* Mark blocks/inodes dirty. */ + fs = VFSTOUFS(mntp)->um_lfs; + bsize = fs->lfs_bsize; + error = 0; + + lfs_seglock(fs, SEGM_SYNC | SEGM_CLEAN); + sp = fs->lfs_sp; + for (v_daddr = LFS_UNUSED_DADDR, lastino = LFS_UNUSED_INUM, + blkp = start; cnt--; ++blkp) { + /* + * Get the IFILE entry (only once) and see if the file still + * exists. + */ + if (lastino != blkp->bi_inode) { + if (lastino != LFS_UNUSED_INUM) { + /* Finish up last file */ + if (sp->fip->fi_nblocks == 0) { + DEC_FINFO(sp); + sp->sum_bytes_left += + sizeof(FINFO) - sizeof(daddr_t); + } else { + lfs_updatemeta(sp); + BUMP_FIP(sp); + } + + lfs_writeinode(fs, sp, ip); + lfs_vunref(vp); + } + + /* Start a new file */ + CHECK_SEG(sizeof(FINFO)); + sp->sum_bytes_left -= sizeof(FINFO) - sizeof(daddr_t); + INC_FINFO(sp); + sp->start_lbp = &sp->fip->fi_blocks[0]; + sp->vp = NULL; + sp->fip->fi_version = blkp->bi_version; + sp->fip->fi_nblocks = 0; + sp->fip->fi_ino = blkp->bi_inode; + lastino = blkp->bi_inode; + if (blkp->bi_inode == LFS_IFILE_INUM) + v_daddr = fs->lfs_idaddr; + else { + LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); + v_daddr = ifp->if_daddr; + brelse(bp); + } + if (v_daddr == LFS_UNUSED_DADDR) + continue; + + /* Get the vnode/inode. */ + if (lfs_fastvget(mntp, blkp->bi_inode, v_daddr, &vp, + blkp->bi_lbn == LFS_UNUSED_LBN ? + blkp->bi_bp : NULL)) { +#ifdef DIAGNOSTIC + printf("lfs_markv: VFS_VGET failed (%d)\n", + blkp->bi_inode); +#endif + lastino = LFS_UNUSED_INUM; + v_daddr = LFS_UNUSED_DADDR; + continue; + } + sp->vp = vp; + ip = VTOI(vp); + } else if (v_daddr == LFS_UNUSED_DADDR) + continue; + + /* If this BLOCK_INFO didn't contain a block, keep going. */ + if (blkp->bi_lbn == LFS_UNUSED_LBN) + continue; + if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) || + b_daddr != blkp->bi_daddr) + continue; + /* + * If we got to here, then we are keeping the block. If it + * is an indirect block, we want to actually put it in the + * buffer cache so that it can be updated in the finish_meta + * section. If it's not, we need to allocate a fake buffer + * so that writeseg can perform the copyin and write the buffer. + */ + if (blkp->bi_lbn >= 0) /* Data Block */ + bp = lfs_fakebuf(vp, blkp->bi_lbn, bsize, + blkp->bi_bp); + else { + bp = getblk(vp, blkp->bi_lbn, bsize, 0, 0); + if (!(bp->b_flags & (B_DELWRI | B_DONE | B_CACHE)) && + (error = copyin(blkp->bi_bp, bp->b_data, + bsize))) + goto err2; + if (error = VOP_BWRITE(bp)) + goto err2; + } + while (lfs_gatherblock(sp, bp, NULL)); + } + if (sp->vp) { + if (sp->fip->fi_nblocks == 0) { + DEC_FINFO(sp); + sp->sum_bytes_left += + sizeof(FINFO) - sizeof(daddr_t); + } else + lfs_updatemeta(sp); + + lfs_writeinode(fs, sp, ip); + lfs_vunref(vp); + } + (void) lfs_writeseg(fs, sp); + lfs_segunlock(fs); + free(start, M_SEGMENT); + return (error); + +/* + * XXX + * If we come in to error 2, we might have indirect blocks that were + * updated and now have bad block pointers. I don't know what to do + * about this. + */ + +err2: lfs_vunref(vp); + /* Free up fakebuffers */ + for (bpp = --sp->cbpp; bpp >= sp->bpp; --bpp) + if ((*bpp)->b_flags & B_CALL) { + brelvp(*bpp); + free(*bpp, M_SEGMENT); + } else + brelse(*bpp); + lfs_segunlock(fs); +err1: + free(start, M_SEGMENT); + return (error); +} + +/* + * lfs_bmapv: + * + * This will fill in the current disk address for arrays of blocks. + * + * 0 on success + * -1/errno is return on error. + */ +struct lfs_bmapv_args { + fsid_t *fsidp; /* file system */ + BLOCK_INFO *blkiov; /* block array */ + int blkcnt; /* count of block array entries */ +}; +int +lfs_bmapv(p, uap, retval) + struct proc *p; + struct lfs_bmapv_args *uap; + int *retval; +{ + BLOCK_INFO *blkp; + struct mount *mntp; + struct vnode *vp; + fsid_t fsid; + void *start; + daddr_t daddr; + int cnt, error, step; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + + if (error = copyin(uap->fsidp, &fsid, sizeof(fsid_t))) + return (error); + if ((mntp = getvfs(&fsid)) == NULL) + return (EINVAL); + + cnt = uap->blkcnt; + start = blkp = malloc(cnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK); + if (error = copyin(uap->blkiov, blkp, cnt * sizeof(BLOCK_INFO))) { + free(blkp, M_SEGMENT); + return (error); + } + + for (step = cnt; step--; ++blkp) { + if (blkp->bi_lbn == LFS_UNUSED_LBN) + continue; + /* Could be a deadlock ? */ + if (VFS_VGET(mntp, blkp->bi_inode, &vp)) + daddr = LFS_UNUSED_DADDR; + else { + if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &daddr, NULL)) + daddr = LFS_UNUSED_DADDR; + vput(vp); + } + blkp->bi_daddr = daddr; + } + copyout(start, uap->blkiov, cnt * sizeof(BLOCK_INFO)); + free(start, M_SEGMENT); + return (0); +} + +/* + * lfs_segclean: + * + * Mark the segment clean. + * + * 0 on success + * -1/errno is return on error. + */ +struct lfs_segclean_args { + fsid_t *fsidp; /* file system */ + u_long segment; /* segment number */ +}; +int +lfs_segclean(p, uap, retval) + struct proc *p; + struct lfs_segclean_args *uap; + int *retval; +{ + CLEANERINFO *cip; + SEGUSE *sup; + struct buf *bp; + struct mount *mntp; + struct lfs *fs; + fsid_t fsid; + int error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + + if (error = copyin(uap->fsidp, &fsid, sizeof(fsid_t))) + return (error); + if ((mntp = getvfs(&fsid)) == NULL) + return (EINVAL); + + fs = VFSTOUFS(mntp)->um_lfs; + + if (datosn(fs, fs->lfs_curseg) == uap->segment) + return (EBUSY); + + LFS_SEGENTRY(sup, fs, uap->segment, bp); + if (sup->su_flags & SEGUSE_ACTIVE) { + brelse(bp); + return (EBUSY); + } + fs->lfs_avail += fsbtodb(fs, fs->lfs_ssize) - 1; + fs->lfs_bfree += (sup->su_nsums * LFS_SUMMARY_SIZE / DEV_BSIZE) + + sup->su_ninos * btodb(fs->lfs_bsize); + sup->su_flags &= ~SEGUSE_DIRTY; + (void) VOP_BWRITE(bp); + + LFS_CLEANERINFO(cip, fs, bp); + ++cip->clean; + --cip->dirty; + (void) VOP_BWRITE(bp); + wakeup(&fs->lfs_avail); + return (0); +} + +/* + * lfs_segwait: + * + * This will block until a segment in file system fsid is written. A timeout + * in milliseconds may be specified which will awake the cleaner automatically. + * An fsid of -1 means any file system, and a timeout of 0 means forever. + * + * 0 on success + * 1 on timeout + * -1/errno is return on error. + */ +struct lfs_segwait_args { + fsid_t *fsidp; /* file system */ + struct timeval *tv; /* timeout */ +}; +int +lfs_segwait(p, uap, retval) + struct proc *p; + struct lfs_segwait_args *uap; + int *retval; +{ + extern int lfs_allclean_wakeup; + struct mount *mntp; + struct timeval atv; + fsid_t fsid; + void *addr; + u_long timeout; + int error, s; + + if (error = suser(p->p_ucred, &p->p_acflag)) { + return (error); +} +#ifdef WHEN_QUADS_WORK + if (error = copyin(uap->fsidp, &fsid, sizeof(fsid_t))) + return (error); + if (fsid == (fsid_t)-1) + addr = &lfs_allclean_wakeup; + else { + if ((mntp = getvfs(&fsid)) == NULL) + return (EINVAL); + addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg; + } +#else + if (error = copyin(uap->fsidp, &fsid, sizeof(fsid_t))) + return (error); + if ((mntp = getvfs(&fsid)) == NULL) + addr = &lfs_allclean_wakeup; + else + addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg; +#endif + + if (uap->tv) { + if (error = copyin(uap->tv, &atv, sizeof(struct timeval))) + return (error); + if (itimerfix(&atv)) + return (EINVAL); + s = splclock(); + timevaladd(&atv, (struct timeval *)&time); + timeout = hzto(&atv); + splx(s); + } else + timeout = 0; + + error = tsleep(addr, PCATCH | PUSER, "segment", timeout); + return (error == ERESTART ? EINTR : 0); +} + +/* + * VFS_VGET call specialized for the cleaner. The cleaner already knows the + * daddr from the ifile, so don't look it up again. If the cleaner is + * processing IINFO structures, it may have the ondisk inode already, so + * don't go retrieving it again. + */ +int +lfs_fastvget(mp, ino, daddr, vpp, dinp) + struct mount *mp; + ino_t ino; + daddr_t daddr; + struct vnode **vpp; + struct dinode *dinp; +{ + register struct inode *ip; + struct vnode *vp; + struct ufsmount *ump; + struct buf *bp; + dev_t dev; + int error; + + ump = VFSTOUFS(mp); + dev = ump->um_dev; + /* + * This is playing fast and loose. Someone may have the inode + * locked, in which case they are going to be distinctly unhappy + * if we trash something. + */ + if ((*vpp = ufs_ihashlookup(dev, ino)) != NULL) { + lfs_vref(*vpp); + if ((*vpp)->v_flag & VXLOCK) + printf ("Cleaned vnode VXLOCKED\n"); + ip = VTOI(*vpp); + if (ip->i_flags & IN_LOCKED) + printf("cleaned vnode locked\n"); + if (!(ip->i_flag & IN_MODIFIED)) { + ++ump->um_lfs->lfs_uinodes; + ip->i_flag |= IN_MODIFIED; + } + ip->i_flag |= IN_MODIFIED; + return (0); + } + + /* Allocate new vnode/inode. */ + if (error = lfs_vcreate(mp, ino, &vp)) { + *vpp = NULL; + return (error); + } + + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + ip = VTOI(vp); + ufs_ihashins(ip); + + /* + * XXX + * This may not need to be here, logically it should go down with + * the i_devvp initialization. + * Ask Kirk. + */ + ip->i_lfs = ump->um_lfs; + + /* Read in the disk contents for the inode, copy into the inode. */ + if (dinp) + if (error = copyin(dinp, &ip->i_din, sizeof(struct dinode))) + return (error); + else { + if (error = bread(ump->um_devvp, daddr, + (int)ump->um_lfs->lfs_bsize, NOCRED, &bp)) { + /* + * The inode does not contain anything useful, so it + * would be misleading to leave it on its hash chain. + * Iput() will return it to the free list. + */ + ufs_ihashrem(ip); + + /* Unlock and discard unneeded inode. */ + lfs_vunref(vp); + brelse(bp); + *vpp = NULL; + return (error); + } + ip->i_din = + *lfs_ifind(ump->um_lfs, ino, (struct dinode *)bp->b_data); + brelse(bp); + } + + /* Inode was just read from user space or disk, make sure it's locked */ + ip->i_flag |= IN_LOCKED; + + /* + * Initialize the vnode from the inode, check for aliases. In all + * cases re-init ip, the underlying vnode/inode may have changed. + */ + if (error = ufs_vinit(mp, lfs_specop_p, LFS_FIFOOPS, &vp)) { + lfs_vunref(vp); + *vpp = NULL; + return (error); + } + /* + * Finish inode initialization now that aliasing has been resolved. + */ + ip->i_devvp = ump->um_devvp; + ip->i_flag |= IN_MODIFIED; + ++ump->um_lfs->lfs_uinodes; + VREF(ip->i_devvp); + *vpp = vp; + return (0); +} +struct buf * +lfs_fakebuf(vp, lbn, size, uaddr) + struct vnode *vp; + int lbn; + size_t size; + caddr_t uaddr; +{ + struct buf *bp; + + bp = lfs_newbuf(vp, lbn, 0); + bp->b_saveaddr = uaddr; + bp->b_bufsize = size; + bp->b_bcount = size; + bp->b_flags |= B_INVAL; + return (bp); +} diff --git a/sys/ufs/lfs/lfs_vfsops.c b/sys/ufs/lfs/lfs_vfsops.c new file mode 100644 index 00000000000..0c8186e2322 --- /dev/null +++ b/sys/ufs/lfs/lfs_vfsops.c @@ -0,0 +1,573 @@ +/* + * Copyright (c) 1989, 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_vfsops.c 8.7 (Berkeley) 4/16/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include + +int lfs_mountfs __P((struct vnode *, struct mount *, struct proc *)); + +struct vfsops lfs_vfsops = { + lfs_mount, + ufs_start, + lfs_unmount, + ufs_root, + ufs_quotactl, + lfs_statfs, + lfs_sync, + lfs_vget, + lfs_fhtovp, + lfs_vptofh, + lfs_init, +}; + +int +lfs_mountroot() +{ + panic("lfs_mountroot"); /* XXX -- implement */ +} + +/* + * VFS Operations. + * + * mount system call + */ +lfs_mount(mp, path, data, ndp, p) + register struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct vnode *devvp; + struct ufs_args args; + struct ufsmount *ump; + register struct lfs *fs; /* LFS */ + u_int size; + int error; + + if (error = copyin(data, (caddr_t)&args, sizeof (struct ufs_args))) + return (error); + + /* Until LFS can do NFS right. XXX */ + if (args.export.ex_flags & MNT_EXPORTED) + return (EINVAL); + + /* + * If updating, check whether changing from read-only to + * read/write; if there is no device name, that's all we do. + */ + if (mp->mnt_flag & MNT_UPDATE) { + ump = VFSTOUFS(mp); +#ifdef NOTLFS /* LFS */ + fs = ump->um_fs; + if (fs->fs_ronly && (mp->mnt_flag & MNT_RDONLY) == 0) + fs->fs_ronly = 0; +#else + fs = ump->um_lfs; + if (fs->lfs_ronly && (mp->mnt_flag & MNT_RDONLY) == 0) + fs->lfs_ronly = 0; +#endif + if (args.fspec == 0) { + /* + * Process export requests. + */ + return (vfs_export(mp, &ump->um_export, &args.export)); + } + } + /* + * Not an update, or updating the name: look up the name + * and verify that it refers to a sensible block device. + */ + NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p); + if (error = namei(ndp)) + return (error); + devvp = ndp->ni_vp; + if (devvp->v_type != VBLK) { + vrele(devvp); + return (ENOTBLK); + } + if (major(devvp->v_rdev) >= nblkdev) { + vrele(devvp); + return (ENXIO); + } + if ((mp->mnt_flag & MNT_UPDATE) == 0) + error = lfs_mountfs(devvp, mp, p); /* LFS */ + else { + if (devvp != ump->um_devvp) + error = EINVAL; /* needs translation */ + else + vrele(devvp); + } + if (error) { + vrele(devvp); + return (error); + } + ump = VFSTOUFS(mp); + fs = ump->um_lfs; /* LFS */ +#ifdef NOTLFS /* LFS */ + (void) copyinstr(path, fs->fs_fsmnt, sizeof(fs->fs_fsmnt) - 1, &size); + bzero(fs->fs_fsmnt + size, sizeof(fs->fs_fsmnt) - size); + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) ufs_statfs(mp, &mp->mnt_stat, p); +#else + (void)copyinstr(path, fs->lfs_fsmnt, sizeof(fs->lfs_fsmnt) - 1, &size); + bzero(fs->lfs_fsmnt + size, sizeof(fs->lfs_fsmnt) - size); + bcopy((caddr_t)fs->lfs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) lfs_statfs(mp, &mp->mnt_stat, p); +#endif + return (0); +} + +/* + * Common code for mount and mountroot + * LFS specific + */ +int +lfs_mountfs(devvp, mp, p) + register struct vnode *devvp; + struct mount *mp; + struct proc *p; +{ + extern struct vnode *rootvp; + register struct lfs *fs; + register struct ufsmount *ump; + struct vnode *vp; + struct buf *bp; + struct partinfo dpart; + dev_t dev; + int error, i, ronly, size; + + /* + * Disallow multiple mounts of the same device. + * Disallow mounting of a device that is currently in use + * (except for root, which might share swap device for miniroot). + * Flush out any old buffers remaining from a previous use. + */ + if (error = vfs_mountedon(devvp)) + return (error); + if (vcount(devvp) > 1 && devvp != rootvp) + return (EBUSY); + if (error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + + ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + if (error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p)) + return (error); + + if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p) != 0) + size = DEV_BSIZE; + else { + size = dpart.disklab->d_secsize; +#ifdef NEVER_USED + dpart.part->p_fstype = FS_LFS; + dpart.part->p_fsize = fs->lfs_fsize; /* frag size */ + dpart.part->p_frag = fs->lfs_frag; /* frags per block */ + dpart.part->p_cpg = fs->lfs_segshift; /* segment shift */ +#endif + } + + /* Don't free random space on error. */ + bp = NULL; + ump = NULL; + + /* Read in the superblock. */ + if (error = bread(devvp, LFS_LABELPAD / size, LFS_SBPAD, NOCRED, &bp)) + goto out; + fs = (struct lfs *)bp->b_data; + + /* Check the basics. */ + if (fs->lfs_magic != LFS_MAGIC || fs->lfs_bsize > MAXBSIZE || + fs->lfs_bsize < sizeof(struct lfs)) { + error = EINVAL; /* XXX needs translation */ + goto out; + } + + /* Allocate the mount structure, copy the superblock into it. */ + ump = (struct ufsmount *)malloc(sizeof *ump, M_UFSMNT, M_WAITOK); + fs = ump->um_lfs = malloc(sizeof(struct lfs), M_UFSMNT, M_WAITOK); + bcopy(bp->b_data, fs, sizeof(struct lfs)); + if (sizeof(struct lfs) < LFS_SBPAD) /* XXX why? */ + bp->b_flags |= B_INVAL; + brelse(bp); + bp = NULL; + + /* Set up the I/O information */ + fs->lfs_iocount = 0; + + /* Set up the ifile and lock aflags */ + fs->lfs_doifile = 0; + fs->lfs_writer = 0; + fs->lfs_dirops = 0; + fs->lfs_seglock = 0; + + /* Set the file system readonly/modify bits. */ + fs->lfs_ronly = ronly; + if (ronly == 0) + fs->lfs_fmod = 1; + + /* Initialize the mount structure. */ + dev = devvp->v_rdev; + mp->mnt_data = (qaddr_t)ump; + mp->mnt_stat.f_fsid.val[0] = (long)dev; + mp->mnt_stat.f_fsid.val[1] = MOUNT_LFS; + mp->mnt_flag |= MNT_LOCAL; + ump->um_mountp = mp; + ump->um_dev = dev; + ump->um_devvp = devvp; + ump->um_bptrtodb = 0; + ump->um_seqinc = 1 << fs->lfs_fsbtodb; + ump->um_nindir = fs->lfs_nindir; + for (i = 0; i < MAXQUOTAS; i++) + ump->um_quotas[i] = NULLVP; + devvp->v_specflags |= SI_MOUNTEDON; + + /* + * We use the ifile vnode for almost every operation. Instead of + * retrieving it from the hash table each time we retrieve it here, + * artificially increment the reference count and keep a pointer + * to it in the incore copy of the superblock. + */ + if (error = VFS_VGET(mp, LFS_IFILE_INUM, &vp)) + goto out; + fs->lfs_ivnode = vp; + VREF(vp); + vput(vp); + + return (0); +out: + if (bp) + brelse(bp); + (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p); + if (ump) { + free(ump->um_lfs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = (qaddr_t)0; + } + return (error); +} + +/* + * unmount system call + */ +lfs_unmount(mp, mntflags, p) + struct mount *mp; + int mntflags; + struct proc *p; +{ + extern int doforce; + register struct ufsmount *ump; + register struct lfs *fs; + int i, error, flags, ronly; + + flags = 0; + if (mntflags & MNT_FORCE) { + if (!doforce || (mp->mnt_flag & MNT_ROOTFS)) + return (EINVAL); + flags |= FORCECLOSE; + } + + ump = VFSTOUFS(mp); + fs = ump->um_lfs; +#ifdef QUOTA + if (mp->mnt_flag & MNT_QUOTA) { + if (error = vflush(mp, fs->lfs_ivnode, SKIPSYSTEM|flags)) + return (error); + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] == NULLVP) + continue; + quotaoff(p, mp, i); + } + /* + * Here we fall through to vflush again to ensure + * that we have gotten rid of all the system vnodes. + */ + } +#endif + if (error = vflush(mp, fs->lfs_ivnode, flags)) + return (error); + fs->lfs_clean = 1; + if (error = VFS_SYNC(mp, 1, p->p_ucred, p)) + return (error); + if (fs->lfs_ivnode->v_dirtyblkhd.lh_first) + panic("lfs_unmount: still dirty blocks on ifile vnode\n"); + vrele(fs->lfs_ivnode); + vgone(fs->lfs_ivnode); + + ronly = !fs->lfs_ronly; + ump->um_devvp->v_specflags &= ~SI_MOUNTEDON; + error = VOP_CLOSE(ump->um_devvp, + ronly ? FREAD : FREAD|FWRITE, NOCRED, p); + vrele(ump->um_devvp); + free(fs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = (qaddr_t)0; + mp->mnt_flag &= ~MNT_LOCAL; + return (error); +} + +/* + * Get file system statistics. + */ +lfs_statfs(mp, sbp, p) + struct mount *mp; + register struct statfs *sbp; + struct proc *p; +{ + register struct lfs *fs; + register struct ufsmount *ump; + + ump = VFSTOUFS(mp); + fs = ump->um_lfs; + if (fs->lfs_magic != LFS_MAGIC) + panic("lfs_statfs: magic"); + sbp->f_type = MOUNT_LFS; + sbp->f_bsize = fs->lfs_bsize; + sbp->f_iosize = fs->lfs_bsize; + sbp->f_blocks = dbtofsb(fs,fs->lfs_dsize); + sbp->f_bfree = dbtofsb(fs, fs->lfs_bfree); + sbp->f_bavail = (fs->lfs_dsize * (100 - fs->lfs_minfree) / 100) - + (fs->lfs_dsize - fs->lfs_bfree); + sbp->f_bavail = dbtofsb(fs, sbp->f_bavail); + sbp->f_files = fs->lfs_nfiles; + sbp->f_ffree = sbp->f_bfree * INOPB(fs); + if (sbp != &mp->mnt_stat) { + bcopy((caddr_t)mp->mnt_stat.f_mntonname, + (caddr_t)&sbp->f_mntonname[0], MNAMELEN); + bcopy((caddr_t)mp->mnt_stat.f_mntfromname, + (caddr_t)&sbp->f_mntfromname[0], MNAMELEN); + } + return (0); +} + +/* + * Go through the disk queues to initiate sandbagged IO; + * go through the inodes to write those that have been modified; + * initiate the writing of the super block if it has been modified. + * + * Note: we are always called with the filesystem marked `MPBUSY'. + */ +lfs_sync(mp, waitfor, cred, p) + struct mount *mp; + int waitfor; + struct ucred *cred; + struct proc *p; +{ + int error; + + /* All syncs must be checkpoints until roll-forward is implemented. */ + error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0)); +#ifdef QUOTA + qsync(mp); +#endif + return (error); +} + +/* + * Look up an LFS dinode number to find its incore vnode. If not already + * in core, read it in from the specified device. Return the inode locked. + * Detection and handling of mount points must be done by the calling routine. + */ +int +lfs_vget(mp, ino, vpp) + struct mount *mp; + ino_t ino; + struct vnode **vpp; +{ + register struct lfs *fs; + register struct inode *ip; + struct buf *bp; + struct ifile *ifp; + struct vnode *vp; + struct ufsmount *ump; + daddr_t daddr; + dev_t dev; + int error; + + ump = VFSTOUFS(mp); + dev = ump->um_dev; + if ((*vpp = ufs_ihashget(dev, ino)) != NULL) + return (0); + + /* Translate the inode number to a disk address. */ + fs = ump->um_lfs; + if (ino == LFS_IFILE_INUM) + daddr = fs->lfs_idaddr; + else { + LFS_IENTRY(ifp, fs, ino, bp); + daddr = ifp->if_daddr; + brelse(bp); + if (daddr == LFS_UNUSED_DADDR) + return (ENOENT); + } + + /* Allocate new vnode/inode. */ + if (error = lfs_vcreate(mp, ino, &vp)) { + *vpp = NULL; + return (error); + } + + /* + * Put it onto its hash chain and lock it so that other requests for + * this inode will block if they arrive while we are sleeping waiting + * for old data structures to be purged or for the contents of the + * disk portion of this inode to be read. + */ + ip = VTOI(vp); + ufs_ihashins(ip); + + /* + * XXX + * This may not need to be here, logically it should go down with + * the i_devvp initialization. + * Ask Kirk. + */ + ip->i_lfs = ump->um_lfs; + + /* Read in the disk contents for the inode, copy into the inode. */ + if (error = + bread(ump->um_devvp, daddr, (int)fs->lfs_bsize, NOCRED, &bp)) { + /* + * The inode does not contain anything useful, so it would + * be misleading to leave it on its hash chain. With mode + * still zero, it will be unlinked and returned to the free + * list by vput(). + */ + vput(vp); + brelse(bp); + *vpp = NULL; + return (error); + } + ip->i_din = *lfs_ifind(fs, ino, (struct dinode *)bp->b_data); + brelse(bp); + + /* + * Initialize the vnode from the inode, check for aliases. In all + * cases re-init ip, the underlying vnode/inode may have changed. + */ + if (error = ufs_vinit(mp, lfs_specop_p, LFS_FIFOOPS, &vp)) { + vput(vp); + *vpp = NULL; + return (error); + } + /* + * Finish inode initialization now that aliasing has been resolved. + */ + ip->i_devvp = ump->um_devvp; + VREF(ip->i_devvp); + *vpp = vp; + return (0); +} + +/* + * File handle to vnode + * + * Have to be really careful about stale file handles: + * - check that the inode number is valid + * - call lfs_vget() to get the locked inode + * - check for an unallocated inode (i_mode == 0) + * - check that the given client host has export rights and return + * those rights via. exflagsp and credanonp + * + * XXX + * use ifile to see if inode is allocated instead of reading off disk + * what is the relationship between my generational number and the NFS + * generational number. + */ +int +lfs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp) + register struct mount *mp; + struct fid *fhp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + register struct ufid *ufhp; + + ufhp = (struct ufid *)fhp; + if (ufhp->ufid_ino < ROOTINO) + return (ESTALE); + return (ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp)); +} + +/* + * Vnode pointer to File handle + */ +/* ARGSUSED */ +lfs_vptofh(vp, fhp) + struct vnode *vp; + struct fid *fhp; +{ + register struct inode *ip; + register struct ufid *ufhp; + + ip = VTOI(vp); + ufhp = (struct ufid *)fhp; + ufhp->ufid_len = sizeof(struct ufid); + ufhp->ufid_ino = ip->i_number; + ufhp->ufid_gen = ip->i_gen; + return (0); +} diff --git a/sys/ufs/lfs/lfs_vnops.c b/sys/ufs/lfs/lfs_vnops.c new file mode 100644 index 00000000000..fc6bd480d22 --- /dev/null +++ b/sys/ufs/lfs/lfs_vnops.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lfs_vnops.c 8.5 (Berkeley) 12/30/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +/* Global vfs data structures for lfs. */ +int (**lfs_vnodeop_p)(); +struct vnodeopv_entry_desc lfs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, ufs_lookup }, /* lookup */ + { &vop_create_desc, ufs_create }, /* create */ + { &vop_mknod_desc, ufs_mknod }, /* mknod */ + { &vop_open_desc, ufs_open }, /* open */ + { &vop_close_desc, lfs_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, lfs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, lfs_read }, /* read */ + { &vop_write_desc, lfs_write }, /* write */ + { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */ + { &vop_select_desc, ufs_select }, /* select */ + { &vop_mmap_desc, ufs_mmap }, /* mmap */ + { &vop_fsync_desc, lfs_fsync }, /* fsync */ + { &vop_seek_desc, ufs_seek }, /* seek */ + { &vop_remove_desc, ufs_remove }, /* remove */ + { &vop_link_desc, ufs_link }, /* link */ + { &vop_rename_desc, ufs_rename }, /* rename */ + { &vop_mkdir_desc, ufs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, ufs_rmdir }, /* rmdir */ + { &vop_symlink_desc, ufs_symlink }, /* symlink */ + { &vop_readdir_desc, ufs_readdir }, /* readdir */ + { &vop_readlink_desc, ufs_readlink }, /* readlink */ + { &vop_abortop_desc, ufs_abortop }, /* abortop */ + { &vop_inactive_desc, lfs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, ufs_bmap }, /* bmap */ + { &vop_strategy_desc, ufs_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */ + { &vop_advlock_desc, ufs_advlock }, /* advlock */ + { &vop_blkatoff_desc, lfs_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, lfs_valloc }, /* valloc */ + { &vop_vfree_desc, lfs_vfree }, /* vfree */ + { &vop_truncate_desc, lfs_truncate }, /* truncate */ + { &vop_update_desc, lfs_update }, /* update */ + { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc lfs_vnodeop_opv_desc = + { &lfs_vnodeop_p, lfs_vnodeop_entries }; + +int (**lfs_specop_p)(); +struct vnodeopv_entry_desc lfs_specop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, spec_lookup }, /* lookup */ + { &vop_create_desc, spec_create }, /* create */ + { &vop_mknod_desc, spec_mknod }, /* mknod */ + { &vop_open_desc, spec_open }, /* open */ + { &vop_close_desc, ufsspec_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, lfs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ufsspec_read }, /* read */ + { &vop_write_desc, ufsspec_write }, /* write */ + { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ + { &vop_select_desc, spec_select }, /* select */ + { &vop_mmap_desc, spec_mmap }, /* mmap */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ + { &vop_seek_desc, spec_seek }, /* seek */ + { &vop_remove_desc, spec_remove }, /* remove */ + { &vop_link_desc, spec_link }, /* link */ + { &vop_rename_desc, spec_rename }, /* rename */ + { &vop_mkdir_desc, spec_mkdir }, /* mkdir */ + { &vop_rmdir_desc, spec_rmdir }, /* rmdir */ + { &vop_symlink_desc, spec_symlink }, /* symlink */ + { &vop_readdir_desc, spec_readdir }, /* readdir */ + { &vop_readlink_desc, spec_readlink }, /* readlink */ + { &vop_abortop_desc, spec_abortop }, /* abortop */ + { &vop_inactive_desc, lfs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, spec_bmap }, /* bmap */ + { &vop_strategy_desc, spec_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, spec_pathconf }, /* pathconf */ + { &vop_advlock_desc, spec_advlock }, /* advlock */ + { &vop_blkatoff_desc, spec_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, spec_valloc }, /* valloc */ + { &vop_vfree_desc, lfs_vfree }, /* vfree */ + { &vop_truncate_desc, spec_truncate }, /* truncate */ + { &vop_update_desc, lfs_update }, /* update */ + { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc lfs_specop_opv_desc = + { &lfs_specop_p, lfs_specop_entries }; + +#ifdef FIFO +int (**lfs_fifoop_p)(); +struct vnodeopv_entry_desc lfs_fifoop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, fifo_lookup }, /* lookup */ + { &vop_create_desc, fifo_create }, /* create */ + { &vop_mknod_desc, fifo_mknod }, /* mknod */ + { &vop_open_desc, fifo_open }, /* open */ + { &vop_close_desc, ufsfifo_close }, /* close */ + { &vop_access_desc, ufs_access }, /* access */ + { &vop_getattr_desc, lfs_getattr }, /* getattr */ + { &vop_setattr_desc, ufs_setattr }, /* setattr */ + { &vop_read_desc, ufsfifo_read }, /* read */ + { &vop_write_desc, ufsfifo_write }, /* write */ + { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */ + { &vop_select_desc, fifo_select }, /* select */ + { &vop_mmap_desc, fifo_mmap }, /* mmap */ + { &vop_fsync_desc, fifo_fsync }, /* fsync */ + { &vop_seek_desc, fifo_seek }, /* seek */ + { &vop_remove_desc, fifo_remove }, /* remove */ + { &vop_link_desc, fifo_link }, /* link */ + { &vop_rename_desc, fifo_rename }, /* rename */ + { &vop_mkdir_desc, fifo_mkdir }, /* mkdir */ + { &vop_rmdir_desc, fifo_rmdir }, /* rmdir */ + { &vop_symlink_desc, fifo_symlink }, /* symlink */ + { &vop_readdir_desc, fifo_readdir }, /* readdir */ + { &vop_readlink_desc, fifo_readlink }, /* readlink */ + { &vop_abortop_desc, fifo_abortop }, /* abortop */ + { &vop_inactive_desc, lfs_inactive }, /* inactive */ + { &vop_reclaim_desc, ufs_reclaim }, /* reclaim */ + { &vop_lock_desc, ufs_lock }, /* lock */ + { &vop_unlock_desc, ufs_unlock }, /* unlock */ + { &vop_bmap_desc, fifo_bmap }, /* bmap */ + { &vop_strategy_desc, fifo_strategy }, /* strategy */ + { &vop_print_desc, ufs_print }, /* print */ + { &vop_islocked_desc, ufs_islocked }, /* islocked */ + { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */ + { &vop_advlock_desc, fifo_advlock }, /* advlock */ + { &vop_blkatoff_desc, fifo_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, fifo_valloc }, /* valloc */ + { &vop_vfree_desc, lfs_vfree }, /* vfree */ + { &vop_truncate_desc, fifo_truncate }, /* truncate */ + { &vop_update_desc, lfs_update }, /* update */ + { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc lfs_fifoop_opv_desc = + { &lfs_fifoop_p, lfs_fifoop_entries }; +#endif /* FIFO */ + +#define LFS_READWRITE +#include +#undef LFS_READWRITE + +/* + * Synch an open file. + */ +/* ARGSUSED */ +lfs_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + struct timeval tv; + + tv = time; + return (VOP_UPDATE(ap->a_vp, &tv, &tv, + ap->a_waitfor == MNT_WAIT ? LFS_SYNC : 0)); +} + +/* + * These macros are used to bracket UFS directory ops, so that we can + * identify all the pages touched during directory ops which need to + * be ordered and flushed atomically, so that they may be recovered. + */ +#define SET_DIROP(fs) { \ + if ((fs)->lfs_writer) \ + tsleep(&(fs)->lfs_dirops, PRIBIO + 1, "lfs_dirop", 0); \ + ++(fs)->lfs_dirops; \ + (fs)->lfs_doifile = 1; \ +} + +#define SET_ENDOP(fs) { \ + --(fs)->lfs_dirops; \ + if (!(fs)->lfs_dirops) \ + wakeup(&(fs)->lfs_writer); \ +} + +#define MARK_VNODE(dvp) (dvp)->v_flag |= VDIROP + +int +lfs_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + ret = ufs_symlink(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_mknod(ap) + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + ret = ufs_mknod(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + ret = ufs_create(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + ret = ufs_mkdir(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_remove(ap) + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + MARK_VNODE(ap->a_vp); + ret = ufs_remove(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_rmdir(ap) + struct vop_rmdir_args /* { + struct vnodeop_desc *a_desc; + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_dvp)->i_lfs); + MARK_VNODE(ap->a_dvp); + MARK_VNODE(ap->a_vp); + ret = ufs_rmdir(ap); + SET_ENDOP(VTOI(ap->a_dvp)->i_lfs); + return (ret); +} + +int +lfs_link(ap) + struct vop_link_args /* { + struct vnode *a_vp; + struct vnode *a_tdvp; + struct componentname *a_cnp; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_vp)->i_lfs); + MARK_VNODE(ap->a_vp); + ret = ufs_link(ap); + SET_ENDOP(VTOI(ap->a_vp)->i_lfs); + return (ret); +} + +int +lfs_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + int ret; + + SET_DIROP(VTOI(ap->a_fdvp)->i_lfs); + MARK_VNODE(ap->a_fdvp); + MARK_VNODE(ap->a_tdvp); + ret = ufs_rename(ap); + SET_ENDOP(VTOI(ap->a_fdvp)->i_lfs); + return (ret); +} +/* XXX hack to avoid calling ITIMES in getattr */ +int +lfs_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + register struct vattr *vap = ap->a_vap; + /* + * Copy from inode table + */ + vap->va_fsid = ip->i_dev; + vap->va_fileid = ip->i_number; + vap->va_mode = ip->i_mode & ~IFMT; + vap->va_nlink = ip->i_nlink; + vap->va_uid = ip->i_uid; + vap->va_gid = ip->i_gid; + vap->va_rdev = (dev_t)ip->i_rdev; + vap->va_size = ip->i_din.di_size; + vap->va_atime = ip->i_atime; + vap->va_mtime = ip->i_mtime; + vap->va_ctime = ip->i_ctime; + vap->va_flags = ip->i_flags; + vap->va_gen = ip->i_gen; + /* this doesn't belong here */ + if (vp->v_type == VBLK) + vap->va_blocksize = BLKDEV_IOSIZE; + else if (vp->v_type == VCHR) + vap->va_blocksize = MAXBSIZE; + else + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + vap->va_bytes = dbtob(ip->i_blocks); + vap->va_type = vp->v_type; + vap->va_filerev = ip->i_modrev; + return (0); +} +/* + * Close called + * + * XXX -- we were using ufs_close, but since it updates the + * times on the inode, we might need to bump the uinodes + * count. + */ +/* ARGSUSED */ +int +lfs_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + int mod; + + if (vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED)) { + mod = ip->i_flag & IN_MODIFIED; + ITIMES(ip, &time, &time); + if (!mod && ip->i_flag & IN_MODIFIED) + ip->i_lfs->lfs_uinodes++; + } + return (0); +} + +/* + * Stub inactive routine that avoid calling ufs_inactive in some cases. + */ +int lfs_no_inactive = 0; + +int +lfs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + if (lfs_no_inactive) + return (0); + return (ufs_inactive(ap)); +} diff --git a/sys/ufs/mfs/mfs_extern.h b/sys/ufs/mfs/mfs_extern.h new file mode 100644 index 00000000000..e357faf6fa5 --- /dev/null +++ b/sys/ufs/mfs/mfs_extern.h @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfs_extern.h 8.1 (Berkeley) 6/11/93 + */ + +struct buf; +struct mount; +struct nameidata; +struct proc; +struct statfs; +struct ucred; +struct vnode; + +__BEGIN_DECLS +int mfs_badop __P((void)); +int mfs_bmap __P((struct vop_bmap_args *)); +int mfs_close __P((struct vop_close_args *)); +void mfs_doio __P((struct buf *bp, caddr_t base)); +int mfs_inactive __P((struct vop_inactive_args *)); /* XXX */ +int mfs_reclaim __P((struct vop_reclaim_args *)); /* XXX */ +int mfs_init __P((void)); +int mfs_ioctl __P((struct vop_ioctl_args *)); +int mfs_mount __P((struct mount *mp, + char *path, caddr_t data, struct nameidata *ndp, struct proc *p)); +int mfs_open __P((struct vop_open_args *)); +int mfs_print __P((struct vop_print_args *)); /* XXX */ +int mfs_start __P((struct mount *mp, int flags, struct proc *p)); +int mfs_statfs __P((struct mount *mp, struct statfs *sbp, struct proc *p)); +int mfs_strategy __P((struct vop_strategy_args *)); /* XXX */ +__END_DECLS diff --git a/sys/ufs/mfs/mfs_vfsops.c b/sys/ufs/mfs/mfs_vfsops.c new file mode 100644 index 00000000000..3fcbdf37928 --- /dev/null +++ b/sys/ufs/mfs/mfs_vfsops.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 1989, 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfs_vfsops.c 8.4 (Berkeley) 4/16/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +caddr_t mfs_rootbase; /* address of mini-root in kernel virtual memory */ +u_long mfs_rootsize; /* size of mini-root in bytes */ + +static int mfs_minor; /* used for building internal dev_t */ + +extern int (**mfs_vnodeop_p)(); + +/* + * mfs vfs operations. + */ +struct vfsops mfs_vfsops = { + mfs_mount, + mfs_start, + ffs_unmount, + ufs_root, + ufs_quotactl, + mfs_statfs, + ffs_sync, + ffs_vget, + ffs_fhtovp, + ffs_vptofh, + mfs_init, +}; + +/* + * Called by main() when mfs is going to be mounted as root. + * + * Name is updated by mount(8) after booting. + */ +#define ROOTNAME "mfs_root" + +mfs_mountroot() +{ + extern struct vnode *rootvp; + register struct fs *fs; + register struct mount *mp; + struct proc *p = curproc; /* XXX */ + struct ufsmount *ump; + struct mfsnode *mfsp; + u_int size; + int error; + + /* + * Get vnodes for swapdev and rootdev. + */ + if (bdevvp(swapdev, &swapdev_vp) || bdevvp(rootdev, &rootvp)) + panic("mfs_mountroot: can't setup bdevvp's"); + + mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + mp->mnt_op = &mfs_vfsops; + mp->mnt_flag = MNT_RDONLY; + mfsp = malloc(sizeof *mfsp, M_MFSNODE, M_WAITOK); + rootvp->v_data = mfsp; + rootvp->v_op = mfs_vnodeop_p; + rootvp->v_tag = VT_MFS; + mfsp->mfs_baseoff = mfs_rootbase; + mfsp->mfs_size = mfs_rootsize; + mfsp->mfs_vnode = rootvp; + mfsp->mfs_pid = p->p_pid; + mfsp->mfs_buflist = (struct buf *)0; + if (error = ffs_mountfs(rootvp, mp, p)) { + free(mp, M_MOUNT); + free(mfsp, M_MFSNODE); + return (error); + } + if (error = vfs_lock(mp)) { + (void)ffs_unmount(mp, 0, p); + free(mp, M_MOUNT); + free(mfsp, M_MFSNODE); + return (error); + } + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mp->mnt_flag |= MNT_ROOTFS; + mp->mnt_vnodecovered = NULLVP; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + bzero(fs->fs_fsmnt, sizeof(fs->fs_fsmnt)); + fs->fs_fsmnt[0] = '/'; + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copystr(ROOTNAME, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void)ffs_statfs(mp, &mp->mnt_stat, p); + vfs_unlock(mp); + inittodr((time_t)0); + return (0); +} + +/* + * This is called early in boot to set the base address and size + * of the mini-root. + */ +mfs_initminiroot(base) + caddr_t base; +{ + struct fs *fs = (struct fs *)(base + SBOFF); + extern int (*mountroot)(); + + /* check for valid super block */ + if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE || + fs->fs_bsize < sizeof(struct fs)) + return (0); + mountroot = mfs_mountroot; + mfs_rootbase = base; + mfs_rootsize = fs->fs_fsize * fs->fs_size; + rootdev = makedev(255, mfs_minor++); + return (mfs_rootsize); +} + +/* + * VFS Operations. + * + * mount system call + */ +/* ARGSUSED */ +int +mfs_mount(mp, path, data, ndp, p) + register struct mount *mp; + char *path; + caddr_t data; + struct nameidata *ndp; + struct proc *p; +{ + struct vnode *devvp; + struct mfs_args args; + struct ufsmount *ump; + register struct fs *fs; + register struct mfsnode *mfsp; + u_int size; + int flags, error; + + if (error = copyin(data, (caddr_t)&args, sizeof (struct mfs_args))) + return (error); + + /* + * If updating, check whether changing from read-only to + * read/write; if there is no device name, that's all we do. + */ + if (mp->mnt_flag & MNT_UPDATE) { + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + flags = WRITECLOSE; + if (mp->mnt_flag & MNT_FORCE) + flags |= FORCECLOSE; + if (vfs_busy(mp)) + return (EBUSY); + error = ffs_flushfiles(mp, flags, p); + vfs_unbusy(mp); + if (error) + return (error); + } + if (fs->fs_ronly && (mp->mnt_flag & MNT_WANTRDWR)) + fs->fs_ronly = 0; +#ifdef EXPORTMFS + if (args.fspec == 0) + return (vfs_export(mp, &ump->um_export, &args.export)); +#endif + return (0); + } + error = getnewvnode(VT_MFS, (struct mount *)0, mfs_vnodeop_p, &devvp); + if (error) + return (error); + devvp->v_type = VBLK; + if (checkalias(devvp, makedev(255, mfs_minor++), (struct mount *)0)) + panic("mfs_mount: dup dev"); + mfsp = (struct mfsnode *)malloc(sizeof *mfsp, M_MFSNODE, M_WAITOK); + devvp->v_data = mfsp; + mfsp->mfs_baseoff = args.base; + mfsp->mfs_size = args.size; + mfsp->mfs_vnode = devvp; + mfsp->mfs_pid = p->p_pid; + mfsp->mfs_buflist = (struct buf *)0; + if (error = ffs_mountfs(devvp, mp, p)) { + mfsp->mfs_buflist = (struct buf *)-1; + vrele(devvp); + return (error); + } + ump = VFSTOUFS(mp); + fs = ump->um_fs; + (void) copyinstr(path, fs->fs_fsmnt, sizeof(fs->fs_fsmnt) - 1, &size); + bzero(fs->fs_fsmnt + size, sizeof(fs->fs_fsmnt) - size); + bcopy((caddr_t)fs->fs_fsmnt, (caddr_t)mp->mnt_stat.f_mntonname, + MNAMELEN); + (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, + &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + (void) mfs_statfs(mp, &mp->mnt_stat, p); + return (0); +} + +int mfs_pri = PWAIT | PCATCH; /* XXX prob. temp */ + +/* + * Used to grab the process and keep it in the kernel to service + * memory filesystem I/O requests. + * + * Loop servicing I/O requests. + * Copy the requested data into or out of the memory filesystem + * address space. + */ +/* ARGSUSED */ +int +mfs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + register struct vnode *vp = VFSTOUFS(mp)->um_devvp; + register struct mfsnode *mfsp = VTOMFS(vp); + register struct buf *bp; + register caddr_t base; + int error = 0; + + base = mfsp->mfs_baseoff; + while (mfsp->mfs_buflist != (struct buf *)(-1)) { + while (bp = mfsp->mfs_buflist) { + mfsp->mfs_buflist = bp->b_actf; + mfs_doio(bp, base); + wakeup((caddr_t)bp); + } + /* + * If a non-ignored signal is received, try to unmount. + * If that fails, clear the signal (it has been "processed"), + * otherwise we will loop here, as tsleep will always return + * EINTR/ERESTART. + */ + if (error = tsleep((caddr_t)vp, mfs_pri, "mfsidl", 0)) + if (dounmount(mp, 0, p) != 0) + CLRSIG(p, CURSIG(p)); + } + return (error); +} + +/* + * Get file system statistics. + */ +mfs_statfs(mp, sbp, p) + struct mount *mp; + struct statfs *sbp; + struct proc *p; +{ + int error; + + error = ffs_statfs(mp, sbp, p); + sbp->f_type = MOUNT_MFS; + return (error); +} diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c new file mode 100644 index 00000000000..71adf069b1d --- /dev/null +++ b/sys/ufs/mfs/mfs_vnops.c @@ -0,0 +1,432 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfs_vnops.c 8.3 (Berkeley) 9/21/93 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include + +#if !defined(hp300) && !defined(i386) && !defined(mips) && !defined(sparc) && !defined(luna68k) +static int mfsmap_want; /* 1 => need kernel I/O resources */ +struct map mfsmap[MFS_MAPSIZE]; +extern char mfsiobuf[]; +#endif + +/* + * mfs vnode operations. + */ +int (**mfs_vnodeop_p)(); +struct vnodeopv_entry_desc mfs_vnodeop_entries[] = { + { &vop_default_desc, vn_default_error }, + { &vop_lookup_desc, mfs_lookup }, /* lookup */ + { &vop_create_desc, mfs_create }, /* create */ + { &vop_mknod_desc, mfs_mknod }, /* mknod */ + { &vop_open_desc, mfs_open }, /* open */ + { &vop_close_desc, mfs_close }, /* close */ + { &vop_access_desc, mfs_access }, /* access */ + { &vop_getattr_desc, mfs_getattr }, /* getattr */ + { &vop_setattr_desc, mfs_setattr }, /* setattr */ + { &vop_read_desc, mfs_read }, /* read */ + { &vop_write_desc, mfs_write }, /* write */ + { &vop_ioctl_desc, mfs_ioctl }, /* ioctl */ + { &vop_select_desc, mfs_select }, /* select */ + { &vop_mmap_desc, mfs_mmap }, /* mmap */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ + { &vop_seek_desc, mfs_seek }, /* seek */ + { &vop_remove_desc, mfs_remove }, /* remove */ + { &vop_link_desc, mfs_link }, /* link */ + { &vop_rename_desc, mfs_rename }, /* rename */ + { &vop_mkdir_desc, mfs_mkdir }, /* mkdir */ + { &vop_rmdir_desc, mfs_rmdir }, /* rmdir */ + { &vop_symlink_desc, mfs_symlink }, /* symlink */ + { &vop_readdir_desc, mfs_readdir }, /* readdir */ + { &vop_readlink_desc, mfs_readlink }, /* readlink */ + { &vop_abortop_desc, mfs_abortop }, /* abortop */ + { &vop_inactive_desc, mfs_inactive }, /* inactive */ + { &vop_reclaim_desc, mfs_reclaim }, /* reclaim */ + { &vop_lock_desc, mfs_lock }, /* lock */ + { &vop_unlock_desc, mfs_unlock }, /* unlock */ + { &vop_bmap_desc, mfs_bmap }, /* bmap */ + { &vop_strategy_desc, mfs_strategy }, /* strategy */ + { &vop_print_desc, mfs_print }, /* print */ + { &vop_islocked_desc, mfs_islocked }, /* islocked */ + { &vop_pathconf_desc, mfs_pathconf }, /* pathconf */ + { &vop_advlock_desc, mfs_advlock }, /* advlock */ + { &vop_blkatoff_desc, mfs_blkatoff }, /* blkatoff */ + { &vop_valloc_desc, mfs_valloc }, /* valloc */ + { &vop_vfree_desc, mfs_vfree }, /* vfree */ + { &vop_truncate_desc, mfs_truncate }, /* truncate */ + { &vop_update_desc, mfs_update }, /* update */ + { &vop_bwrite_desc, mfs_bwrite }, /* bwrite */ + { (struct vnodeop_desc*)NULL, (int(*)())NULL } +}; +struct vnodeopv_desc mfs_vnodeop_opv_desc = + { &mfs_vnodeop_p, mfs_vnodeop_entries }; + +/* + * Vnode Operations. + * + * Open called to allow memory filesystem to initialize and + * validate before actual IO. Record our process identifier + * so we can tell when we are doing I/O to ourself. + */ +/* ARGSUSED */ +int +mfs_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + if (ap->a_vp->v_type != VBLK) { + panic("mfs_ioctl not VBLK"); + /* NOTREACHED */ + } + return (0); +} + +/* + * Ioctl operation. + */ +/* ARGSUSED */ +int +mfs_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (ENOTTY); +} + +/* + * Pass I/O requests to the memory filesystem process. + */ +int +mfs_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + register struct buf *bp = ap->a_bp; + register struct mfsnode *mfsp; + struct vnode *vp; + struct proc *p = curproc; /* XXX */ + + if (!vfinddev(bp->b_dev, VBLK, &vp) || vp->v_usecount == 0) + panic("mfs_strategy: bad dev"); + mfsp = VTOMFS(vp); + /* check for mini-root access */ + if (mfsp->mfs_pid == 0) { + caddr_t base; + + base = mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT); + if (bp->b_flags & B_READ) + bcopy(base, bp->b_data, bp->b_bcount); + else + bcopy(bp->b_data, base, bp->b_bcount); + biodone(bp); + } else if (mfsp->mfs_pid == p->p_pid) { + mfs_doio(bp, mfsp->mfs_baseoff); + } else { + bp->b_actf = mfsp->mfs_buflist; + mfsp->mfs_buflist = bp; + wakeup((caddr_t)vp); + } + return (0); +} + +#if defined(vax) || defined(tahoe) +/* + * Memory file system I/O. + * + * Essentially play ubasetup() and disk interrupt service routine by + * doing the copies to or from the memfs process. If doing physio + * (i.e. pagein), we must map the I/O through the kernel virtual + * address space. + */ +void +mfs_doio(bp, base) + register struct buf *bp; + caddr_t base; +{ + register struct pte *pte, *ppte; + register caddr_t vaddr; + int off, npf, npf2, reg; + caddr_t kernaddr, offset; + + /* + * For phys I/O, map the b_data into kernel virtual space using + * the Mfsiomap pte's. + */ + if ((bp->b_flags & B_PHYS) == 0) { + kernaddr = bp->b_data; + } else { + if (bp->b_flags & (B_PAGET | B_UAREA | B_DIRTY)) + panic("swap on memfs?"); + off = (int)bp->b_data & PGOFSET; + npf = btoc(bp->b_bcount + off); + /* + * Get some mapping page table entries + */ + while ((reg = rmalloc(mfsmap, (long)npf)) == 0) { + mfsmap_want++; + sleep((caddr_t)&mfsmap_want, PZERO-1); + } + reg--; + pte = vtopte(bp->b_proc, btop(bp->b_data)); + /* + * Do vmaccess() but with the Mfsiomap page table. + */ + ppte = &Mfsiomap[reg]; + vaddr = &mfsiobuf[reg * NBPG]; + kernaddr = vaddr + off; + for (npf2 = npf; npf2; npf2--) { + mapin(ppte, (u_int)vaddr, pte->pg_pfnum, + (int)(PG_V|PG_KW)); +#if defined(tahoe) + if ((bp->b_flags & B_READ) == 0) + mtpr(P1DC, vaddr); +#endif + ppte++; + pte++; + vaddr += NBPG; + } + } + offset = base + (bp->b_blkno << DEV_BSHIFT); + if (bp->b_flags & B_READ) + bp->b_error = copyin(offset, kernaddr, bp->b_bcount); + else + bp->b_error = copyout(kernaddr, offset, bp->b_bcount); + if (bp->b_error) + bp->b_flags |= B_ERROR; + /* + * Release pte's used by physical I/O. + */ + if (bp->b_flags & B_PHYS) { + rmfree(mfsmap, (long)npf, (long)++reg); + if (mfsmap_want) { + mfsmap_want = 0; + wakeup((caddr_t)&mfsmap_want); + } + } + biodone(bp); +} +#endif /* vax || tahoe */ + +#if defined(hp300) || defined(i386) || defined(mips) || defined(sparc) || defined(luna68k) +/* + * Memory file system I/O. + * + * Trivial on the HP since buffer has already been mapping into KVA space. + */ +void +mfs_doio(bp, base) + register struct buf *bp; + caddr_t base; +{ + + base += (bp->b_blkno << DEV_BSHIFT); + if (bp->b_flags & B_READ) + bp->b_error = copyin(base, bp->b_data, bp->b_bcount); + else + bp->b_error = copyout(bp->b_data, base, bp->b_bcount); + if (bp->b_error) + bp->b_flags |= B_ERROR; + biodone(bp); +} +#endif + +/* + * This is a noop, simply returning what one has been given. + */ +int +mfs_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + + if (ap->a_vpp != NULL) + *ap->a_vpp = ap->a_vp; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + return (0); +} + +/* + * Memory filesystem close routine + */ +/* ARGSUSED */ +int +mfs_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct mfsnode *mfsp = VTOMFS(vp); + register struct buf *bp; + int error; + + /* + * Finish any pending I/O requests. + */ + while (bp = mfsp->mfs_buflist) { + mfsp->mfs_buflist = bp->b_actf; + mfs_doio(bp, mfsp->mfs_baseoff); + wakeup((caddr_t)bp); + } + /* + * On last close of a memory filesystem + * we must invalidate any in core blocks, so that + * we can, free up its vnode. + */ + if (error = vinvalbuf(vp, 1, ap->a_cred, ap->a_p, 0, 0)) + return (error); + /* + * There should be no way to have any more uses of this + * vnode, so if we find any other uses, it is a panic. + */ + if (vp->v_usecount > 1) + printf("mfs_close: ref count %d > 1\n", vp->v_usecount); + if (vp->v_usecount > 1 || mfsp->mfs_buflist) + panic("mfs_close"); + /* + * Send a request to the filesystem server to exit. + */ + mfsp->mfs_buflist = (struct buf *)(-1); + wakeup((caddr_t)vp); + return (0); +} + +/* + * Memory filesystem inactive routine + */ +/* ARGSUSED */ +int +mfs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct mfsnode *mfsp = VTOMFS(ap->a_vp); + + if (mfsp->mfs_buflist && mfsp->mfs_buflist != (struct buf *)(-1)) + panic("mfs_inactive: not inactive (mfs_buflist %x)", + mfsp->mfs_buflist); + return (0); +} + +/* + * Reclaim a memory filesystem devvp so that it can be reused. + */ +int +mfs_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + FREE(ap->a_vp->v_data, M_MFSNODE); + ap->a_vp->v_data = NULL; + return (0); +} + +/* + * Print out the contents of an mfsnode. + */ +int +mfs_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct mfsnode *mfsp = VTOMFS(ap->a_vp); + + printf("tag VT_MFS, pid %d, base %d, size %d\n", mfsp->mfs_pid, + mfsp->mfs_baseoff, mfsp->mfs_size); + return (0); +} + +/* + * Block device bad operation + */ +int +mfs_badop() +{ + + panic("mfs_badop called\n"); + /* NOTREACHED */ +} + +/* + * Memory based filesystem initialization. + */ +mfs_init() +{ + +#if !defined(hp300) && !defined(i386) && !defined(mips) && !defined(sparc) && !defined(luna68k) + rminit(mfsmap, (long)MFS_MAPREG, (long)1, "mfs mapreg", MFS_MAPSIZE); +#endif +} diff --git a/sys/ufs/mfs/mfsiom.h b/sys/ufs/mfs/mfsiom.h new file mode 100644 index 00000000000..98aca855f6a --- /dev/null +++ b/sys/ufs/mfs/mfsiom.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfsiom.h 8.1 (Berkeley) 6/11/93 + */ + +#define MFS_MAPREG (MAXPHYS/NBPG + 2) /* Kernel mapping pte's */ +#define MFS_MAPSIZE 10 /* Size of alloc map for pte's */ diff --git a/sys/ufs/mfs/mfsnode.h b/sys/ufs/mfs/mfsnode.h new file mode 100644 index 00000000000..4480ab02407 --- /dev/null +++ b/sys/ufs/mfs/mfsnode.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mfsnode.h 8.2 (Berkeley) 8/11/93 + */ + +/* + * This structure defines the control data for the memory based file system. + */ + +struct mfsnode { + struct vnode *mfs_vnode; /* vnode associated with this mfsnode */ + caddr_t mfs_baseoff; /* base of file system in memory */ + long mfs_size; /* size of memory file system */ + pid_t mfs_pid; /* supporting process pid */ + struct buf *mfs_buflist; /* list of I/O requests */ + long mfs_spare[4]; +}; + +/* + * Convert between mfsnode pointers and vnode pointers + */ +#define VTOMFS(vp) ((struct mfsnode *)(vp)->v_data) +#define MFSTOV(mfsp) ((mfsp)->mfs_vnode) + +/* Prototypes for MFS operations on vnodes. */ +#define mfs_lookup ((int (*) __P((struct vop_lookup_args *)))mfs_badop) +#define mfs_create ((int (*) __P((struct vop_create_args *)))mfs_badop) +#define mfs_mknod ((int (*) __P((struct vop_mknod_args *)))mfs_badop) +#define mfs_access ((int (*) __P((struct vop_access_args *)))mfs_badop) +#define mfs_getattr ((int (*) __P((struct vop_getattr_args *)))mfs_badop) +#define mfs_setattr ((int (*) __P((struct vop_setattr_args *)))mfs_badop) +#define mfs_read ((int (*) __P((struct vop_read_args *)))mfs_badop) +#define mfs_write ((int (*) __P((struct vop_write_args *)))mfs_badop) +#define mfs_select ((int (*) __P((struct vop_select_args *)))mfs_badop) +#define mfs_mmap ((int (*) __P((struct vop_mmap_args *)))mfs_badop) +#define mfs_seek ((int (*) __P((struct vop_seek_args *)))mfs_badop) +#define mfs_remove ((int (*) __P((struct vop_remove_args *)))mfs_badop) +#define mfs_link ((int (*) __P((struct vop_link_args *)))mfs_badop) +#define mfs_rename ((int (*) __P((struct vop_rename_args *)))mfs_badop) +#define mfs_mkdir ((int (*) __P((struct vop_mkdir_args *)))mfs_badop) +#define mfs_rmdir ((int (*) __P((struct vop_rmdir_args *)))mfs_badop) +#define mfs_symlink ((int (*) __P((struct vop_symlink_args *)))mfs_badop) +#define mfs_readdir ((int (*) __P((struct vop_readdir_args *)))mfs_badop) +#define mfs_readlink ((int (*) __P((struct vop_readlink_args *)))mfs_badop) +#define mfs_abortop ((int (*) __P((struct vop_abortop_args *)))mfs_badop) +#define mfs_lock ((int (*) __P((struct vop_lock_args *)))nullop) +#define mfs_unlock ((int (*) __P((struct vop_unlock_args *)))nullop) +#define mfs_islocked ((int (*) __P((struct vop_islocked_args *)))nullop) +#define mfs_pathconf ((int (*) __P((struct vop_pathconf_args *)))mfs_badop) +#define mfs_advlock ((int (*) __P((struct vop_advlock_args *)))mfs_badop) +#define mfs_blkatoff ((int (*) __P((struct vop_blkatoff_args *)))mfs_badop) +#define mfs_valloc ((int (*) __P((struct vop_valloc_args *)))mfs_badop) +#define mfs_vfree ((int (*) __P((struct vop_vfree_args *)))mfs_badop) +#define mfs_truncate ((int (*) __P((struct vop_truncate_args *)))mfs_badop) +#define mfs_update ((int (*) __P((struct vop_update_args *)))mfs_badop) +#define mfs_bwrite ((int (*) __P((struct vop_bwrite_args *)))vn_bwrite) diff --git a/sys/ufs/ufs/dinode.h b/sys/ufs/ufs/dinode.h new file mode 100644 index 00000000000..5b9915d9cfd --- /dev/null +++ b/sys/ufs/ufs/dinode.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dinode.h 8.3 (Berkeley) 1/21/94 + */ + +/* + * The root inode is the root of the file system. Inode 0 can't be used for + * normal purposes and historically bad blocks were linked to inode 1, thus + * the root inode is 2. (Inode 1 is no longer used for this purpose, however + * numerous dump tapes make this assumption, so we are stuck with it). + */ +#define ROOTINO ((ino_t)2) + +/* + * A dinode contains all the meta-data associated with a UFS file. + * This structure defines the on-disk format of a dinode. + */ + +#define NDADDR 12 /* Direct addresses in inode. */ +#define NIADDR 3 /* Indirect addresses in inode. */ + +struct dinode { + u_short di_mode; /* 0: IFMT and permissions. */ + short di_nlink; /* 2: File link count. */ + union { + u_short oldids[2]; /* 4: Ffs: old user and group ids. */ + ino_t inumber; /* 4: Lfs: inode number. */ + } di_u; + u_quad_t di_size; /* 8: File byte count. */ + struct timespec di_atime; /* 16: Last access time. */ + struct timespec di_mtime; /* 24: Last modified time. */ + struct timespec di_ctime; /* 32: Last inode change time. */ + daddr_t di_db[NDADDR]; /* 40: Direct disk blocks. */ + daddr_t di_ib[NIADDR]; /* 88: Indirect disk blocks. */ + u_long di_flags; /* 100: Status flags (chflags). */ + long di_blocks; /* 104: Blocks actually held. */ + long di_gen; /* 108: Generation number. */ + u_long di_uid; /* 112: File owner. */ + u_long di_gid; /* 116: File group. */ + long di_spare[2]; /* 120: Reserved; currently unused */ +}; + +/* + * The di_db fields may be overlaid with other information for + * file types that do not have associated disk storage. Block + * and character devices overlay the first data block with their + * dev_t value. Short symbolic links place their path in the + * di_db area. + */ +#define di_inumber di_u.inumber +#define di_ogid di_u.oldids[1] +#define di_ouid di_u.oldids[0] +#define di_rdev di_db[0] +#define di_shortlink di_db +#define MAXSYMLINKLEN ((NDADDR + NIADDR) * sizeof(daddr_t)) + +/* File modes. */ +#define IEXEC 0000100 /* Executable. */ +#define IWRITE 0000200 /* Writeable. */ +#define IREAD 0000400 /* Readable. */ +#define ISVTX 0001000 /* Sticky bit. */ +#define ISGID 0002000 /* Set-gid. */ +#define ISUID 0004000 /* Set-uid. */ + +/* File types. */ +#define IFMT 0170000 /* Mask of file type. */ +#define IFIFO 0010000 /* Named pipe (fifo). */ +#define IFCHR 0020000 /* Character device. */ +#define IFDIR 0040000 /* Directory file. */ +#define IFBLK 0060000 /* Block device. */ +#define IFREG 0100000 /* Regular file. */ +#define IFLNK 0120000 /* Symbolic link. */ +#define IFSOCK 0140000 /* UNIX domain socket. */ diff --git a/sys/ufs/ufs/dir.h b/sys/ufs/ufs/dir.h new file mode 100644 index 00000000000..c51bd1cf6e1 --- /dev/null +++ b/sys/ufs/ufs/dir.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dir.h 8.2 (Berkeley) 1/21/94 + */ + +#ifndef _DIR_H_ +#define _DIR_H_ + +/* + * A directory consists of some number of blocks of DIRBLKSIZ + * bytes, where DIRBLKSIZ is chosen such that it can be transferred + * to disk in a single atomic operation (e.g. 512 bytes on most machines). + * + * Each DIRBLKSIZ byte block contains some number of directory entry + * structures, which are of variable length. Each directory entry has + * a struct direct at the front of it, containing its inode number, + * the length of the entry, and the length of the name contained in + * the entry. These are followed by the name padded to a 4 byte boundary + * with null bytes. All names are guaranteed null terminated. + * The maximum length of a name in a directory is MAXNAMLEN. + * + * The macro DIRSIZ(fmt, dp) gives the amount of space required to represent + * a directory entry. Free space in a directory is represented by + * entries which have dp->d_reclen > DIRSIZ(fmt, dp). All DIRBLKSIZ bytes + * in a directory block are claimed by the directory entries. This + * usually results in the last entry in a directory having a large + * dp->d_reclen. When entries are deleted from a directory, the + * space is returned to the previous entry in the same directory + * block by increasing its dp->d_reclen. If the first entry of + * a directory block is free, then its dp->d_ino is set to 0. + * Entries other than the first in a directory do not normally have + * dp->d_ino set to 0. + */ +#define DIRBLKSIZ DEV_BSIZE +#define MAXNAMLEN 255 + +struct direct { + u_long d_ino; /* inode number of entry */ + u_short d_reclen; /* length of this record */ + u_char d_type; /* file type, see below */ + u_char d_namlen; /* length of string in d_name */ + char d_name[MAXNAMLEN + 1]; /* name with length <= MAXNAMLEN */ +}; + +/* + * File types + */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 + +/* + * Convert between stat structure types and directory types. + */ +#define IFTODT(mode) (((mode) & 0170000) >> 12) +#define DTTOIF(dirtype) ((dirtype) << 12) + +/* + * The DIRSIZ macro gives the minimum record length which will hold + * the directory entry. This requires the amount of space in struct direct + * without the d_name field, plus enough space for the name with a terminating + * null byte (dp->d_namlen+1), rounded up to a 4 byte boundary. + */ +#if (BYTE_ORDER == LITTLE_ENDIAN) +#define DIRSIZ(oldfmt, dp) \ + ((oldfmt) ? \ + ((sizeof (struct direct) - (MAXNAMLEN+1)) + (((dp)->d_type+1 + 3) &~ 3)) : \ + ((sizeof (struct direct) - (MAXNAMLEN+1)) + (((dp)->d_namlen+1 + 3) &~ 3))) +#else +#define DIRSIZ(oldfmt, dp) \ + ((sizeof (struct direct) - (MAXNAMLEN+1)) + (((dp)->d_namlen+1 + 3) &~ 3)) +#endif +#define OLDDIRFMT 1 +#define NEWDIRFMT 0 + +/* + * Template for manipulating directories. + * Should use struct direct's, but the name field + * is MAXNAMLEN - 1, and this just won't do. + */ +struct dirtemplate { + u_long dot_ino; + short dot_reclen; + u_char dot_type; + u_char dot_namlen; + char dot_name[4]; /* must be multiple of 4 */ + u_long dotdot_ino; + short dotdot_reclen; + u_char dotdot_type; + u_char dotdot_namlen; + char dotdot_name[4]; /* ditto */ +}; + +/* + * This is the old format of directories, sanz type element. + */ +struct odirtemplate { + u_long dot_ino; + short dot_reclen; + u_short dot_namlen; + char dot_name[4]; /* must be multiple of 4 */ + u_long dotdot_ino; + short dotdot_reclen; + u_short dotdot_namlen; + char dotdot_name[4]; /* ditto */ +}; +#endif /* !_DIR_H_ */ diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h new file mode 100644 index 00000000000..df155967a7d --- /dev/null +++ b/sys/ufs/ufs/inode.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)inode.h 8.4 (Berkeley) 1/21/94 + */ + +#include + +/* + * Theoretically, directories can be more than 2Gb in length, however, in + * practice this seems unlikely. So, we define the type doff_t as a long + * to keep down the cost of doing lookup on a 32-bit machine. If you are + * porting to a 64-bit architecture, you should make doff_t the same as off_t. + */ +#define doff_t long + +/* + * The inode is used to describe each active (or recently active) + * file in the UFS filesystem. It is composed of two types of + * information. The first part is the information that is needed + * only while the file is active (such as the identity of the file + * and linkage to speed its lookup). The second part is the + * permannent meta-data associated with the file which is read + * in from the permanent dinode from long term storage when the + * file becomes active, and is put back when the file is no longer + * being used. + */ +struct inode { + struct inode *i_next; /* Hash chain forward. */ + struct inode **i_prev; /* Hash chain back. */ + struct vnode *i_vnode; /* Vnode associated with this inode. */ + struct vnode *i_devvp; /* Vnode for block I/O. */ + u_long i_flag; /* I* flags. */ + dev_t i_dev; /* Device associated with the inode. */ + ino_t i_number; /* The identity of the inode. */ + union { /* Associated filesystem. */ + struct fs *fs; /* FFS */ + struct lfs *lfs; /* LFS */ + } inode_u; +#define i_fs inode_u.fs +#define i_lfs inode_u.lfs + struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ + u_quad_t i_modrev; /* Revision level for lease. */ + struct lockf *i_lockf; /* Head of byte-level lock list. */ + pid_t i_lockholder; /* DEBUG: holder of inode lock. */ + pid_t i_lockwaiter; /* DEBUG: latest blocked for inode lock. */ + /* + * Side effects; used during directory lookup. + */ + long i_count; /* Size of free slot in directory. */ + doff_t i_endoff; /* End of useful stuff in directory. */ + doff_t i_diroff; /* Offset in dir, where we found last entry. */ + doff_t i_offset; /* Offset of free space in directory. */ + ino_t i_ino; /* Inode number of found directory. */ + u_long i_reclen; /* Size of found directory entry. */ + long i_spare[11]; /* Spares to round up to 128 bytes. */ + /* + * The on-disk dinode itself. + */ + struct dinode i_din; /* 128 bytes of the on-disk dinode. */ +}; + +#define i_atime i_din.di_atime +#define i_blocks i_din.di_blocks +#define i_ctime i_din.di_ctime +#define i_db i_din.di_db +#define i_flags i_din.di_flags +#define i_gen i_din.di_gen +#define i_gid i_din.di_gid +#define i_ib i_din.di_ib +#define i_mode i_din.di_mode +#define i_mtime i_din.di_mtime +#define i_nlink i_din.di_nlink +#define i_rdev i_din.di_rdev +#define i_shortlink i_din.di_shortlink +#define i_size i_din.di_size +#define i_uid i_din.di_uid + +/* These flags are kept in i_flag. */ +#define IN_ACCESS 0x0001 /* Access time update request. */ +#define IN_CHANGE 0x0002 /* Inode change time update request. */ +#define IN_EXLOCK 0x0004 /* File has exclusive lock. */ +#define IN_LOCKED 0x0008 /* Inode lock. */ +#define IN_LWAIT 0x0010 /* Process waiting on file lock. */ +#define IN_MODIFIED 0x0020 /* Inode has been modified. */ +#define IN_RENAME 0x0040 /* Inode is being renamed. */ +#define IN_SHLOCK 0x0080 /* File has shared lock. */ +#define IN_UPDATE 0x0100 /* Modification time update request. */ +#define IN_WANTED 0x0200 /* Inode is wanted by a process. */ + +#ifdef KERNEL +/* + * Structure used to pass around logical block paths generated by + * ufs_getlbns and used by truncate and bmap code. + */ +struct indir { + daddr_t in_lbn; /* Logical block number. */ + int in_off; /* Offset in buffer. */ + int in_exists; /* Flag if the block exists. */ +}; + +/* Convert between inode pointers and vnode pointers. */ +#define VTOI(vp) ((struct inode *)(vp)->v_data) +#define ITOV(ip) ((ip)->i_vnode) + +#define ITIMES(ip, t1, t2) { \ + if ((ip)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) { \ + (ip)->i_flag |= IN_MODIFIED; \ + if ((ip)->i_flag & IN_ACCESS) \ + (ip)->i_atime.ts_sec = (t1)->tv_sec; \ + if ((ip)->i_flag & IN_UPDATE) { \ + (ip)->i_mtime.ts_sec = (t2)->tv_sec; \ + (ip)->i_modrev++; \ + } \ + if ((ip)->i_flag & IN_CHANGE) \ + (ip)->i_ctime.ts_sec = time.tv_sec; \ + (ip)->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); \ + } \ +} + +/* This overlays the fid structure (see mount.h). */ +struct ufid { + u_short ufid_len; /* Length of structure. */ + u_short ufid_pad; /* Force long alignment. */ + ino_t ufid_ino; /* File number (ino). */ + long ufid_gen; /* Generation number. */ +}; +#endif /* KERNEL */ diff --git a/sys/ufs/ufs/lockf.h b/sys/ufs/ufs/lockf.h new file mode 100644 index 00000000000..0ec61dbb0cf --- /dev/null +++ b/sys/ufs/ufs/lockf.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Scooter Morris at Genentech Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lockf.h 8.1 (Berkeley) 6/11/93 + */ + +/* + * The lockf structure is a kernel structure which contains the information + * associated with a byte range lock. The lockf structures are linked into + * the inode structure. Locks are sorted by the starting byte of the lock for + * efficiency. + */ +struct lockf { + short lf_flags; /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */ + short lf_type; /* Lock type: F_RDLCK, F_WRLCK */ + off_t lf_start; /* The byte # of the start of the lock */ + off_t lf_end; /* The byte # of the end of the lock (-1=EOF)*/ + caddr_t lf_id; /* The id of the resource holding the lock */ + struct inode *lf_inode; /* Back pointer to the inode */ + struct lockf *lf_next; /* A pointer to the next lock on this inode */ + struct lockf *lf_block; /* The list of blocked locks */ +}; + +/* Maximum length of sleep chains to traverse to try and detect deadlock. */ +#define MAXDEPTH 50 + +__BEGIN_DECLS +void lf_addblock __P((struct lockf *, struct lockf *)); +int lf_clearlock __P((struct lockf *)); +int lf_findoverlap __P((struct lockf *, + struct lockf *, int, struct lockf ***, struct lockf **)); +struct lockf * + lf_getblock __P((struct lockf *)); +int lf_getlock __P((struct lockf *, struct flock *)); +int lf_setlock __P((struct lockf *)); +void lf_split __P((struct lockf *, struct lockf *)); +void lf_wakelock __P((struct lockf *)); +__END_DECLS + +#ifdef LOCKF_DEBUG +extern int lockf_debug; + +__BEGIN_DECLS +void lf_print __P((char *, struct lockf *)); +void lf_printlist __P((char *, struct lockf *)); +__END_DECLS +#endif diff --git a/sys/ufs/ufs/quota.h b/sys/ufs/ufs/quota.h new file mode 100644 index 00000000000..11efb402c91 --- /dev/null +++ b/sys/ufs/ufs/quota.h @@ -0,0 +1,207 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)quota.h 8.1 (Berkeley) 6/11/93 + */ + +#ifndef _QUOTA_ +#define _QUOTA_ + +/* + * Definitions for disk quotas imposed on the average user + * (big brother finally hits UNIX). + * + * The following constants define the amount of time given a user before the + * soft limits are treated as hard limits (usually resulting in an allocation + * failure). The timer is started when the user crosses their soft limit, it + * is reset when they go below their soft limit. + */ +#define MAX_IQ_TIME (7*24*60*60) /* 1 week */ +#define MAX_DQ_TIME (7*24*60*60) /* 1 week */ + +/* + * The following constants define the usage of the quota file array in the + * ufsmount structure and dquot array in the inode structure. The semantics + * of the elements of these arrays are defined in the routine getinoquota; + * the remainder of the quota code treats them generically and need not be + * inspected when changing the size of the array. + */ +#define MAXQUOTAS 2 +#define USRQUOTA 0 /* element used for user quotas */ +#define GRPQUOTA 1 /* element used for group quotas */ + +/* + * Definitions for the default names of the quotas files. + */ +#define INITQFNAMES { \ + "user", /* USRQUOTA */ \ + "group", /* GRPQUOTA */ \ + "undefined", \ +}; +#define QUOTAFILENAME "quota" +#define QUOTAGROUP "operator" + +/* + * Command definitions for the 'quotactl' system call. The commands are + * broken into a main command defined below and a subcommand that is used + * to convey the type of quota that is being manipulated (see above). + */ +#define SUBCMDMASK 0x00ff +#define SUBCMDSHIFT 8 +#define QCMD(cmd, type) (((cmd) << SUBCMDSHIFT) | ((type) & SUBCMDMASK)) + +#define Q_QUOTAON 0x0100 /* enable quotas */ +#define Q_QUOTAOFF 0x0200 /* disable quotas */ +#define Q_GETQUOTA 0x0300 /* get limits and usage */ +#define Q_SETQUOTA 0x0400 /* set limits and usage */ +#define Q_SETUSE 0x0500 /* set usage */ +#define Q_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is an array of these structures + * indexed by user or group number. The setquota system call establishes + * the vnode for each quota file (a pointer is retained in the ufsmount + * structure). + */ +struct dqblk { + u_long dqb_bhardlimit; /* absolute limit on disk blks alloc */ + u_long dqb_bsoftlimit; /* preferred limit on disk blks */ + u_long dqb_curblocks; /* current block count */ + u_long dqb_ihardlimit; /* maximum # allocated inodes + 1 */ + u_long dqb_isoftlimit; /* preferred inode limit */ + u_long dqb_curinodes; /* current # allocated inodes */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive files */ +}; + +/* + * The following structure records disk usage for a user or group on a + * filesystem. There is one allocated for each quota that exists on any + * filesystem for the current user or group. A cache is kept of recently + * used entries. + */ +struct dquot { + struct dquot *dq_forw, **dq_back; /* hash list */ + struct dquot *dq_freef, **dq_freeb; /* free list */ + short dq_flags; /* flags, see below */ + short dq_cnt; /* count of active references */ + short dq_spare; /* unused spare padding */ + short dq_type; /* quota type of this dquot */ + u_long dq_id; /* identifier this applies to */ + struct ufsmount *dq_ump; /* filesystem that this is taken from */ + struct dqblk dq_dqb; /* actual usage & quotas */ +}; +/* + * Flag values. + */ +#define DQ_LOCK 0x01 /* this quota locked (no MODS) */ +#define DQ_WANT 0x02 /* wakeup on unlock */ +#define DQ_MOD 0x04 /* this quota modified since read */ +#define DQ_FAKE 0x08 /* no limits here, just usage */ +#define DQ_BLKS 0x10 /* has been warned about blk limit */ +#define DQ_INODS 0x20 /* has been warned about inode limit */ +/* + * Shorthand notation. + */ +#define dq_bhardlimit dq_dqb.dqb_bhardlimit +#define dq_bsoftlimit dq_dqb.dqb_bsoftlimit +#define dq_curblocks dq_dqb.dqb_curblocks +#define dq_ihardlimit dq_dqb.dqb_ihardlimit +#define dq_isoftlimit dq_dqb.dqb_isoftlimit +#define dq_curinodes dq_dqb.dqb_curinodes +#define dq_btime dq_dqb.dqb_btime +#define dq_itime dq_dqb.dqb_itime + +/* + * If the system has never checked for a quota for this file, then it is set + * to NODQUOT. Once a write attempt is made the inode pointer is set to + * reference a dquot structure. + */ +#define NODQUOT ((struct dquot *) 0) + +/* + * Flags to chkdq() and chkiq() + */ +#define FORCE 0x01 /* force usage changes independent of limits */ +#define CHOWN 0x02 /* (advisory) change initiated by chown */ + +/* + * Macros to avoid subroutine calls to trivial functions. + */ +#ifdef DIAGNOSTIC +#define DQREF(dq) dqref(dq) +#else +#define DQREF(dq) (dq)->dq_cnt++ +#endif + +#include + +struct dquot; +struct inode; +struct mount; +struct proc; +struct ucred; +struct ufsmount; +struct vnode; +__BEGIN_DECLS +int chkdq __P((struct inode *, long, struct ucred *, int)); +int chkdqchg __P((struct inode *, long, struct ucred *, int)); +int chkiq __P((struct inode *, long, struct ucred *, int)); +int chkiqchg __P((struct inode *, long, struct ucred *, int)); +void dqflush __P((struct vnode *)); +int dqget __P((struct vnode *, + u_long, struct ufsmount *, int, struct dquot **)); +void dqinit __P((void)); +void dqref __P((struct dquot *)); +void dqrele __P((struct vnode *, struct dquot *)); +int dqsync __P((struct vnode *, struct dquot *)); +int getinoquota __P((struct inode *)); +int getquota __P((struct mount *, u_long, int, caddr_t)); +int qsync __P((struct mount *mp)); +int quotaoff __P((struct proc *, struct mount *, int)); +int quotaon __P((struct proc *, struct mount *, int, caddr_t)); +int setquota __P((struct mount *, u_long, int, caddr_t)); +int setuse __P((struct mount *, u_long, int, caddr_t)); +int ufs_quotactl __P((struct mount *, int, uid_t, caddr_t, struct proc *)); +__END_DECLS + +#ifdef DIAGNOSTIC +__BEGIN_DECLS +void chkdquot __P((struct inode *)); +__END_DECLS +#endif + +#endif /* _QUOTA_ */ diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c new file mode 100644 index 00000000000..bcd838d036a --- /dev/null +++ b/sys/ufs/ufs/ufs_bmap.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_bmap.c 8.6 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +/* + * Bmap converts a the logical block number of a file to its physical block + * number on the disk. The conversion is done by using the logical block + * number to index into the array of block pointers described by the dinode. + */ +int +ufs_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + } */ *ap; +{ + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_vpp != NULL) + *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; + if (ap->a_bnp == NULL) + return (0); + + return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL, + ap->a_runp)); +} + +/* + * Indirect blocks are now on the vnode for the file. They are given negative + * logical block numbers. Indirect blocks are addressed by the negative + * address of the first data block to which they point. Double indirect blocks + * are addressed by one less than the address of the first indirect block to + * which they point. Triple indirect blocks are addressed by one less than + * the address of the first double indirect block to which they point. + * + * ufs_bmaparray does the bmap conversion, and if requested returns the + * array of logical blocks which must be traversed to get to a block. + * Each entry contains the offset into that block that gets you to the + * next block and the disk address of the block (if it is assigned). + */ + +int +ufs_bmaparray(vp, bn, bnp, ap, nump, runp) + struct vnode *vp; + register daddr_t bn; + daddr_t *bnp; + struct indir *ap; + int *nump; + int *runp; +{ + register struct inode *ip; + struct buf *bp; + struct ufsmount *ump; + struct mount *mp; + struct vnode *devvp; + struct indir a[NIADDR], *xap; + daddr_t daddr; + long metalbn; + int error, maxrun, num; + + ip = VTOI(vp); + mp = vp->v_mount; + ump = VFSTOUFS(mp); +#ifdef DIAGNOSTIC + if (ap != NULL && nump == NULL || ap == NULL && nump != NULL) + panic("ufs_bmaparray: invalid arguments"); +#endif + + if (runp) { + /* + * XXX + * If MAXBSIZE is the largest transfer the disks can handle, + * we probably want maxrun to be 1 block less so that we + * don't create a block larger than the device can handle. + */ + *runp = 0; + maxrun = MAXBSIZE / mp->mnt_stat.f_iosize - 1; + } + + xap = ap == NULL ? a : ap; + if (!nump) + nump = # + if (error = ufs_getlbns(vp, bn, xap, nump)) + return (error); + + num = *nump; + if (num == 0) { + *bnp = blkptrtodb(ump, ip->i_db[bn]); + if (*bnp == 0) + *bnp = -1; + else if (runp) + for (++bn; bn < NDADDR && *runp < maxrun && + is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); + ++bn, ++*runp); + return (0); + } + + + /* Get disk address out of indirect block array */ + daddr = ip->i_ib[xap->in_off]; + + devvp = VFSTOUFS(vp->v_mount)->um_devvp; + for (bp = NULL, ++xap; --num; ++xap) { + /* + * Exit the loop if there is no disk address assigned yet and + * the indirect block isn't in the cache, or if we were + * looking for an indirect block and we've found it. + */ + + metalbn = xap->in_lbn; + if (daddr == 0 && !incore(vp, metalbn) || metalbn == bn) + break; + /* + * If we get here, we've either got the block in the cache + * or we have a disk address for it, go fetch it. + */ + if (bp) + brelse(bp); + + xap->in_exists = 1; + bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); + if (bp->b_flags & (B_DONE | B_DELWRI)) { + trace(TR_BREADHIT, pack(vp, size), metalbn); + } +#ifdef DIAGNOSTIC + else if (!daddr) + panic("ufs_bmaparry: indirect block not in cache"); +#endif + else { + trace(TR_BREADMISS, pack(vp, size), metalbn); + bp->b_blkno = blkptrtodb(ump, daddr); + bp->b_flags |= B_READ; + VOP_STRATEGY(bp); + curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + if (error = biowait(bp)) { + brelse(bp); + return (error); + } + } + + daddr = ((daddr_t *)bp->b_data)[xap->in_off]; + if (num == 1 && daddr && runp) + for (bn = xap->in_off + 1; + bn < MNINDIR(ump) && *runp < maxrun && + is_sequential(ump, ((daddr_t *)bp->b_data)[bn - 1], + ((daddr_t *)bp->b_data)[bn]); + ++bn, ++*runp); + } + if (bp) + brelse(bp); + + daddr = blkptrtodb(ump, daddr); + *bnp = daddr == 0 ? -1 : daddr; + return (0); +} + +/* + * Create an array of logical block number/offset pairs which represent the + * path of indirect blocks required to access a data block. The first "pair" + * contains the logical block number of the appropriate single, double or + * triple indirect block and the offset into the inode indirect block array. + * Note, the logical block number of the inode single/double/triple indirect + * block appears twice in the array, once with the offset into the i_ib and + * once with the offset into the page itself. + */ +int +ufs_getlbns(vp, bn, ap, nump) + struct vnode *vp; + register daddr_t bn; + struct indir *ap; + int *nump; +{ + long metalbn, realbn; + struct ufsmount *ump; + int blockcnt, i, numlevels, off; + + ump = VFSTOUFS(vp->v_mount); + if (nump) + *nump = 0; + numlevels = 0; + realbn = bn; + if ((long)bn < 0) + bn = -(long)bn; + + /* The first NDADDR blocks are direct blocks. */ + if (bn < NDADDR) + return (0); + + /* + * Determine the number of levels of indirection. After this loop + * is done, blockcnt indicates the number of data blocks possible + * at the given level of indirection, and NIADDR - i is the number + * of levels of indirection needed to locate the requested block. + */ + for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { + if (i == 0) + return (EFBIG); + blockcnt *= MNINDIR(ump); + if (bn < blockcnt) + break; + } + + /* Calculate the address of the first meta-block. */ + if (realbn >= 0) + metalbn = -(realbn - bn + NIADDR - i); + else + metalbn = -(-realbn - bn + NIADDR - i); + + /* + * At each iteration, off is the offset into the bap array which is + * an array of disk addresses at the current level of indirection. + * The logical block number and the offset in that block are stored + * into the argument array. + */ + ap->in_lbn = metalbn; + ap->in_off = off = NIADDR - i; + ap->in_exists = 0; + ap++; + for (++numlevels; i <= NIADDR; i++) { + /* If searching for a meta-data block, quit when found. */ + if (metalbn == realbn) + break; + + blockcnt /= MNINDIR(ump); + off = (bn / blockcnt) % MNINDIR(ump); + + ++numlevels; + ap->in_lbn = metalbn; + ap->in_off = off; + ap->in_exists = 0; + ++ap; + + metalbn -= -1 + off * blockcnt; + } + if (nump) + *nump = numlevels; + return (0); +} diff --git a/sys/ufs/ufs/ufs_disksubr.c b/sys/ufs/ufs/ufs_disksubr.c new file mode 100644 index 00000000000..78dede4da77 --- /dev/null +++ b/sys/ufs/ufs/ufs_disksubr.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include + +/* + * Seek sort for disks. We depend on the driver which calls us using b_resid + * as the current cylinder number. + * + * The argument ap structure holds a b_actf activity chain pointer on which we + * keep two queues, sorted in ascending cylinder order. The first queue holds + * those requests which are positioned after the current cylinder (in the first + * request); the second holds requests which came in after their cylinder number + * was passed. Thus we implement a one way scan, retracting after reaching the + * end of the drive to the first request on the second queue, at which time it + * becomes the first queue. + * + * A one-way scan is natural because of the way UNIX read-ahead blocks are + * allocated. + */ + +/* + * For portability with historic industry practice, the + * cylinder number has to be maintained in the `b_resid' + * field. + */ +#define b_cylinder b_resid + +void +disksort(ap, bp) + register struct buf *ap, *bp; +{ + register struct buf *bq; + + /* If the queue is empty, then it's easy. */ + if (ap->b_actf == NULL) { + bp->b_actf = NULL; + ap->b_actf = bp; + return; + } + + /* + * If we lie after the first (currently active) request, then we + * must locate the second request list and add ourselves to it. + */ + bq = ap->b_actf; + if (bp->b_cylinder < bq->b_cylinder) { + while (bq->b_actf) { + /* + * Check for an ``inversion'' in the normally ascending + * cylinder numbers, indicating the start of the second + * request list. + */ + if (bq->b_actf->b_cylinder < bq->b_cylinder) { + /* + * Search the second request list for the first + * request at a larger cylinder number. We go + * before that; if there is no such request, we + * go at end. + */ + do { + if (bp->b_cylinder < + bq->b_actf->b_cylinder) + goto insert; + if (bp->b_cylinder == + bq->b_actf->b_cylinder && + bp->b_blkno < bq->b_actf->b_blkno) + goto insert; + bq = bq->b_actf; + } while (bq->b_actf); + goto insert; /* after last */ + } + bq = bq->b_actf; + } + /* + * No inversions... we will go after the last, and + * be the first request in the second request list. + */ + goto insert; + } + /* + * Request is at/after the current request... + * sort in the first request list. + */ + while (bq->b_actf) { + /* + * We want to go after the current request if there is an + * inversion after it (i.e. it is the end of the first + * request list), or if the next request is a larger cylinder + * than our request. + */ + if (bq->b_actf->b_cylinder < bq->b_cylinder || + bp->b_cylinder < bq->b_actf->b_cylinder || + (bp->b_cylinder == bq->b_actf->b_cylinder && + bp->b_blkno < bq->b_actf->b_blkno)) + goto insert; + bq = bq->b_actf; + } + /* + * Neither a second list nor a larger request... we go at the end of + * the first list, which is the same as the end of the whole schebang. + */ +insert: bp->b_actf = bq->b_actf; + bq->b_actf = bp; +} + +/* + * Attempt to read a disk label from a device using the indicated stategy + * routine. The label must be partly set up before this: secpercyl and + * anything required in the strategy routine (e.g., sector size) must be + * filled in before calling us. Returns NULL on success and an error + * string on failure. + */ +char * +readdisklabel(dev, strat, lp) + dev_t dev; + int (*strat)(); + register struct disklabel *lp; +{ + register struct buf *bp; + struct disklabel *dlp; + char *msg = NULL; + + if (lp->d_secperunit == 0) + lp->d_secperunit = 0x1fffffff; + lp->d_npartitions = 1; + if (lp->d_partitions[0].p_size == 0) + lp->d_partitions[0].p_size = 0x1fffffff; + lp->d_partitions[0].p_offset = 0; + + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dev; + bp->b_blkno = LABELSECTOR; + bp->b_bcount = lp->d_secsize; + bp->b_flags = B_BUSY | B_READ; + bp->b_cylinder = LABELSECTOR / lp->d_secpercyl; + (*strat)(bp); + if (biowait(bp)) + msg = "I/O error"; + else for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *)((char *)bp->b_data + + DEV_BSIZE - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) { + if (msg == NULL) + msg = "no disk label"; + } else if (dlp->d_npartitions > MAXPARTITIONS || + dkcksum(dlp) != 0) + msg = "disk label corrupted"; + else { + *lp = *dlp; + msg = NULL; + break; + } + } + bp->b_flags = B_INVAL | B_AGE; + brelse(bp); + return (msg); +} + +/* + * Check new disk label for sensibility before setting it. + */ +int +setdisklabel(olp, nlp, openmask) + register struct disklabel *olp, *nlp; + u_long openmask; +{ + register i; + register struct partition *opp, *npp; + + if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC || + dkcksum(nlp) != 0) + return (EINVAL); + while ((i = ffs((long)openmask)) != 0) { + i--; + openmask &= ~(1 << i); + if (nlp->d_npartitions <= i) + return (EBUSY); + opp = &olp->d_partitions[i]; + npp = &nlp->d_partitions[i]; + if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size) + return (EBUSY); + /* + * Copy internally-set partition information + * if new label doesn't include it. XXX + */ + if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) { + npp->p_fstype = opp->p_fstype; + npp->p_fsize = opp->p_fsize; + npp->p_frag = opp->p_frag; + npp->p_cpg = opp->p_cpg; + } + } + nlp->d_checksum = 0; + nlp->d_checksum = dkcksum(nlp); + *olp = *nlp; + return (0); +} + +/* encoding of disk minor numbers, should be elsewhere... */ +#define dkunit(dev) (minor(dev) >> 3) +#define dkpart(dev) (minor(dev) & 07) +#define dkminor(unit, part) (((unit) << 3) | (part)) + +/* + * Write disk label back to device after modification. + */ +int +writedisklabel(dev, strat, lp) + dev_t dev; + int (*strat)(); + register struct disklabel *lp; +{ + struct buf *bp; + struct disklabel *dlp; + int labelpart; + int error = 0; + + labelpart = dkpart(dev); + if (lp->d_partitions[labelpart].p_offset != 0) { + if (lp->d_partitions[0].p_offset != 0) + return (EXDEV); /* not quite right */ + labelpart = 0; + } + bp = geteblk((int)lp->d_secsize); + bp->b_dev = makedev(major(dev), dkminor(dkunit(dev), labelpart)); + bp->b_blkno = LABELSECTOR; + bp->b_bcount = lp->d_secsize; + bp->b_flags = B_READ; + (*strat)(bp); + if (error = biowait(bp)) + goto done; + for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *) + ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC && + dkcksum(dlp) == 0) { + *dlp = *lp; + bp->b_flags = B_WRITE; + (*strat)(bp); + error = biowait(bp); + goto done; + } + } + error = ESRCH; +done: + brelse(bp); + return (error); +} + +/* + * Compute checksum for disk label. + */ +dkcksum(lp) + register struct disklabel *lp; +{ + register u_short *start, *end; + register u_short sum = 0; + + start = (u_short *)lp; + end = (u_short *)&lp->d_partitions[lp->d_npartitions]; + while (start < end) + sum ^= *start++; + return (sum); +} + +/* + * Disk error is the preface to plaintive error messages + * about failing disk transfers. It prints messages of the form + +hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) + + * if the offset of the error in the transfer and a disk label + * are both available. blkdone should be -1 if the position of the error + * is unknown; the disklabel pointer may be null from drivers that have not + * been converted to use them. The message is printed with printf + * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. + * The message should be completed (with at least a newline) with printf + * or addlog, respectively. There is no trailing space. + */ +void +diskerr(bp, dname, what, pri, blkdone, lp) + register struct buf *bp; + char *dname, *what; + int pri, blkdone; + register struct disklabel *lp; +{ + int unit = dkunit(bp->b_dev), part = dkpart(bp->b_dev); + register void (*pr) __P((const char *, ...)); + char partname = 'a' + part; + int sn; + + if (pri != LOG_PRINTF) { + log(pri, ""); + pr = addlog; + } else + pr = printf; + (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what, + bp->b_flags & B_READ ? "read" : "writ"); + sn = bp->b_blkno; + if (bp->b_bcount <= DEV_BSIZE) + (*pr)("%d", sn); + else { + if (blkdone >= 0) { + sn += blkdone; + (*pr)("%d of ", sn); + } + (*pr)("%d-%d", bp->b_blkno, + bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); + } + if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { +#ifdef tahoe + sn *= DEV_BSIZE / lp->d_secsize; /* XXX */ +#endif + sn += lp->d_partitions[part].p_offset; + (*pr)(" (%s%d bn %d; cn %d", dname, unit, sn, + sn / lp->d_secpercyl); + sn %= lp->d_secpercyl; + (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors); + } +} diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h new file mode 100644 index 00000000000..e25923e947d --- /dev/null +++ b/sys/ufs/ufs/ufs_extern.h @@ -0,0 +1,125 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_extern.h 8.3 (Berkeley) 4/16/94 + */ + +struct buf; +struct direct; +struct disklabel; +struct fid; +struct flock; +struct inode; +struct mbuf; +struct mount; +struct nameidata; +struct proc; +struct ucred; +struct uio; +struct vattr; +struct vnode; +struct ufs_args; + +__BEGIN_DECLS +void diskerr + __P((struct buf *, char *, char *, int, int, struct disklabel *)); +void disksort __P((struct buf *, struct buf *)); +u_int dkcksum __P((struct disklabel *)); +char *readdisklabel __P((dev_t, int (*)(), struct disklabel *)); +int setdisklabel __P((struct disklabel *, struct disklabel *, u_long)); +int writedisklabel __P((dev_t, int (*)(), struct disklabel *)); + +int ufs_abortop __P((struct vop_abortop_args *)); +int ufs_access __P((struct vop_access_args *)); +int ufs_advlock __P((struct vop_advlock_args *)); +int ufs_bmap __P((struct vop_bmap_args *)); +int ufs_check_export __P((struct mount *, struct ufid *, struct mbuf *, + struct vnode **, int *exflagsp, struct ucred **)); +int ufs_checkpath __P((struct inode *, struct inode *, struct ucred *)); +int ufs_close __P((struct vop_close_args *)); +int ufs_create __P((struct vop_create_args *)); +void ufs_dirbad __P((struct inode *, doff_t, char *)); +int ufs_dirbadentry __P((struct vnode *, struct direct *, int)); +int ufs_dirempty __P((struct inode *, ino_t, struct ucred *)); +int ufs_direnter __P((struct inode *, struct vnode *,struct componentname *)); +int ufs_dirremove __P((struct vnode *, struct componentname*)); +int ufs_dirrewrite + __P((struct inode *, struct inode *, struct componentname *)); +int ufs_getattr __P((struct vop_getattr_args *)); +int ufs_getlbns __P((struct vnode *, daddr_t, struct indir *, int *)); +struct vnode * + ufs_ihashget __P((dev_t, ino_t)); +void ufs_ihashinit __P((void)); +void ufs_ihashins __P((struct inode *)); +struct vnode * + ufs_ihashlookup __P((dev_t, ino_t)); +void ufs_ihashrem __P((struct inode *)); +int ufs_inactive __P((struct vop_inactive_args *)); +int ufs_init __P((void)); +int ufs_ioctl __P((struct vop_ioctl_args *)); +int ufs_islocked __P((struct vop_islocked_args *)); +int ufs_link __P((struct vop_link_args *)); +int ufs_lock __P((struct vop_lock_args *)); +int ufs_lookup __P((struct vop_lookup_args *)); +int ufs_makeinode __P((int mode, struct vnode *, struct vnode **, struct componentname *)); +int ufs_mkdir __P((struct vop_mkdir_args *)); +int ufs_mknod __P((struct vop_mknod_args *)); +int ufs_mmap __P((struct vop_mmap_args *)); +int ufs_open __P((struct vop_open_args *)); +int ufs_pathconf __P((struct vop_pathconf_args *)); +int ufs_print __P((struct vop_print_args *)); +int ufs_readdir __P((struct vop_readdir_args *)); +int ufs_readlink __P((struct vop_readlink_args *)); +int ufs_reclaim __P((struct vop_reclaim_args *)); +int ufs_remove __P((struct vop_remove_args *)); +int ufs_rename __P((struct vop_rename_args *)); +int ufs_rmdir __P((struct vop_rmdir_args *)); +int ufs_root __P((struct mount *, struct vnode **)); +int ufs_seek __P((struct vop_seek_args *)); +int ufs_select __P((struct vop_select_args *)); +int ufs_setattr __P((struct vop_setattr_args *)); +int ufs_start __P((struct mount *, int, struct proc *)); +int ufs_strategy __P((struct vop_strategy_args *)); +int ufs_symlink __P((struct vop_symlink_args *)); +int ufs_unlock __P((struct vop_unlock_args *)); +int ufs_vinit __P((struct mount *, + int (**)(), int (**)(), struct vnode **)); +int ufsspec_close __P((struct vop_close_args *)); +int ufsspec_read __P((struct vop_read_args *)); +int ufsspec_write __P((struct vop_write_args *)); + +#ifdef FIFO +int ufsfifo_read __P((struct vop_read_args *)); +int ufsfifo_write __P((struct vop_write_args *)); +int ufsfifo_close __P((struct vop_close_args *)); +#endif +__END_DECLS diff --git a/sys/ufs/ufs/ufs_ihash.c b/sys/ufs/ufs/ufs_ihash.c new file mode 100644 index 00000000000..4a37c907ef6 --- /dev/null +++ b/sys/ufs/ufs/ufs_ihash.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_ihash.c 8.4 (Berkeley) 12/30/93 + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Structures associated with inode cacheing. + */ +struct inode **ihashtbl; +u_long ihash; /* size of hash table - 1 */ +#define INOHASH(device, inum) (((device) + (inum)) & ihash) + +/* + * Initialize inode hash table. + */ +void +ufs_ihashinit() +{ + + ihashtbl = hashinit(desiredvnodes, M_UFSMNT, &ihash); +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, return it, even if it is locked. + */ +struct vnode * +ufs_ihashlookup(device, inum) + dev_t device; + ino_t inum; +{ + register struct inode *ip; + + for (ip = ihashtbl[INOHASH(device, inum)];; ip = ip->i_next) { + if (ip == NULL) + return (NULL); + if (inum == ip->i_number && device == ip->i_dev) + return (ITOV(ip)); + } + /* NOTREACHED */ +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, but locked, wait for it. + */ +struct vnode * +ufs_ihashget(device, inum) + dev_t device; + ino_t inum; +{ + register struct inode *ip; + struct vnode *vp; + + for (;;) + for (ip = ihashtbl[INOHASH(device, inum)];; ip = ip->i_next) { + if (ip == NULL) + return (NULL); + if (inum == ip->i_number && device == ip->i_dev) { + if (ip->i_flag & IN_LOCKED) { + ip->i_flag |= IN_WANTED; + sleep(ip, PINOD); + break; + } + vp = ITOV(ip); + if (!vget(vp, 1)) + return (vp); + break; + } + } + /* NOTREACHED */ +} + +/* + * Insert the inode into the hash table, and return it locked. + */ +void +ufs_ihashins(ip) + struct inode *ip; +{ + struct inode **ipp, *iq; + + ipp = &ihashtbl[INOHASH(ip->i_dev, ip->i_number)]; + if (iq = *ipp) + iq->i_prev = &ip->i_next; + ip->i_next = iq; + ip->i_prev = ipp; + *ipp = ip; + if (ip->i_flag & IN_LOCKED) + panic("ufs_ihashins: already locked"); + if (curproc) + ip->i_lockholder = curproc->p_pid; + else + ip->i_lockholder = -1; + ip->i_flag |= IN_LOCKED; +} + +/* + * Remove the inode from the hash table. + */ +void +ufs_ihashrem(ip) + register struct inode *ip; +{ + register struct inode *iq; + + if (iq = ip->i_next) + iq->i_prev = ip->i_prev; + *ip->i_prev = iq; +#ifdef DIAGNOSTIC + ip->i_next = NULL; + ip->i_prev = NULL; +#endif +} diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c new file mode 100644 index 00000000000..ac876f9d34d --- /dev/null +++ b/sys/ufs/ufs/ufs_inode.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_inode.c 8.4 (Berkeley) 1/21/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +u_long nextgennumber; /* Next generation number to assign. */ +int prtactive = 0; /* 1 => print out reclaim of active vnodes */ + +int +ufs_init() +{ + static int first = 1; + + if (!first) + return (0); + first = 0; + +#ifdef DIAGNOSTIC + if ((sizeof(struct inode) - 1) & sizeof(struct inode)) + printf("ufs_init: bad size %d\n", sizeof(struct inode)); +#endif + ufs_ihashinit(); + dqinit(); + return (0); +} + +/* + * Last reference to an inode. If necessary, write or delete it. + */ +int +ufs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + struct timeval tv; + int mode, error; + extern int prtactive; + + if (prtactive && vp->v_usecount != 0) + vprint("ffs_inactive: pushing active", vp); + + /* Get rid of inodes related to stale file handles. */ + if (ip->i_mode == 0) { + if ((vp->v_flag & VXLOCK) == 0) + vgone(vp); + return (0); + } + + error = 0; +#ifdef DIAGNOSTIC + if (VOP_ISLOCKED(vp)) + panic("ffs_inactive: locked inode"); + if (curproc) + ip->i_lockholder = curproc->p_pid; + else + ip->i_lockholder = -1; +#endif + ip->i_flag |= IN_LOCKED; + if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { +#ifdef QUOTA + if (!getinoquota(ip)) + (void)chkiq(ip, -1, NOCRED, 0); +#endif + error = VOP_TRUNCATE(vp, (off_t)0, 0, NOCRED, NULL); + ip->i_rdev = 0; + mode = ip->i_mode; + ip->i_mode = 0; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + VOP_VFREE(vp, ip->i_number, mode); + } + if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) { + tv = time; + VOP_UPDATE(vp, &tv, &tv, 0); + } + VOP_UNLOCK(vp); + /* + * If we are done with the inode, reclaim it + * so that it can be reused immediately. + */ + if (vp->v_usecount == 0 && ip->i_mode == 0) + vgone(vp); + return (error); +} + +/* + * Reclaim an inode so that it can be used for other purposes. + */ +int +ufs_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip; + int i, type; + + if (prtactive && vp->v_usecount != 0) + vprint("ufs_reclaim: pushing active", vp); + /* + * Remove the inode from its hash chain. + */ + ip = VTOI(vp); + ufs_ihashrem(ip); + /* + * Purge old data structures associated with the inode. + */ + cache_purge(vp); + if (ip->i_devvp) { + vrele(ip->i_devvp); + ip->i_devvp = 0; + } +#ifdef QUOTA + for (i = 0; i < MAXQUOTAS; i++) { + if (ip->i_dquot[i] != NODQUOT) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } + } +#endif + switch (vp->v_mount->mnt_stat.f_type) { + case MOUNT_UFS: + type = M_FFSNODE; + break; + case MOUNT_MFS: + type = M_MFSNODE; + break; + case MOUNT_LFS: + type = M_LFSNODE; + break; + default: + panic("ufs_reclaim: not ufs file"); + } + FREE(vp->v_data, type); + vp->v_data = NULL; + return (0); +} diff --git a/sys/ufs/ufs/ufs_lockf.c b/sys/ufs/ufs/ufs_lockf.c new file mode 100644 index 00000000000..cb9a7375de1 --- /dev/null +++ b/sys/ufs/ufs/ufs_lockf.c @@ -0,0 +1,707 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Scooter Morris at Genentech Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * This variable controls the maximum number of processes that will + * be checked in doing deadlock detection. + */ +int maxlockdepth = MAXDEPTH; + +#ifdef LOCKF_DEBUG +int lockf_debug = 0; +#endif + +#define NOLOCKF (struct lockf *)0 +#define SELF 0x1 +#define OTHERS 0x2 + +/* + * Set a byte-range lock. + */ +int +lf_setlock(lock) + register struct lockf *lock; +{ + register struct lockf *block; + struct inode *ip = lock->lf_inode; + struct lockf **prev, *overlap, *ltmp; + static char lockstr[] = "lockf"; + int ovcase, priority, needtolink, error; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_setlock", lock); +#endif /* LOCKF_DEBUG */ + + /* + * Set the priority + */ + priority = PLOCK; + if (lock->lf_type == F_WRLCK) + priority += 4; + priority |= PCATCH; + /* + * Scan lock list for this file looking for locks that would block us. + */ + while (block = lf_getblock(lock)) { + /* + * Free the structure and return if nonblocking. + */ + if ((lock->lf_flags & F_WAIT) == 0) { + FREE(lock, M_LOCKF); + return (EAGAIN); + } + /* + * We are blocked. Since flock style locks cover + * the whole file, there is no chance for deadlock. + * For byte-range locks we must check for deadlock. + * + * Deadlock detection is done by looking through the + * wait channels to see if there are any cycles that + * involve us. MAXDEPTH is set just to make sure we + * do not go off into neverland. + */ + if ((lock->lf_flags & F_POSIX) && + (block->lf_flags & F_POSIX)) { + register struct proc *wproc; + register struct lockf *waitblock; + int i = 0; + + /* The block is waiting on something */ + wproc = (struct proc *)block->lf_id; + while (wproc->p_wchan && + (wproc->p_wmesg == lockstr) && + (i++ < maxlockdepth)) { + waitblock = (struct lockf *)wproc->p_wchan; + /* Get the owner of the blocking lock */ + waitblock = waitblock->lf_next; + if ((waitblock->lf_flags & F_POSIX) == 0) + break; + wproc = (struct proc *)waitblock->lf_id; + if (wproc == (struct proc *)lock->lf_id) { + free(lock, M_LOCKF); + return (EDEADLK); + } + } + } + /* + * For flock type locks, we must first remove + * any shared locks that we hold before we sleep + * waiting for an exclusive lock. + */ + if ((lock->lf_flags & F_FLOCK) && + lock->lf_type == F_WRLCK) { + lock->lf_type = F_UNLCK; + (void) lf_clearlock(lock); + lock->lf_type = F_WRLCK; + } + /* + * Add our lock to the blocked list and sleep until we're free. + * Remember who blocked us (for deadlock detection). + */ + lock->lf_next = block; + lf_addblock(block, lock); +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: blocking on", block); + lf_printlist("lf_setlock", block); + } +#endif /* LOCKF_DEBUG */ + if (error = tsleep((caddr_t)lock, priority, lockstr, 0)) { + /* + * Delete ourselves from the waiting to lock list. + */ + for (block = lock->lf_next; + block != NOLOCKF; + block = block->lf_block) { + if (block->lf_block != lock) + continue; + block->lf_block = block->lf_block->lf_block; + break; + } + /* + * If we did not find ourselves on the list, but + * are still linked onto a lock list, then something + * is very wrong. + */ + if (block == NOLOCKF && lock->lf_next != NOLOCKF) + panic("lf_setlock: lost lock"); + free(lock, M_LOCKF); + return (error); + } + } + /* + * No blocks!! Add the lock. Note that we will + * downgrade or upgrade any overlapping locks this + * process already owns. + * + * Skip over locks owned by other processes. + * Handle any locks that overlap and are owned by ourselves. + */ + prev = &ip->i_lockf; + block = ip->i_lockf; + needtolink = 1; + for (;;) { + if (ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap)) + block = overlap->lf_next; + /* + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + switch (ovcase) { + case 0: /* no overlap */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + break; + + case 1: /* overlap == lock */ + /* + * If downgrading lock, others may be + * able to acquire it. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) + lf_wakelock(overlap); + overlap->lf_type = lock->lf_type; + FREE(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + + case 2: /* overlap contains lock */ + /* + * Check for common starting point and different types. + */ + if (overlap->lf_type == lock->lf_type) { + free(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + } + if (overlap->lf_start == lock->lf_start) { + *prev = lock; + lock->lf_next = overlap; + overlap->lf_start = lock->lf_end + 1; + } else + lf_split(overlap, lock); + lf_wakelock(overlap); + break; + + case 3: /* lock contains overlap */ + /* + * If downgrading lock, others may be able to + * acquire it, otherwise take the list. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) { + lf_wakelock(overlap); + } else { + ltmp = lock->lf_block; + lock->lf_block = overlap->lf_block; + lf_addblock(lock, ltmp); + } + /* + * Add the new lock if necessary and delete the overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap->lf_next; + prev = &lock->lf_next; + needtolink = 0; + } else + *prev = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + /* + * Add lock after overlap on the list. + */ + lock->lf_next = overlap->lf_next; + overlap->lf_next = lock; + overlap->lf_end = lock->lf_start - 1; + prev = &lock->lf_next; + lf_wakelock(overlap); + needtolink = 0; + continue; + + case 5: /* overlap ends after lock */ + /* + * Add the new lock before overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + overlap->lf_start = lock->lf_end + 1; + lf_wakelock(overlap); + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: got the lock", lock); + lf_printlist("lf_setlock", lock); + } +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Remove a byte-range lock on an inode. + * + * Generally, find the lock (or an overlap to that lock) + * and remove it (or shrink it), then wakeup anyone we can. + */ +int +lf_clearlock(unlock) + register struct lockf *unlock; +{ + struct inode *ip = unlock->lf_inode; + register struct lockf *lf = ip->i_lockf; + struct lockf *overlap, **prev; + int ovcase; + + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (unlock->lf_type != F_UNLCK) + panic("lf_clearlock: bad type"); + if (lockf_debug & 1) + lf_print("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + prev = &ip->i_lockf; + while (ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap)) { + /* + * Wakeup the list of locks to be retried. + */ + lf_wakelock(overlap); + + switch (ovcase) { + + case 1: /* overlap == lock */ + *prev = overlap->lf_next; + FREE(overlap, M_LOCKF); + break; + + case 2: /* overlap contains lock: split it */ + if (overlap->lf_start == unlock->lf_start) { + overlap->lf_start = unlock->lf_end + 1; + break; + } + lf_split(overlap, unlock); + overlap->lf_next = unlock->lf_next; + break; + + case 3: /* lock contains overlap */ + *prev = overlap->lf_next; + lf = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + overlap->lf_end = unlock->lf_start - 1; + prev = &overlap->lf_next; + lf = overlap->lf_next; + continue; + + case 5: /* overlap ends after lock */ + overlap->lf_start = unlock->lf_end + 1; + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_printlist("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Check whether there is a blocking lock, + * and if so return its process identifier. + */ +int +lf_getlock(lock, fl) + register struct lockf *lock; + register struct flock *fl; +{ + register struct lockf *block; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_getlock", lock); +#endif /* LOCKF_DEBUG */ + + if (block = lf_getblock(lock)) { + fl->l_type = block->lf_type; + fl->l_whence = SEEK_SET; + fl->l_start = block->lf_start; + if (block->lf_end == -1) + fl->l_len = 0; + else + fl->l_len = block->lf_end - block->lf_start + 1; + if (block->lf_flags & F_POSIX) + fl->l_pid = ((struct proc *)(block->lf_id))->p_pid; + else + fl->l_pid = -1; + } else { + fl->l_type = F_UNLCK; + } + return (0); +} + +/* + * Walk the list of locks for an inode and + * return the first blocking lock. + */ +struct lockf * +lf_getblock(lock) + register struct lockf *lock; +{ + struct lockf **prev, *overlap, *lf = lock->lf_inode->i_lockf; + int ovcase; + + prev = &lock->lf_inode->i_lockf; + while (ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap)) { + /* + * We've found an overlap, see if it blocks us + */ + if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) + return (overlap); + /* + * Nope, point to the next one on the list and + * see if it blocks us + */ + lf = overlap->lf_next; + } + return (NOLOCKF); +} + +/* + * Walk the list of locks for an inode to + * find an overlapping lock (if any). + * + * NOTE: this returns only the FIRST overlapping lock. There + * may be more than one. + */ +int +lf_findoverlap(lf, lock, type, prev, overlap) + register struct lockf *lf; + struct lockf *lock; + int type; + struct lockf ***prev; + struct lockf **overlap; +{ + off_t start, end; + + *overlap = lf; + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_findoverlap: looking for overlap in", lock); +#endif /* LOCKF_DEBUG */ + start = lock->lf_start; + end = lock->lf_end; + while (lf != NOLOCKF) { + if (((type & SELF) && lf->lf_id != lock->lf_id) || + ((type & OTHERS) && lf->lf_id == lock->lf_id)) { + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("\tchecking", lf); +#endif /* LOCKF_DEBUG */ + /* + * OK, check for overlap + * + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + if ((lf->lf_end != -1 && start > lf->lf_end) || + (end != -1 && lf->lf_start > end)) { + /* Case 0 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("no overlap\n"); +#endif /* LOCKF_DEBUG */ + if ((type & SELF) && end != -1 && lf->lf_start > end) + return (0); + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } + if ((lf->lf_start == start) && (lf->lf_end == end)) { + /* Case 1 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap == lock\n"); +#endif /* LOCKF_DEBUG */ + return (1); + } + if ((lf->lf_start <= start) && + (end != -1) && + ((lf->lf_end >= end) || (lf->lf_end == -1))) { + /* Case 2 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap contains lock\n"); +#endif /* LOCKF_DEBUG */ + return (2); + } + if (start <= lf->lf_start && + (end == -1 || + (lf->lf_end != -1 && end >= lf->lf_end))) { + /* Case 3 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("lock contains overlap\n"); +#endif /* LOCKF_DEBUG */ + return (3); + } + if ((lf->lf_start < start) && + ((lf->lf_end >= start) || (lf->lf_end == -1))) { + /* Case 4 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap starts before lock\n"); +#endif /* LOCKF_DEBUG */ + return (4); + } + if ((lf->lf_start > start) && + (end != -1) && + ((lf->lf_end > end) || (lf->lf_end == -1))) { + /* Case 5 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap ends after lock\n"); +#endif /* LOCKF_DEBUG */ + return (5); + } + panic("lf_findoverlap: default"); + } + return (0); +} + +/* + * Add a lock to the end of the blocked list. + */ +void +lf_addblock(lock, blocked) + struct lockf *lock; + struct lockf *blocked; +{ + register struct lockf *lf; + + if (blocked == NOLOCKF) + return; +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) { + lf_print("addblock: adding", blocked); + lf_print("to blocked list of", lock); + } +#endif /* LOCKF_DEBUG */ + if ((lf = lock->lf_block) == NOLOCKF) { + lock->lf_block = blocked; + return; + } + while (lf->lf_block != NOLOCKF) + lf = lf->lf_block; + lf->lf_block = blocked; + return; +} + +/* + * Split a lock and a contained region into + * two or three locks as necessary. + */ +void +lf_split(lock1, lock2) + register struct lockf *lock1; + register struct lockf *lock2; +{ + register struct lockf *splitlock; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) { + lf_print("lf_split", lock1); + lf_print("splitting from", lock2); + } +#endif /* LOCKF_DEBUG */ + /* + * Check to see if spliting into only two pieces. + */ + if (lock1->lf_start == lock2->lf_start) { + lock1->lf_start = lock2->lf_end + 1; + lock2->lf_next = lock1; + return; + } + if (lock1->lf_end == lock2->lf_end) { + lock1->lf_end = lock2->lf_start - 1; + lock2->lf_next = lock1->lf_next; + lock1->lf_next = lock2; + return; + } + /* + * Make a new lock consisting of the last part of + * the encompassing lock + */ + MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK); + bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock); + splitlock->lf_start = lock2->lf_end + 1; + splitlock->lf_block = NOLOCKF; + lock1->lf_end = lock2->lf_start - 1; + /* + * OK, now link it in + */ + splitlock->lf_next = lock1->lf_next; + lock2->lf_next = splitlock; + lock1->lf_next = lock2; +} + +/* + * Wakeup a blocklist + */ +void +lf_wakelock(listhead) + struct lockf *listhead; +{ + register struct lockf *blocklist, *wakelock; + + blocklist = listhead->lf_block; + listhead->lf_block = NOLOCKF; + while (blocklist != NOLOCKF) { + wakelock = blocklist; + blocklist = blocklist->lf_block; + wakelock->lf_block = NOLOCKF; + wakelock->lf_next = NOLOCKF; +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_wakelock: awakening", wakelock); +#endif /* LOCKF_DEBUG */ + wakeup((caddr_t)wakelock); + } +} + +#ifdef LOCKF_DEBUG +/* + * Print out a lock. + */ +void +lf_print(tag, lock) + char *tag; + register struct lockf *lock; +{ + + printf("%s: lock 0x%lx for ", tag, lock); + if (lock->lf_flags & F_POSIX) + printf("proc %d", ((struct proc *)(lock->lf_id))->p_pid); + else + printf("id 0x%x", lock->lf_id); + printf(" in ino %d on dev <%d, %d>, %s, start %d, end %d", + lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev), + lock->lf_type == F_RDLCK ? "shared" : + lock->lf_type == F_WRLCK ? "exclusive" : + lock->lf_type == F_UNLCK ? "unlock" : + "unknown", lock->lf_start, lock->lf_end); + if (lock->lf_block) + printf(" block 0x%x\n", lock->lf_block); + else + printf("\n"); +} + +void +lf_printlist(tag, lock) + char *tag; + struct lockf *lock; +{ + register struct lockf *lf; + + printf("%s: Lock list for ino %d on dev <%d, %d>:\n", + tag, lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev)); + for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) { + printf("\tlock 0x%lx for ", lf); + if (lf->lf_flags & F_POSIX) + printf("proc %d", ((struct proc *)(lf->lf_id))->p_pid); + else + printf("id 0x%x", lf->lf_id); + printf(", %s, start %d, end %d", + lf->lf_type == F_RDLCK ? "shared" : + lf->lf_type == F_WRLCK ? "exclusive" : + lf->lf_type == F_UNLCK ? "unlock" : + "unknown", lf->lf_start, lf->lf_end); + if (lf->lf_block) + printf(" block 0x%x\n", lf->lf_block); + else + printf("\n"); + } +} +#endif /* LOCKF_DEBUG */ diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c new file mode 100644 index 00000000000..87c6802c79f --- /dev/null +++ b/sys/ufs/ufs/ufs_lookup.c @@ -0,0 +1,970 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_lookup.c 8.6 (Berkeley) 4/1/94 + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +struct nchstats nchstats; +#ifdef DIAGNOSTIC +int dirchk = 1; +#else +int dirchk = 0; +#endif + +#define FSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) + +/* + * Convert a component of a pathname into a pointer to a locked inode. + * This is a very central and rather complicated routine. + * If the file system is not maintained in a strict tree hierarchy, + * this can result in a deadlock situation (see comments in code below). + * + * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending + * on whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it and the target of the pathname + * exists, lookup returns both the target and its parent directory locked. + * When creating or renaming and LOCKPARENT is specified, the target may + * not be ".". When deleting and LOCKPARENT is specified, the target may + * be "."., but the caller must check to ensure it does an vrele and vput + * instead of two vputs. + * + * Overall outline of ufs_lookup: + * + * check accessibility of directory + * look for name in cache, if found, then if at end of path + * and deleting or creating, drop it, else return name + * search for name in directory, to found or notfound + * notfound: + * if creating, return locked directory, leaving info on available slots + * else return error + * found: + * if at end of path and deleting, return information to allow delete + * if at end of path and rewriting (RENAME and LOCKPARENT), lock target + * inode and return info to allow rewrite + * if not at end, add name to cache; if at end and neither creating + * nor deleting, add name to cache + */ +int +ufs_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vdp; /* vnode for directory being searched */ + register struct inode *dp; /* inode for directory being searched */ + struct buf *bp; /* a buffer of directory entries */ + register struct direct *ep; /* the current directory entry */ + int entryoffsetinblock; /* offset of ep in bp's buffer */ + enum {NONE, COMPACT, FOUND} slotstatus; + doff_t slotoffset; /* offset of area with free space */ + int slotsize; /* size of area at slotoffset */ + int slotfreespace; /* amount of space free in slot */ + int slotneeded; /* size of the entry we're seeking */ + int numdirpasses; /* strategy for directory search */ + doff_t endsearch; /* offset to end directory search */ + doff_t prevoff; /* prev entry dp->i_offset */ + struct vnode *pdp; /* saved dp during symlink work */ + struct vnode *tdp; /* returned by VFS_VGET */ + doff_t enduseful; /* pointer past last used dir slot */ + u_long bmask; /* block offset mask */ + int lockparent; /* 1 => lockparent flag is set */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int namlen, error; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + struct ucred *cred = cnp->cn_cred; + int flags = cnp->cn_flags; + int nameiop = cnp->cn_nameiop; + + bp = NULL; + slotoffset = -1; + *vpp = NULL; + vdp = ap->a_dvp; + dp = VTOI(vdp); + lockparent = flags & LOCKPARENT; + wantparent = flags & (LOCKPARENT|WANTPARENT); + + /* + * Check accessiblity of directory. + */ + if ((dp->i_mode & IFMT) != IFDIR) + return (ENOTDIR); + if (error = VOP_ACCESS(vdp, VEXEC, cred, cnp->cn_proc)) + return (error); + + /* + * We now have a segment name to search for, and a directory to search. + * + * Before tediously performing a linear scan of the directory, + * check the name cache to see if the directory/name pair + * we are looking for is known already. + */ + if (error = cache_lookup(vdp, vpp, cnp)) { + int vpid; /* capability number of vnode */ + + if (error == ENOENT) + return (error); + /* + * Get the next vnode in the path. + * See comment below starting `Step through' for + * an explaination of the locking protocol. + */ + pdp = vdp; + dp = VTOI(*vpp); + vdp = *vpp; + vpid = vdp->v_id; + if (pdp == vdp) { /* lookup on "." */ + VREF(vdp); + error = 0; + } else if (flags & ISDOTDOT) { + VOP_UNLOCK(pdp); + error = vget(vdp, 1); + if (!error && lockparent && (flags & ISLASTCN)) + error = VOP_LOCK(pdp); + } else { + error = vget(vdp, 1); + if (!lockparent || error || !(flags & ISLASTCN)) + VOP_UNLOCK(pdp); + } + /* + * Check that the capability number did not change + * while we were waiting for the lock. + */ + if (!error) { + if (vpid == vdp->v_id) + return (0); + vput(vdp); + if (lockparent && pdp != vdp && (flags & ISLASTCN)) + VOP_UNLOCK(pdp); + } + if (error = VOP_LOCK(pdp)) + return (error); + vdp = pdp; + dp = VTOI(pdp); + *vpp = NULL; + } + + /* + * Suppress search for slots unless creating + * file and at end of pathname, in which case + * we watch for a place to put the new file in + * case it doesn't already exist. + */ + slotstatus = FOUND; + slotfreespace = slotsize = slotneeded = 0; + if ((nameiop == CREATE || nameiop == RENAME) && + (flags & ISLASTCN)) { + slotstatus = NONE; + slotneeded = (sizeof(struct direct) - MAXNAMLEN + + cnp->cn_namelen + 3) &~ 3; + } + + /* + * If there is cached information on a previous search of + * this directory, pick up where we last left off. + * We cache only lookups as these are the most common + * and have the greatest payoff. Caching CREATE has little + * benefit as it usually must search the entire directory + * to determine that the entry does not exist. Caching the + * location of the last DELETE or RENAME has not reduced + * profiling time and hence has been removed in the interest + * of simplicity. + */ + bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; + if (nameiop != LOOKUP || dp->i_diroff == 0 || + dp->i_diroff > dp->i_size) { + entryoffsetinblock = 0; + dp->i_offset = 0; + numdirpasses = 1; + } else { + dp->i_offset = dp->i_diroff; + if ((entryoffsetinblock = dp->i_offset & bmask) && + (error = VOP_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp))) + return (error); + numdirpasses = 2; + nchstats.ncs_2passes++; + } + prevoff = dp->i_offset; + endsearch = roundup(dp->i_size, DIRBLKSIZ); + enduseful = 0; + +searchloop: + while (dp->i_offset < endsearch) { + /* + * If necessary, get the next directory block. + */ + if ((dp->i_offset & bmask) == 0) { + if (bp != NULL) + brelse(bp); + if (error = + VOP_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp)) + return (error); + entryoffsetinblock = 0; + } + /* + * If still looking for a slot, and at a DIRBLKSIZE + * boundary, have to start looking for free space again. + */ + if (slotstatus == NONE && + (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) { + slotoffset = -1; + slotfreespace = 0; + } + /* + * Get pointer to next entry. + * Full validation checks are slow, so we only check + * enough to insure forward progress through the + * directory. Complete checks can be run by patching + * "dirchk" to be true. + */ + ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock); + if (ep->d_reclen == 0 || + dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock)) { + int i; + + ufs_dirbad(dp, dp->i_offset, "mangled entry"); + i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); + dp->i_offset += i; + entryoffsetinblock += i; + continue; + } + + /* + * If an appropriate sized slot has not yet been found, + * check to see if one is available. Also accumulate space + * in the current block so that we can determine if + * compaction is viable. + */ + if (slotstatus != FOUND) { + int size = ep->d_reclen; + + if (ep->d_ino != 0) + size -= DIRSIZ(FSFMT(vdp), ep); + if (size > 0) { + if (size >= slotneeded) { + slotstatus = FOUND; + slotoffset = dp->i_offset; + slotsize = ep->d_reclen; + } else if (slotstatus == NONE) { + slotfreespace += size; + if (slotoffset == -1) + slotoffset = dp->i_offset; + if (slotfreespace >= slotneeded) { + slotstatus = COMPACT; + slotsize = dp->i_offset + + ep->d_reclen - slotoffset; + } + } + } + } + + /* + * Check for a name match. + */ + if (ep->d_ino) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (vdp->v_mount->mnt_maxsymlinklen > 0) + namlen = ep->d_namlen; + else + namlen = ep->d_type; +# else + namlen = ep->d_namlen; +# endif + if (namlen == cnp->cn_namelen && + !bcmp(cnp->cn_nameptr, ep->d_name, + (unsigned)namlen)) { + /* + * Save directory entry's inode number and + * reclen in ndp->ni_ufs area, and release + * directory buffer. + */ + dp->i_ino = ep->d_ino; + dp->i_reclen = ep->d_reclen; + brelse(bp); + goto found; + } + } + prevoff = dp->i_offset; + dp->i_offset += ep->d_reclen; + entryoffsetinblock += ep->d_reclen; + if (ep->d_ino) + enduseful = dp->i_offset; + } +/* notfound: */ + /* + * If we started in the middle of the directory and failed + * to find our target, we must check the beginning as well. + */ + if (numdirpasses == 2) { + numdirpasses--; + dp->i_offset = 0; + endsearch = dp->i_diroff; + goto searchloop; + } + if (bp != NULL) + brelse(bp); + /* + * If creating, and at end of pathname and current + * directory has not been removed, then can consider + * allowing file to be created. + */ + if ((nameiop == CREATE || nameiop == RENAME) && + (flags & ISLASTCN) && dp->i_nlink != 0) { + /* + * Access for write is interpreted as allowing + * creation of files in the directory. + */ + if (error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc)) + return (error); + /* + * Return an indication of where the new directory + * entry should be put. If we didn't find a slot, + * then set dp->i_count to 0 indicating + * that the new slot belongs at the end of the + * directory. If we found a slot, then the new entry + * can be put in the range from dp->i_offset to + * dp->i_offset + dp->i_count. + */ + if (slotstatus == NONE) { + dp->i_offset = roundup(dp->i_size, DIRBLKSIZ); + dp->i_count = 0; + enduseful = dp->i_offset; + } else { + dp->i_offset = slotoffset; + dp->i_count = slotsize; + if (enduseful < slotoffset + slotsize) + enduseful = slotoffset + slotsize; + } + dp->i_endoff = roundup(enduseful, DIRBLKSIZ); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * We return with the directory locked, so that + * the parameters we set up above will still be + * valid if we actually decide to do a direnter(). + * We return ni_vp == NULL to indicate that the entry + * does not currently exist; we leave a pointer to + * the (locked) directory inode in ndp->ni_dvp. + * The pathname buffer is saved so that the name + * can be obtained later. + * + * NB - if the directory is unlocked, then this + * information cannot be used. + */ + cnp->cn_flags |= SAVENAME; + if (!lockparent) + VOP_UNLOCK(vdp); + return (EJUSTRETURN); + } + /* + * Insert name into cache (as non-existent) if appropriate. + */ + if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) + cache_enter(vdp, *vpp, cnp); + return (ENOENT); + +found: + if (numdirpasses == 2) + nchstats.ncs_pass2++; + /* + * Check that directory length properly reflects presence + * of this entry. + */ + if (entryoffsetinblock + DIRSIZ(FSFMT(vdp), ep) > dp->i_size) { + ufs_dirbad(dp, dp->i_offset, "i_size too small"); + dp->i_size = entryoffsetinblock + DIRSIZ(FSFMT(vdp), ep); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + } + + /* + * Found component in pathname. + * If the final component of path name, save information + * in the cache as to where the entry was found. + */ + if ((flags & ISLASTCN) && nameiop == LOOKUP) + dp->i_diroff = dp->i_offset &~ (DIRBLKSIZ - 1); + + /* + * If deleting, and at end of pathname, return + * parameters which can be used to remove file. + * If the wantparent flag isn't set, we return only + * the directory (in ndp->ni_dvp), otherwise we go + * on and lock the inode, being careful with ".". + */ + if (nameiop == DELETE && (flags & ISLASTCN)) { + /* + * Write access to directory required to delete files. + */ + if (error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc)) + return (error); + /* + * Return pointer to current entry in dp->i_offset, + * and distance past previous entry (if there + * is a previous entry in this block) in dp->i_count. + * Save directory inode pointer in ndp->ni_dvp for dirremove(). + */ + if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) + dp->i_count = 0; + else + dp->i_count = dp->i_offset - prevoff; + if (dp->i_number == dp->i_ino) { + VREF(vdp); + *vpp = vdp; + return (0); + } + if (error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp)) + return (error); + /* + * If directory is "sticky", then user must own + * the directory, or the file in it, else she + * may not delete it (unless she's root). This + * implements append-only directories. + */ + if ((dp->i_mode & ISVTX) && + cred->cr_uid != 0 && + cred->cr_uid != dp->i_uid && + VTOI(tdp)->i_uid != cred->cr_uid) { + vput(tdp); + return (EPERM); + } + *vpp = tdp; + if (!lockparent) + VOP_UNLOCK(vdp); + return (0); + } + + /* + * If rewriting (RENAME), return the inode and the + * information required to rewrite the present directory + * Must get inode of directory entry to verify it's a + * regular file, or empty directory. + */ + if (nameiop == RENAME && wantparent && + (flags & ISLASTCN)) { + if (error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_proc)) + return (error); + /* + * Careful about locking second inode. + * This can only occur if the target is ".". + */ + if (dp->i_number == dp->i_ino) + return (EISDIR); + if (error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp)) + return (error); + *vpp = tdp; + cnp->cn_flags |= SAVENAME; + if (!lockparent) + VOP_UNLOCK(vdp); + return (0); + } + + /* + * Step through the translation in the name. We do not `vput' the + * directory because we may need it again if a symbolic link + * is relative to the current directory. Instead we save it + * unlocked as "pdp". We must get the target inode before unlocking + * the directory to insure that the inode will not be removed + * before we get it. We prevent deadlock by always fetching + * inodes from the root, moving down the directory tree. Thus + * when following backward pointers ".." we must unlock the + * parent directory before getting the requested directory. + * There is a potential race condition here if both the current + * and parent directories are removed before the VFS_VGET for the + * inode associated with ".." returns. We hope that this occurs + * infrequently since we cannot avoid this race condition without + * implementing a sophisticated deadlock detection algorithm. + * Note also that this simple deadlock detection scheme will not + * work if the file system has any hard links other than ".." + * that point backwards in the directory structure. + */ + pdp = vdp; + if (flags & ISDOTDOT) { + VOP_UNLOCK(pdp); /* race to get the inode */ + if (error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp)) { + VOP_LOCK(pdp); + return (error); + } + if (lockparent && (flags & ISLASTCN) && + (error = VOP_LOCK(pdp))) { + vput(tdp); + return (error); + } + *vpp = tdp; + } else if (dp->i_number == dp->i_ino) { + VREF(vdp); /* we want ourself, ie "." */ + *vpp = vdp; + } else { + if (error = VFS_VGET(vdp->v_mount, dp->i_ino, &tdp)) + return (error); + if (!lockparent || !(flags & ISLASTCN)) + VOP_UNLOCK(pdp); + *vpp = tdp; + } + + /* + * Insert name into cache if appropriate. + */ + if (cnp->cn_flags & MAKEENTRY) + cache_enter(vdp, *vpp, cnp); + return (0); +} + +void +ufs_dirbad(ip, offset, how) + struct inode *ip; + doff_t offset; + char *how; +{ + struct mount *mp; + + mp = ITOV(ip)->v_mount; + (void)printf("%s: bad dir ino %d at offset %d: %s\n", + mp->mnt_stat.f_mntonname, ip->i_number, offset, how); + if ((mp->mnt_stat.f_flags & MNT_RDONLY) == 0) + panic("bad dir"); +} + +/* + * Do consistency checking on a directory entry: + * record length must be multiple of 4 + * entry must fit in rest of its DIRBLKSIZ block + * record must be large enough to contain entry + * name is not longer than MAXNAMLEN + * name must be as long as advertised, and null terminated + */ +int +ufs_dirbadentry(dp, ep, entryoffsetinblock) + struct vnode *dp; + register struct direct *ep; + int entryoffsetinblock; +{ + register int i; + int namlen; + +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (dp->v_mount->mnt_maxsymlinklen > 0) + namlen = ep->d_namlen; + else + namlen = ep->d_type; +# else + namlen = ep->d_namlen; +# endif + if ((ep->d_reclen & 0x3) != 0 || + ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || + ep->d_reclen < DIRSIZ(FSFMT(dp), ep) || namlen > MAXNAMLEN) { + /*return (1); */ + printf("First bad\n"); + goto bad; + } + for (i = 0; i < namlen; i++) + if (ep->d_name[i] == '\0') { + /*return (1); */ + printf("Second bad\n"); + goto bad; + } + if (ep->d_name[i]) + goto bad; + return (ep->d_name[i]); +bad: + return(1); +} + +/* + * Write a directory entry after a call to namei, using the parameters + * that it left in nameidata. The argument ip is the inode which the new + * directory entry will refer to. Dvp is a pointer to the directory to + * be written, which was left locked by namei. Remaining parameters + * (dp->i_offset, dp->i_count) indicate how the space for the new + * entry is to be obtained. + */ +int +ufs_direnter(ip, dvp, cnp) + struct inode *ip; + struct vnode *dvp; + register struct componentname *cnp; +{ + register struct direct *ep, *nep; + register struct inode *dp; + struct buf *bp; + struct direct newdir; + struct iovec aiov; + struct uio auio; + u_int dsize; + int error, loc, newentrysize, spacefree; + char *dirbuf; + +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & SAVENAME) == 0) + panic("direnter: missing name"); +#endif + dp = VTOI(dvp); + newdir.d_ino = ip->i_number; + newdir.d_namlen = cnp->cn_namelen; + bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); + if (dvp->v_mount->mnt_maxsymlinklen > 0) + newdir.d_type = IFTODT(ip->i_mode); + else { + newdir.d_type = 0; +# if (BYTE_ORDER == LITTLE_ENDIAN) + { u_char tmp = newdir.d_namlen; + newdir.d_namlen = newdir.d_type; + newdir.d_type = tmp; } +# endif + } + newentrysize = DIRSIZ(FSFMT(dvp), &newdir); + if (dp->i_count == 0) { + /* + * If dp->i_count is 0, then namei could find no + * space in the directory. Here, dp->i_offset will + * be on a directory block boundary and we will write the + * new entry into a fresh block. + */ + if (dp->i_offset & (DIRBLKSIZ - 1)) + panic("ufs_direnter: newblk"); + auio.uio_offset = dp->i_offset; + newdir.d_reclen = DIRBLKSIZ; + auio.uio_resid = newentrysize; + aiov.iov_len = newentrysize; + aiov.iov_base = (caddr_t)&newdir; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_procp = (struct proc *)0; + error = VOP_WRITE(dvp, &auio, IO_SYNC, cnp->cn_cred); + if (DIRBLKSIZ > + VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) + /* XXX should grow with balloc() */ + panic("ufs_direnter: frag size"); + else if (!error) { + dp->i_size = roundup(dp->i_size, DIRBLKSIZ); + dp->i_flag |= IN_CHANGE; + } + return (error); + } + + /* + * If dp->i_count is non-zero, then namei found space + * for the new entry in the range dp->i_offset to + * dp->i_offset + dp->i_count in the directory. + * To use this space, we may have to compact the entries located + * there, by copying them together towards the beginning of the + * block, leaving the free space in one usable chunk at the end. + */ + + /* + * Increase size of directory if entry eats into new space. + * This should never push the size past a new multiple of + * DIRBLKSIZE. + * + * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. + */ + if (dp->i_offset + dp->i_count > dp->i_size) + dp->i_size = dp->i_offset + dp->i_count; + /* + * Get the block containing the space for the new directory entry. + */ + if (error = VOP_BLKATOFF(dvp, (off_t)dp->i_offset, &dirbuf, &bp)) + return (error); + /* + * Find space for the new entry. In the simple case, the entry at + * offset base will have the space. If it does not, then namei + * arranged that compacting the region dp->i_offset to + * dp->i_offset + dp->i_count would yield the + * space. + */ + ep = (struct direct *)dirbuf; + dsize = DIRSIZ(FSFMT(dvp), ep); + spacefree = ep->d_reclen - dsize; + for (loc = ep->d_reclen; loc < dp->i_count; ) { + nep = (struct direct *)(dirbuf + loc); + if (ep->d_ino) { + /* trim the existing slot */ + ep->d_reclen = dsize; + ep = (struct direct *)((char *)ep + dsize); + } else { + /* overwrite; nothing there; header is ours */ + spacefree += dsize; + } + dsize = DIRSIZ(FSFMT(dvp), nep); + spacefree += nep->d_reclen - dsize; + loc += nep->d_reclen; + bcopy((caddr_t)nep, (caddr_t)ep, dsize); + } + /* + * Update the pointer fields in the previous entry (if any), + * copy in the new entry, and write out the block. + */ + if (ep->d_ino == 0) { + if (spacefree + dsize < newentrysize) + panic("ufs_direnter: compact1"); + newdir.d_reclen = spacefree + dsize; + } else { + if (spacefree < newentrysize) + panic("ufs_direnter: compact2"); + newdir.d_reclen = spacefree; + ep->d_reclen = dsize; + ep = (struct direct *)((char *)ep + dsize); + } + bcopy((caddr_t)&newdir, (caddr_t)ep, (u_int)newentrysize); + error = VOP_BWRITE(bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + if (!error && dp->i_endoff && dp->i_endoff < dp->i_size) + error = VOP_TRUNCATE(dvp, (off_t)dp->i_endoff, IO_SYNC, + cnp->cn_cred, cnp->cn_proc); + return (error); +} + +/* + * Remove a directory entry after a call to namei, using + * the parameters which it left in nameidata. The entry + * dp->i_offset contains the offset into the directory of the + * entry to be eliminated. The dp->i_count field contains the + * size of the previous record in the directory. If this + * is 0, the first entry is being deleted, so we need only + * zero the inode number to mark the entry as free. If the + * entry is not the first in the directory, we must reclaim + * the space of the now empty record by adding the record size + * to the size of the previous entry. + */ +int +ufs_dirremove(dvp, cnp) + struct vnode *dvp; + struct componentname *cnp; +{ + register struct inode *dp; + struct direct *ep; + struct buf *bp; + int error; + + dp = VTOI(dvp); + if (dp->i_count == 0) { + /* + * First entry in block: set d_ino to zero. + */ + if (error = + VOP_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) + return (error); + ep->d_ino = 0; + error = VOP_BWRITE(bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); + } + /* + * Collapse new free space into previous entry. + */ + if (error = VOP_BLKATOFF(dvp, (off_t)(dp->i_offset - dp->i_count), + (char **)&ep, &bp)) + return (error); + ep->d_reclen += dp->i_reclen; + error = VOP_BWRITE(bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); +} + +/* + * Rewrite an existing directory entry to point at the inode + * supplied. The parameters describing the directory entry are + * set up by a call to namei. + */ +int +ufs_dirrewrite(dp, ip, cnp) + struct inode *dp, *ip; + struct componentname *cnp; +{ + struct buf *bp; + struct direct *ep; + struct vnode *vdp = ITOV(dp); + int error; + + if (error = VOP_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp)) + return (error); + ep->d_ino = ip->i_number; + if (vdp->v_mount->mnt_maxsymlinklen > 0) + ep->d_type = IFTODT(ip->i_mode); + error = VOP_BWRITE(bp); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); +} + +/* + * Check if a directory is empty or not. + * Inode supplied must be locked. + * + * Using a struct dirtemplate here is not precisely + * what we want, but better than using a struct direct. + * + * NB: does not handle corrupted directories. + */ +int +ufs_dirempty(ip, parentino, cred) + register struct inode *ip; + ino_t parentino; + struct ucred *cred; +{ + register off_t off; + struct dirtemplate dbuf; + register struct direct *dp = (struct direct *)&dbuf; + int error, count, namlen; +#define MINDIRSIZ (sizeof (struct dirtemplate) / 2) + + for (off = 0; off < ip->i_size; off += dp->d_reclen) { + error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, off, + UIO_SYSSPACE, IO_NODELOCKED, cred, &count, (struct proc *)0); + /* + * Since we read MINDIRSIZ, residual must + * be 0 unless we're at end of file. + */ + if (error || count != 0) + return (0); + /* avoid infinite loops */ + if (dp->d_reclen == 0) + return (0); + /* skip empty entries */ + if (dp->d_ino == 0) + continue; + /* accept only "." and ".." */ +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (ITOV(ip)->v_mount->mnt_maxsymlinklen > 0) + namlen = dp->d_namlen; + else + namlen = dp->d_type; +# else + namlen = dp->d_namlen; +# endif + if (namlen > 2) + return (0); + if (dp->d_name[0] != '.') + return (0); + /* + * At this point namlen must be 1 or 2. + * 1 implies ".", 2 implies ".." if second + * char is also "." + */ + if (namlen == 1) + continue; + if (dp->d_name[1] == '.' && dp->d_ino == parentino) + continue; + return (0); + } + return (1); +} + +/* + * Check if source directory is in the path of the target directory. + * Target is supplied locked, source is unlocked. + * The target is always vput before returning. + */ +int +ufs_checkpath(source, target, cred) + struct inode *source, *target; + struct ucred *cred; +{ + struct vnode *vp; + int error, rootino, namlen; + struct dirtemplate dirbuf; + + vp = ITOV(target); + if (target->i_number == source->i_number) { + error = EEXIST; + goto out; + } + rootino = ROOTINO; + error = 0; + if (target->i_number == rootino) + goto out; + + for (;;) { + if (vp->v_type != VDIR) { + error = ENOTDIR; + break; + } + error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf, + sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED, cred, (int *)0, (struct proc *)0); + if (error != 0) + break; +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (vp->v_mount->mnt_maxsymlinklen > 0) + namlen = dirbuf.dotdot_namlen; + else + namlen = dirbuf.dotdot_type; +# else + namlen = dirbuf.dotdot_namlen; +# endif + if (namlen != 2 || + dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') { + error = ENOTDIR; + break; + } + if (dirbuf.dotdot_ino == source->i_number) { + error = EINVAL; + break; + } + if (dirbuf.dotdot_ino == rootino) + break; + vput(vp); + if (error = VFS_VGET(vp->v_mount, dirbuf.dotdot_ino, &vp)) { + vp = NULL; + break; + } + } + +out: + if (error == ENOTDIR) + printf("checkpath: .. not a directory\n"); + if (vp != NULL) + vput(vp); + return (error); +} diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c new file mode 100644 index 00000000000..15cb1cfbb23 --- /dev/null +++ b/sys/ufs/ufs/ufs_quota.c @@ -0,0 +1,938 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_quota.c 8.2 (Berkeley) 12/30/93 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Quota name to error message mapping. + */ +static char *quotatypes[] = INITQFNAMES; + +/* + * Set up the quotas for an inode. + * + * This routine completely defines the semantics of quotas. + * If other criterion want to be used to establish quotas, the + * MAXQUOTAS value in quotas.h should be increased, and the + * additional dquots set up here. + */ +int +getinoquota(ip) + register struct inode *ip; +{ + struct ufsmount *ump; + struct vnode *vp = ITOV(ip); + int error; + + ump = VFSTOUFS(vp->v_mount); + /* + * Set up the user quota based on file uid. + * EINVAL means that quotas are not enabled. + */ + if (ip->i_dquot[USRQUOTA] == NODQUOT && + (error = + dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) && + error != EINVAL) + return (error); + /* + * Set up the group quota based on file gid. + * EINVAL means that quotas are not enabled. + */ + if (ip->i_dquot[GRPQUOTA] == NODQUOT && + (error = + dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) && + error != EINVAL) + return (error); + return (0); +} + +/* + * Update disk usage, and take corrective action. + */ +int +chkdq(ip, change, cred, flags) + register struct inode *ip; + long change; + struct ucred *cred; + int flags; +{ + register struct dquot *dq; + register int i; + int ncurblocks, error; + +#ifdef DIAGNOSTIC + if ((flags & CHOWN) == 0) + chkdquot(ip); +#endif + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + ncurblocks = dq->dq_curblocks + change; + if (ncurblocks >= 0) + dq->dq_curblocks = ncurblocks; + else + dq->dq_curblocks = 0; + dq->dq_flags &= ~DQ_BLKS; + dq->dq_flags |= DQ_MOD; + } + return (0); + } + if ((flags & FORCE) == 0 && cred->cr_uid != 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + if (error = chkdqchg(ip, change, cred, i)) + return (error); + } + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + dq->dq_curblocks += change; + dq->dq_flags |= DQ_MOD; + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +int +chkdqchg(ip, change, cred, type) + struct inode *ip; + long change; + struct ucred *cred; + int type; +{ + register struct dquot *dq = ip->i_dquot[type]; + long ncurblocks = dq->dq_curblocks + change; + + /* + * If user would exceed their hard limit, disallow space allocation. + */ + if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s disk limit reached\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type]); + dq->dq_flags |= DQ_BLKS; + } + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow space + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { + if (dq->dq_curblocks < dq->dq_bsoftlimit) { + dq->dq_btime = time.tv_sec + + VFSTOUFS(ITOV(ip)->v_mount)->um_btime[type]; + if (ip->i_uid == cred->cr_uid) + uprintf("\n%s: warning, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], "disk quota exceeded"); + return (0); + } + if (time.tv_sec > dq->dq_btime) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], + "disk quota exceeded for too long"); + dq->dq_flags |= DQ_BLKS; + } + return (EDQUOT); + } + } + return (0); +} + +/* + * Check the inode limit, applying corrective action. + */ +int +chkiq(ip, change, cred, flags) + register struct inode *ip; + long change; + struct ucred *cred; + int flags; +{ + register struct dquot *dq; + register int i; + int ncurinodes, error; + +#ifdef DIAGNOSTIC + if ((flags & CHOWN) == 0) + chkdquot(ip); +#endif + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + ncurinodes = dq->dq_curinodes + change; + if (ncurinodes >= 0) + dq->dq_curinodes = ncurinodes; + else + dq->dq_curinodes = 0; + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + } + return (0); + } + if ((flags & FORCE) == 0 && cred->cr_uid != 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + if (error = chkiqchg(ip, change, cred, i)) + return (error); + } + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + dq->dq_curinodes += change; + dq->dq_flags |= DQ_MOD; + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +int +chkiqchg(ip, change, cred, type) + struct inode *ip; + long change; + struct ucred *cred; + int type; +{ + register struct dquot *dq = ip->i_dquot[type]; + long ncurinodes = dq->dq_curinodes + change; + + /* + * If user would exceed their hard limit, disallow inode allocation. + */ + if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { + if ((dq->dq_flags & DQ_INODS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s inode limit reached\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type]); + dq->dq_flags |= DQ_INODS; + } + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow inode + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { + if (dq->dq_curinodes < dq->dq_isoftlimit) { + dq->dq_itime = time.tv_sec + + VFSTOUFS(ITOV(ip)->v_mount)->um_itime[type]; + if (ip->i_uid == cred->cr_uid) + uprintf("\n%s: warning, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], "inode quota exceeded"); + return (0); + } + if (time.tv_sec > dq->dq_itime) { + if ((dq->dq_flags & DQ_INODS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], + "inode quota exceeded for too long"); + dq->dq_flags |= DQ_INODS; + } + return (EDQUOT); + } + } + return (0); +} + +#ifdef DIAGNOSTIC +/* + * On filesystems with quotas enabled, it is an error for a file to change + * size and not to have a dquot structure associated with it. + */ +void +chkdquot(ip) + register struct inode *ip; +{ + struct ufsmount *ump = VFSTOUFS(ITOV(ip)->v_mount); + register int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] == NULLVP || + (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING))) + continue; + if (ip->i_dquot[i] == NODQUOT) { + vprint("chkdquot: missing dquot", ITOV(ip)); + panic("missing dquot"); + } + } +} +#endif + +/* + * Code to process quotactl commands. + */ + +/* + * Q_QUOTAON - set up a quota file for a particular file system. + */ +int +quotaon(p, mp, type, fname) + struct proc *p; + struct mount *mp; + register int type; + caddr_t fname; +{ + register struct ufsmount *ump = VFSTOUFS(mp); + register struct vnode *vp, **vpp; + struct vnode *nextvp; + struct dquot *dq; + int error; + struct nameidata nd; + + vpp = &ump->um_quotas[type]; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fname, p); + if (error = vn_open(&nd, FREAD|FWRITE, 0)) + return (error); + vp = nd.ni_vp; + VOP_UNLOCK(vp); + if (vp->v_type != VREG) { + (void) vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (EACCES); + } + if (vfs_busy(mp)) { + (void) vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (EBUSY); + } + if (*vpp != vp) + quotaoff(p, mp, type); + ump->um_qflags[type] |= QTF_OPENING; + mp->mnt_flag |= MNT_QUOTA; + vp->v_flag |= VSYSTEM; + *vpp = vp; + /* + * Save the credential of the process that turned on quotas. + * Set up the time limits for this quota. + */ + crhold(p->p_ucred); + ump->um_cred[type] = p->p_ucred; + ump->um_btime[type] = MAX_DQ_TIME; + ump->um_itime[type] = MAX_IQ_TIME; + if (dqget(NULLVP, 0, ump, type, &dq) == 0) { + if (dq->dq_btime > 0) + ump->um_btime[type] = dq->dq_btime; + if (dq->dq_itime > 0) + ump->um_itime[type] = dq->dq_itime; + dqrele(NULLVP, dq); + } + /* + * Search vnodes associated with this mount point, + * adding references to quota file being opened. + * NB: only need to add dquot's for inodes being modified. + */ +again: + for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { + nextvp = vp->v_mntvnodes.le_next; + if (vp->v_writecount == 0) + continue; + if (vget(vp, 1)) + goto again; + if (error = getinoquota(VTOI(vp))) { + vput(vp); + break; + } + vput(vp); + if (vp->v_mntvnodes.le_next != nextvp || vp->v_mount != mp) + goto again; + } + ump->um_qflags[type] &= ~QTF_OPENING; + if (error) + quotaoff(p, mp, type); + vfs_unbusy(mp); + return (error); +} + +/* + * Q_QUOTAOFF - turn off disk quotas for a filesystem. + */ +int +quotaoff(p, mp, type) + struct proc *p; + struct mount *mp; + register int type; +{ + register struct vnode *vp; + struct vnode *qvp, *nextvp; + struct ufsmount *ump = VFSTOUFS(mp); + register struct dquot *dq; + register struct inode *ip; + int error; + + if ((mp->mnt_flag & MNT_MPBUSY) == 0) + panic("quotaoff: not busy"); + if ((qvp = ump->um_quotas[type]) == NULLVP) + return (0); + ump->um_qflags[type] |= QTF_CLOSING; + /* + * Search vnodes associated with this mount point, + * deleting any references to quota file being closed. + */ +again: + for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { + nextvp = vp->v_mntvnodes.le_next; + if (vget(vp, 1)) + goto again; + ip = VTOI(vp); + dq = ip->i_dquot[type]; + ip->i_dquot[type] = NODQUOT; + dqrele(vp, dq); + vput(vp); + if (vp->v_mntvnodes.le_next != nextvp || vp->v_mount != mp) + goto again; + } + dqflush(qvp); + qvp->v_flag &= ~VSYSTEM; + error = vn_close(qvp, FREAD|FWRITE, p->p_ucred, p); + ump->um_quotas[type] = NULLVP; + crfree(ump->um_cred[type]); + ump->um_cred[type] = NOCRED; + ump->um_qflags[type] &= ~QTF_CLOSING; + for (type = 0; type < MAXQUOTAS; type++) + if (ump->um_quotas[type] != NULLVP) + break; + if (type == MAXQUOTAS) + mp->mnt_flag &= ~MNT_QUOTA; + return (error); +} + +/* + * Q_GETQUOTA - return current values in a dqblk structure. + */ +int +getquota(mp, id, type, addr) + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + struct dquot *dq; + int error; + + if (error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq)) + return (error); + error = copyout((caddr_t)&dq->dq_dqb, addr, sizeof (struct dqblk)); + dqrele(NULLVP, dq); + return (error); +} + +/* + * Q_SETQUOTA - assign an entire dqblk structure. + */ +int +setquota(mp, id, type, addr) + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + register struct dquot *dq; + struct dquot *ndq; + struct ufsmount *ump = VFSTOUFS(mp); + struct dqblk newlim; + int error; + + if (error = copyin(addr, (caddr_t)&newlim, sizeof (struct dqblk))) + return (error); + if (error = dqget(NULLVP, id, ump, type, &ndq)) + return (error); + dq = ndq; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + /* + * Copy all but the current values. + * Reset time limit if previously had no soft limit or were + * under it, but now have a soft limit and are over it. + */ + newlim.dqb_curblocks = dq->dq_curblocks; + newlim.dqb_curinodes = dq->dq_curinodes; + if (dq->dq_id != 0) { + newlim.dqb_btime = dq->dq_btime; + newlim.dqb_itime = dq->dq_itime; + } + if (newlim.dqb_bsoftlimit && + dq->dq_curblocks >= newlim.dqb_bsoftlimit && + (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) + newlim.dqb_btime = time.tv_sec + ump->um_btime[type]; + if (newlim.dqb_isoftlimit && + dq->dq_curinodes >= newlim.dqb_isoftlimit && + (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) + newlim.dqb_itime = time.tv_sec + ump->um_itime[type]; + dq->dq_dqb = newlim; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + else + dq->dq_flags &= ~DQ_FAKE; + dq->dq_flags |= DQ_MOD; + dqrele(NULLVP, dq); + return (0); +} + +/* + * Q_SETUSE - set current inode and block usage. + */ +int +setuse(mp, id, type, addr) + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + register struct dquot *dq; + struct ufsmount *ump = VFSTOUFS(mp); + struct dquot *ndq; + struct dqblk usage; + int error; + + if (error = copyin(addr, (caddr_t)&usage, sizeof (struct dqblk))) + return (error); + if (error = dqget(NULLVP, id, ump, type, &ndq)) + return (error); + dq = ndq; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+1); + } + /* + * Reset time limit if have a soft limit and were + * previously under it, but are now over it. + */ + if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit && + usage.dqb_curblocks >= dq->dq_bsoftlimit) + dq->dq_btime = time.tv_sec + ump->um_btime[type]; + if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && + usage.dqb_curinodes >= dq->dq_isoftlimit) + dq->dq_itime = time.tv_sec + ump->um_itime[type]; + dq->dq_curblocks = usage.dqb_curblocks; + dq->dq_curinodes = usage.dqb_curinodes; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + dqrele(NULLVP, dq); + return (0); +} + +/* + * Q_SYNC - sync quota files to disk. + */ +int +qsync(mp) + struct mount *mp; +{ + struct ufsmount *ump = VFSTOUFS(mp); + register struct vnode *vp, *nextvp; + register struct dquot *dq; + register int i; + + /* + * Check if the mount point has any quotas. + * If not, simply return. + */ + if ((mp->mnt_flag & MNT_MPBUSY) == 0) + panic("qsync: not busy"); + for (i = 0; i < MAXQUOTAS; i++) + if (ump->um_quotas[i] != NULLVP) + break; + if (i == MAXQUOTAS) + return (0); + /* + * Search vnodes associated with this mount point, + * synchronizing any modified dquot structures. + */ +again: + for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nextvp) { + nextvp = vp->v_mntvnodes.le_next; + if (VOP_ISLOCKED(vp)) + continue; + if (vget(vp, 1)) + goto again; + for (i = 0; i < MAXQUOTAS; i++) { + dq = VTOI(vp)->i_dquot[i]; + if (dq != NODQUOT && (dq->dq_flags & DQ_MOD)) + dqsync(vp, dq); + } + vput(vp); + if (vp->v_mntvnodes.le_next != nextvp || vp->v_mount != mp) + goto again; + } + return (0); +} + +/* + * Code pertaining to management of the in-core dquot data structures. + */ +struct dquot **dqhashtbl; +u_long dqhash; + +/* + * Dquot free list. + */ +#define DQUOTINC 5 /* minimum free dquots desired */ +struct dquot *dqfreel, **dqback = &dqfreel; +long numdquot, desireddquot = DQUOTINC; + +/* + * Initialize the quota system. + */ +void +dqinit() +{ + + dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash); +} + +/* + * Obtain a dquot structure for the specified identifier and quota file + * reading the information from the file if necessary. + */ +int +dqget(vp, id, ump, type, dqp) + struct vnode *vp; + u_long id; + register struct ufsmount *ump; + register int type; + struct dquot **dqp; +{ + register struct dquot *dq, *dp, **dpp; + register struct vnode *dqvp; + struct iovec aiov; + struct uio auio; + int error; + + dqvp = ump->um_quotas[type]; + if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) { + *dqp = NODQUOT; + return (EINVAL); + } + /* + * Check the cache first. + */ + dpp = &dqhashtbl[((((int)(dqvp)) >> 8) + id) & dqhash]; + for (dq = *dpp; dq; dq = dq->dq_forw) { + if (dq->dq_id != id || + dq->dq_ump->um_quotas[dq->dq_type] != dqvp) + continue; + /* + * Cache hit with no references. Take + * the structure off the free list. + */ + if (dq->dq_cnt == 0) { + if ((dp = dq->dq_freef) != NODQUOT) + dp->dq_freeb = dq->dq_freeb; + else + dqback = dq->dq_freeb; + *dq->dq_freeb = dp; + } + DQREF(dq); + *dqp = dq; + return (0); + } + /* + * Not in cache, allocate a new one. + */ + if (dqfreel == NODQUOT && numdquot < MAXQUOTAS * desiredvnodes) + desireddquot += DQUOTINC; + if (numdquot < desireddquot) { + dq = (struct dquot *)malloc(sizeof *dq, M_DQUOT, M_WAITOK); + bzero((char *)dq, sizeof *dq); + numdquot++; + } else { + if ((dq = dqfreel) == NULL) { + tablefull("dquot"); + *dqp = NODQUOT; + return (EUSERS); + } + if (dq->dq_cnt || (dq->dq_flags & DQ_MOD)) + panic("free dquot isn't"); + if ((dp = dq->dq_freef) != NODQUOT) + dp->dq_freeb = &dqfreel; + else + dqback = &dqfreel; + dqfreel = dp; + dq->dq_freef = NULL; + dq->dq_freeb = NULL; + if (dp = dq->dq_forw) + dp->dq_back = dq->dq_back; + *dq->dq_back = dp; + } + /* + * Initialize the contents of the dquot structure. + */ + if (vp != dqvp) + VOP_LOCK(dqvp); + if (dp = *dpp) + dp->dq_back = &dq->dq_forw; + dq->dq_forw = dp; + dq->dq_back = dpp; + *dpp = dq; + DQREF(dq); + dq->dq_flags = DQ_LOCK; + dq->dq_id = id; + dq->dq_ump = ump; + dq->dq_type = type; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (caddr_t)&dq->dq_dqb; + aiov.iov_len = sizeof (struct dqblk); + auio.uio_resid = sizeof (struct dqblk); + auio.uio_offset = (off_t)(id * sizeof (struct dqblk)); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_procp = (struct proc *)0; + error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]); + if (auio.uio_resid == sizeof(struct dqblk) && error == 0) + bzero((caddr_t)&dq->dq_dqb, sizeof(struct dqblk)); + if (vp != dqvp) + VOP_UNLOCK(dqvp); + if (dq->dq_flags & DQ_WANT) + wakeup((caddr_t)dq); + dq->dq_flags = 0; + /* + * I/O error in reading quota file, release + * quota structure and reflect problem to caller. + */ + if (error) { + if (dp = dq->dq_forw) + dp->dq_back = dq->dq_back; + *dq->dq_back = dp; + dq->dq_forw = NULL; + dq->dq_back = NULL; + dqrele(vp, dq); + *dqp = NODQUOT; + return (error); + } + /* + * Check for no limit to enforce. + * Initialize time values if necessary. + */ + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + if (dq->dq_id != 0) { + if (dq->dq_btime == 0) + dq->dq_btime = time.tv_sec + ump->um_btime[type]; + if (dq->dq_itime == 0) + dq->dq_itime = time.tv_sec + ump->um_itime[type]; + } + *dqp = dq; + return (0); +} + +/* + * Obtain a reference to a dquot. + */ +void +dqref(dq) + struct dquot *dq; +{ + + dq->dq_cnt++; +} + +/* + * Release a reference to a dquot. + */ +void +dqrele(vp, dq) + struct vnode *vp; + register struct dquot *dq; +{ + + if (dq == NODQUOT) + return; + if (dq->dq_cnt > 1) { + dq->dq_cnt--; + return; + } + if (dq->dq_flags & DQ_MOD) + (void) dqsync(vp, dq); + if (--dq->dq_cnt > 0) + return; + if (dqfreel != NODQUOT) { + *dqback = dq; + dq->dq_freeb = dqback; + } else { + dqfreel = dq; + dq->dq_freeb = &dqfreel; + } + dq->dq_freef = NODQUOT; + dqback = &dq->dq_freef; +} + +/* + * Update the disk quota in the quota file. + */ +int +dqsync(vp, dq) + struct vnode *vp; + register struct dquot *dq; +{ + struct vnode *dqvp; + struct iovec aiov; + struct uio auio; + int error; + + if (dq == NODQUOT) + panic("dqsync: dquot"); + if ((dq->dq_flags & DQ_MOD) == 0) + return (0); + if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP) + panic("dqsync: file"); + if (vp != dqvp) + VOP_LOCK(dqvp); + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + sleep((caddr_t)dq, PINOD+2); + if ((dq->dq_flags & DQ_MOD) == 0) { + if (vp != dqvp) + VOP_UNLOCK(dqvp); + return (0); + } + } + dq->dq_flags |= DQ_LOCK; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (caddr_t)&dq->dq_dqb; + aiov.iov_len = sizeof (struct dqblk); + auio.uio_resid = sizeof (struct dqblk); + auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk)); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_procp = (struct proc *)0; + error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]); + if (auio.uio_resid && error == 0) + error = EIO; + if (dq->dq_flags & DQ_WANT) + wakeup((caddr_t)dq); + dq->dq_flags &= ~(DQ_MOD|DQ_LOCK|DQ_WANT); + if (vp != dqvp) + VOP_UNLOCK(dqvp); + return (error); +} + +/* + * Flush all entries from the cache for a particular vnode. + */ +void +dqflush(vp) + register struct vnode *vp; +{ + register struct dquot *dq, *dp, **dpp, *nextdq; + + /* + * Move all dquot's that used to refer to this quota + * file off their hash chains (they will eventually + * fall off the head of the free list and be re-used). + */ + for (dpp = &dqhashtbl[dqhash]; dpp >= dqhashtbl; dpp--) { + for (dq = *dpp; dq; dq = nextdq) { + nextdq = dq->dq_forw; + if (dq->dq_ump->um_quotas[dq->dq_type] != vp) + continue; + if (dq->dq_cnt) + panic("dqflush: stray dquot"); + if (dp = dq->dq_forw) + dp->dq_back = dq->dq_back; + *dq->dq_back = dp; + dq->dq_forw = NULL; + dq->dq_back = NULL; + dq->dq_ump = (struct ufsmount *)0; + } + } +} diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c new file mode 100644 index 00000000000..5ead2c1a9ad --- /dev/null +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -0,0 +1,295 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_readwrite.c 8.7 (Berkeley) 1/21/94 + */ + +#ifdef LFS_READWRITE +#define BLKSIZE(a, b, c) blksize(a) +#define FS struct lfs +#define I_FS i_lfs +#define READ lfs_read +#define READ_S "lfs_read" +#define WRITE lfs_write +#define WRITE_S "lfs_write" +#define fs_bsize lfs_bsize +#define fs_maxfilesize lfs_maxfilesize +#else +#define BLKSIZE(a, b, c) blksize(a, b, c) +#define FS struct fs +#define I_FS i_fs +#define READ ffs_read +#define READ_S "ffs_read" +#define WRITE ffs_write +#define WRITE_S "ffs_write" +#endif + +/* + * Vnode op for reading. + */ +/* ARGSUSED */ +READ(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp; + register struct inode *ip; + register struct uio *uio; + register FS *fs; + struct buf *bp; + daddr_t lbn, nextlbn; + off_t bytesinfile; + long size, xfersize, blkoffset; + int error; + u_short mode; + + vp = ap->a_vp; + ip = VTOI(vp); + mode = ip->i_mode; + uio = ap->a_uio; + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_READ) + panic("%s: mode", READ_S); + + if (vp->v_type == VLNK) { + if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) + panic("%s: short symlink", READ_S); + } else if (vp->v_type != VREG && vp->v_type != VDIR) + panic("%s: type %d", READ_S, vp->v_type); +#endif + fs = ip->I_FS; + if ((u_quad_t)uio->uio_offset > fs->fs_maxfilesize) + return (EFBIG); + + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) + break; + lbn = lblkno(fs, uio->uio_offset); + nextlbn = lbn + 1; + size = BLKSIZE(fs, ip, lbn); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = fs->fs_bsize - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + if (bytesinfile < xfersize) + xfersize = bytesinfile; + +#ifdef LFS_READWRITE + (void)lfs_check(vp, lbn); + error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, &bp); +#else + if (lblktosize(fs, nextlbn) > ip->i_size) + error = bread(vp, lbn, size, NOCRED, &bp); + else if (doclusterread) + error = cluster_read(vp, + ip->i_size, lbn, size, NOCRED, &bp); + else if (lbn - 1 == vp->v_lastr) { + int nextsize = BLKSIZE(fs, ip, nextlbn); + error = breadn(vp, lbn, + size, &nextlbn, &nextsize, 1, NOCRED, &bp); + } else + error = bread(vp, lbn, size, NOCRED, &bp); +#endif + if (error) + break; + vp->v_lastr = lbn; + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < xfersize) { + if (size == 0) + break; + xfersize = size; + } + if (error = + uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio)) + break; + + if (S_ISREG(mode) && (xfersize + blkoffset == fs->fs_bsize || + uio->uio_offset == ip->i_size)) + bp->b_flags |= B_AGE; + brelse(bp); + } + if (bp != NULL) + brelse(bp); + ip->i_flag |= IN_ACCESS; + return (error); +} + +/* + * Vnode op for writing. + */ +WRITE(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp; + register struct uio *uio; + register struct inode *ip; + register FS *fs; + struct buf *bp; + struct proc *p; + daddr_t lbn; + off_t osize; + int blkoffset, error, flags, ioflag, resid, size, xfersize; + + ioflag = ap->a_ioflag; + uio = ap->a_uio; + vp = ap->a_vp; + ip = VTOI(vp); + +#ifdef DIAGNOSTIC + if (uio->uio_rw != UIO_WRITE) + panic("%s: mode", WRITE_S); +#endif + + switch (vp->v_type) { + case VREG: + if (ioflag & IO_APPEND) + uio->uio_offset = ip->i_size; + if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) + return (EPERM); + /* FALLTHROUGH */ + case VLNK: + break; + case VDIR: + if ((ioflag & IO_SYNC) == 0) + panic("%s: nonsync dir write", WRITE_S); + break; + default: + panic("%s: type", WRITE_S); + } + + fs = ip->I_FS; + if (uio->uio_offset < 0 || + (u_quad_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) + return (EFBIG); + /* + * Maybe this should be above the vnode op call, but so long as + * file servers have no limits, I don't think it matters. + */ + p = uio->uio_procp; + if (vp->v_type == VREG && p && + uio->uio_offset + uio->uio_resid > + p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { + psignal(p, SIGXFSZ); + return (EFBIG); + } + + resid = uio->uio_resid; + osize = ip->i_size; + flags = ioflag & IO_SYNC ? B_SYNC : 0; + + for (error = 0; uio->uio_resid > 0;) { + lbn = lblkno(fs, uio->uio_offset); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = fs->fs_bsize - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; +#ifdef LFS_READWRITE + (void)lfs_check(vp, lbn); + error = lfs_balloc(vp, xfersize, lbn, &bp); +#else + if (fs->fs_bsize > xfersize) + flags |= B_CLRBUF; + else + flags &= ~B_CLRBUF; + + error = ffs_balloc(ip, + lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); +#endif + if (error) + break; + if (uio->uio_offset + xfersize > ip->i_size) { + ip->i_size = uio->uio_offset + xfersize; + vnode_pager_setsize(vp, (u_long)ip->i_size); + } + (void)vnode_pager_uncache(vp); + + size = BLKSIZE(fs, ip, lbn) - bp->b_resid; + if (size < xfersize) + xfersize = size; + + error = + uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); +#ifdef LFS_READWRITE + (void)VOP_BWRITE(bp); +#else + if (ioflag & IO_SYNC) + (void)bwrite(bp); + else if (xfersize + blkoffset == fs->fs_bsize) + if (doclusterwrite) + cluster_write(bp, ip->i_size); + else { + bp->b_flags |= B_AGE; + bawrite(bp); + } + else + bdwrite(bp); +#endif + if (error || xfersize == 0) + break; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ + if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) + ip->i_mode &= ~(ISUID | ISGID); + if (error) { + if (ioflag & IO_UNIT) { + (void)VOP_TRUNCATE(vp, osize, + ioflag & IO_SYNC, ap->a_cred, uio->uio_procp); + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } + } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) + error = VOP_UPDATE(vp, &time, &time, 1); + return (error); +} diff --git a/sys/ufs/ufs/ufs_vfsops.c b/sys/ufs/ufs/ufs_vfsops.c new file mode 100644 index 00000000000..f806e0b2a83 --- /dev/null +++ b/sys/ufs/ufs/ufs_vfsops.c @@ -0,0 +1,206 @@ +/* + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vfsops.c 8.4 (Berkeley) 4/16/94 + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +/* + * Flag to permit forcible unmounting. + */ +int doforce = 1; + +/* + * Make a filesystem operational. + * Nothing to do at the moment. + */ +/* ARGSUSED */ +int +ufs_start(mp, flags, p) + struct mount *mp; + int flags; + struct proc *p; +{ + + return (0); +} + +/* + * Return the root of a filesystem. + */ +int +ufs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *nvp; + int error; + + if (error = VFS_VGET(mp, (ino_t)ROOTINO, &nvp)) + return (error); + *vpp = nvp; + return (0); +} + +/* + * Do operations associated with quotas + */ +int +ufs_quotactl(mp, cmds, uid, arg, p) + struct mount *mp; + int cmds; + uid_t uid; + caddr_t arg; + struct proc *p; +{ + int cmd, type, error; + +#ifndef QUOTA + return (EOPNOTSUPP); +#else + if (uid == -1) + uid = p->p_cred->p_ruid; + cmd = cmds >> SUBCMDSHIFT; + + switch (cmd) { + case Q_GETQUOTA: + case Q_SYNC: + if (uid == p->p_cred->p_ruid) + break; + /* fall through */ + default: + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + } + + type = cmd & SUBCMDMASK; + if ((u_int)type >= MAXQUOTAS) + return (EINVAL); + + switch (cmd) { + + case Q_QUOTAON: + return (quotaon(p, mp, type, arg)); + + case Q_QUOTAOFF: + if (vfs_busy(mp)) + return (0); + error = quotaoff(p, mp, type); + vfs_unbusy(mp); + return (error); + + case Q_SETQUOTA: + return (setquota(mp, uid, type, arg)); + + case Q_SETUSE: + return (setuse(mp, uid, type, arg)); + + case Q_GETQUOTA: + return (getquota(mp, uid, type, arg)); + + case Q_SYNC: + if (vfs_busy(mp)) + return (0); + error = qsync(mp); + vfs_unbusy(mp); + return (error); + + default: + return (EINVAL); + } + /* NOTREACHED */ +#endif +} + +/* + * This is the generic part of fhtovp called after the underlying + * filesystem has validated the file handle. + * + * Verify that a host should have access to a filesystem, and if so + * return a vnode for the presented file handle. + */ +int +ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp) + register struct mount *mp; + struct ufid *ufhp; + struct mbuf *nam; + struct vnode **vpp; + int *exflagsp; + struct ucred **credanonp; +{ + register struct inode *ip; + register struct netcred *np; + register struct ufsmount *ump = VFSTOUFS(mp); + struct vnode *nvp; + int error; + + /* + * Get the export permission structure for this tuple. + */ + np = vfs_export_lookup(mp, &ump->um_export, nam); + if (np == NULL) + return (EACCES); + + if (error = VFS_VGET(mp, ufhp->ufid_ino, &nvp)) { + *vpp = NULLVP; + return (error); + } + ip = VTOI(nvp); + if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen) { + vput(nvp); + *vpp = NULLVP; + return (ESTALE); + } + *vpp = nvp; + *exflagsp = np->netc_exflags; + *credanonp = &np->netc_anon; + return (0); +} diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c new file mode 100644 index 00000000000..7b7c88376b9 --- /dev/null +++ b/sys/ufs/ufs/ufs_vnops.c @@ -0,0 +1,2159 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vnops.c 8.10 (Berkeley) 4/1/94 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include + +static int ufs_chmod __P((struct vnode *, int, struct ucred *, struct proc *)); +static int ufs_chown + __P((struct vnode *, uid_t, gid_t, struct ucred *, struct proc *)); + +union _qcvt { + quad_t qcvt; + long val[2]; +}; +#define SETHIGH(q, h) { \ + union _qcvt tmp; \ + tmp.qcvt = (q); \ + tmp.val[_QUAD_HIGHWORD] = (h); \ + (q) = tmp.qcvt; \ +} +#define SETLOW(q, l) { \ + union _qcvt tmp; \ + tmp.qcvt = (q); \ + tmp.val[_QUAD_LOWWORD] = (l); \ + (q) = tmp.qcvt; \ +} + +/* + * Create a regular file + */ +int +ufs_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + int error; + + if (error = + ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), + ap->a_dvp, ap->a_vpp, ap->a_cnp)) + return (error); + return (0); +} + +/* + * Mknod vnode call + */ +/* ARGSUSED */ +int +ufs_mknod(ap) + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + register struct vattr *vap = ap->a_vap; + register struct vnode **vpp = ap->a_vpp; + register struct inode *ip; + int error; + + if (error = + ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), + ap->a_dvp, vpp, ap->a_cnp)) + return (error); + ip = VTOI(*vpp); + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + if (vap->va_rdev != VNOVAL) { + /* + * Want to be able to use this to make badblock + * inodes, so don't truncate the dev number. + */ + ip->i_rdev = vap->va_rdev; + } + /* + * Remove inode so that it will be reloaded by VFS_VGET and + * checked to see if it is an alias of an existing entry in + * the inode cache. + */ + vput(*vpp); + (*vpp)->v_type = VNON; + vgone(*vpp); + *vpp = 0; + return (0); +} + +/* + * Open called. + * + * Nothing to do. + */ +/* ARGSUSED */ +int +ufs_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* + * Files marked append-only must be opened for appending. + */ + if ((VTOI(ap->a_vp)->i_flags & APPEND) && + (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) + return (EPERM); + return (0); +} + +/* + * Close called. + * + * Update the times on the inode. + */ +/* ARGSUSED */ +int +ufs_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + + if (vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED)) + ITIMES(ip, &time, &time); + return (0); +} + +int +ufs_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + register struct ucred *cred = ap->a_cred; + mode_t mask, mode = ap->a_mode; + register gid_t *gp; + int i, error; + +#ifdef DIAGNOSTIC + if (!VOP_ISLOCKED(vp)) { + vprint("ufs_access: not locked", vp); + panic("ufs_access: not locked"); + } +#endif +#ifdef QUOTA + if (mode & VWRITE) + switch (vp->v_type) { + case VDIR: + case VLNK: + case VREG: + if (error = getinoquota(ip)) + return (error); + break; + } +#endif + + /* If immutable bit set, nobody gets to write it. */ + if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE)) + return (EPERM); + + /* Otherwise, user id 0 always gets access. */ + if (cred->cr_uid == 0) + return (0); + + mask = 0; + + /* Otherwise, check the owner. */ + if (cred->cr_uid == ip->i_uid) { + if (mode & VEXEC) + mask |= S_IXUSR; + if (mode & VREAD) + mask |= S_IRUSR; + if (mode & VWRITE) + mask |= S_IWUSR; + return ((ip->i_mode & mask) == mask ? 0 : EACCES); + } + + /* Otherwise, check the groups. */ + for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++) + if (ip->i_gid == *gp) { + if (mode & VEXEC) + mask |= S_IXGRP; + if (mode & VREAD) + mask |= S_IRGRP; + if (mode & VWRITE) + mask |= S_IWGRP; + return ((ip->i_mode & mask) == mask ? 0 : EACCES); + } + + /* Otherwise, check everyone else. */ + if (mode & VEXEC) + mask |= S_IXOTH; + if (mode & VREAD) + mask |= S_IROTH; + if (mode & VWRITE) + mask |= S_IWOTH; + return ((ip->i_mode & mask) == mask ? 0 : EACCES); +} + +/* ARGSUSED */ +int +ufs_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + register struct vattr *vap = ap->a_vap; + + ITIMES(ip, &time, &time); + /* + * Copy from inode table + */ + vap->va_fsid = ip->i_dev; + vap->va_fileid = ip->i_number; + vap->va_mode = ip->i_mode & ~IFMT; + vap->va_nlink = ip->i_nlink; + vap->va_uid = ip->i_uid; + vap->va_gid = ip->i_gid; + vap->va_rdev = (dev_t)ip->i_rdev; + vap->va_size = ip->i_din.di_size; + vap->va_atime = ip->i_atime; + vap->va_mtime = ip->i_mtime; + vap->va_ctime = ip->i_ctime; + vap->va_flags = ip->i_flags; + vap->va_gen = ip->i_gen; + /* this doesn't belong here */ + if (vp->v_type == VBLK) + vap->va_blocksize = BLKDEV_IOSIZE; + else if (vp->v_type == VCHR) + vap->va_blocksize = MAXBSIZE; + else + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + vap->va_bytes = dbtob(ip->i_blocks); + vap->va_type = vp->v_type; + vap->va_filerev = ip->i_modrev; + return (0); +} + +/* + * Set attribute vnode op. called from several syscalls + */ +int +ufs_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct vattr *vap = ap->a_vap; + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + register struct ucred *cred = ap->a_cred; + register struct proc *p = ap->a_p; + struct timeval atimeval, mtimeval; + int error; + + /* + * Check for unsettable attributes. + */ + if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || + (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || + (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || + ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { + return (EINVAL); + } + if (vap->va_flags != VNOVAL) { + if (cred->cr_uid != ip->i_uid && + (error = suser(cred, &p->p_acflag))) + return (error); + if (cred->cr_uid == 0) { + if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) && + securelevel > 0) + return (EPERM); + ip->i_flags = vap->va_flags; + } else { + if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) + return (EPERM); + ip->i_flags &= SF_SETTABLE; + ip->i_flags |= (vap->va_flags & UF_SETTABLE); + } + ip->i_flag |= IN_CHANGE; + if (vap->va_flags & (IMMUTABLE | APPEND)) + return (0); + } + if (ip->i_flags & (IMMUTABLE | APPEND)) + return (EPERM); + /* + * Go through the fields and update iff not VNOVAL. + */ + if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) + if (error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, p)) + return (error); + if (vap->va_size != VNOVAL) { + if (vp->v_type == VDIR) + return (EISDIR); + if (error = VOP_TRUNCATE(vp, vap->va_size, 0, cred, p)) + return (error); + } + ip = VTOI(vp); + if (vap->va_atime.ts_sec != VNOVAL || vap->va_mtime.ts_sec != VNOVAL) { + if (cred->cr_uid != ip->i_uid && + (error = suser(cred, &p->p_acflag)) && + ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || + (error = VOP_ACCESS(vp, VWRITE, cred, p)))) + return (error); + if (vap->va_atime.ts_sec != VNOVAL) + ip->i_flag |= IN_ACCESS; + if (vap->va_mtime.ts_sec != VNOVAL) + ip->i_flag |= IN_CHANGE | IN_UPDATE; + atimeval.tv_sec = vap->va_atime.ts_sec; + atimeval.tv_usec = vap->va_atime.ts_nsec / 1000; + mtimeval.tv_sec = vap->va_mtime.ts_sec; + mtimeval.tv_usec = vap->va_mtime.ts_nsec / 1000; + if (error = VOP_UPDATE(vp, &atimeval, &mtimeval, 1)) + return (error); + } + error = 0; + if (vap->va_mode != (mode_t)VNOVAL) + error = ufs_chmod(vp, (int)vap->va_mode, cred, p); + return (error); +} + +/* + * Change the mode on a file. + * Inode must be locked before calling. + */ +static int +ufs_chmod(vp, mode, cred, p) + register struct vnode *vp; + register int mode; + register struct ucred *cred; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + int error; + + if (cred->cr_uid != ip->i_uid && + (error = suser(cred, &p->p_acflag))) + return (error); + if (cred->cr_uid) { + if (vp->v_type != VDIR && (mode & S_ISTXT)) + return (EFTYPE); + if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) + return (EPERM); + } + ip->i_mode &= ~ALLPERMS; + ip->i_mode |= (mode & ALLPERMS); + ip->i_flag |= IN_CHANGE; + if ((vp->v_flag & VTEXT) && (ip->i_mode & S_ISTXT) == 0) + (void) vnode_pager_uncache(vp); + return (0); +} + +/* + * Perform chown operation on inode ip; + * inode must be locked prior to call. + */ +static int +ufs_chown(vp, uid, gid, cred, p) + register struct vnode *vp; + uid_t uid; + gid_t gid; + struct ucred *cred; + struct proc *p; +{ + register struct inode *ip = VTOI(vp); + uid_t ouid; + gid_t ogid; + int error = 0; +#ifdef QUOTA + register int i; + long change; +#endif + + if (uid == (uid_t)VNOVAL) + uid = ip->i_uid; + if (gid == (gid_t)VNOVAL) + gid = ip->i_gid; + /* + * If we don't own the file, are trying to change the owner + * of the file, or are not a member of the target group, + * the caller must be superuser or the call fails. + */ + if ((cred->cr_uid != ip->i_uid || uid != ip->i_uid || + !groupmember((gid_t)gid, cred)) && + (error = suser(cred, &p->p_acflag))) + return (error); + ogid = ip->i_gid; + ouid = ip->i_uid; +#ifdef QUOTA + if (error = getinoquota(ip)) + return (error); + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + change = ip->i_blocks; + (void) chkdq(ip, -change, cred, CHOWN); + (void) chkiq(ip, -1, cred, CHOWN); + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } +#endif + ip->i_gid = gid; + ip->i_uid = uid; +#ifdef QUOTA + if ((error = getinoquota(ip)) == 0) { + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { + if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) + goto good; + else + (void) chkdq(ip, -change, cred, CHOWN|FORCE); + } + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } + } + ip->i_gid = ogid; + ip->i_uid = ouid; + if (getinoquota(ip) == 0) { + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + (void) chkdq(ip, change, cred, FORCE|CHOWN); + (void) chkiq(ip, 1, cred, FORCE|CHOWN); + (void) getinoquota(ip); + } + return (error); +good: + if (getinoquota(ip)) + panic("chown: lost quota"); +#endif /* QUOTA */ + if (ouid != uid || ogid != gid) + ip->i_flag |= IN_CHANGE; + if (ouid != uid && cred->cr_uid != 0) + ip->i_mode &= ~ISUID; + if (ogid != gid && cred->cr_uid != 0) + ip->i_mode &= ~ISGID; + return (0); +} + +/* ARGSUSED */ +int +ufs_ioctl(ap) + struct vop_ioctl_args /* { + struct vnode *a_vp; + int a_command; + caddr_t a_data; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (ENOTTY); +} + +/* ARGSUSED */ +int +ufs_select(ap) + struct vop_select_args /* { + struct vnode *a_vp; + int a_which; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + /* + * We should really check to see if I/O is possible. + */ + return (1); +} + +/* + * Mmap a file + * + * NB Currently unsupported. + */ +/* ARGSUSED */ +int +ufs_mmap(ap) + struct vop_mmap_args /* { + struct vnode *a_vp; + int a_fflags; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + + return (EINVAL); +} + +/* + * Seek on a file + * + * Nothing to do, so just return. + */ +/* ARGSUSED */ +int +ufs_seek(ap) + struct vop_seek_args /* { + struct vnode *a_vp; + off_t a_oldoff; + off_t a_newoff; + struct ucred *a_cred; + } */ *ap; +{ + + return (0); +} + +int +ufs_remove(ap) + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct inode *ip; + register struct vnode *vp = ap->a_vp; + register struct vnode *dvp = ap->a_dvp; + int error; + + ip = VTOI(vp); + if ((ip->i_flags & (IMMUTABLE | APPEND)) || + (VTOI(dvp)->i_flags & APPEND)) { + error = EPERM; + goto out; + } + if ((error = ufs_dirremove(dvp, ap->a_cnp)) == 0) { + ip->i_nlink--; + ip->i_flag |= IN_CHANGE; + } +out: + if (dvp == vp) + vrele(vp); + else + vput(vp); + vput(dvp); + return (error); +} + +/* + * link vnode call + */ +int +ufs_link(ap) + struct vop_link_args /* { + struct vnode *a_vp; + struct vnode *a_tdvp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct vnode *tdvp = ap->a_tdvp; + register struct componentname *cnp = ap->a_cnp; + register struct inode *ip; + struct timeval tv; + int error; + +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & HASBUF) == 0) + panic("ufs_link: no name"); +#endif + if (vp->v_mount != tdvp->v_mount) { + VOP_ABORTOP(vp, cnp); + error = EXDEV; + goto out2; + } + if (vp != tdvp && (error = VOP_LOCK(tdvp))) { + VOP_ABORTOP(vp, cnp); + goto out2; + } + ip = VTOI(tdvp); + if ((nlink_t)ip->i_nlink >= LINK_MAX) { + VOP_ABORTOP(vp, cnp); + error = EMLINK; + goto out1; + } + if (ip->i_flags & (IMMUTABLE | APPEND)) { + VOP_ABORTOP(vp, cnp); + error = EPERM; + goto out1; + } + ip->i_nlink++; + ip->i_flag |= IN_CHANGE; + tv = time; + error = VOP_UPDATE(tdvp, &tv, &tv, 1); + if (!error) + error = ufs_direnter(ip, vp, cnp); + if (error) { + ip->i_nlink--; + ip->i_flag |= IN_CHANGE; + } + FREE(cnp->cn_pnbuf, M_NAMEI); +out1: + if (vp != tdvp) + VOP_UNLOCK(tdvp); +out2: + vput(vp); + return (error); +} + + + +/* + * relookup - lookup a path name component + * Used by lookup to re-aquire things. + */ +int +relookup(dvp, vpp, cnp) + struct vnode *dvp, **vpp; + struct componentname *cnp; +{ + register struct vnode *dp = 0; /* the directory we are searching */ + int docache; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int error = 0; +#ifdef NAMEI_DIAGNOSTIC + int newhash; /* DEBUG: check name hash */ + char *cp; /* DEBUG: check name ptr/len */ +#endif + + /* + * Setup: break out flag bits into variables. + */ + wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + if (cnp->cn_nameiop == DELETE || + (wantparent && cnp->cn_nameiop != CREATE)) + docache = 0; + rdonly = cnp->cn_flags & RDONLY; + cnp->cn_flags &= ~ISSYMLINK; + dp = dvp; + VOP_LOCK(dp); + +/* dirloop: */ + /* + * Search a new directory. + * + * The cn_hash value is for use by vfs_cache. + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ +#ifdef NAMEI_DIAGNOSTIC + for (newhash = 0, cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) + newhash += (unsigned char)*cp; + if (newhash != cnp->cn_hash) + panic("relookup: bad hash"); + if (cnp->cn_namelen != cp - cnp->cn_nameptr) + panic ("relookup: bad len"); + if (*cp != 0) + panic("relookup: not last component"); + printf("{%s}: ", cnp->cn_nameptr); +#endif + + /* + * Check for degenerate name (e.g. / or "") + * which is a way of talking about a directory, + * e.g. like "/." or ".". + */ + if (cnp->cn_nameptr[0] == '\0') { + if (cnp->cn_nameiop != LOOKUP || wantparent) { + error = EISDIR; + goto bad; + } + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto bad; + } + if (!(cnp->cn_flags & LOCKLEAF)) + VOP_UNLOCK(dp); + *vpp = dp; + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + return (0); + } + + if (cnp->cn_flags & ISDOTDOT) + panic ("relookup: lookup on dot-dot"); + + /* + * We now have a segment name to search for, and a directory to search. + */ + if (error = VOP_LOOKUP(dp, vpp, cnp)) { +#ifdef DIAGNOSTIC + if (*vpp != NULL) + panic("leaf should be empty"); +#endif + if (error != EJUSTRETURN) + goto bad; + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (rdonly || (dvp->v_mount->mnt_flag & MNT_RDONLY)) { + error = EROFS; + goto bad; + } + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + /* + * We return with ni_vp NULL to indicate that the entry + * doesn't currently exist, leaving a pointer to the + * (possibly locked) directory inode in ndp->ni_dvp. + */ + return (0); + } + dp = *vpp; + +#ifdef DIAGNOSTIC + /* + * Check for symbolic link + */ + if (dp->v_type == VLNK && (cnp->cn_flags & FOLLOW)) + panic ("relookup: symlink found.\n"); +#endif + + /* + * Check for read-only file systems. + */ + if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) { + /* + * Disallow directory write attempts on read-only + * file systems. + */ + if (rdonly || (dp->v_mount->mnt_flag & MNT_RDONLY) || + (wantparent && + (dvp->v_mount->mnt_flag & MNT_RDONLY))) { + error = EROFS; + goto bad2; + } + } + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + + if (!wantparent) + vrele(dvp); + if ((cnp->cn_flags & LOCKLEAF) == 0) + VOP_UNLOCK(dp); + return (0); + +bad2: + if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN)) + VOP_UNLOCK(dvp); + vrele(dvp); +bad: + vput(dp); + *vpp = NULL; + return (error); +} + + +/* + * Rename system call. + * rename("foo", "bar"); + * is essentially + * unlink("bar"); + * link("foo", "bar"); + * unlink("foo"); + * but ``atomically''. Can't do full commit without saving state in the + * inode on disk which isn't feasible at this time. Best we can do is + * always guarantee the target exists. + * + * Basic algorithm is: + * + * 1) Bump link count on source while we're linking it to the + * target. This also ensure the inode won't be deleted out + * from underneath us while we work (it may be truncated by + * a concurrent `trunc' or `open' for creation). + * 2) Link source to destination. If destination already exists, + * delete it first. + * 3) Unlink source reference to inode if still around. If a + * directory was moved and the parent of the destination + * is different from the source, patch the ".." entry in the + * directory. + */ +int +ufs_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + struct vnode *tvp = ap->a_tvp; + register struct vnode *tdvp = ap->a_tdvp; + struct vnode *fvp = ap->a_fvp; + register struct vnode *fdvp = ap->a_fdvp; + register struct componentname *tcnp = ap->a_tcnp; + register struct componentname *fcnp = ap->a_fcnp; + register struct inode *ip, *xp, *dp; + struct dirtemplate dirbuf; + struct timeval tv; + int doingdirectory = 0, oldparent = 0, newparent = 0; + int error = 0; + u_char namlen; + +#ifdef DIAGNOSTIC + if ((tcnp->cn_flags & HASBUF) == 0 || + (fcnp->cn_flags & HASBUF) == 0) + panic("ufs_rename: no name"); +#endif + /* + * Check for cross-device rename. + */ + if ((fvp->v_mount != tdvp->v_mount) || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; +abortit: + VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */ + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */ + vrele(fdvp); + vrele(fvp); + return (error); + } + + /* + * Check if just deleting a link name. + */ + if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) || + (VTOI(tdvp)->i_flags & APPEND))) { + error = EPERM; + goto abortit; + } + if (fvp == tvp) { + if (fvp->v_type == VDIR) { + error = EINVAL; + goto abortit; + } + VOP_ABORTOP(fdvp, fcnp); + vrele(fdvp); + vrele(fvp); + vput(tdvp); + vput(tvp); + tcnp->cn_flags &= ~MODMASK; + tcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + if ((tcnp->cn_flags & SAVESTART) == 0) + panic("ufs_rename: lost from startdir"); + tcnp->cn_nameiop = DELETE; + (void) relookup(tdvp, &tvp, tcnp); + return (VOP_REMOVE(tdvp, tvp, tcnp)); + } + if (error = VOP_LOCK(fvp)) + goto abortit; + dp = VTOI(fdvp); + ip = VTOI(fvp); + if ((ip->i_flags & (IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { + VOP_UNLOCK(fvp); + error = EPERM; + goto abortit; + } + if ((ip->i_mode & IFMT) == IFDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || + dp == ip || (fcnp->cn_flags&ISDOTDOT) || + (ip->i_flag & IN_RENAME)) { + VOP_UNLOCK(fvp); + error = EINVAL; + goto abortit; + } + ip->i_flag |= IN_RENAME; + oldparent = dp->i_number; + doingdirectory++; + } + vrele(fdvp); + + /* + * When the target exists, both the directory + * and target vnodes are returned locked. + */ + dp = VTOI(tdvp); + xp = NULL; + if (tvp) + xp = VTOI(tvp); + + /* + * 1) Bump link count while we're moving stuff + * around. If we crash somewhere before + * completing our work, the link count + * may be wrong, but correctable. + */ + ip->i_nlink++; + ip->i_flag |= IN_CHANGE; + tv = time; + if (error = VOP_UPDATE(fvp, &tv, &tv, 1)) { + VOP_UNLOCK(fvp); + goto bad; + } + + /* + * If ".." must be changed (ie the directory gets a new + * parent) then the source directory must not be in the + * directory heirarchy above the target, as this would + * orphan everything below the source directory. Also + * the user must have write permission in the source so + * as to be able to change "..". We must repeat the call + * to namei, as the parent directory is unlocked by the + * call to checkpath(). + */ + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); + VOP_UNLOCK(fvp); + if (oldparent != dp->i_number) + newparent = dp->i_number; + if (doingdirectory && newparent) { + if (error) /* write access check above */ + goto bad; + if (xp != NULL) + vput(tvp); + if (error = ufs_checkpath(ip, dp, tcnp->cn_cred)) + goto out; + if ((tcnp->cn_flags & SAVESTART) == 0) + panic("ufs_rename: lost to startdir"); + if (error = relookup(tdvp, &tvp, tcnp)) + goto out; + dp = VTOI(tdvp); + xp = NULL; + if (tvp) + xp = VTOI(tvp); + } + /* + * 2) If target doesn't exist, link the target + * to the source and unlink the source. + * Otherwise, rewrite the target directory + * entry to reference the source inode and + * expunge the original entry's existence. + */ + if (xp == NULL) { + if (dp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't fool with the link count. + */ + if (doingdirectory && newparent) { + if ((nlink_t)dp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto bad; + } + dp->i_nlink++; + dp->i_flag |= IN_CHANGE; + if (error = VOP_UPDATE(tdvp, &tv, &tv, 1)) + goto bad; + } + if (error = ufs_direnter(ip, tdvp, tcnp)) { + if (doingdirectory && newparent) { + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + (void)VOP_UPDATE(tdvp, &tv, &tv, 1); + } + goto bad; + } + vput(tdvp); + } else { + if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Short circuit rename(foo, foo). + */ + if (xp->i_number == ip->i_number) + panic("rename: same file"); + /* + * If the parent directory is "sticky", then the user must + * own the parent directory, or the destination of the rename, + * otherwise the destination may not be changed (except by + * root). This implements append-only directories. + */ + if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 && + tcnp->cn_cred->cr_uid != dp->i_uid && + xp->i_uid != tcnp->cn_cred->cr_uid) { + error = EPERM; + goto bad; + } + /* + * Target must be empty if a directory and have no links + * to it. Also, ensure source and target are compatible + * (both directories, or both not directories). + */ + if ((xp->i_mode&IFMT) == IFDIR) { + if (!ufs_dirempty(xp, dp->i_number, tcnp->cn_cred) || + xp->i_nlink > 2) { + error = ENOTEMPTY; + goto bad; + } + if (!doingdirectory) { + error = ENOTDIR; + goto bad; + } + cache_purge(tdvp); + } else if (doingdirectory) { + error = EISDIR; + goto bad; + } + if (error = ufs_dirrewrite(dp, ip, tcnp)) + goto bad; + /* + * If the target directory is in the same + * directory as the source directory, + * decrement the link count on the parent + * of the target directory. + */ + if (doingdirectory && !newparent) { + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + } + vput(tdvp); + /* + * Adjust the link count of the target to + * reflect the dirrewrite above. If this is + * a directory it is empty and there are + * no links to it, so we can squash the inode and + * any space associated with it. We disallowed + * renaming over top of a directory with links to + * it above, as the remaining link would point to + * a directory without "." or ".." entries. + */ + xp->i_nlink--; + if (doingdirectory) { + if (--xp->i_nlink != 0) + panic("rename: linked directory"); + error = VOP_TRUNCATE(tvp, (off_t)0, IO_SYNC, + tcnp->cn_cred, tcnp->cn_proc); + } + xp->i_flag |= IN_CHANGE; + vput(tvp); + xp = NULL; + } + + /* + * 3) Unlink the source. + */ + fcnp->cn_flags &= ~MODMASK; + fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + if ((fcnp->cn_flags & SAVESTART) == 0) + panic("ufs_rename: lost from startdir"); + (void) relookup(fdvp, &fvp, fcnp); + if (fvp != NULL) { + xp = VTOI(fvp); + dp = VTOI(fdvp); + } else { + /* + * From name has disappeared. + */ + if (doingdirectory) + panic("rename: lost dir entry"); + vrele(ap->a_fvp); + return (0); + } + /* + * Ensure that the directory entry still exists and has not + * changed while the new name has been entered. If the source is + * a file then the entry may have been unlinked or renamed. In + * either case there is no further work to be done. If the source + * is a directory then it cannot have been rmdir'ed; its link + * count of three would cause a rmdir to fail with ENOTEMPTY. + * The IRENAME flag ensures that it cannot be moved by another + * rename. + */ + if (xp != ip) { + if (doingdirectory) + panic("rename: lost dir entry"); + } else { + /* + * If the source is a directory with a + * new parent, the link count of the old + * parent directory must be decremented + * and ".." set to point to the new parent. + */ + if (doingdirectory && newparent) { + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + error = vn_rdwr(UIO_READ, fvp, (caddr_t)&dirbuf, + sizeof (struct dirtemplate), (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED, + tcnp->cn_cred, (int *)0, (struct proc *)0); + if (error == 0) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (fvp->v_mount->mnt_maxsymlinklen <= 0) + namlen = dirbuf.dotdot_type; + else + namlen = dirbuf.dotdot_namlen; +# else + namlen = dirbuf.dotdot_namlen; +# endif + if (namlen != 2 || + dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') { + ufs_dirbad(xp, (doff_t)12, + "rename: mangled dir"); + } else { + dirbuf.dotdot_ino = newparent; + (void) vn_rdwr(UIO_WRITE, fvp, + (caddr_t)&dirbuf, + sizeof (struct dirtemplate), + (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED|IO_SYNC, + tcnp->cn_cred, (int *)0, + (struct proc *)0); + cache_purge(fdvp); + } + } + } + error = ufs_dirremove(fdvp, fcnp); + if (!error) { + xp->i_nlink--; + xp->i_flag |= IN_CHANGE; + } + xp->i_flag &= ~IN_RENAME; + } + if (dp) + vput(fdvp); + if (xp) + vput(fvp); + vrele(ap->a_fvp); + return (error); + +bad: + if (xp) + vput(ITOV(xp)); + vput(ITOV(dp)); +out: + if (VOP_LOCK(fvp) == 0) { + ip->i_nlink--; + ip->i_flag |= IN_CHANGE; + vput(fvp); + } else + vrele(fvp); + return (error); +} + +/* + * A virgin directory (no blushing please). + */ +static struct dirtemplate mastertemplate = { + 0, 12, DT_DIR, 1, ".", + 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." +}; +static struct odirtemplate omastertemplate = { + 0, 12, 1, ".", + 0, DIRBLKSIZ - 12, 2, ".." +}; + +/* + * Mkdir system call + */ +int +ufs_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + register struct vnode *dvp = ap->a_dvp; + register struct vattr *vap = ap->a_vap; + register struct componentname *cnp = ap->a_cnp; + register struct inode *ip, *dp; + struct vnode *tvp; + struct dirtemplate dirtemplate, *dtp; + struct timeval tv; + int error, dmode; + +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & HASBUF) == 0) + panic("ufs_mkdir: no name"); +#endif + dp = VTOI(dvp); + if ((nlink_t)dp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto out; + } + dmode = vap->va_mode & 0777; + dmode |= IFDIR; + /* + * Must simulate part of ufs_makeinode here to acquire the inode, + * but not have it entered in the parent directory. The entry is + * made later after writing "." and ".." entries. + */ + if (error = VOP_VALLOC(dvp, dmode, cnp->cn_cred, &tvp)) + goto out; + ip = VTOI(tvp); + ip->i_uid = cnp->cn_cred->cr_uid; + ip->i_gid = dp->i_gid; +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, cnp->cn_cred, 0))) { + free(cnp->cn_pnbuf, M_NAMEI); + VOP_VFREE(tvp, ip->i_number, dmode); + vput(tvp); + vput(dvp); + return (error); + } +#endif + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_mode = dmode; + tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ + ip->i_nlink = 2; + tv = time; + error = VOP_UPDATE(tvp, &tv, &tv, 1); + + /* + * Bump link count in parent directory + * to reflect work done below. Should + * be done before reference is created + * so reparation is possible if we crash. + */ + dp->i_nlink++; + dp->i_flag |= IN_CHANGE; + if (error = VOP_UPDATE(dvp, &tv, &tv, 1)) + goto bad; + + /* Initialize directory with "." and ".." from static template. */ + if (dvp->v_mount->mnt_maxsymlinklen > 0) + dtp = &mastertemplate; + else + dtp = (struct dirtemplate *)&omastertemplate; + dirtemplate = *dtp; + dirtemplate.dot_ino = ip->i_number; + dirtemplate.dotdot_ino = dp->i_number; + error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)&dirtemplate, + sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED|IO_SYNC, cnp->cn_cred, (int *)0, (struct proc *)0); + if (error) { + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + goto bad; + } + if (DIRBLKSIZ > VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) + panic("ufs_mkdir: blksize"); /* XXX should grow with balloc() */ + else { + ip->i_size = DIRBLKSIZ; + ip->i_flag |= IN_CHANGE; + } + + /* Directory set up, now install it's entry in the parent directory. */ + if (error = ufs_direnter(ip, dvp, cnp)) { + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + } +bad: + /* + * No need to do an explicit VOP_TRUNCATE here, vrele will do this + * for us because we set the link count to 0. + */ + if (error) { + ip->i_nlink = 0; + ip->i_flag |= IN_CHANGE; + vput(tvp); + } else + *ap->a_vpp = tvp; +out: + FREE(cnp->cn_pnbuf, M_NAMEI); + vput(dvp); + return (error); +} + +/* + * Rmdir system call. + */ +int +ufs_rmdir(ap) + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct vnode *dvp = ap->a_dvp; + register struct componentname *cnp = ap->a_cnp; + register struct inode *ip, *dp; + int error; + + ip = VTOI(vp); + dp = VTOI(dvp); + /* + * No rmdir "." please. + */ + if (dp == ip) { + vrele(dvp); + vput(vp); + return (EINVAL); + } + /* + * Verify the directory is empty (and valid). + * (Rmdir ".." won't be valid since + * ".." will contain a reference to + * the current directory and thus be + * non-empty.) + */ + error = 0; + if (ip->i_nlink != 2 || + !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { + error = ENOTEMPTY; + goto out; + } + if ((dp->i_flags & APPEND) || (ip->i_flags & (IMMUTABLE | APPEND))) { + error = EPERM; + goto out; + } + /* + * Delete reference to directory before purging + * inode. If we crash in between, the directory + * will be reattached to lost+found, + */ + if (error = ufs_dirremove(dvp, cnp)) + goto out; + dp->i_nlink--; + dp->i_flag |= IN_CHANGE; + cache_purge(dvp); + vput(dvp); + dvp = NULL; + /* + * Truncate inode. The only stuff left + * in the directory is "." and "..". The + * "." reference is inconsequential since + * we're quashing it. The ".." reference + * has already been adjusted above. We've + * removed the "." reference and the reference + * in the parent directory, but there may be + * other hard links so decrement by 2 and + * worry about them later. + */ + ip->i_nlink -= 2; + error = VOP_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred, + cnp->cn_proc); + cache_purge(ITOV(ip)); +out: + if (dvp) + vput(dvp); + vput(vp); + return (error); +} + +/* + * symlink -- make a symbolic link + */ +int +ufs_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + register struct vnode *vp, **vpp = ap->a_vpp; + register struct inode *ip; + int len, error; + + if (error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, + vpp, ap->a_cnp)) + return (error); + vp = *vpp; + len = strlen(ap->a_target); + if (len < vp->v_mount->mnt_maxsymlinklen) { + ip = VTOI(vp); + bcopy(ap->a_target, (char *)ip->i_shortlink, len); + ip->i_size = len; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else + error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, (int *)0, + (struct proc *)0); + vput(vp); + return (error); +} + +/* + * Vnode op for reading directories. + * + * The routine below assumes that the on-disk format of a directory + * is the same as that defined by . If the on-disk + * format changes, then it will be necessary to do a conversion + * from the on-disk format that read returns to the format defined + * by . + */ +int +ufs_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + register struct uio *uio = ap->a_uio; + int count, lost, error; + + count = uio->uio_resid; + count &= ~(DIRBLKSIZ - 1); + lost = uio->uio_resid - count; + if (count < DIRBLKSIZ || (uio->uio_offset & (DIRBLKSIZ -1))) + return (EINVAL); + uio->uio_resid = count; + uio->uio_iov->iov_len = count; +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) { + error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); + } else { + struct dirent *dp, *edp; + struct uio auio; + struct iovec aiov; + caddr_t dirbuf; + int readcnt; + u_char tmp; + + auio = *uio; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + aiov.iov_len = count; + MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK); + aiov.iov_base = dirbuf; + error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); + if (error == 0) { + readcnt = count - auio.uio_resid; + edp = (struct dirent *)&dirbuf[readcnt]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { + tmp = dp->d_namlen; + dp->d_namlen = dp->d_type; + dp->d_type = tmp; + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } + } + if (dp >= edp) + error = uiomove(dirbuf, readcnt, uio); + } + FREE(dirbuf, M_TEMP); + } +# else + error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); +# endif + uio->uio_resid += lost; + return (error); +} + +/* + * Return target name of a symbolic link + */ +int +ufs_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + int isize; + + isize = ip->i_size; + if (isize < vp->v_mount->mnt_maxsymlinklen) { + uiomove((char *)ip->i_shortlink, isize, ap->a_uio); + return (0); + } + return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); +} + +/* + * Ufs abort op, called after namei() when a CREATE/DELETE isn't actually + * done. If a buffer has been saved in anticipation of a CREATE, delete it. + */ +/* ARGSUSED */ +int +ufs_abortop(ap) + struct vop_abortop_args /* { + struct vnode *a_dvp; + struct componentname *a_cnp; + } */ *ap; +{ + if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF) + FREE(ap->a_cnp->cn_pnbuf, M_NAMEI); + return (0); +} + +/* + * Lock an inode. If its already locked, set the WANT bit and sleep. + */ +int +ufs_lock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip; + struct proc *p = curproc; /* XXX */ + +start: + while (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + sleep((caddr_t)vp, PINOD); + } + if (vp->v_tag == VT_NON) + return (ENOENT); + ip = VTOI(vp); + if (ip->i_flag & IN_LOCKED) { + ip->i_flag |= IN_WANTED; +#ifdef DIAGNOSTIC + if (p) { + if (p->p_pid == ip->i_lockholder) + panic("locking against myself"); + ip->i_lockwaiter = p->p_pid; + } else + ip->i_lockwaiter = -1; +#endif + (void) sleep((caddr_t)ip, PINOD); + goto start; + } +#ifdef DIAGNOSTIC + ip->i_lockwaiter = 0; + if (ip->i_lockholder != 0) + panic("lockholder (%d) != 0", ip->i_lockholder); + if (p && p->p_pid == 0) + printf("locking by process 0\n"); + if (p) + ip->i_lockholder = p->p_pid; + else + ip->i_lockholder = -1; +#endif + ip->i_flag |= IN_LOCKED; + return (0); +} + +/* + * Unlock an inode. If WANT bit is on, wakeup. + */ +int lockcount = 90; +int +ufs_unlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct inode *ip = VTOI(ap->a_vp); + struct proc *p = curproc; /* XXX */ + +#ifdef DIAGNOSTIC + if ((ip->i_flag & IN_LOCKED) == 0) { + vprint("ufs_unlock: unlocked inode", ap->a_vp); + panic("ufs_unlock NOT LOCKED"); + } + if (p && p->p_pid != ip->i_lockholder && p->p_pid > -1 && + ip->i_lockholder > -1 && lockcount++ < 100) + panic("unlocker (%d) != lock holder (%d)", + p->p_pid, ip->i_lockholder); + ip->i_lockholder = 0; +#endif + ip->i_flag &= ~IN_LOCKED; + if (ip->i_flag & IN_WANTED) { + ip->i_flag &= ~IN_WANTED; + wakeup((caddr_t)ip); + } + return (0); +} + +/* + * Check for a locked inode. + */ +int +ufs_islocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + + if (VTOI(ap->a_vp)->i_flag & IN_LOCKED) + return (1); + return (0); +} + +/* + * Calculate the logical to physical mapping if not done already, + * then call the device strategy routine. + */ +int +ufs_strategy(ap) + struct vop_strategy_args /* { + struct buf *a_bp; + } */ *ap; +{ + register struct buf *bp = ap->a_bp; + register struct vnode *vp = bp->b_vp; + register struct inode *ip; + int error; + + ip = VTOI(vp); + if (vp->v_type == VBLK || vp->v_type == VCHR) + panic("ufs_strategy: spec"); + if (bp->b_blkno == bp->b_lblkno) { + if (error = + VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) { + bp->b_error = error; + bp->b_flags |= B_ERROR; + biodone(bp); + return (error); + } + if ((long)bp->b_blkno == -1) + clrbuf(bp); + } + if ((long)bp->b_blkno == -1) { + biodone(bp); + return (0); + } + vp = ip->i_devvp; + bp->b_dev = vp->v_rdev; + VOCALL (vp->v_op, VOFFSET(vop_strategy), ap); + return (0); +} + +/* + * Print out the contents of an inode. + */ +int +ufs_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + register struct vnode *vp = ap->a_vp; + register struct inode *ip = VTOI(vp); + + printf("tag VT_UFS, ino %d, on dev %d, %d", ip->i_number, + major(ip->i_dev), minor(ip->i_dev)); +#ifdef FIFO + if (vp->v_type == VFIFO) + fifo_printinfo(vp); +#endif /* FIFO */ + printf("%s\n", (ip->i_flag & IN_LOCKED) ? " (LOCKED)" : ""); + if (ip->i_lockholder == 0) + return (0); + printf("\towner pid %d", ip->i_lockholder); + if (ip->i_lockwaiter) + printf(" waiting pid %d", ip->i_lockwaiter); + printf("\n"); + return (0); +} + +/* + * Read wrapper for special devices. + */ +int +ufsspec_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + + /* + * Set access flag. + */ + VTOI(ap->a_vp)->i_flag |= IN_ACCESS; + return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap)); +} + +/* + * Write wrapper for special devices. + */ +int +ufsspec_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + + /* + * Set update and change flags. + */ + VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap)); +} + +/* + * Close wrapper for special devices. + * + * Update the times on the inode then do device close. + */ +int +ufsspec_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + register struct inode *ip = VTOI(ap->a_vp); + + if (ap->a_vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED)) + ITIMES(ip, &time, &time); + return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap)); +} + +#ifdef FIFO +/* + * Read wrapper for fifo's + */ +int +ufsfifo_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + extern int (**fifo_vnodeop_p)(); + + /* + * Set access flag. + */ + VTOI(ap->a_vp)->i_flag |= IN_ACCESS; + return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap)); +} + +/* + * Write wrapper for fifo's. + */ +int +ufsfifo_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + extern int (**fifo_vnodeop_p)(); + + /* + * Set update and change flags. + */ + VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap)); +} + +/* + * Close wrapper for fifo's. + * + * Update the times on the inode then do device close. + */ +ufsfifo_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + extern int (**fifo_vnodeop_p)(); + register struct inode *ip = VTOI(ap->a_vp); + + if (ap->a_vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED)) + ITIMES(ip, &time, &time); + return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap)); +} +#endif /* FIFO */ + +/* + * Return POSIX pathconf information applicable to ufs filesystems. + */ +ufs_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); + case _PC_PATH_MAX: + *ap->a_retval = PATH_MAX; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_NO_TRUNC: + *ap->a_retval = 1; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Advisory record locking support + */ +int +ufs_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + register struct inode *ip = VTOI(ap->a_vp); + register struct flock *fl = ap->a_fl; + register struct lockf *lock; + off_t start, end; + int error; + + /* + * Avoid the common case of unlocking when inode has no locks. + */ + if (ip->i_lockf == (struct lockf *)0) { + if (ap->a_op != F_SETLK) { + fl->l_type = F_UNLCK; + return (0); + } + } + /* + * Convert the flock structure into a start and end. + */ + switch (fl->l_whence) { + + case SEEK_SET: + case SEEK_CUR: + /* + * Caller is responsible for adding any necessary offset + * when SEEK_CUR is used. + */ + start = fl->l_start; + break; + + case SEEK_END: + start = ip->i_size + fl->l_start; + break; + + default: + return (EINVAL); + } + if (start < 0) + return (EINVAL); + if (fl->l_len == 0) + end = -1; + else + end = start + fl->l_len - 1; + /* + * Create the lockf structure + */ + MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); + lock->lf_start = start; + lock->lf_end = end; + lock->lf_id = ap->a_id; + lock->lf_inode = ip; + lock->lf_type = fl->l_type; + lock->lf_next = (struct lockf *)0; + lock->lf_block = (struct lockf *)0; + lock->lf_flags = ap->a_flags; + /* + * Do the requested operation. + */ + switch(ap->a_op) { + case F_SETLK: + return (lf_setlock(lock)); + + case F_UNLCK: + error = lf_clearlock(lock); + FREE(lock, M_LOCKF); + return (error); + + case F_GETLK: + error = lf_getlock(lock, fl); + FREE(lock, M_LOCKF); + return (error); + + default: + free(lock, M_LOCKF); + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Initialize the vnode associated with a new inode, handle aliased + * vnodes. + */ +int +ufs_vinit(mntp, specops, fifoops, vpp) + struct mount *mntp; + int (**specops)(); + int (**fifoops)(); + struct vnode **vpp; +{ + struct inode *ip; + struct vnode *vp, *nvp; + + vp = *vpp; + ip = VTOI(vp); + switch(vp->v_type = IFTOVT(ip->i_mode)) { + case VCHR: + case VBLK: + vp->v_op = specops; + if (nvp = checkalias(vp, ip->i_rdev, mntp)) { + /* + * Discard unneeded vnode, but save its inode. + */ + ufs_ihashrem(ip); + VOP_UNLOCK(vp); + nvp->v_data = vp->v_data; + vp->v_data = NULL; + vp->v_op = spec_vnodeop_p; + vrele(vp); + vgone(vp); + /* + * Reinitialize aliased inode. + */ + vp = nvp; + ip->i_vnode = vp; + ufs_ihashins(ip); + } + break; + case VFIFO: +#ifdef FIFO + vp->v_op = fifoops; + break; +#else + return (EOPNOTSUPP); +#endif + } + if (ip->i_number == ROOTINO) + vp->v_flag |= VROOT; + /* + * Initialize modrev times + */ + SETHIGH(ip->i_modrev, mono_time.tv_sec); + SETLOW(ip->i_modrev, mono_time.tv_usec * 4294); + *vpp = vp; + return (0); +} + +/* + * Allocate a new inode. + */ +int +ufs_makeinode(mode, dvp, vpp, cnp) + int mode; + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; +{ + register struct inode *ip, *pdir; + struct timeval tv; + struct vnode *tvp; + int error; + + pdir = VTOI(dvp); +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & HASBUF) == 0) + panic("ufs_makeinode: no name"); +#endif + *vpp = NULL; + if ((mode & IFMT) == 0) + mode |= IFREG; + + if (error = VOP_VALLOC(dvp, mode, cnp->cn_cred, &tvp)) { + free(cnp->cn_pnbuf, M_NAMEI); + vput(dvp); + return (error); + } + ip = VTOI(tvp); + ip->i_gid = pdir->i_gid; + if ((mode & IFMT) == IFLNK) + ip->i_uid = pdir->i_uid; + else + ip->i_uid = cnp->cn_cred->cr_uid; +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, cnp->cn_cred, 0))) { + free(cnp->cn_pnbuf, M_NAMEI); + VOP_VFREE(tvp, ip->i_number, mode); + vput(tvp); + vput(dvp); + return (error); + } +#endif + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_mode = mode; + tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ + ip->i_nlink = 1; + if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && + suser(cnp->cn_cred, NULL)) + ip->i_mode &= ~ISGID; + + /* + * Make sure inode goes to disk before directory entry. + */ + tv = time; + if (error = VOP_UPDATE(tvp, &tv, &tv, 1)) + goto bad; + if (error = ufs_direnter(ip, dvp, cnp)) + goto bad; + if ((cnp->cn_flags & SAVESTART) == 0) + FREE(cnp->cn_pnbuf, M_NAMEI); + vput(dvp); + *vpp = tvp; + return (0); + +bad: + /* + * Write error occurred trying to update the inode + * or the directory so must deallocate the inode. + */ + free(cnp->cn_pnbuf, M_NAMEI); + vput(dvp); + ip->i_nlink = 0; + ip->i_flag |= IN_CHANGE; + vput(tvp); + return (error); +} diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h new file mode 100644 index 00000000000..237871fdaac --- /dev/null +++ b/sys/ufs/ufs/ufsmount.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufsmount.h 8.2 (Berkeley) 1/12/94 + */ + +struct buf; +struct inode; +struct nameidata; +struct timeval; +struct ucred; +struct uio; +struct vnode; +struct netexport; + +/* This structure describes the UFS specific mount structure data. */ +struct ufsmount { + struct mount *um_mountp; /* filesystem vfs structure */ + dev_t um_dev; /* device mounted */ + struct vnode *um_devvp; /* block device mounted vnode */ + union { /* pointer to superblock */ + struct lfs *lfs; /* LFS */ + struct fs *fs; /* FFS */ + } ufsmount_u; +#define um_fs ufsmount_u.fs +#define um_lfs ufsmount_u.lfs + struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */ + struct ucred *um_cred[MAXQUOTAS]; /* quota file access cred */ + u_long um_nindir; /* indirect ptrs per block */ + u_long um_bptrtodb; /* indir ptr to disk block */ + u_long um_seqinc; /* inc between seq blocks */ + time_t um_btime[MAXQUOTAS]; /* block quota time limit */ + time_t um_itime[MAXQUOTAS]; /* inode quota time limit */ + char um_qflags[MAXQUOTAS]; /* quota specific flags */ + struct netexport um_export; /* export information */ +}; +/* + * Flags describing the state of quotas. + */ +#define QTF_OPENING 0x01 /* Q_QUOTAON in progress */ +#define QTF_CLOSING 0x02 /* Q_QUOTAOFF in progress */ + +/* Convert mount ptr to ufsmount ptr. */ +#define VFSTOUFS(mp) ((struct ufsmount *)((mp)->mnt_data)) + +/* + * Macros to access file system parameters in the ufsmount structure. + * Used by ufs_bmap. + */ +#define blkptrtodb(ump, b) ((b) << (ump)->um_bptrtodb) +#define is_sequential(ump, a, b) ((b) == (a) + ump->um_seqinc) +#define MNINDIR(ump) ((ump)->um_nindir) + + diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c new file mode 100644 index 00000000000..235c917a0c6 --- /dev/null +++ b/sys/vm/device_pager.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)device_pager.c 8.5 (Berkeley) 1/12/94 + */ + +/* + * Page to/from special files. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct pagerlst dev_pager_list; /* list of managed devices */ +struct pglist dev_pager_fakelist; /* list of available vm_page_t's */ + +#ifdef DEBUG +int dpagerdebug = 0; +#define DDB_FOLLOW 0x01 +#define DDB_INIT 0x02 +#define DDB_ALLOC 0x04 +#define DDB_FAIL 0x08 +#endif + +static vm_pager_t dev_pager_alloc + __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); +static void dev_pager_dealloc __P((vm_pager_t)); +static int dev_pager_getpage + __P((vm_pager_t, vm_page_t *, int, boolean_t)); +static boolean_t dev_pager_haspage __P((vm_pager_t, vm_offset_t)); +static void dev_pager_init __P((void)); +static int dev_pager_putpage + __P((vm_pager_t, vm_page_t *, int, boolean_t)); +static vm_page_t dev_pager_getfake __P((vm_offset_t)); +static void dev_pager_putfake __P((vm_page_t)); + +struct pagerops devicepagerops = { + dev_pager_init, + dev_pager_alloc, + dev_pager_dealloc, + dev_pager_getpage, + dev_pager_putpage, + dev_pager_haspage, + vm_pager_clusternull +}; + +static void +dev_pager_init() +{ +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_init()\n"); +#endif + TAILQ_INIT(&dev_pager_list); + TAILQ_INIT(&dev_pager_fakelist); +} + +static vm_pager_t +dev_pager_alloc(handle, size, prot, foff) + caddr_t handle; + vm_size_t size; + vm_prot_t prot; + vm_offset_t foff; +{ + dev_t dev; + vm_pager_t pager; + int (*mapfunc)(); + vm_object_t object; + dev_pager_t devp; + int npages, off; + +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_alloc(%x, %x, %x, %x)\n", + handle, size, prot, foff); +#endif +#ifdef DIAGNOSTIC + /* + * Pageout to device, should never happen. + */ + if (handle == NULL) + panic("dev_pager_alloc called"); +#endif + + /* + * Make sure this device can be mapped. + */ + dev = (dev_t)handle; + mapfunc = cdevsw[major(dev)].d_mmap; + if (mapfunc == NULL || mapfunc == enodev || mapfunc == nullop) + return(NULL); + + /* + * Offset should be page aligned. + */ + if (foff & PAGE_MASK) + return(NULL); + + /* + * Check that the specified range of the device allows the + * desired protection. + * + * XXX assumes VM_PROT_* == PROT_* + */ + npages = atop(round_page(size)); + for (off = foff; npages--; off += PAGE_SIZE) + if ((*mapfunc)(dev, off, (int)prot) == -1) + return(NULL); + + /* + * Look up pager, creating as necessary. + */ +top: + pager = vm_pager_lookup(&dev_pager_list, handle); + if (pager == NULL) { + /* + * Allocate and initialize pager structs + */ + pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, M_WAITOK); + if (pager == NULL) + return(NULL); + devp = (dev_pager_t)malloc(sizeof *devp, M_VMPGDATA, M_WAITOK); + if (devp == NULL) { + free((caddr_t)pager, M_VMPAGER); + return(NULL); + } + pager->pg_handle = handle; + pager->pg_ops = &devicepagerops; + pager->pg_type = PG_DEVICE; + pager->pg_flags = 0; + pager->pg_data = devp; + TAILQ_INIT(&devp->devp_pglist); + /* + * Allocate object and associate it with the pager. + */ + object = devp->devp_object = vm_object_allocate(0); + vm_object_enter(object, pager); + vm_object_setpager(object, pager, (vm_offset_t)0, FALSE); + /* + * Finally, put it on the managed list so other can find it. + * First we re-lookup in case someone else beat us to this + * point (due to blocking in the various mallocs). If so, + * we free everything and start over. + */ + if (vm_pager_lookup(&dev_pager_list, handle)) { + free((caddr_t)devp, M_VMPGDATA); + free((caddr_t)pager, M_VMPAGER); + goto top; + } + TAILQ_INSERT_TAIL(&dev_pager_list, pager, pg_list); +#ifdef DEBUG + if (dpagerdebug & DDB_ALLOC) { + printf("dev_pager_alloc: pager %x devp %x object %x\n", + pager, devp, object); + vm_object_print(object, FALSE); + } +#endif + } else { + /* + * vm_object_lookup() gains a reference and also + * removes the object from the cache. + */ + object = vm_object_lookup(pager); +#ifdef DIAGNOSTIC + devp = (dev_pager_t)pager->pg_data; + if (object != devp->devp_object) + panic("dev_pager_setup: bad object"); +#endif + } + return(pager); +} + +static void +dev_pager_dealloc(pager) + vm_pager_t pager; +{ + dev_pager_t devp; + vm_object_t object; + vm_page_t m; + +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_dealloc(%x)\n", pager); +#endif + TAILQ_REMOVE(&dev_pager_list, pager, pg_list); + /* + * Get the object. + * Note: cannot use vm_object_lookup since object has already + * been removed from the hash chain. + */ + devp = (dev_pager_t)pager->pg_data; + object = devp->devp_object; +#ifdef DEBUG + if (dpagerdebug & DDB_ALLOC) + printf("dev_pager_dealloc: devp %x object %x\n", devp, object); +#endif + /* + * Free up our fake pages. + */ + while ((m = devp->devp_pglist.tqh_first) != NULL) { + TAILQ_REMOVE(&devp->devp_pglist, m, pageq); + dev_pager_putfake(m); + } + free((caddr_t)devp, M_VMPGDATA); + free((caddr_t)pager, M_VMPAGER); +} + +static int +dev_pager_getpage(pager, mlist, npages, sync) + vm_pager_t pager; + vm_page_t *mlist; + int npages; + boolean_t sync; +{ + register vm_object_t object; + vm_offset_t offset, paddr; + vm_page_t page; + dev_t dev; + int (*mapfunc)(), prot; + vm_page_t m; + +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_getpage(%x, %x, %x, %x)\n", + pager, mlist, npages, sync); +#endif + + if (npages != 1) + panic("dev_pager_getpage: cannot handle multiple pages"); + m = *mlist; + + object = m->object; + dev = (dev_t)pager->pg_handle; + offset = m->offset + object->paging_offset; + prot = PROT_READ; /* XXX should pass in? */ + mapfunc = cdevsw[major(dev)].d_mmap; +#ifdef DIAGNOSTIC + if (mapfunc == NULL || mapfunc == enodev || mapfunc == nullop) + panic("dev_pager_getpage: no map function"); +#endif + paddr = pmap_phys_address((*mapfunc)(dev, (int)offset, prot)); +#ifdef DIAGNOSTIC + if (paddr == -1) + panic("dev_pager_getpage: map function returns error"); +#endif + /* + * Replace the passed in page with our own fake page and free + * up the original. + */ + page = dev_pager_getfake(paddr); + TAILQ_INSERT_TAIL(&((dev_pager_t)pager->pg_data)->devp_pglist, page, + pageq); + vm_object_lock(object); + vm_page_lock_queues(); + vm_page_free(m); + vm_page_insert(page, object, offset); + vm_page_unlock_queues(); + PAGE_WAKEUP(m); + if (offset + PAGE_SIZE > object->size) + object->size = offset + PAGE_SIZE; /* XXX anal */ + vm_object_unlock(object); + + return(VM_PAGER_OK); +} + +static int +dev_pager_putpage(pager, mlist, npages, sync) + vm_pager_t pager; + vm_page_t *mlist; + int npages; + boolean_t sync; +{ +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_putpage(%x, %x, %x, %x)\n", + pager, mlist, npages, sync); +#endif + if (pager == NULL) + return; + panic("dev_pager_putpage called"); +} + +static boolean_t +dev_pager_haspage(pager, offset) + vm_pager_t pager; + vm_offset_t offset; +{ +#ifdef DEBUG + if (dpagerdebug & DDB_FOLLOW) + printf("dev_pager_haspage(%x, %x)\n", pager, offset); +#endif + return(TRUE); +} + +static vm_page_t +dev_pager_getfake(paddr) + vm_offset_t paddr; +{ + vm_page_t m; + int i; + + if (dev_pager_fakelist.tqh_first == NULL) { + m = (vm_page_t)malloc(PAGE_SIZE, M_VMPGDATA, M_WAITOK); + for (i = PAGE_SIZE / sizeof(*m); i > 0; i--) { + TAILQ_INSERT_TAIL(&dev_pager_fakelist, m, pageq); + m++; + } + } + m = dev_pager_fakelist.tqh_first; + TAILQ_REMOVE(&dev_pager_fakelist, m, pageq); + m->flags = PG_BUSY | PG_CLEAN | PG_FAKE | PG_FICTITIOUS; + m->phys_addr = paddr; + m->wire_count = 1; + return(m); +} + +static void +dev_pager_putfake(m) + vm_page_t m; +{ +#ifdef DIAGNOSTIC + if (!(m->flags & PG_FICTITIOUS)) + panic("dev_pager_putfake: bad page"); +#endif + TAILQ_INSERT_TAIL(&dev_pager_fakelist, m, pageq); +} diff --git a/sys/vm/device_pager.h b/sys/vm/device_pager.h new file mode 100644 index 00000000000..8840622919d --- /dev/null +++ b/sys/vm/device_pager.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)device_pager.h 8.3 (Berkeley) 12/13/93 + */ + +#ifndef _DEVICE_PAGER_ +#define _DEVICE_PAGER_ 1 + +/* + * Device pager private data. + */ +struct devpager { + struct pglist devp_pglist; /* list of pages allocated */ + vm_object_t devp_object; /* object representing this device */ +}; +typedef struct devpager *dev_pager_t; + +#endif /* _DEVICE_PAGER_ */ diff --git a/sys/vm/kern_lock.c b/sys/vm/kern_lock.c new file mode 100644 index 00000000000..c4fa05230a7 --- /dev/null +++ b/sys/vm/kern_lock.c @@ -0,0 +1,534 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_lock.c 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Locking primitives implementation + */ + +#include +#include + +#include + +/* XXX */ +#include +typedef int *thread_t; +#define current_thread() ((thread_t)&curproc->p_thread) +/* XXX */ + +#if NCPUS > 1 + +/* + * Module: lock + * Function: + * Provide reader/writer sychronization. + * Implementation: + * Simple interlock on a bit. Readers first interlock + * increment the reader count, then let go. Writers hold + * the interlock (thus preventing further readers), and + * wait for already-accepted readers to go away. + */ + +/* + * The simple-lock routines are the primitives out of which + * the lock package is built. The implementation is left + * to the machine-dependent code. + */ + +#ifdef notdef +/* + * A sample implementation of simple locks. + * assumes: + * boolean_t test_and_set(boolean_t *) + * indivisibly sets the boolean to TRUE + * and returns its old value + * and that setting a boolean to FALSE is indivisible. + */ +/* + * simple_lock_init initializes a simple lock. A simple lock + * may only be used for exclusive locks. + */ + +void simple_lock_init(l) + simple_lock_t l; +{ + *(boolean_t *)l = FALSE; +} + +void simple_lock(l) + simple_lock_t l; +{ + while (test_and_set((boolean_t *)l)) + continue; +} + +void simple_unlock(l) + simple_lock_t l; +{ + *(boolean_t *)l = FALSE; +} + +boolean_t simple_lock_try(l) + simple_lock_t l; +{ + return (!test_and_set((boolean_t *)l)); +} +#endif /* notdef */ +#endif /* NCPUS > 1 */ + +#if NCPUS > 1 +int lock_wait_time = 100; +#else /* NCPUS > 1 */ + + /* + * It is silly to spin on a uni-processor as if we + * thought something magical would happen to the + * want_write bit while we are executing. + */ +int lock_wait_time = 0; +#endif /* NCPUS > 1 */ + + +/* + * Routine: lock_init + * Function: + * Initialize a lock; required before use. + * Note that clients declare the "struct lock" + * variables and then initialize them, rather + * than getting a new one from this module. + */ +void lock_init(l, can_sleep) + lock_t l; + boolean_t can_sleep; +{ + bzero(l, sizeof(lock_data_t)); + simple_lock_init(&l->interlock); + l->want_write = FALSE; + l->want_upgrade = FALSE; + l->read_count = 0; + l->can_sleep = can_sleep; + l->thread = (char *)-1; /* XXX */ + l->recursion_depth = 0; +} + +void lock_sleepable(l, can_sleep) + lock_t l; + boolean_t can_sleep; +{ + simple_lock(&l->interlock); + l->can_sleep = can_sleep; + simple_unlock(&l->interlock); +} + + +/* + * Sleep locks. These use the same data structure and algorithm + * as the spin locks, but the process sleeps while it is waiting + * for the lock. These work on uniprocessor systems. + */ + +void lock_write(l) + register lock_t l; +{ + register int i; + + simple_lock(&l->interlock); + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock. + */ + l->recursion_depth++; + simple_unlock(&l->interlock); + return; + } + + /* + * Try to acquire the want_write bit. + */ + while (l->want_write) { + if ((i = lock_wait_time) > 0) { + simple_unlock(&l->interlock); + while (--i > 0 && l->want_write) + continue; + simple_lock(&l->interlock); + } + + if (l->can_sleep && l->want_write) { + l->waiting = TRUE; + thread_sleep((int) l, &l->interlock, FALSE); + simple_lock(&l->interlock); + } + } + l->want_write = TRUE; + + /* Wait for readers (and upgrades) to finish */ + + while ((l->read_count != 0) || l->want_upgrade) { + if ((i = lock_wait_time) > 0) { + simple_unlock(&l->interlock); + while (--i > 0 && (l->read_count != 0 || + l->want_upgrade)) + continue; + simple_lock(&l->interlock); + } + + if (l->can_sleep && (l->read_count != 0 || l->want_upgrade)) { + l->waiting = TRUE; + thread_sleep((int) l, &l->interlock, FALSE); + simple_lock(&l->interlock); + } + } + simple_unlock(&l->interlock); +} + +void lock_done(l) + register lock_t l; +{ + simple_lock(&l->interlock); + + if (l->read_count != 0) + l->read_count--; + else + if (l->recursion_depth != 0) + l->recursion_depth--; + else + if (l->want_upgrade) + l->want_upgrade = FALSE; + else + l->want_write = FALSE; + + if (l->waiting) { + l->waiting = FALSE; + thread_wakeup((int) l); + } + simple_unlock(&l->interlock); +} + +void lock_read(l) + register lock_t l; +{ + register int i; + + simple_lock(&l->interlock); + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock. + */ + l->read_count++; + simple_unlock(&l->interlock); + return; + } + + while (l->want_write || l->want_upgrade) { + if ((i = lock_wait_time) > 0) { + simple_unlock(&l->interlock); + while (--i > 0 && (l->want_write || l->want_upgrade)) + continue; + simple_lock(&l->interlock); + } + + if (l->can_sleep && (l->want_write || l->want_upgrade)) { + l->waiting = TRUE; + thread_sleep((int) l, &l->interlock, FALSE); + simple_lock(&l->interlock); + } + } + + l->read_count++; + simple_unlock(&l->interlock); +} + +/* + * Routine: lock_read_to_write + * Function: + * Improves a read-only lock to one with + * write permission. If another reader has + * already requested an upgrade to a write lock, + * no lock is held upon return. + * + * Returns TRUE if the upgrade *failed*. + */ +boolean_t lock_read_to_write(l) + register lock_t l; +{ + register int i; + + simple_lock(&l->interlock); + + l->read_count--; + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock. + */ + l->recursion_depth++; + simple_unlock(&l->interlock); + return(FALSE); + } + + if (l->want_upgrade) { + /* + * Someone else has requested upgrade. + * Since we've released a read lock, wake + * him up. + */ + if (l->waiting) { + l->waiting = FALSE; + thread_wakeup((int) l); + } + + simple_unlock(&l->interlock); + return (TRUE); + } + + l->want_upgrade = TRUE; + + while (l->read_count != 0) { + if ((i = lock_wait_time) > 0) { + simple_unlock(&l->interlock); + while (--i > 0 && l->read_count != 0) + continue; + simple_lock(&l->interlock); + } + + if (l->can_sleep && l->read_count != 0) { + l->waiting = TRUE; + thread_sleep((int) l, &l->interlock, FALSE); + simple_lock(&l->interlock); + } + } + + simple_unlock(&l->interlock); + return (FALSE); +} + +void lock_write_to_read(l) + register lock_t l; +{ + simple_lock(&l->interlock); + + l->read_count++; + if (l->recursion_depth != 0) + l->recursion_depth--; + else + if (l->want_upgrade) + l->want_upgrade = FALSE; + else + l->want_write = FALSE; + + if (l->waiting) { + l->waiting = FALSE; + thread_wakeup((int) l); + } + + simple_unlock(&l->interlock); +} + + +/* + * Routine: lock_try_write + * Function: + * Tries to get a write lock. + * + * Returns FALSE if the lock is not held on return. + */ + +boolean_t lock_try_write(l) + register lock_t l; +{ + + simple_lock(&l->interlock); + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock + */ + l->recursion_depth++; + simple_unlock(&l->interlock); + return(TRUE); + } + + if (l->want_write || l->want_upgrade || l->read_count) { + /* + * Can't get lock. + */ + simple_unlock(&l->interlock); + return(FALSE); + } + + /* + * Have lock. + */ + + l->want_write = TRUE; + simple_unlock(&l->interlock); + return(TRUE); +} + +/* + * Routine: lock_try_read + * Function: + * Tries to get a read lock. + * + * Returns FALSE if the lock is not held on return. + */ + +boolean_t lock_try_read(l) + register lock_t l; +{ + simple_lock(&l->interlock); + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock + */ + l->read_count++; + simple_unlock(&l->interlock); + return(TRUE); + } + + if (l->want_write || l->want_upgrade) { + simple_unlock(&l->interlock); + return(FALSE); + } + + l->read_count++; + simple_unlock(&l->interlock); + return(TRUE); +} + +/* + * Routine: lock_try_read_to_write + * Function: + * Improves a read-only lock to one with + * write permission. If another reader has + * already requested an upgrade to a write lock, + * the read lock is still held upon return. + * + * Returns FALSE if the upgrade *failed*. + */ +boolean_t lock_try_read_to_write(l) + register lock_t l; +{ + + simple_lock(&l->interlock); + + if (((thread_t)l->thread) == current_thread()) { + /* + * Recursive lock + */ + l->read_count--; + l->recursion_depth++; + simple_unlock(&l->interlock); + return(TRUE); + } + + if (l->want_upgrade) { + simple_unlock(&l->interlock); + return(FALSE); + } + l->want_upgrade = TRUE; + l->read_count--; + + while (l->read_count != 0) { + l->waiting = TRUE; + thread_sleep((int) l, &l->interlock, FALSE); + simple_lock(&l->interlock); + } + + simple_unlock(&l->interlock); + return(TRUE); +} + +/* + * Allow a process that has a lock for write to acquire it + * recursively (for read, write, or update). + */ +void lock_set_recursive(l) + lock_t l; +{ + simple_lock(&l->interlock); + if (!l->want_write) { + panic("lock_set_recursive: don't have write lock"); + } + l->thread = (char *) current_thread(); + simple_unlock(&l->interlock); +} + +/* + * Prevent a lock from being re-acquired. + */ +void lock_clear_recursive(l) + lock_t l; +{ + simple_lock(&l->interlock); + if (((thread_t) l->thread) != current_thread()) { + panic("lock_clear_recursive: wrong thread"); + } + if (l->recursion_depth == 0) + l->thread = (char *)-1; /* XXX */ + simple_unlock(&l->interlock); +} diff --git a/sys/vm/lock.h b/sys/vm/lock.h new file mode 100644 index 00000000000..26bed1f048a --- /dev/null +++ b/sys/vm/lock.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)lock.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Locking primitives definitions + */ + +#ifndef _LOCK_H_ +#define _LOCK_H_ + +#define NCPUS 1 /* XXX */ + +/* + * A simple spin lock. + */ + +struct slock { + int lock_data; /* in general 1 bit is sufficient */ +}; + +typedef struct slock simple_lock_data_t; +typedef struct slock *simple_lock_t; + +/* + * The general lock structure. Provides for multiple readers, + * upgrading from read to write, and sleeping until the lock + * can be gained. + */ + +struct lock { +#ifdef vax + /* + * Efficient VAX implementation -- see field description below. + */ + unsigned int read_count:16, + want_upgrade:1, + want_write:1, + waiting:1, + can_sleep:1, + :0; + + simple_lock_data_t interlock; +#else /* vax */ +#ifdef ns32000 + /* + * Efficient ns32000 implementation -- + * see field description below. + */ + simple_lock_data_t interlock; + unsigned int read_count:16, + want_upgrade:1, + want_write:1, + waiting:1, + can_sleep:1, + :0; + +#else /* ns32000 */ + /* Only the "interlock" field is used for hardware exclusion; + * other fields are modified with normal instructions after + * acquiring the interlock bit. + */ + simple_lock_data_t + interlock; /* Interlock for remaining fields */ + boolean_t want_write; /* Writer is waiting, or locked for write */ + boolean_t want_upgrade; /* Read-to-write upgrade waiting */ + boolean_t waiting; /* Someone is sleeping on lock */ + boolean_t can_sleep; /* Can attempts to lock go to sleep */ + int read_count; /* Number of accepted readers */ +#endif /* ns32000 */ +#endif /* vax */ + char *thread; /* Thread that has lock, if recursive locking allowed */ + /* (should be thread_t, but but we then have mutually + recursive definitions) */ + int recursion_depth;/* Depth of recursion */ +}; + +typedef struct lock lock_data_t; +typedef struct lock *lock_t; + +#if NCPUS > 1 +__BEGIN_DECLS +void simple_lock __P((simple_lock_t)); +void simple_lock_init __P((simple_lock_t)); +boolean_t simple_lock_try __P((simple_lock_t)); +void simple_unlock __P((simple_lock_t)); +__END_DECLS +#else /* No multiprocessor locking is necessary. */ +#define simple_lock(l) +#define simple_lock_init(l) +#define simple_lock_try(l) (1) /* Always succeeds. */ +#define simple_unlock(l) +#endif + +/* Sleep locks must work even if no multiprocessing. */ + +#define lock_read_done(l) lock_done(l) +#define lock_write_done(l) lock_done(l) + +void lock_clear_recursive __P((lock_t)); +void lock_done __P((lock_t)); +void lock_init __P((lock_t, boolean_t)); +void lock_read __P((lock_t)); +boolean_t lock_read_to_write __P((lock_t)); +void lock_set_recursive __P((lock_t)); +void lock_sleepable __P((lock_t, boolean_t)); +boolean_t lock_try_read __P((lock_t)); +boolean_t lock_try_read_to_write __P((lock_t)); +boolean_t lock_try_write __P((lock_t)); +void lock_write __P((lock_t)); +void lock_write_to_read __P((lock_t)); +#endif /* !_LOCK_H_ */ diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h new file mode 100644 index 00000000000..63a83c90578 --- /dev/null +++ b/sys/vm/pmap.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)pmap.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Avadis Tevanian, Jr. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Machine address mapping definitions -- machine-independent + * section. [For machine-dependent section, see "machine/pmap.h".] + */ + +#ifndef _PMAP_VM_ +#define _PMAP_VM_ + +/* + * Each machine dependent implementation is expected to + * keep certain statistics. They may do this anyway they + * so choose, but are expected to return the statistics + * in the following structure. + */ +struct pmap_statistics { + long resident_count; /* # of pages mapped (total)*/ + long wired_count; /* # of pages wired */ +}; +typedef struct pmap_statistics *pmap_statistics_t; + +#include + +#ifdef KERNEL +__BEGIN_DECLS +void *pmap_bootstrap_alloc __P((int)); +void pmap_bootstrap( /* machine dependent */ ); +void pmap_change_wiring __P((pmap_t, vm_offset_t, boolean_t)); +void pmap_clear_modify __P((vm_offset_t pa)); +void pmap_clear_reference __P((vm_offset_t pa)); +void pmap_collect __P((pmap_t)); +void pmap_copy __P((pmap_t, + pmap_t, vm_offset_t, vm_size_t, vm_offset_t)); +void pmap_copy_page __P((vm_offset_t, vm_offset_t)); +pmap_t pmap_create __P((vm_size_t)); +void pmap_destroy __P((pmap_t)); +void pmap_enter __P((pmap_t, + vm_offset_t, vm_offset_t, vm_prot_t, boolean_t)); +vm_offset_t pmap_extract __P((pmap_t, vm_offset_t)); +void pmap_init __P((vm_offset_t, vm_offset_t)); +boolean_t pmap_is_modified __P((vm_offset_t pa)); +boolean_t pmap_is_referenced __P((vm_offset_t pa)); +vm_offset_t pmap_map __P((vm_offset_t, vm_offset_t, vm_offset_t, int)); +void pmap_page_protect __P((vm_offset_t, vm_prot_t)); +void pmap_pageable __P((pmap_t, + vm_offset_t, vm_offset_t, boolean_t)); +vm_offset_t pmap_phys_address __P((int)); +void pmap_pinit __P((pmap_t)); +void pmap_protect __P((pmap_t, + vm_offset_t, vm_offset_t, vm_prot_t)); +void pmap_reference __P((pmap_t)); +void pmap_release __P((pmap_t)); +void pmap_remove __P((pmap_t, vm_offset_t, vm_offset_t)); +void pmap_update __P((void)); +void pmap_zero_page __P((vm_offset_t)); +__END_DECLS +#endif + +#endif /* _PMAP_VM_ */ diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c new file mode 100644 index 00000000000..899a6cf41a0 --- /dev/null +++ b/sys/vm/swap_pager.c @@ -0,0 +1,1009 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ + * + * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 + */ + +/* + * Quick hack to page to dedicated partition(s). + * TODO: + * Add multiprocessor locks + * Deal with async writes in a better fashion + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#define NSWSIZES 16 /* size of swtab */ +#define MAXDADDRS 64 /* max # of disk addrs for fixed allocations */ +#ifndef NPENDINGIO +#define NPENDINGIO 64 /* max # of pending cleans */ +#endif + +#ifdef DEBUG +int swpagerdebug = 0x100; +#define SDB_FOLLOW 0x001 +#define SDB_INIT 0x002 +#define SDB_ALLOC 0x004 +#define SDB_IO 0x008 +#define SDB_WRITE 0x010 +#define SDB_FAIL 0x020 +#define SDB_ALLOCBLK 0x040 +#define SDB_FULL 0x080 +#define SDB_ANOM 0x100 +#define SDB_ANOMPANIC 0x200 +#define SDB_CLUSTER 0x400 +#define SDB_PARANOIA 0x800 +#endif + +TAILQ_HEAD(swpclean, swpagerclean); + +struct swpagerclean { + TAILQ_ENTRY(swpagerclean) spc_list; + int spc_flags; + struct buf *spc_bp; + sw_pager_t spc_swp; + vm_offset_t spc_kva; + vm_page_t spc_m; + int spc_npages; +} swcleanlist[NPENDINGIO]; +typedef struct swpagerclean *swp_clean_t; + +/* spc_flags values */ +#define SPC_FREE 0x00 +#define SPC_BUSY 0x01 +#define SPC_DONE 0x02 +#define SPC_ERROR 0x04 + +struct swtab { + vm_size_t st_osize; /* size of object (bytes) */ + int st_bsize; /* vs. size of swap block (DEV_BSIZE units) */ +#ifdef DEBUG + u_long st_inuse; /* number in this range in use */ + u_long st_usecnt; /* total used of this size */ +#endif +} swtab[NSWSIZES+1]; + +#ifdef DEBUG +int swap_pager_poip; /* pageouts in progress */ +int swap_pager_piip; /* pageins in progress */ +#endif + +int swap_pager_maxcluster; /* maximum cluster size */ +int swap_pager_npendingio; /* number of pager clean structs */ + +struct swpclean swap_pager_inuse; /* list of pending page cleans */ +struct swpclean swap_pager_free; /* list of free pager clean structs */ +struct pagerlst swap_pager_list; /* list of "named" anon regions */ + +static void swap_pager_init __P((void)); +static vm_pager_t swap_pager_alloc + __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); +static void swap_pager_clean __P((int)); +#ifdef DEBUG +static void swap_pager_clean_check __P((vm_page_t *, int, int)); +#endif +static void swap_pager_cluster + __P((vm_pager_t, vm_offset_t, + vm_offset_t *, vm_offset_t *)); +static void swap_pager_dealloc __P((vm_pager_t)); +static int swap_pager_getpage + __P((vm_pager_t, vm_page_t *, int, boolean_t)); +static boolean_t swap_pager_haspage __P((vm_pager_t, vm_offset_t)); +static int swap_pager_io __P((sw_pager_t, vm_page_t *, int, int)); +static void swap_pager_iodone __P((struct buf *)); +static int swap_pager_putpage + __P((vm_pager_t, vm_page_t *, int, boolean_t)); + +struct pagerops swappagerops = { + swap_pager_init, + swap_pager_alloc, + swap_pager_dealloc, + swap_pager_getpage, + swap_pager_putpage, + swap_pager_haspage, + swap_pager_cluster +}; + +static void +swap_pager_init() +{ + register swp_clean_t spc; + register int i, bsize; + extern int dmmin, dmmax; + int maxbsize; + +#ifdef DEBUG + if (swpagerdebug & (SDB_FOLLOW|SDB_INIT)) + printf("swpg_init()\n"); +#endif + dfltpagerops = &swappagerops; + TAILQ_INIT(&swap_pager_list); + + /* + * Allocate async IO structures. + * + * XXX it would be nice if we could do this dynamically based on + * the value of nswbuf (since we are ultimately limited by that) + * but neither nswbuf or malloc has been initialized yet. So the + * structs are statically allocated above. + */ + swap_pager_npendingio = NPENDINGIO; + + /* + * Initialize clean lists + */ + TAILQ_INIT(&swap_pager_inuse); + TAILQ_INIT(&swap_pager_free); + for (i = 0, spc = swcleanlist; i < swap_pager_npendingio; i++, spc++) { + TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); + spc->spc_flags = SPC_FREE; + } + + /* + * Calculate the swap allocation constants. + */ + if (dmmin == 0) { + dmmin = DMMIN; + if (dmmin < CLBYTES/DEV_BSIZE) + dmmin = CLBYTES/DEV_BSIZE; + } + if (dmmax == 0) + dmmax = DMMAX; + + /* + * Fill in our table of object size vs. allocation size + */ + bsize = btodb(PAGE_SIZE); + if (bsize < dmmin) + bsize = dmmin; + maxbsize = btodb(sizeof(sw_bm_t) * NBBY * PAGE_SIZE); + if (maxbsize > dmmax) + maxbsize = dmmax; + for (i = 0; i < NSWSIZES; i++) { + swtab[i].st_osize = (vm_size_t) (MAXDADDRS * dbtob(bsize)); + swtab[i].st_bsize = bsize; + if (bsize <= btodb(MAXPHYS)) + swap_pager_maxcluster = dbtob(bsize); +#ifdef DEBUG + if (swpagerdebug & SDB_INIT) + printf("swpg_init: ix %d, size %x, bsize %x\n", + i, swtab[i].st_osize, swtab[i].st_bsize); +#endif + if (bsize >= maxbsize) + break; + bsize *= 2; + } + swtab[i].st_osize = 0; + swtab[i].st_bsize = bsize; +} + +/* + * Allocate a pager structure and associated resources. + * Note that if we are called from the pageout daemon (handle == NULL) + * we should not wait for memory as it could resulting in deadlock. + */ +static vm_pager_t +swap_pager_alloc(handle, size, prot, foff) + caddr_t handle; + register vm_size_t size; + vm_prot_t prot; + vm_offset_t foff; +{ + register vm_pager_t pager; + register sw_pager_t swp; + struct swtab *swt; + int waitok; + +#ifdef DEBUG + if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOC)) + printf("swpg_alloc(%x, %x, %x)\n", handle, size, prot); +#endif + /* + * If this is a "named" anonymous region, look it up and + * return the appropriate pager if it exists. + */ + if (handle) { + pager = vm_pager_lookup(&swap_pager_list, handle); + if (pager != NULL) { + /* + * Use vm_object_lookup to gain a reference + * to the object and also to remove from the + * object cache. + */ + if (vm_object_lookup(pager) == NULL) + panic("swap_pager_alloc: bad object"); + return(pager); + } + } + /* + * Pager doesn't exist, allocate swap management resources + * and initialize. + */ + waitok = handle ? M_WAITOK : M_NOWAIT; + pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, waitok); + if (pager == NULL) + return(NULL); + swp = (sw_pager_t)malloc(sizeof *swp, M_VMPGDATA, waitok); + if (swp == NULL) { +#ifdef DEBUG + if (swpagerdebug & SDB_FAIL) + printf("swpg_alloc: swpager malloc failed\n"); +#endif + free((caddr_t)pager, M_VMPAGER); + return(NULL); + } + size = round_page(size); + for (swt = swtab; swt->st_osize; swt++) + if (size <= swt->st_osize) + break; +#ifdef DEBUG + swt->st_inuse++; + swt->st_usecnt++; +#endif + swp->sw_osize = size; + swp->sw_bsize = swt->st_bsize; + swp->sw_nblocks = (btodb(size) + swp->sw_bsize - 1) / swp->sw_bsize; + swp->sw_blocks = (sw_blk_t) + malloc(swp->sw_nblocks*sizeof(*swp->sw_blocks), + M_VMPGDATA, M_NOWAIT); + if (swp->sw_blocks == NULL) { + free((caddr_t)swp, M_VMPGDATA); + free((caddr_t)pager, M_VMPAGER); +#ifdef DEBUG + if (swpagerdebug & SDB_FAIL) + printf("swpg_alloc: sw_blocks malloc failed\n"); + swt->st_inuse--; + swt->st_usecnt--; +#endif + return(FALSE); + } + bzero((caddr_t)swp->sw_blocks, + swp->sw_nblocks * sizeof(*swp->sw_blocks)); + swp->sw_poip = 0; + if (handle) { + vm_object_t object; + + swp->sw_flags = SW_NAMED; + TAILQ_INSERT_TAIL(&swap_pager_list, pager, pg_list); + /* + * Consistant with other pagers: return with object + * referenced. Can't do this with handle == NULL + * since it might be the pageout daemon calling. + */ + object = vm_object_allocate(size); + vm_object_enter(object, pager); + vm_object_setpager(object, pager, 0, FALSE); + } else { + swp->sw_flags = 0; + pager->pg_list.tqe_next = NULL; + pager->pg_list.tqe_prev = NULL; + } + pager->pg_handle = handle; + pager->pg_ops = &swappagerops; + pager->pg_type = PG_SWAP; + pager->pg_flags = PG_CLUSTERPUT; + pager->pg_data = swp; + +#ifdef DEBUG + if (swpagerdebug & SDB_ALLOC) + printf("swpg_alloc: pg_data %x, %x of %x at %x\n", + swp, swp->sw_nblocks, swp->sw_bsize, swp->sw_blocks); +#endif + return(pager); +} + +static void +swap_pager_dealloc(pager) + vm_pager_t pager; +{ + register int i; + register sw_blk_t bp; + register sw_pager_t swp; + struct swtab *swt; + int s; + +#ifdef DEBUG + /* save panic time state */ + if ((swpagerdebug & SDB_ANOMPANIC) && panicstr) + return; + if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOC)) + printf("swpg_dealloc(%x)\n", pager); +#endif + /* + * Remove from list right away so lookups will fail if we + * block for pageout completion. + */ + swp = (sw_pager_t) pager->pg_data; + if (swp->sw_flags & SW_NAMED) { + TAILQ_REMOVE(&swap_pager_list, pager, pg_list); + swp->sw_flags &= ~SW_NAMED; + } +#ifdef DEBUG + for (swt = swtab; swt->st_osize; swt++) + if (swp->sw_osize <= swt->st_osize) + break; + swt->st_inuse--; +#endif + + /* + * Wait for all pageouts to finish and remove + * all entries from cleaning list. + */ + s = splbio(); + while (swp->sw_poip) { + swp->sw_flags |= SW_WANTED; + (void) tsleep(swp, PVM, "swpgdealloc", 0); + } + splx(s); + swap_pager_clean(B_WRITE); + + /* + * Free left over swap blocks + */ + for (i = 0, bp = swp->sw_blocks; i < swp->sw_nblocks; i++, bp++) + if (bp->swb_block) { +#ifdef DEBUG + if (swpagerdebug & (SDB_ALLOCBLK|SDB_FULL)) + printf("swpg_dealloc: blk %x\n", + bp->swb_block); +#endif + rmfree(swapmap, swp->sw_bsize, bp->swb_block); + } + /* + * Free swap management resources + */ + free((caddr_t)swp->sw_blocks, M_VMPGDATA); + free((caddr_t)swp, M_VMPGDATA); + free((caddr_t)pager, M_VMPAGER); +} + +static int +swap_pager_getpage(pager, mlist, npages, sync) + vm_pager_t pager; + vm_page_t *mlist; + int npages; + boolean_t sync; +{ +#ifdef DEBUG + if (swpagerdebug & SDB_FOLLOW) + printf("swpg_getpage(%x, %x, %x, %x)\n", + pager, mlist, npages, sync); +#endif + return(swap_pager_io((sw_pager_t)pager->pg_data, + mlist, npages, B_READ)); +} + +static int +swap_pager_putpage(pager, mlist, npages, sync) + vm_pager_t pager; + vm_page_t *mlist; + int npages; + boolean_t sync; +{ + int flags; + +#ifdef DEBUG + if (swpagerdebug & SDB_FOLLOW) + printf("swpg_putpage(%x, %x, %x, %x)\n", + pager, mlist, npages, sync); +#endif + if (pager == NULL) { + swap_pager_clean(B_WRITE); + return (VM_PAGER_OK); /* ??? */ + } + flags = B_WRITE; + if (!sync) + flags |= B_ASYNC; + return(swap_pager_io((sw_pager_t)pager->pg_data, + mlist, npages, flags)); +} + +static boolean_t +swap_pager_haspage(pager, offset) + vm_pager_t pager; + vm_offset_t offset; +{ + register sw_pager_t swp; + register sw_blk_t swb; + int ix; + +#ifdef DEBUG + if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOCBLK)) + printf("swpg_haspage(%x, %x) ", pager, offset); +#endif + swp = (sw_pager_t) pager->pg_data; + ix = offset / dbtob(swp->sw_bsize); + if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { +#ifdef DEBUG + if (swpagerdebug & (SDB_FAIL|SDB_FOLLOW|SDB_ALLOCBLK)) + printf("swpg_haspage: %x bad offset %x, ix %x\n", + swp->sw_blocks, offset, ix); +#endif + return(FALSE); + } + swb = &swp->sw_blocks[ix]; + if (swb->swb_block) + ix = atop(offset % dbtob(swp->sw_bsize)); +#ifdef DEBUG + if (swpagerdebug & SDB_ALLOCBLK) + printf("%x blk %x+%x ", swp->sw_blocks, swb->swb_block, ix); + if (swpagerdebug & (SDB_FOLLOW|SDB_ALLOCBLK)) + printf("-> %c\n", + "FT"[swb->swb_block && (swb->swb_mask & (1 << ix))]); +#endif + if (swb->swb_block && (swb->swb_mask & (1 << ix))) + return(TRUE); + return(FALSE); +} + +static void +swap_pager_cluster(pager, offset, loffset, hoffset) + vm_pager_t pager; + vm_offset_t offset; + vm_offset_t *loffset; + vm_offset_t *hoffset; +{ + sw_pager_t swp; + register int bsize; + vm_offset_t loff, hoff; + +#ifdef DEBUG + if (swpagerdebug & (SDB_FOLLOW|SDB_CLUSTER)) + printf("swpg_cluster(%x, %x) ", pager, offset); +#endif + swp = (sw_pager_t) pager->pg_data; + bsize = dbtob(swp->sw_bsize); + if (bsize > swap_pager_maxcluster) + bsize = swap_pager_maxcluster; + + loff = offset - (offset % bsize); + if (loff >= swp->sw_osize) + panic("swap_pager_cluster: bad offset"); + + hoff = loff + bsize; + if (hoff > swp->sw_osize) + hoff = swp->sw_osize; + + *loffset = loff; + *hoffset = hoff; +#ifdef DEBUG + if (swpagerdebug & (SDB_FOLLOW|SDB_CLUSTER)) + printf("returns [%x-%x]\n", loff, hoff); +#endif +} + +/* + * Scaled down version of swap(). + * Assumes that PAGE_SIZE < MAXPHYS; i.e. only one operation needed. + * BOGUS: lower level IO routines expect a KVA so we have to map our + * provided physical page into the KVA to keep them happy. + */ +static int +swap_pager_io(swp, mlist, npages, flags) + register sw_pager_t swp; + vm_page_t *mlist; + int npages; + int flags; +{ + register struct buf *bp; + register sw_blk_t swb; + register int s; + int ix, mask; + boolean_t rv; + vm_offset_t kva, off; + swp_clean_t spc; + vm_page_t m; + +#ifdef DEBUG + /* save panic time state */ + if ((swpagerdebug & SDB_ANOMPANIC) && panicstr) + return (VM_PAGER_FAIL); /* XXX: correct return? */ + if (swpagerdebug & (SDB_FOLLOW|SDB_IO)) + printf("swpg_io(%x, %x, %x, %x)\n", swp, mlist, npages, flags); + if (flags & B_READ) { + if (flags & B_ASYNC) + panic("swap_pager_io: cannot do ASYNC reads"); + if (npages != 1) + panic("swap_pager_io: cannot do clustered reads"); + } +#endif + + /* + * First determine if the page exists in the pager if this is + * a sync read. This quickly handles cases where we are + * following shadow chains looking for the top level object + * with the page. + */ + m = *mlist; + off = m->offset + m->object->paging_offset; + ix = off / dbtob(swp->sw_bsize); + if (swp->sw_blocks == NULL || ix >= swp->sw_nblocks) { +#ifdef DEBUG + if ((flags & B_READ) == 0 && (swpagerdebug & SDB_ANOM)) { + printf("swap_pager_io: no swap block on write\n"); + return(VM_PAGER_BAD); + } +#endif + return(VM_PAGER_FAIL); + } + swb = &swp->sw_blocks[ix]; + off = off % dbtob(swp->sw_bsize); + if ((flags & B_READ) && + (swb->swb_block == 0 || (swb->swb_mask & (1 << atop(off))) == 0)) + return(VM_PAGER_FAIL); + + /* + * For reads (pageins) and synchronous writes, we clean up + * all completed async pageouts. + */ + if ((flags & B_ASYNC) == 0) { + s = splbio(); + swap_pager_clean(flags&B_READ); +#ifdef DEBUG + if (swpagerdebug & SDB_PARANOIA) + swap_pager_clean_check(mlist, npages, flags&B_READ); +#endif + splx(s); + } + /* + * For async writes (pageouts), we cleanup completed pageouts so + * that all available resources are freed. Also tells us if this + * page is already being cleaned. If it is, or no resources + * are available, we try again later. + */ + else { + swap_pager_clean(B_WRITE); +#ifdef DEBUG + if (swpagerdebug & SDB_PARANOIA) + swap_pager_clean_check(mlist, npages, B_WRITE); +#endif + if (swap_pager_free.tqh_first == NULL) { +#ifdef DEBUG + if (swpagerdebug & SDB_FAIL) + printf("%s: no available io headers\n", + "swap_pager_io"); +#endif + return(VM_PAGER_AGAIN); + } + } + + /* + * Allocate a swap block if necessary. + */ + if (swb->swb_block == 0) { + swb->swb_block = rmalloc(swapmap, swp->sw_bsize); + if (swb->swb_block == 0) { +#ifdef DEBUG + if (swpagerdebug & SDB_FAIL) + printf("swpg_io: rmalloc of %x failed\n", + swp->sw_bsize); +#endif + /* + * XXX this is technically a resource shortage that + * should return AGAIN, but the situation isn't likely + * to be remedied just by delaying a little while and + * trying again (the pageout daemon's current response + * to AGAIN) so we just return FAIL. + */ + return(VM_PAGER_FAIL); + } +#ifdef DEBUG + if (swpagerdebug & (SDB_FULL|SDB_ALLOCBLK)) + printf("swpg_io: %x alloc blk %x at ix %x\n", + swp->sw_blocks, swb->swb_block, ix); +#endif + } + + /* + * Allocate a kernel virtual address and initialize so that PTE + * is available for lower level IO drivers. + */ + kva = vm_pager_map_pages(mlist, npages, !(flags & B_ASYNC)); + if (kva == NULL) { +#ifdef DEBUG + if (swpagerdebug & SDB_FAIL) + printf("%s: no KVA space to map pages\n", + "swap_pager_io"); +#endif + return(VM_PAGER_AGAIN); + } + + /* + * Get a swap buffer header and initialize it. + */ + s = splbio(); + while (bswlist.b_actf == NULL) { +#ifdef DEBUG + if (swpagerdebug & SDB_ANOM) + printf("swap_pager_io: wait on swbuf for %x (%d)\n", + m, flags); +#endif + bswlist.b_flags |= B_WANTED; + tsleep((caddr_t)&bswlist, PSWP+1, "swpgiobuf", 0); + } + bp = bswlist.b_actf; + bswlist.b_actf = bp->b_actf; + splx(s); + bp->b_flags = B_BUSY | (flags & B_READ); + bp->b_proc = &proc0; /* XXX (but without B_PHYS set this is ok) */ + bp->b_data = (caddr_t)kva; + bp->b_blkno = swb->swb_block + btodb(off); + VHOLD(swapdev_vp); + bp->b_vp = swapdev_vp; + if (swapdev_vp->v_type == VBLK) + bp->b_dev = swapdev_vp->v_rdev; + bp->b_bcount = npages * PAGE_SIZE; + + /* + * For writes we set up additional buffer fields, record a pageout + * in progress and mark that these swap blocks are now allocated. + */ + if ((bp->b_flags & B_READ) == 0) { + bp->b_dirtyoff = 0; + bp->b_dirtyend = npages * PAGE_SIZE; + swapdev_vp->v_numoutput++; + s = splbio(); + swp->sw_poip++; + splx(s); + mask = (~(~0 << npages)) << atop(off); +#ifdef DEBUG + swap_pager_poip++; + if (swpagerdebug & SDB_WRITE) + printf("swpg_io: write: bp=%x swp=%x poip=%d\n", + bp, swp, swp->sw_poip); + if ((swpagerdebug & SDB_ALLOCBLK) && + (swb->swb_mask & mask) != mask) + printf("swpg_io: %x write %d pages at %x+%x\n", + swp->sw_blocks, npages, swb->swb_block, + atop(off)); + if (swpagerdebug & SDB_CLUSTER) + printf("swpg_io: off=%x, npg=%x, mask=%x, bmask=%x\n", + off, npages, mask, swb->swb_mask); +#endif + swb->swb_mask |= mask; + } + /* + * If this is an async write we set up still more buffer fields + * and place a "cleaning" entry on the inuse queue. + */ + if ((flags & (B_READ|B_ASYNC)) == B_ASYNC) { +#ifdef DEBUG + if (swap_pager_free.tqh_first == NULL) + panic("swpg_io: lost spc"); +#endif + spc = swap_pager_free.tqh_first; + TAILQ_REMOVE(&swap_pager_free, spc, spc_list); +#ifdef DEBUG + if (spc->spc_flags != SPC_FREE) + panic("swpg_io: bad free spc"); +#endif + spc->spc_flags = SPC_BUSY; + spc->spc_bp = bp; + spc->spc_swp = swp; + spc->spc_kva = kva; + /* + * Record the first page. This allows swap_pager_clean + * to efficiently handle the common case of a single page. + * For clusters, it allows us to locate the object easily + * and we then reconstruct the rest of the mlist from spc_kva. + */ + spc->spc_m = m; + spc->spc_npages = npages; + bp->b_flags |= B_CALL; + bp->b_iodone = swap_pager_iodone; + s = splbio(); + TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list); + splx(s); + } + + /* + * Finally, start the IO operation. + * If it is async we are all done, otherwise we must wait for + * completion and cleanup afterwards. + */ +#ifdef DEBUG + if (swpagerdebug & SDB_IO) + printf("swpg_io: IO start: bp %x, db %x, va %x, pa %x\n", + bp, swb->swb_block+btodb(off), kva, VM_PAGE_TO_PHYS(m)); +#endif + VOP_STRATEGY(bp); + if ((flags & (B_READ|B_ASYNC)) == B_ASYNC) { +#ifdef DEBUG + if (swpagerdebug & SDB_IO) + printf("swpg_io: IO started: bp %x\n", bp); +#endif + return(VM_PAGER_PEND); + } + s = splbio(); +#ifdef DEBUG + if (flags & B_READ) + swap_pager_piip++; + else + swap_pager_poip++; +#endif + while ((bp->b_flags & B_DONE) == 0) + (void) tsleep(bp, PVM, "swpgio", 0); + if ((flags & B_READ) == 0) + --swp->sw_poip; +#ifdef DEBUG + if (flags & B_READ) + --swap_pager_piip; + else + --swap_pager_poip; +#endif + rv = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK; + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY); + bp->b_actf = bswlist.b_actf; + bswlist.b_actf = bp; + if (bp->b_vp) + brelvp(bp); + if (bswlist.b_flags & B_WANTED) { + bswlist.b_flags &= ~B_WANTED; + wakeup(&bswlist); + } + if ((flags & B_READ) == 0 && rv == VM_PAGER_OK) { + m->flags |= PG_CLEAN; + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + } + splx(s); +#ifdef DEBUG + if (swpagerdebug & SDB_IO) + printf("swpg_io: IO done: bp %x, rv %d\n", bp, rv); + if ((swpagerdebug & SDB_FAIL) && rv == VM_PAGER_ERROR) + printf("swpg_io: IO error\n"); +#endif + vm_pager_unmap_pages(kva, npages); + return(rv); +} + +static void +swap_pager_clean(rw) + int rw; +{ + register swp_clean_t spc; + register int s, i; + vm_object_t object; + vm_page_t m; + +#ifdef DEBUG + /* save panic time state */ + if ((swpagerdebug & SDB_ANOMPANIC) && panicstr) + return; + if (swpagerdebug & SDB_FOLLOW) + printf("swpg_clean(%x)\n", rw); +#endif + + for (;;) { + /* + * Look up and removal from inuse list must be done + * at splbio() to avoid conflicts with swap_pager_iodone. + */ + s = splbio(); + for (spc = swap_pager_inuse.tqh_first; + spc != NULL; + spc = spc->spc_list.tqe_next) { + /* + * If the operation is done, remove it from the + * list and process it. + * + * XXX if we can't get the object lock we also + * leave it on the list and try again later. + * Is there something better we could do? + */ + if ((spc->spc_flags & SPC_DONE) && + vm_object_lock_try(spc->spc_m->object)) { + TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list); + break; + } + } + splx(s); + + /* + * No operations done, thats all we can do for now. + */ + if (spc == NULL) + break; + + /* + * Found a completed operation so finish it off. + * Note: no longer at splbio since entry is off the list. + */ + m = spc->spc_m; + object = m->object; + + /* + * Process each page in the cluster. + * The first page is explicitly kept in the cleaning + * entry, others must be reconstructed from the KVA. + */ + for (i = 0; i < spc->spc_npages; i++) { + if (i) + m = vm_pager_atop(spc->spc_kva + ptoa(i)); + /* + * If no error mark as clean and inform the pmap + * system. If there was an error, mark as dirty + * so we will try again. + * + * XXX could get stuck doing this, should give up + * after awhile. + */ + if (spc->spc_flags & SPC_ERROR) { + printf("%s: clean of page %x failed\n", + "swap_pager_clean", + VM_PAGE_TO_PHYS(m)); + m->flags |= PG_LAUNDRY; + } else { + m->flags |= PG_CLEAN; + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + } + m->flags &= ~PG_BUSY; + PAGE_WAKEUP(m); + } + + /* + * Done with the object, decrement the paging count + * and unlock it. + */ + if (--object->paging_in_progress == 0) + wakeup(object); + vm_object_unlock(object); + + /* + * Free up KVM used and put the entry back on the list. + */ + vm_pager_unmap_pages(spc->spc_kva, spc->spc_npages); + spc->spc_flags = SPC_FREE; + TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list); +#ifdef DEBUG + if (swpagerdebug & SDB_WRITE) + printf("swpg_clean: free spc %x\n", spc); +#endif + } +} + +#ifdef DEBUG +static void +swap_pager_clean_check(mlist, npages, rw) + vm_page_t *mlist; + int npages; + int rw; +{ + register swp_clean_t spc; + boolean_t bad; + int i, j, s; + vm_page_t m; + + if (panicstr) + return; + + bad = FALSE; + s = splbio(); + for (spc = swap_pager_inuse.tqh_first; + spc != NULL; + spc = spc->spc_list.tqe_next) { + for (j = 0; j < spc->spc_npages; j++) { + m = vm_pager_atop(spc->spc_kva + ptoa(j)); + for (i = 0; i < npages; i++) + if (m == mlist[i]) { + if (swpagerdebug & SDB_ANOM) + printf( + "swpg_clean_check: %s: page %x on list, flags %x\n", + rw == B_WRITE ? "write" : "read", mlist[i], spc->spc_flags); + bad = TRUE; + } + } + } + splx(s); + if (bad) + panic("swpg_clean_check"); +} +#endif + +static void +swap_pager_iodone(bp) + register struct buf *bp; +{ + register swp_clean_t spc; + daddr_t blk; + int s; + +#ifdef DEBUG + /* save panic time state */ + if ((swpagerdebug & SDB_ANOMPANIC) && panicstr) + return; + if (swpagerdebug & SDB_FOLLOW) + printf("swpg_iodone(%x)\n", bp); +#endif + s = splbio(); + for (spc = swap_pager_inuse.tqh_first; + spc != NULL; + spc = spc->spc_list.tqe_next) + if (spc->spc_bp == bp) + break; +#ifdef DEBUG + if (spc == NULL) + panic("swap_pager_iodone: bp not found"); +#endif + + spc->spc_flags &= ~SPC_BUSY; + spc->spc_flags |= SPC_DONE; + if (bp->b_flags & B_ERROR) + spc->spc_flags |= SPC_ERROR; + spc->spc_bp = NULL; + blk = bp->b_blkno; + +#ifdef DEBUG + --swap_pager_poip; + if (swpagerdebug & SDB_WRITE) + printf("swpg_iodone: bp=%x swp=%x flags=%x spc=%x poip=%x\n", + bp, spc->spc_swp, spc->spc_swp->sw_flags, + spc, spc->spc_swp->sw_poip); +#endif + + spc->spc_swp->sw_poip--; + if (spc->spc_swp->sw_flags & SW_WANTED) { + spc->spc_swp->sw_flags &= ~SW_WANTED; + wakeup(spc->spc_swp); + } + + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY); + bp->b_actf = bswlist.b_actf; + bswlist.b_actf = bp; + if (bp->b_vp) + brelvp(bp); + if (bswlist.b_flags & B_WANTED) { + bswlist.b_flags &= ~B_WANTED; + wakeup(&bswlist); + } + wakeup(&vm_pages_needed); + splx(s); +} diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h new file mode 100644 index 00000000000..497d92a3938 --- /dev/null +++ b/sys/vm/swap_pager.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)swap_pager.h 8.1 (Berkeley) 6/11/93 + */ + +#ifndef _SWAP_PAGER_ +#define _SWAP_PAGER_ 1 + +/* + * In the swap pager, the backing store for an object is organized as an + * array of some number of "swap blocks". A swap block consists of a bitmask + * and some number of contiguous DEV_BSIZE disk blocks. The minimum size + * of a swap block is: + * + * max(PAGE_SIZE, dmmin*DEV_BSIZE) [ 32k currently ] + * + * bytes (since the pager interface is page oriented), the maximum size is: + * + * min(#bits(swb_mask)*PAGE_SIZE, dmmax*DEV_BSIZE) [ 128k currently ] + * + * where dmmin and dmmax are left over from the old VM interface. The bitmask + * (swb_mask) is used by swap_pager_haspage() to determine if a particular + * page has actually been written; i.e. the pager copy of the page is valid. + * All swap blocks in the backing store of an object will be the same size. + * + * The reason for variable sized swap blocks is to reduce fragmentation of + * swap resources. Whenever possible we allocate smaller swap blocks to + * smaller objects. The swap block size is determined from a table of + * object-size vs. swap-block-size computed at boot time. + */ +typedef int sw_bm_t; /* pager bitmask */ + +struct swblock { + sw_bm_t swb_mask; /* bitmask of valid pages in this block */ + daddr_t swb_block; /* starting disk block for this block */ +}; +typedef struct swblock *sw_blk_t; + +/* + * Swap pager private data. + */ +struct swpager { + vm_size_t sw_osize; /* size of object we are backing (bytes) */ + int sw_bsize; /* size of swap blocks (DEV_BSIZE units) */ + int sw_nblocks;/* number of blocks in list (sw_blk_t units) */ + sw_blk_t sw_blocks; /* pointer to list of swap blocks */ + short sw_flags; /* flags */ + short sw_poip; /* pageouts in progress */ +}; +typedef struct swpager *sw_pager_t; + +#define SW_WANTED 0x01 +#define SW_NAMED 0x02 + +#endif /* _SWAP_PAGER_ */ diff --git a/sys/vm/vm.h b/sys/vm/vm.h new file mode 100644 index 00000000000..85f892f29be --- /dev/null +++ b/sys/vm/vm.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm.h 8.2 (Berkeley) 12/13/93 + */ + +#ifndef VM_H +#define VM_H + +typedef int vm_inherit_t; /* XXX: inheritance codes */ + +union vm_map_object; +typedef union vm_map_object vm_map_object_t; + +struct vm_map_entry; +typedef struct vm_map_entry *vm_map_entry_t; + +struct vm_map; +typedef struct vm_map *vm_map_t; + +struct vm_object; +typedef struct vm_object *vm_object_t; + +struct vm_page; +typedef struct vm_page *vm_page_t; + +struct pager_struct; +typedef struct pager_struct *vm_pager_t; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Shareable process virtual address space. + * May eventually be merged with vm_map. + * Several fields are temporary (text, data stuff). + */ +struct vmspace { + struct vm_map vm_map; /* VM address map */ + struct pmap vm_pmap; /* private physical map */ + int vm_refcnt; /* number of references */ + caddr_t vm_shm; /* SYS5 shared memory private data XXX */ +/* we copy from vm_startcopy to the end of the structure on fork */ +#define vm_startcopy vm_rssize + segsz_t vm_rssize; /* current resident set size in pages */ + segsz_t vm_swrss; /* resident set size before last swap */ + segsz_t vm_tsize; /* text size (pages) XXX */ + segsz_t vm_dsize; /* data size (pages) XXX */ + segsz_t vm_ssize; /* stack size (pages) */ + caddr_t vm_taddr; /* user virtual address of text XXX */ + caddr_t vm_daddr; /* user virtual address of data XXX */ + caddr_t vm_maxsaddr; /* user VA at max stack growth */ +}; +#endif /* VM_H */ diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h new file mode 100644 index 00000000000..bae5f005273 --- /dev/null +++ b/sys/vm/vm_extern.h @@ -0,0 +1,125 @@ +/*- + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_extern.h 8.2 (Berkeley) 1/12/94 + */ + +struct buf; +struct loadavg; +struct proc; +struct vmspace; +struct vmtotal; +struct mount; +struct vnode; + +#ifdef KGDB +void chgkprot __P((caddr_t, int, int)); +#endif + +#ifdef KERNEL +#ifdef TYPEDEF_FOR_UAP +int getpagesize __P((struct proc *p, void *, int *)); +int madvise __P((struct proc *, void *, int *)); +int mincore __P((struct proc *, void *, int *)); +int mprotect __P((struct proc *, void *, int *)); +int msync __P((struct proc *, void *, int *)); +int munmap __P((struct proc *, void *, int *)); +int obreak __P((struct proc *, void *, int *)); +int sbrk __P((struct proc *, void *, int *)); +int smmap __P((struct proc *, void *, int *)); +int sstk __P((struct proc *, void *, int *)); +#endif + +void assert_wait __P((int, boolean_t)); +int grow __P((struct proc *, u_int)); +void iprintf __P((const char *, ...)); +int kernacc __P((caddr_t, int, int)); +int kinfo_loadavg __P((int, char *, int *, int, int *)); +int kinfo_meter __P((int, caddr_t, int *, int, int *)); +vm_offset_t kmem_alloc __P((vm_map_t, vm_size_t)); +vm_offset_t kmem_alloc_pageable __P((vm_map_t, vm_size_t)); +vm_offset_t kmem_alloc_wait __P((vm_map_t, vm_size_t)); +void kmem_free __P((vm_map_t, vm_offset_t, vm_size_t)); +void kmem_free_wakeup __P((vm_map_t, vm_offset_t, vm_size_t)); +void kmem_init __P((vm_offset_t, vm_offset_t)); +vm_offset_t kmem_malloc __P((vm_map_t, vm_size_t, boolean_t)); +vm_map_t kmem_suballoc __P((vm_map_t, vm_offset_t *, vm_offset_t *, + vm_size_t, boolean_t)); +void loadav __P((struct loadavg *)); +void munmapfd __P((int)); +int pager_cache __P((vm_object_t, boolean_t)); +void sched __P((void)); +int svm_allocate __P((struct proc *, void *, int *)); +int svm_deallocate __P((struct proc *, void *, int *)); +int svm_inherit __P((struct proc *, void *, int *)); +int svm_protect __P((struct proc *, void *, int *)); +void swapinit __P((void)); +int swapon __P((struct proc *, void *, int *)); +void swapout __P((struct proc *)); +void swapout_threads __P((void)); +int swfree __P((struct proc *, int)); +void swstrategy __P((struct buf *)); +void thread_block __P((void)); +void thread_sleep __P((int, simple_lock_t, boolean_t)); +void thread_wakeup __P((int)); +int useracc __P((caddr_t, int, int)); +int vm_allocate __P((vm_map_t, + vm_offset_t *, vm_size_t, boolean_t)); +int vm_allocate_with_pager __P((vm_map_t, vm_offset_t *, + vm_size_t, boolean_t, vm_pager_t, vm_offset_t, boolean_t)); +int vm_deallocate __P((vm_map_t, vm_offset_t, vm_size_t)); +int vm_fault __P((vm_map_t, vm_offset_t, vm_prot_t, boolean_t)); +void vm_fault_copy_entry __P((vm_map_t, + vm_map_t, vm_map_entry_t, vm_map_entry_t)); +void vm_fault_unwire __P((vm_map_t, vm_offset_t, vm_offset_t)); +int vm_fault_wire __P((vm_map_t, vm_offset_t, vm_offset_t)); +int vm_fork __P((struct proc *, struct proc *, int)); +int vm_inherit __P((vm_map_t, + vm_offset_t, vm_size_t, vm_inherit_t)); +void vm_init_limits __P((struct proc *)); +void vm_mem_init __P((void)); +int vm_mmap __P((vm_map_t, vm_offset_t *, vm_size_t, + vm_prot_t, vm_prot_t, int, caddr_t, vm_offset_t)); +int vm_protect __P((vm_map_t, + vm_offset_t, vm_size_t, boolean_t, vm_prot_t)); +void vm_set_page_size __P((void)); +void vmmeter __P((void)); +struct vmspace *vmspace_alloc __P((vm_offset_t, vm_offset_t, int)); +struct vmspace *vmspace_fork __P((struct vmspace *)); +void vmspace_free __P((struct vmspace *)); +void vmtotal __P((struct vmtotal *)); +void vnode_pager_setsize __P((struct vnode *, u_long)); +void vnode_pager_umount __P((struct mount *)); +boolean_t vnode_pager_uncache __P((struct vnode *)); +void vslock __P((caddr_t, u_int)); +void vsunlock __P((caddr_t, u_int, int)); +#endif diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c new file mode 100644 index 00000000000..f60abf2b5f3 --- /dev/null +++ b/sys/vm/vm_fault.c @@ -0,0 +1,1035 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Page fault handling module. + */ + +#include +#include + +#include +#include +#include + +/* + * vm_fault: + * + * Handle a page fault occuring at the given address, + * requiring the given permissions, in the map specified. + * If successful, the page is inserted into the + * associated physical map. + * + * NOTE: the given address should be truncated to the + * proper page address. + * + * KERN_SUCCESS is returned if the page fault is handled; otherwise, + * a standard error specifying why the fault is fatal is returned. + * + * + * The map in question must be referenced, and remains so. + * Caller may hold no locks. + */ +int +vm_fault(map, vaddr, fault_type, change_wiring) + vm_map_t map; + vm_offset_t vaddr; + vm_prot_t fault_type; + boolean_t change_wiring; +{ + vm_object_t first_object; + vm_offset_t first_offset; + vm_map_entry_t entry; + register vm_object_t object; + register vm_offset_t offset; + register vm_page_t m; + vm_page_t first_m; + vm_prot_t prot; + int result; + boolean_t wired; + boolean_t su; + boolean_t lookup_still_valid; + boolean_t page_exists; + vm_page_t old_m; + vm_object_t next_object; + + cnt.v_faults++; /* needs lock XXX */ +/* + * Recovery actions + */ +#define FREE_PAGE(m) { \ + PAGE_WAKEUP(m); \ + vm_page_lock_queues(); \ + vm_page_free(m); \ + vm_page_unlock_queues(); \ +} + +#define RELEASE_PAGE(m) { \ + PAGE_WAKEUP(m); \ + vm_page_lock_queues(); \ + vm_page_activate(m); \ + vm_page_unlock_queues(); \ +} + +#define UNLOCK_MAP { \ + if (lookup_still_valid) { \ + vm_map_lookup_done(map, entry); \ + lookup_still_valid = FALSE; \ + } \ +} + +#define UNLOCK_THINGS { \ + object->paging_in_progress--; \ + vm_object_unlock(object); \ + if (object != first_object) { \ + vm_object_lock(first_object); \ + FREE_PAGE(first_m); \ + first_object->paging_in_progress--; \ + vm_object_unlock(first_object); \ + } \ + UNLOCK_MAP; \ +} + +#define UNLOCK_AND_DEALLOCATE { \ + UNLOCK_THINGS; \ + vm_object_deallocate(first_object); \ +} + + RetryFault: ; + + /* + * Find the backing store object and offset into + * it to begin the search. + */ + + if ((result = vm_map_lookup(&map, vaddr, fault_type, &entry, + &first_object, &first_offset, + &prot, &wired, &su)) != KERN_SUCCESS) { + return(result); + } + lookup_still_valid = TRUE; + + if (wired) + fault_type = prot; + + first_m = NULL; + + /* + * Make a reference to this object to + * prevent its disposal while we are messing with + * it. Once we have the reference, the map is free + * to be diddled. Since objects reference their + * shadows (and copies), they will stay around as well. + */ + + vm_object_lock(first_object); + + first_object->ref_count++; + first_object->paging_in_progress++; + + /* + * INVARIANTS (through entire routine): + * + * 1) At all times, we must either have the object + * lock or a busy page in some object to prevent + * some other thread from trying to bring in + * the same page. + * + * Note that we cannot hold any locks during the + * pager access or when waiting for memory, so + * we use a busy page then. + * + * Note also that we aren't as concerned about + * more than one thead attempting to pager_data_unlock + * the same page at once, so we don't hold the page + * as busy then, but do record the highest unlock + * value so far. [Unlock requests may also be delivered + * out of order.] + * + * 2) Once we have a busy page, we must remove it from + * the pageout queues, so that the pageout daemon + * will not grab it away. + * + * 3) To prevent another thread from racing us down the + * shadow chain and entering a new page in the top + * object before we do, we must keep a busy page in + * the top object while following the shadow chain. + * + * 4) We must increment paging_in_progress on any object + * for which we have a busy page, to prevent + * vm_object_collapse from removing the busy page + * without our noticing. + */ + + /* + * Search for the page at object/offset. + */ + + object = first_object; + offset = first_offset; + + /* + * See whether this page is resident + */ + + while (TRUE) { + m = vm_page_lookup(object, offset); + if (m != NULL) { + /* + * If the page is being brought in, + * wait for it and then retry. + */ + if (m->flags & PG_BUSY) { +#ifdef DOTHREADS + int wait_result; + + PAGE_ASSERT_WAIT(m, !change_wiring); + UNLOCK_THINGS; + thread_block(); + wait_result = current_thread()->wait_result; + vm_object_deallocate(first_object); + if (wait_result != THREAD_AWAKENED) + return(KERN_SUCCESS); + goto RetryFault; +#else + PAGE_ASSERT_WAIT(m, !change_wiring); + UNLOCK_THINGS; + cnt.v_intrans++; + thread_block(); + vm_object_deallocate(first_object); + goto RetryFault; +#endif + } + + /* + * Remove the page from the pageout daemon's + * reach while we play with it. + */ + + vm_page_lock_queues(); + if (m->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); + m->flags &= ~PG_INACTIVE; + cnt.v_inactive_count--; + cnt.v_reactivated++; + } + + if (m->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + m->flags &= ~PG_ACTIVE; + cnt.v_active_count--; + } + vm_page_unlock_queues(); + + /* + * Mark page busy for other threads. + */ + m->flags |= PG_BUSY; + break; + } + + if (((object->pager != NULL) && + (!change_wiring || wired)) + || (object == first_object)) { + + /* + * Allocate a new page for this object/offset + * pair. + */ + + m = vm_page_alloc(object, offset); + + if (m == NULL) { + UNLOCK_AND_DEALLOCATE; + VM_WAIT; + goto RetryFault; + } + } + + if (object->pager != NULL && (!change_wiring || wired)) { + int rv; + + /* + * Now that we have a busy page, we can + * release the object lock. + */ + vm_object_unlock(object); + + /* + * Call the pager to retrieve the data, if any, + * after releasing the lock on the map. + */ + UNLOCK_MAP; + cnt.v_pageins++; + rv = vm_pager_get(object->pager, m, TRUE); + + /* + * Reaquire the object lock to preserve our + * invariant. + */ + vm_object_lock(object); + + /* + * Found the page. + * Leave it busy while we play with it. + */ + if (rv == VM_PAGER_OK) { + /* + * Relookup in case pager changed page. + * Pager is responsible for disposition + * of old page if moved. + */ + m = vm_page_lookup(object, offset); + + cnt.v_pgpgin++; + m->flags &= ~PG_FAKE; + m->flags |= PG_CLEAN; + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + break; + } + + /* + * IO error or page outside the range of the pager: + * cleanup and return an error. + */ + if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) { + FREE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + return(KERN_PROTECTION_FAILURE); /* XXX */ + } + /* + * rv == VM_PAGER_FAIL: + * + * Page does not exist at this object/offset. + * Free the bogus page (waking up anyone waiting + * for it) and continue on to the next object. + * + * If this is the top-level object, we must + * leave the busy page to prevent another + * thread from rushing past us, and inserting + * the page in that object at the same time + * that we are. + */ + if (object != first_object) { + FREE_PAGE(m); + /* note that `m' is not used after this */ + } + } + + /* + * We get here if the object has no pager (or unwiring) + * or the pager doesn't have the page. + */ + if (object == first_object) + first_m = m; + + /* + * Move on to the next object. Lock the next + * object before unlocking the current one. + */ + + offset += object->shadow_offset; + next_object = object->shadow; + if (next_object == NULL) { + /* + * If there's no object left, fill the page + * in the top object with zeros. + */ + if (object != first_object) { + object->paging_in_progress--; + vm_object_unlock(object); + + object = first_object; + offset = first_offset; + m = first_m; + vm_object_lock(object); + } + first_m = NULL; + + vm_page_zero_fill(m); + cnt.v_zfod++; + m->flags &= ~PG_FAKE; + break; + } + else { + vm_object_lock(next_object); + if (object != first_object) + object->paging_in_progress--; + vm_object_unlock(object); + object = next_object; + object->paging_in_progress++; + } + } + + if ((m->flags & (PG_ACTIVE | PG_INACTIVE | PG_BUSY)) != PG_BUSY) + panic("vm_fault: active, inactive or !busy after main loop"); + + /* + * PAGE HAS BEEN FOUND. + * [Loop invariant still holds -- the object lock + * is held.] + */ + + old_m = m; /* save page that would be copied */ + + /* + * If the page is being written, but isn't + * already owned by the top-level object, + * we have to copy it into a new page owned + * by the top-level object. + */ + + if (object != first_object) { + /* + * We only really need to copy if we + * want to write it. + */ + + if (fault_type & VM_PROT_WRITE) { + + /* + * If we try to collapse first_object at this + * point, we may deadlock when we try to get + * the lock on an intermediate object (since we + * have the bottom object locked). We can't + * unlock the bottom object, because the page + * we found may move (by collapse) if we do. + * + * Instead, we first copy the page. Then, when + * we have no more use for the bottom object, + * we unlock it and try to collapse. + * + * Note that we copy the page even if we didn't + * need to... that's the breaks. + */ + + /* + * We already have an empty page in + * first_object - use it. + */ + + vm_page_copy(m, first_m); + first_m->flags &= ~PG_FAKE; + + /* + * If another map is truly sharing this + * page with us, we have to flush all + * uses of the original page, since we + * can't distinguish those which want the + * original from those which need the + * new copy. + * + * XXX If we know that only one map has + * access to this page, then we could + * avoid the pmap_page_protect() call. + */ + + vm_page_lock_queues(); + vm_page_activate(m); + vm_page_deactivate(m); + pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); + vm_page_unlock_queues(); + + /* + * We no longer need the old page or object. + */ + PAGE_WAKEUP(m); + object->paging_in_progress--; + vm_object_unlock(object); + + /* + * Only use the new page below... + */ + + cnt.v_cow_faults++; + m = first_m; + object = first_object; + offset = first_offset; + + /* + * Now that we've gotten the copy out of the + * way, let's try to collapse the top object. + */ + vm_object_lock(object); + /* + * But we have to play ugly games with + * paging_in_progress to do that... + */ + object->paging_in_progress--; + vm_object_collapse(object); + object->paging_in_progress++; + } + else { + prot &= ~VM_PROT_WRITE; + m->flags |= PG_COPYONWRITE; + } + } + + if (m->flags & (PG_ACTIVE|PG_INACTIVE)) + panic("vm_fault: active or inactive before copy object handling"); + + /* + * If the page is being written, but hasn't been + * copied to the copy-object, we have to copy it there. + */ + RetryCopy: + if (first_object->copy != NULL) { + vm_object_t copy_object = first_object->copy; + vm_offset_t copy_offset; + vm_page_t copy_m; + + /* + * We only need to copy if we want to write it. + */ + if ((fault_type & VM_PROT_WRITE) == 0) { + prot &= ~VM_PROT_WRITE; + m->flags |= PG_COPYONWRITE; + } + else { + /* + * Try to get the lock on the copy_object. + */ + if (!vm_object_lock_try(copy_object)) { + vm_object_unlock(object); + /* should spin a bit here... */ + vm_object_lock(object); + goto RetryCopy; + } + + /* + * Make another reference to the copy-object, + * to keep it from disappearing during the + * copy. + */ + copy_object->ref_count++; + + /* + * Does the page exist in the copy? + */ + copy_offset = first_offset + - copy_object->shadow_offset; + copy_m = vm_page_lookup(copy_object, copy_offset); + if (page_exists = (copy_m != NULL)) { + if (copy_m->flags & PG_BUSY) { +#ifdef DOTHREADS + int wait_result; + + /* + * If the page is being brought + * in, wait for it and then retry. + */ + PAGE_ASSERT_WAIT(copy_m, !change_wiring); + RELEASE_PAGE(m); + copy_object->ref_count--; + vm_object_unlock(copy_object); + UNLOCK_THINGS; + thread_block(); + wait_result = current_thread()->wait_result; + vm_object_deallocate(first_object); + if (wait_result != THREAD_AWAKENED) + return(KERN_SUCCESS); + goto RetryFault; +#else + /* + * If the page is being brought + * in, wait for it and then retry. + */ + PAGE_ASSERT_WAIT(copy_m, !change_wiring); + RELEASE_PAGE(m); + copy_object->ref_count--; + vm_object_unlock(copy_object); + UNLOCK_THINGS; + thread_block(); + vm_object_deallocate(first_object); + goto RetryFault; +#endif + } + } + + /* + * If the page is not in memory (in the object) + * and the object has a pager, we have to check + * if the pager has the data in secondary + * storage. + */ + if (!page_exists) { + + /* + * If we don't allocate a (blank) page + * here... another thread could try + * to page it in, allocate a page, and + * then block on the busy page in its + * shadow (first_object). Then we'd + * trip over the busy page after we + * found that the copy_object's pager + * doesn't have the page... + */ + copy_m = vm_page_alloc(copy_object, + copy_offset); + if (copy_m == NULL) { + /* + * Wait for a page, then retry. + */ + RELEASE_PAGE(m); + copy_object->ref_count--; + vm_object_unlock(copy_object); + UNLOCK_AND_DEALLOCATE; + VM_WAIT; + goto RetryFault; + } + + if (copy_object->pager != NULL) { + vm_object_unlock(object); + vm_object_unlock(copy_object); + UNLOCK_MAP; + + page_exists = vm_pager_has_page( + copy_object->pager, + (copy_offset + copy_object->paging_offset)); + + vm_object_lock(copy_object); + + /* + * Since the map is unlocked, someone + * else could have copied this object + * and put a different copy_object + * between the two. Or, the last + * reference to the copy-object (other + * than the one we have) may have + * disappeared - if that has happened, + * we don't need to make the copy. + */ + if (copy_object->shadow != object || + copy_object->ref_count == 1) { + /* + * Gaah... start over! + */ + FREE_PAGE(copy_m); + vm_object_unlock(copy_object); + vm_object_deallocate(copy_object); + /* may block */ + vm_object_lock(object); + goto RetryCopy; + } + vm_object_lock(object); + + if (page_exists) { + /* + * We didn't need the page + */ + FREE_PAGE(copy_m); + } + } + } + if (!page_exists) { + /* + * Must copy page into copy-object. + */ + vm_page_copy(m, copy_m); + copy_m->flags &= ~PG_FAKE; + + /* + * Things to remember: + * 1. The copied page must be marked 'dirty' + * so it will be paged out to the copy + * object. + * 2. If the old page was in use by any users + * of the copy-object, it must be removed + * from all pmaps. (We can't know which + * pmaps use it.) + */ + vm_page_lock_queues(); + pmap_page_protect(VM_PAGE_TO_PHYS(old_m), + VM_PROT_NONE); + copy_m->flags &= ~PG_CLEAN; + vm_page_activate(copy_m); /* XXX */ + vm_page_unlock_queues(); + + PAGE_WAKEUP(copy_m); + } + /* + * The reference count on copy_object must be + * at least 2: one for our extra reference, + * and at least one from the outside world + * (we checked that when we last locked + * copy_object). + */ + copy_object->ref_count--; + vm_object_unlock(copy_object); + m->flags &= ~PG_COPYONWRITE; + } + } + + if (m->flags & (PG_ACTIVE | PG_INACTIVE)) + panic("vm_fault: active or inactive before retrying lookup"); + + /* + * We must verify that the maps have not changed + * since our last lookup. + */ + + if (!lookup_still_valid) { + vm_object_t retry_object; + vm_offset_t retry_offset; + vm_prot_t retry_prot; + + /* + * Since map entries may be pageable, make sure we can + * take a page fault on them. + */ + vm_object_unlock(object); + + /* + * To avoid trying to write_lock the map while another + * thread has it read_locked (in vm_map_pageable), we + * do not try for write permission. If the page is + * still writable, we will get write permission. If it + * is not, or has been marked needs_copy, we enter the + * mapping without write permission, and will merely + * take another fault. + */ + result = vm_map_lookup(&map, vaddr, + fault_type & ~VM_PROT_WRITE, &entry, + &retry_object, &retry_offset, &retry_prot, + &wired, &su); + + vm_object_lock(object); + + /* + * If we don't need the page any longer, put it on the + * active list (the easiest thing to do here). If no + * one needs it, pageout will grab it eventually. + */ + + if (result != KERN_SUCCESS) { + RELEASE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + return(result); + } + + lookup_still_valid = TRUE; + + if ((retry_object != first_object) || + (retry_offset != first_offset)) { + RELEASE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + goto RetryFault; + } + + /* + * Check whether the protection has changed or the object + * has been copied while we left the map unlocked. + * Changing from read to write permission is OK - we leave + * the page write-protected, and catch the write fault. + * Changing from write to read permission means that we + * can't mark the page write-enabled after all. + */ + prot &= retry_prot; + if (m->flags & PG_COPYONWRITE) + prot &= ~VM_PROT_WRITE; + } + + /* + * (the various bits we're fiddling with here are locked by + * the object's lock) + */ + + /* XXX This distorts the meaning of the copy_on_write bit */ + + if (prot & VM_PROT_WRITE) + m->flags &= ~PG_COPYONWRITE; + + /* + * It's critically important that a wired-down page be faulted + * only once in each map for which it is wired. + */ + + if (m->flags & (PG_ACTIVE | PG_INACTIVE)) + panic("vm_fault: active or inactive before pmap_enter"); + + vm_object_unlock(object); + + /* + * Put this page into the physical map. + * We had to do the unlock above because pmap_enter + * may cause other faults. We don't put the + * page back on the active queue until later so + * that the page-out daemon won't find us (yet). + */ + + pmap_enter(map->pmap, vaddr, VM_PAGE_TO_PHYS(m), prot, wired); + + /* + * If the page is not wired down, then put it where the + * pageout daemon can find it. + */ + vm_object_lock(object); + vm_page_lock_queues(); + if (change_wiring) { + if (wired) + vm_page_wire(m); + else + vm_page_unwire(m); + } + else + vm_page_activate(m); + vm_page_unlock_queues(); + + /* + * Unlock everything, and return + */ + + PAGE_WAKEUP(m); + UNLOCK_AND_DEALLOCATE; + + return(KERN_SUCCESS); + +} + +/* + * vm_fault_wire: + * + * Wire down a range of virtual addresses in a map. + */ +int +vm_fault_wire(map, start, end) + vm_map_t map; + vm_offset_t start, end; +{ + register vm_offset_t va; + register pmap_t pmap; + int rv; + + pmap = vm_map_pmap(map); + + /* + * Inform the physical mapping system that the + * range of addresses may not fault, so that + * page tables and such can be locked down as well. + */ + + pmap_pageable(pmap, start, end, FALSE); + + /* + * We simulate a fault to get the page and enter it + * in the physical map. + */ + + for (va = start; va < end; va += PAGE_SIZE) { + rv = vm_fault(map, va, VM_PROT_NONE, TRUE); + if (rv) { + if (va != start) + vm_fault_unwire(map, start, va); + return(rv); + } + } + return(KERN_SUCCESS); +} + + +/* + * vm_fault_unwire: + * + * Unwire a range of virtual addresses in a map. + */ +void vm_fault_unwire(map, start, end) + vm_map_t map; + vm_offset_t start, end; +{ + + register vm_offset_t va, pa; + register pmap_t pmap; + + pmap = vm_map_pmap(map); + + /* + * Since the pages are wired down, we must be able to + * get their mappings from the physical map system. + */ + + vm_page_lock_queues(); + + for (va = start; va < end; va += PAGE_SIZE) { + pa = pmap_extract(pmap, va); + if (pa == (vm_offset_t) 0) { + panic("unwire: page not in pmap"); + } + pmap_change_wiring(pmap, va, FALSE); + vm_page_unwire(PHYS_TO_VM_PAGE(pa)); + } + vm_page_unlock_queues(); + + /* + * Inform the physical mapping system that the range + * of addresses may fault, so that page tables and + * such may be unwired themselves. + */ + + pmap_pageable(pmap, start, end, TRUE); + +} + +/* + * Routine: + * vm_fault_copy_entry + * Function: + * Copy all of the pages from a wired-down map entry to another. + * + * In/out conditions: + * The source and destination maps must be locked for write. + * The source map entry must be wired down (or be a sharing map + * entry corresponding to a main map entry that is wired down). + */ + +void vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry) + vm_map_t dst_map; + vm_map_t src_map; + vm_map_entry_t dst_entry; + vm_map_entry_t src_entry; +{ + + vm_object_t dst_object; + vm_object_t src_object; + vm_offset_t dst_offset; + vm_offset_t src_offset; + vm_prot_t prot; + vm_offset_t vaddr; + vm_page_t dst_m; + vm_page_t src_m; + +#ifdef lint + src_map++; +#endif + + src_object = src_entry->object.vm_object; + src_offset = src_entry->offset; + + /* + * Create the top-level object for the destination entry. + * (Doesn't actually shadow anything - we copy the pages + * directly.) + */ + dst_object = vm_object_allocate( + (vm_size_t) (dst_entry->end - dst_entry->start)); + + dst_entry->object.vm_object = dst_object; + dst_entry->offset = 0; + + prot = dst_entry->max_protection; + + /* + * Loop through all of the pages in the entry's range, copying + * each one from the source object (it should be there) to the + * destination object. + */ + for (vaddr = dst_entry->start, dst_offset = 0; + vaddr < dst_entry->end; + vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { + + /* + * Allocate a page in the destination object + */ + vm_object_lock(dst_object); + do { + dst_m = vm_page_alloc(dst_object, dst_offset); + if (dst_m == NULL) { + vm_object_unlock(dst_object); + VM_WAIT; + vm_object_lock(dst_object); + } + } while (dst_m == NULL); + + /* + * Find the page in the source object, and copy it in. + * (Because the source is wired down, the page will be + * in memory.) + */ + vm_object_lock(src_object); + src_m = vm_page_lookup(src_object, dst_offset + src_offset); + if (src_m == NULL) + panic("vm_fault_copy_wired: page missing"); + + vm_page_copy(src_m, dst_m); + + /* + * Enter it in the pmap... + */ + vm_object_unlock(src_object); + vm_object_unlock(dst_object); + + pmap_enter(dst_map->pmap, vaddr, VM_PAGE_TO_PHYS(dst_m), + prot, FALSE); + + /* + * Mark it no longer busy, and put it on the active list. + */ + vm_object_lock(dst_object); + vm_page_lock_queues(); + vm_page_activate(dst_m); + vm_page_unlock_queues(); + PAGE_WAKEUP(dst_m); + vm_object_unlock(dst_object); + } + +} diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c new file mode 100644 index 00000000000..5676ff3f7cc --- /dev/null +++ b/sys/vm/vm_glue.c @@ -0,0 +1,605 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +int avefree = 0; /* XXX */ +unsigned maxdmap = MAXDSIZ; /* XXX */ +int readbuffers = 0; /* XXX allow kgdb to read kernel buffer pool */ + +int +kernacc(addr, len, rw) + caddr_t addr; + int len, rw; +{ + boolean_t rv; + vm_offset_t saddr, eaddr; + vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; + + saddr = trunc_page(addr); + eaddr = round_page(addr+len); + rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); + /* + * XXX there are still some things (e.g. the buffer cache) that + * are managed behind the VM system's back so even though an + * address is accessible in the mind of the VM system, there may + * not be physical pages where the VM thinks there is. This can + * lead to bogus allocation of pages in the kernel address space + * or worse, inconsistencies at the pmap level. We only worry + * about the buffer cache for now. + */ + if (!readbuffers && rv && (eaddr > (vm_offset_t)buffers && + saddr < (vm_offset_t)buffers + MAXBSIZE * nbuf)) + rv = FALSE; + return(rv == TRUE); +} + +int +useracc(addr, len, rw) + caddr_t addr; + int len, rw; +{ + boolean_t rv; + vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE; + + rv = vm_map_check_protection(&curproc->p_vmspace->vm_map, + trunc_page(addr), round_page(addr+len), prot); + return(rv == TRUE); +} + +#ifdef KGDB +/* + * Change protections on kernel pages from addr to addr+len + * (presumably so debugger can plant a breakpoint). + * + * We force the protection change at the pmap level. If we were + * to use vm_map_protect a change to allow writing would be lazily- + * applied meaning we would still take a protection fault, something + * we really don't want to do. It would also fragment the kernel + * map unnecessarily. We cannot use pmap_protect since it also won't + * enforce a write-enable request. Using pmap_enter is the only way + * we can ensure the change takes place properly. + */ +void +chgkprot(addr, len, rw) + register caddr_t addr; + int len, rw; +{ + vm_prot_t prot; + vm_offset_t pa, sva, eva; + + prot = rw == B_READ ? VM_PROT_READ : VM_PROT_READ|VM_PROT_WRITE; + eva = round_page(addr + len); + for (sva = trunc_page(addr); sva < eva; sva += PAGE_SIZE) { + /* + * Extract physical address for the page. + * We use a cheezy hack to differentiate physical + * page 0 from an invalid mapping, not that it + * really matters... + */ + pa = pmap_extract(kernel_pmap, sva|1); + if (pa == 0) + panic("chgkprot: invalid page"); + pmap_enter(kernel_pmap, sva, pa&~1, prot, TRUE); + } +} +#endif + +void +vslock(addr, len) + caddr_t addr; + u_int len; +{ + vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), + round_page(addr+len), FALSE); +} + +void +vsunlock(addr, len, dirtied) + caddr_t addr; + u_int len; + int dirtied; +{ +#ifdef lint + dirtied++; +#endif + vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr), + round_page(addr+len), TRUE); +} + +/* + * Implement fork's actions on an address space. + * Here we arrange for the address space to be copied or referenced, + * allocate a user struct (pcb and kernel stack), then call the + * machine-dependent layer to fill those in and make the new process + * ready to run. + * NOTE: the kernel stack may be at a different location in the child + * process, and thus addresses of automatic variables may be invalid + * after cpu_fork returns in the child process. We do nothing here + * after cpu_fork returns. + */ +int +vm_fork(p1, p2, isvfork) + register struct proc *p1, *p2; + int isvfork; +{ + register struct user *up; + vm_offset_t addr; + +#ifdef i386 + /* + * avoid copying any of the parent's pagetables or other per-process + * objects that reside in the map by marking all of them non-inheritable + */ + (void)vm_map_inherit(&p1->p_vmspace->vm_map, + UPT_MIN_ADDRESS-UPAGES*NBPG, VM_MAX_ADDRESS, VM_INHERIT_NONE); +#endif + p2->p_vmspace = vmspace_fork(p1->p_vmspace); + +#ifdef SYSVSHM + if (p1->p_vmspace->vm_shm) + shmfork(p1, p2, isvfork); +#endif + +#ifndef i386 + /* + * Allocate a wired-down (for now) pcb and kernel stack for the process + */ + addr = kmem_alloc_pageable(kernel_map, ctob(UPAGES)); + if (addr == 0) + panic("vm_fork: no more kernel virtual memory"); + vm_map_pageable(kernel_map, addr, addr + ctob(UPAGES), FALSE); +#else +/* XXX somehow, on 386, ocassionally pageout removes active, wired down kstack, +and pagetables, WITHOUT going thru vm_page_unwire! Why this appears to work is +not yet clear, yet it does... */ + addr = kmem_alloc(kernel_map, ctob(UPAGES)); + if (addr == 0) + panic("vm_fork: no more kernel virtual memory"); +#endif + up = (struct user *)addr; + p2->p_addr = up; + + /* + * p_stats and p_sigacts currently point at fields + * in the user struct but not at &u, instead at p_addr. + * Copy p_sigacts and parts of p_stats; zero the rest + * of p_stats (statistics). + */ + p2->p_stats = &up->u_stats; + p2->p_sigacts = &up->u_sigacts; + up->u_sigacts = *p1->p_sigacts; + bzero(&up->u_stats.pstat_startzero, + (unsigned) ((caddr_t)&up->u_stats.pstat_endzero - + (caddr_t)&up->u_stats.pstat_startzero)); + bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy, + ((caddr_t)&up->u_stats.pstat_endcopy - + (caddr_t)&up->u_stats.pstat_startcopy)); + +#ifdef i386 + { u_int addr = UPT_MIN_ADDRESS - UPAGES*NBPG; struct vm_map *vp; + + vp = &p2->p_vmspace->vm_map; + (void)vm_deallocate(vp, addr, UPT_MAX_ADDRESS - addr); + (void)vm_allocate(vp, &addr, UPT_MAX_ADDRESS - addr, FALSE); + (void)vm_map_inherit(vp, addr, UPT_MAX_ADDRESS, VM_INHERIT_NONE); + } +#endif + /* + * cpu_fork will copy and update the kernel stack and pcb, + * and make the child ready to run. It marks the child + * so that it can return differently than the parent. + * It returns twice, once in the parent process and + * once in the child. + */ + return (cpu_fork(p1, p2)); +} + +/* + * Set default limits for VM system. + * Called for proc 0, and then inherited by all others. + */ +void +vm_init_limits(p) + register struct proc *p; +{ + + /* + * Set up the initial limits on process VM. + * Set the maximum resident set size to be all + * of (reasonably) available memory. This causes + * any single, large process to start random page + * replacement once it fills memory. + */ + p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; + p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ; + p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ; + p->p_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ; + p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(cnt.v_free_count); +} + +#include + +#ifdef DEBUG +int enableswap = 1; +int swapdebug = 0; +#define SDB_FOLLOW 1 +#define SDB_SWAPIN 2 +#define SDB_SWAPOUT 4 +#endif + +/* + * Brutally simple: + * 1. Attempt to swapin every swaped-out, runnable process in + * order of priority. + * 2. If not enough memory, wake the pageout daemon and let it + * clear some space. + */ +void +scheduler() +{ + register struct proc *p; + register int pri; + struct proc *pp; + int ppri; + vm_offset_t addr; + vm_size_t size; + +loop: +#ifdef DEBUG + while (!enableswap) + sleep((caddr_t)&proc0, PVM); +#endif + pp = NULL; + ppri = INT_MIN; + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + if (p->p_stat == SRUN && (p->p_flag & P_INMEM) == 0) { + pri = p->p_swtime + p->p_slptime - p->p_nice * 8; + if (pri > ppri) { + pp = p; + ppri = pri; + } + } + } +#ifdef DEBUG + if (swapdebug & SDB_FOLLOW) + printf("sched: running, procp %x pri %d\n", pp, ppri); +#endif + /* + * Nothing to do, back to sleep + */ + if ((p = pp) == NULL) { + sleep((caddr_t)&proc0, PVM); + goto loop; + } + + /* + * We would like to bring someone in. + * This part is really bogus cuz we could deadlock on memory + * despite our feeble check. + */ + size = round_page(ctob(UPAGES)); + addr = (vm_offset_t) p->p_addr; + if (cnt.v_free_count > atop(size)) { +#ifdef DEBUG + if (swapdebug & SDB_SWAPIN) + printf("swapin: pid %d(%s)@%x, pri %d free %d\n", + p->p_pid, p->p_comm, p->p_addr, + ppri, cnt.v_free_count); +#endif + vm_map_pageable(kernel_map, addr, addr+size, FALSE); + /* + * Some architectures need to be notified when the + * user area has moved to new physical page(s) (e.g. + * see pmax/pmax/vm_machdep.c). + */ + cpu_swapin(p); + (void) splstatclock(); + if (p->p_stat == SRUN) + setrunqueue(p); + p->p_flag |= P_INMEM; + (void) spl0(); + p->p_swtime = 0; + goto loop; + } + /* + * Not enough memory, jab the pageout daemon and wait til the + * coast is clear. + */ +#ifdef DEBUG + if (swapdebug & SDB_FOLLOW) + printf("sched: no room for pid %d(%s), free %d\n", + p->p_pid, p->p_comm, cnt.v_free_count); +#endif + (void) splhigh(); + VM_WAIT; + (void) spl0(); +#ifdef DEBUG + if (swapdebug & SDB_FOLLOW) + printf("sched: room again, free %d\n", cnt.v_free_count); +#endif + goto loop; +} + +#define swappable(p) \ + (((p)->p_flag & \ + (P_SYSTEM | P_INMEM | P_NOSWAP | P_WEXIT | P_PHYSIO)) == P_INMEM) + +/* + * Swapout is driven by the pageout daemon. Very simple, we find eligible + * procs and unwire their u-areas. We try to always "swap" at least one + * process in case we need the room for a swapin. + * If any procs have been sleeping/stopped for at least maxslp seconds, + * they are swapped. Else, we swap the longest-sleeping or stopped process, + * if any, otherwise the longest-resident process. + */ +void +swapout_threads() +{ + register struct proc *p; + struct proc *outp, *outp2; + int outpri, outpri2; + int didswap = 0; + extern int maxslp; + +#ifdef DEBUG + if (!enableswap) + return; +#endif + outp = outp2 = NULL; + outpri = outpri2 = 0; + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + if (!swappable(p)) + continue; + switch (p->p_stat) { + case SRUN: + if (p->p_swtime > outpri2) { + outp2 = p; + outpri2 = p->p_swtime; + } + continue; + + case SSLEEP: + case SSTOP: + if (p->p_slptime >= maxslp) { + swapout(p); + didswap++; + } else if (p->p_slptime > outpri) { + outp = p; + outpri = p->p_slptime; + } + continue; + } + } + /* + * If we didn't get rid of any real duds, toss out the next most + * likely sleeping/stopped or running candidate. We only do this + * if we are real low on memory since we don't gain much by doing + * it (UPAGES pages). + */ + if (didswap == 0 && + cnt.v_free_count <= atop(round_page(ctob(UPAGES)))) { + if ((p = outp) == 0) + p = outp2; +#ifdef DEBUG + if (swapdebug & SDB_SWAPOUT) + printf("swapout_threads: no duds, try procp %x\n", p); +#endif + if (p) + swapout(p); + } +} + +void +swapout(p) + register struct proc *p; +{ + vm_offset_t addr; + vm_size_t size; + +#ifdef DEBUG + if (swapdebug & SDB_SWAPOUT) + printf("swapout: pid %d(%s)@%x, stat %x pri %d free %d\n", + p->p_pid, p->p_comm, p->p_addr, p->p_stat, + p->p_slptime, cnt.v_free_count); +#endif + size = round_page(ctob(UPAGES)); + addr = (vm_offset_t) p->p_addr; +#if defined(hp300) || defined(luna68k) + /* + * Ugh! u-area is double mapped to a fixed address behind the + * back of the VM system and accesses are usually through that + * address rather than the per-process address. Hence reference + * and modify information are recorded at the fixed address and + * lost at context switch time. We assume the u-struct and + * kernel stack are always accessed/modified and force it to be so. + */ + { + register int i; + volatile long tmp; + + for (i = 0; i < UPAGES; i++) { + tmp = *(long *)addr; *(long *)addr = tmp; + addr += NBPG; + } + addr = (vm_offset_t) p->p_addr; + } +#endif +#ifdef mips + /* + * Be sure to save the floating point coprocessor state before + * paging out the u-struct. + */ + { + extern struct proc *machFPCurProcPtr; + + if (p == machFPCurProcPtr) { + MachSaveCurFPState(p); + machFPCurProcPtr = (struct proc *)0; + } + } +#endif +#ifndef i386 /* temporary measure till we find spontaineous unwire of kstack */ + vm_map_pageable(kernel_map, addr, addr+size, TRUE); + pmap_collect(vm_map_pmap(&p->p_vmspace->vm_map)); +#endif + (void) splhigh(); + p->p_flag &= ~P_INMEM; + if (p->p_stat == SRUN) + remrq(p); + (void) spl0(); + p->p_swtime = 0; +} + +/* + * The rest of these routines fake thread handling + */ + +void +assert_wait(event, ruptible) + int event; + boolean_t ruptible; +{ +#ifdef lint + ruptible++; +#endif + curproc->p_thread = event; +} + +void +thread_block() +{ + int s = splhigh(); + + if (curproc->p_thread) + sleep((caddr_t)curproc->p_thread, PVM); + splx(s); +} + +void +thread_sleep(event, lock, ruptible) + int event; + simple_lock_t lock; + boolean_t ruptible; +{ +#ifdef lint + ruptible++; +#endif + int s = splhigh(); + + curproc->p_thread = event; + simple_unlock(lock); + if (curproc->p_thread) + sleep((caddr_t)event, PVM); + splx(s); +} + +void +thread_wakeup(event) + int event; +{ + int s = splhigh(); + + wakeup((caddr_t)event); + splx(s); +} + +/* + * DEBUG stuff + */ + +int indent = 0; + +#include /* see subr_prf.c */ + +/*ARGSUSED2*/ +void +#if __STDC__ +iprintf(const char *fmt, ...) +#else +iprintf(fmt /* , va_alist */) + char *fmt; + /* va_dcl */ +#endif +{ + register int i; + va_list ap; + + for (i = indent; i >= 8; i -= 8) + printf("\t"); + while (--i >= 0) + printf(" "); + va_start(ap, fmt); + printf("%r", fmt, ap); + va_end(ap); +} diff --git a/sys/vm/vm_inherit.h b/sys/vm/vm_inherit.h new file mode 100644 index 00000000000..455f91c9390 --- /dev/null +++ b/sys/vm/vm_inherit.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_inherit.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory map inheritance definitions. + */ + +#ifndef _VM_INHERIT_ +#define _VM_INHERIT_ + +/* + * Enumeration of valid values for vm_inherit_t. + */ + +#define VM_INHERIT_SHARE ((vm_inherit_t) 0) /* share with child */ +#define VM_INHERIT_COPY ((vm_inherit_t) 1) /* copy into child */ +#define VM_INHERIT_NONE ((vm_inherit_t) 2) /* absent from child */ +#define VM_INHERIT_DONATE_COPY ((vm_inherit_t) 3) /* copy and delete */ + +#define VM_INHERIT_DEFAULT VM_INHERIT_COPY + +#endif /* _VM_INHERIT_ */ diff --git a/sys/vm/vm_init.c b/sys/vm/vm_init.c new file mode 100644 index 00000000000..4874f9e707a --- /dev/null +++ b/sys/vm/vm_init.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_init.c 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Initialize the Virtual Memory subsystem. + */ + +#include + +#include +#include +#include + +/* + * vm_init initializes the virtual memory system. + * This is done only by the first cpu up. + * + * The start and end address of physical memory is passed in. + */ + +void vm_mem_init() +{ + extern vm_offset_t avail_start, avail_end; + extern vm_offset_t virtual_avail, virtual_end; + + /* + * Initializes resident memory structures. + * From here on, all physical memory is accounted for, + * and we use only virtual addresses. + */ + vm_set_page_size(); + vm_page_startup(&avail_start, &avail_end); + + /* + * Initialize other VM packages + */ + vm_object_init(virtual_end - VM_MIN_KERNEL_ADDRESS); + vm_map_startup(); + kmem_init(virtual_avail, virtual_end); + pmap_init(avail_start, avail_end); + vm_pager_init(); +} diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c new file mode 100644 index 00000000000..7e4db63abf2 --- /dev/null +++ b/sys/vm/vm_kern.c @@ -0,0 +1,450 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Kernel memory management. + */ + +#include +#include + +#include +#include +#include +#include + +/* + * kmem_alloc_pageable: + * + * Allocate pageable memory to the kernel's address map. + * map must be "kernel_map" below. + */ + +vm_offset_t kmem_alloc_pageable(map, size) + vm_map_t map; + register vm_size_t size; +{ + vm_offset_t addr; + register int result; + +#if 0 + if (map != kernel_map) + panic("kmem_alloc_pageable: not called with kernel_map"); +#endif + + size = round_page(size); + + addr = vm_map_min(map); + result = vm_map_find(map, NULL, (vm_offset_t) 0, + &addr, size, TRUE); + if (result != KERN_SUCCESS) { + return(0); + } + + return(addr); +} + +/* + * Allocate wired-down memory in the kernel's address map + * or a submap. + */ +vm_offset_t kmem_alloc(map, size) + register vm_map_t map; + register vm_size_t size; +{ + vm_offset_t addr; + register vm_offset_t offset; + extern vm_object_t kernel_object; + vm_offset_t i; + + size = round_page(size); + + /* + * Use the kernel object for wired-down kernel pages. + * Assume that no region of the kernel object is + * referenced more than once. + */ + + /* + * Locate sufficient space in the map. This will give us the + * final virtual address for the new memory, and thus will tell + * us the offset within the kernel map. + */ + vm_map_lock(map); + if (vm_map_findspace(map, 0, size, &addr)) { + vm_map_unlock(map); + return (0); + } + offset = addr - VM_MIN_KERNEL_ADDRESS; + vm_object_reference(kernel_object); + vm_map_insert(map, kernel_object, offset, addr, addr + size); + vm_map_unlock(map); + + /* + * Guarantee that there are pages already in this object + * before calling vm_map_pageable. This is to prevent the + * following scenario: + * + * 1) Threads have swapped out, so that there is a + * pager for the kernel_object. + * 2) The kmsg zone is empty, and so we are kmem_allocing + * a new page for it. + * 3) vm_map_pageable calls vm_fault; there is no page, + * but there is a pager, so we call + * pager_data_request. But the kmsg zone is empty, + * so we must kmem_alloc. + * 4) goto 1 + * 5) Even if the kmsg zone is not empty: when we get + * the data back from the pager, it will be (very + * stale) non-zero data. kmem_alloc is defined to + * return zero-filled memory. + * + * We're intentionally not activating the pages we allocate + * to prevent a race with page-out. vm_map_pageable will wire + * the pages. + */ + + vm_object_lock(kernel_object); + for (i = 0 ; i < size; i+= PAGE_SIZE) { + vm_page_t mem; + + while ((mem = vm_page_alloc(kernel_object, offset+i)) == NULL) { + vm_object_unlock(kernel_object); + VM_WAIT; + vm_object_lock(kernel_object); + } + vm_page_zero_fill(mem); + mem->flags &= ~PG_BUSY; + } + vm_object_unlock(kernel_object); + + /* + * And finally, mark the data as non-pageable. + */ + + (void) vm_map_pageable(map, (vm_offset_t) addr, addr + size, FALSE); + + /* + * Try to coalesce the map + */ + + vm_map_simplify(map, addr); + + return(addr); +} + +/* + * kmem_free: + * + * Release a region of kernel virtual memory allocated + * with kmem_alloc, and return the physical pages + * associated with that region. + */ +void kmem_free(map, addr, size) + vm_map_t map; + register vm_offset_t addr; + vm_size_t size; +{ + (void) vm_map_remove(map, trunc_page(addr), round_page(addr + size)); +} + +/* + * kmem_suballoc: + * + * Allocates a map to manage a subrange + * of the kernel virtual address space. + * + * Arguments are as follows: + * + * parent Map to take range from + * size Size of range to find + * min, max Returned endpoints of map + * pageable Can the region be paged + */ +vm_map_t kmem_suballoc(parent, min, max, size, pageable) + register vm_map_t parent; + vm_offset_t *min, *max; + register vm_size_t size; + boolean_t pageable; +{ + register int ret; + vm_map_t result; + + size = round_page(size); + + *min = (vm_offset_t) vm_map_min(parent); + ret = vm_map_find(parent, NULL, (vm_offset_t) 0, + min, size, TRUE); + if (ret != KERN_SUCCESS) { + printf("kmem_suballoc: bad status return of %d.\n", ret); + panic("kmem_suballoc"); + } + *max = *min + size; + pmap_reference(vm_map_pmap(parent)); + result = vm_map_create(vm_map_pmap(parent), *min, *max, pageable); + if (result == NULL) + panic("kmem_suballoc: cannot create submap"); + if ((ret = vm_map_submap(parent, *min, *max, result)) != KERN_SUCCESS) + panic("kmem_suballoc: unable to change range to submap"); + return(result); +} + +/* + * Allocate wired-down memory in the kernel's address map for the higher + * level kernel memory allocator (kern/kern_malloc.c). We cannot use + * kmem_alloc() because we may need to allocate memory at interrupt + * level where we cannot block (canwait == FALSE). + * + * This routine has its own private kernel submap (kmem_map) and object + * (kmem_object). This, combined with the fact that only malloc uses + * this routine, ensures that we will never block in map or object waits. + * + * Note that this still only works in a uni-processor environment and + * when called at splhigh(). + * + * We don't worry about expanding the map (adding entries) since entries + * for wired maps are statically allocated. + */ +vm_offset_t +kmem_malloc(map, size, canwait) + register vm_map_t map; + register vm_size_t size; + boolean_t canwait; +{ + register vm_offset_t offset, i; + vm_map_entry_t entry; + vm_offset_t addr; + vm_page_t m; + extern vm_object_t kmem_object; + + if (map != kmem_map && map != mb_map) + panic("kern_malloc_alloc: map != {kmem,mb}_map"); + + size = round_page(size); + addr = vm_map_min(map); + + /* + * Locate sufficient space in the map. This will give us the + * final virtual address for the new memory, and thus will tell + * us the offset within the kernel map. + */ + vm_map_lock(map); + if (vm_map_findspace(map, 0, size, &addr)) { + vm_map_unlock(map); + if (canwait) /* XXX should wait */ + panic("kmem_malloc: %s too small", + map == kmem_map ? "kmem_map" : "mb_map"); + return (0); + } + offset = addr - vm_map_min(kmem_map); + vm_object_reference(kmem_object); + vm_map_insert(map, kmem_object, offset, addr, addr + size); + + /* + * If we can wait, just mark the range as wired + * (will fault pages as necessary). + */ + if (canwait) { + vm_map_unlock(map); + (void) vm_map_pageable(map, (vm_offset_t) addr, addr + size, + FALSE); + vm_map_simplify(map, addr); + return(addr); + } + + /* + * If we cannot wait then we must allocate all memory up front, + * pulling it off the active queue to prevent pageout. + */ + vm_object_lock(kmem_object); + for (i = 0; i < size; i += PAGE_SIZE) { + m = vm_page_alloc(kmem_object, offset + i); + + /* + * Ran out of space, free everything up and return. + * Don't need to lock page queues here as we know + * that the pages we got aren't on any queues. + */ + if (m == NULL) { + while (i != 0) { + i -= PAGE_SIZE; + m = vm_page_lookup(kmem_object, offset + i); + vm_page_free(m); + } + vm_object_unlock(kmem_object); + vm_map_delete(map, addr, addr + size); + vm_map_unlock(map); + return(0); + } +#if 0 + vm_page_zero_fill(m); +#endif + m->flags &= ~PG_BUSY; + } + vm_object_unlock(kmem_object); + + /* + * Mark map entry as non-pageable. + * Assert: vm_map_insert() will never be able to extend the previous + * entry so there will be a new entry exactly corresponding to this + * address range and it will have wired_count == 0. + */ + if (!vm_map_lookup_entry(map, addr, &entry) || + entry->start != addr || entry->end != addr + size || + entry->wired_count) + panic("kmem_malloc: entry not found or misaligned"); + entry->wired_count++; + + /* + * Loop thru pages, entering them in the pmap. + * (We cannot add them to the wired count without + * wrapping the vm_page_queue_lock in splimp...) + */ + for (i = 0; i < size; i += PAGE_SIZE) { + vm_object_lock(kmem_object); + m = vm_page_lookup(kmem_object, offset + i); + vm_object_unlock(kmem_object); + pmap_enter(map->pmap, addr + i, VM_PAGE_TO_PHYS(m), + VM_PROT_DEFAULT, TRUE); + } + vm_map_unlock(map); + + vm_map_simplify(map, addr); + return(addr); +} + +/* + * kmem_alloc_wait + * + * Allocates pageable memory from a sub-map of the kernel. If the submap + * has no room, the caller sleeps waiting for more memory in the submap. + * + */ +vm_offset_t kmem_alloc_wait(map, size) + vm_map_t map; + vm_size_t size; +{ + vm_offset_t addr; + + size = round_page(size); + + for (;;) { + /* + * To make this work for more than one map, + * use the map's lock to lock out sleepers/wakers. + */ + vm_map_lock(map); + if (vm_map_findspace(map, 0, size, &addr) == 0) + break; + /* no space now; see if we can ever get space */ + if (vm_map_max(map) - vm_map_min(map) < size) { + vm_map_unlock(map); + return (0); + } + assert_wait((int)map, TRUE); + vm_map_unlock(map); + thread_block(); + } + vm_map_insert(map, NULL, (vm_offset_t)0, addr, addr + size); + vm_map_unlock(map); + return (addr); +} + +/* + * kmem_free_wakeup + * + * Returns memory to a submap of the kernel, and wakes up any threads + * waiting for memory in that map. + */ +void kmem_free_wakeup(map, addr, size) + vm_map_t map; + vm_offset_t addr; + vm_size_t size; +{ + vm_map_lock(map); + (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); + thread_wakeup((int)map); + vm_map_unlock(map); +} + +/* + * Create the kernel map; insert a mapping covering kernel text, data, bss, + * and all space allocated thus far (`boostrap' data). The new map will thus + * map the range between VM_MIN_KERNEL_ADDRESS and `start' as allocated, and + * the range between `start' and `end' as free. + */ +void kmem_init(start, end) + vm_offset_t start, end; +{ + register vm_map_t m; + + m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end, FALSE); + vm_map_lock(m); + /* N.B.: cannot use kgdb to debug, starting with this assignment ... */ + kernel_map = m; + (void) vm_map_insert(m, NULL, (vm_offset_t)0, + VM_MIN_KERNEL_ADDRESS, start); + /* ... and ending with the completion of the above `insert' */ + vm_map_unlock(m); +} diff --git a/sys/vm/vm_kern.h b/sys/vm/vm_kern.h new file mode 100644 index 00000000000..d0d2c358af0 --- /dev/null +++ b/sys/vm/vm_kern.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_kern.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* Kernel memory management definitions. */ + +vm_map_t buffer_map; +vm_map_t exec_map; +vm_map_t kernel_map; +vm_map_t kmem_map; +vm_map_t mb_map; +vm_map_t phys_map; diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c new file mode 100644 index 00000000000..425fe0de432 --- /dev/null +++ b/sys/vm/vm_map.c @@ -0,0 +1,2626 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_map.c 8.3 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory mapping module. + */ + +#include +#include +#include + +#include +#include +#include + +/* + * Virtual memory maps provide for the mapping, protection, + * and sharing of virtual memory objects. In addition, + * this module provides for an efficient virtual copy of + * memory from one map to another. + * + * Synchronization is required prior to most operations. + * + * Maps consist of an ordered doubly-linked list of simple + * entries; a single hint is used to speed up lookups. + * + * In order to properly represent the sharing of virtual + * memory regions among maps, the map structure is bi-level. + * Top-level ("address") maps refer to regions of sharable + * virtual memory. These regions are implemented as + * ("sharing") maps, which then refer to the actual virtual + * memory objects. When two address maps "share" memory, + * their top-level maps both have references to the same + * sharing map. When memory is virtual-copied from one + * address map to another, the references in the sharing + * maps are actually copied -- no copying occurs at the + * virtual memory object level. + * + * Since portions of maps are specified by start/end addreses, + * which may not align with existing map entries, all + * routines merely "clip" entries to these start/end values. + * [That is, an entry is split into two, bordering at a + * start or end value.] Note that these clippings may not + * always be necessary (as the two resulting entries are then + * not changed); however, the clipping is done for convenience. + * No attempt is currently made to "glue back together" two + * abutting entries. + * + * As mentioned above, virtual copy operations are performed + * by copying VM object references from one sharing map to + * another, and then marking both regions as copy-on-write. + * It is important to note that only one writeable reference + * to a VM object region exists in any map -- this means that + * shadow object creation can be delayed until a write operation + * occurs. + */ + +/* + * vm_map_startup: + * + * Initialize the vm_map module. Must be called before + * any other vm_map routines. + * + * Map and entry structures are allocated from the general + * purpose memory pool with some exceptions: + * + * - The kernel map and kmem submap are allocated statically. + * - Kernel map entries are allocated out of a static pool. + * + * These restrictions are necessary since malloc() uses the + * maps and requires map entries. + */ + +vm_offset_t kentry_data; +vm_size_t kentry_data_size; +vm_map_entry_t kentry_free; +vm_map_t kmap_free; + +static void _vm_map_clip_end __P((vm_map_t, vm_map_entry_t, vm_offset_t)); +static void _vm_map_clip_start __P((vm_map_t, vm_map_entry_t, vm_offset_t)); + +void vm_map_startup() +{ + register int i; + register vm_map_entry_t mep; + vm_map_t mp; + + /* + * Static map structures for allocation before initialization of + * kernel map or kmem map. vm_map_create knows how to deal with them. + */ + kmap_free = mp = (vm_map_t) kentry_data; + i = MAX_KMAP; + while (--i > 0) { + mp->header.next = (vm_map_entry_t) (mp + 1); + mp++; + } + mp++->header.next = NULL; + + /* + * Form a free list of statically allocated kernel map entries + * with the rest. + */ + kentry_free = mep = (vm_map_entry_t) mp; + i = (kentry_data_size - MAX_KMAP * sizeof *mp) / sizeof *mep; + while (--i > 0) { + mep->next = mep + 1; + mep++; + } + mep->next = NULL; +} + +/* + * Allocate a vmspace structure, including a vm_map and pmap, + * and initialize those structures. The refcnt is set to 1. + * The remaining fields must be initialized by the caller. + */ +struct vmspace * +vmspace_alloc(min, max, pageable) + vm_offset_t min, max; + int pageable; +{ + register struct vmspace *vm; + + MALLOC(vm, struct vmspace *, sizeof(struct vmspace), M_VMMAP, M_WAITOK); + bzero(vm, (caddr_t) &vm->vm_startcopy - (caddr_t) vm); + vm_map_init(&vm->vm_map, min, max, pageable); + pmap_pinit(&vm->vm_pmap); + vm->vm_map.pmap = &vm->vm_pmap; /* XXX */ + vm->vm_refcnt = 1; + return (vm); +} + +void +vmspace_free(vm) + register struct vmspace *vm; +{ + + if (--vm->vm_refcnt == 0) { + /* + * Lock the map, to wait out all other references to it. + * Delete all of the mappings and pages they hold, + * then call the pmap module to reclaim anything left. + */ + vm_map_lock(&vm->vm_map); + (void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset, + vm->vm_map.max_offset); + pmap_release(&vm->vm_pmap); + FREE(vm, M_VMMAP); + } +} + +/* + * vm_map_create: + * + * Creates and returns a new empty VM map with + * the given physical map structure, and having + * the given lower and upper address bounds. + */ +vm_map_t vm_map_create(pmap, min, max, pageable) + pmap_t pmap; + vm_offset_t min, max; + boolean_t pageable; +{ + register vm_map_t result; + extern vm_map_t kmem_map; + + if (kmem_map == NULL) { + result = kmap_free; + kmap_free = (vm_map_t) result->header.next; + if (result == NULL) + panic("vm_map_create: out of maps"); + } else + MALLOC(result, vm_map_t, sizeof(struct vm_map), + M_VMMAP, M_WAITOK); + + vm_map_init(result, min, max, pageable); + result->pmap = pmap; + return(result); +} + +/* + * Initialize an existing vm_map structure + * such as that in the vmspace structure. + * The pmap is set elsewhere. + */ +void +vm_map_init(map, min, max, pageable) + register struct vm_map *map; + vm_offset_t min, max; + boolean_t pageable; +{ + map->header.next = map->header.prev = &map->header; + map->nentries = 0; + map->size = 0; + map->ref_count = 1; + map->is_main_map = TRUE; + map->min_offset = min; + map->max_offset = max; + map->entries_pageable = pageable; + map->first_free = &map->header; + map->hint = &map->header; + map->timestamp = 0; + lock_init(&map->lock, TRUE); + simple_lock_init(&map->ref_lock); + simple_lock_init(&map->hint_lock); +} + +/* + * vm_map_entry_create: [ internal use only ] + * + * Allocates a VM map entry for insertion. + * No entry fields are filled in. This routine is + */ +vm_map_entry_t vm_map_entry_create(map) + vm_map_t map; +{ + vm_map_entry_t entry; +#ifdef DEBUG + extern vm_map_t kernel_map, kmem_map, mb_map, pager_map; + boolean_t isspecial; + + isspecial = (map == kernel_map || map == kmem_map || + map == mb_map || map == pager_map); + if (isspecial && map->entries_pageable || + !isspecial && !map->entries_pageable) + panic("vm_map_entry_create: bogus map"); +#endif + if (map->entries_pageable) { + MALLOC(entry, vm_map_entry_t, sizeof(struct vm_map_entry), + M_VMMAPENT, M_WAITOK); + } else { + if (entry = kentry_free) + kentry_free = kentry_free->next; + } + if (entry == NULL) + panic("vm_map_entry_create: out of map entries"); + + return(entry); +} + +/* + * vm_map_entry_dispose: [ internal use only ] + * + * Inverse of vm_map_entry_create. + */ +void vm_map_entry_dispose(map, entry) + vm_map_t map; + vm_map_entry_t entry; +{ +#ifdef DEBUG + extern vm_map_t kernel_map, kmem_map, mb_map, pager_map; + boolean_t isspecial; + + isspecial = (map == kernel_map || map == kmem_map || + map == mb_map || map == pager_map); + if (isspecial && map->entries_pageable || + !isspecial && !map->entries_pageable) + panic("vm_map_entry_dispose: bogus map"); +#endif + if (map->entries_pageable) { + FREE(entry, M_VMMAPENT); + } else { + entry->next = kentry_free; + kentry_free = entry; + } +} + +/* + * vm_map_entry_{un,}link: + * + * Insert/remove entries from maps. + */ +#define vm_map_entry_link(map, after_where, entry) \ + { \ + (map)->nentries++; \ + (entry)->prev = (after_where); \ + (entry)->next = (after_where)->next; \ + (entry)->prev->next = (entry); \ + (entry)->next->prev = (entry); \ + } +#define vm_map_entry_unlink(map, entry) \ + { \ + (map)->nentries--; \ + (entry)->next->prev = (entry)->prev; \ + (entry)->prev->next = (entry)->next; \ + } + +/* + * vm_map_reference: + * + * Creates another valid reference to the given map. + * + */ +void vm_map_reference(map) + register vm_map_t map; +{ + if (map == NULL) + return; + + simple_lock(&map->ref_lock); + map->ref_count++; + simple_unlock(&map->ref_lock); +} + +/* + * vm_map_deallocate: + * + * Removes a reference from the specified map, + * destroying it if no references remain. + * The map should not be locked. + */ +void vm_map_deallocate(map) + register vm_map_t map; +{ + register int c; + + if (map == NULL) + return; + + simple_lock(&map->ref_lock); + c = --map->ref_count; + simple_unlock(&map->ref_lock); + + if (c > 0) { + return; + } + + /* + * Lock the map, to wait out all other references + * to it. + */ + + vm_map_lock(map); + + (void) vm_map_delete(map, map->min_offset, map->max_offset); + + pmap_destroy(map->pmap); + + FREE(map, M_VMMAP); +} + +/* + * vm_map_insert: + * + * Inserts the given whole VM object into the target + * map at the specified address range. The object's + * size should match that of the address range. + * + * Requires that the map be locked, and leaves it so. + */ +int +vm_map_insert(map, object, offset, start, end) + vm_map_t map; + vm_object_t object; + vm_offset_t offset; + vm_offset_t start; + vm_offset_t end; +{ + register vm_map_entry_t new_entry; + register vm_map_entry_t prev_entry; + vm_map_entry_t temp_entry; + + /* + * Check that the start and end points are not bogus. + */ + + if ((start < map->min_offset) || (end > map->max_offset) || + (start >= end)) + return(KERN_INVALID_ADDRESS); + + /* + * Find the entry prior to the proposed + * starting address; if it's part of an + * existing entry, this range is bogus. + */ + + if (vm_map_lookup_entry(map, start, &temp_entry)) + return(KERN_NO_SPACE); + + prev_entry = temp_entry; + + /* + * Assert that the next entry doesn't overlap the + * end point. + */ + + if ((prev_entry->next != &map->header) && + (prev_entry->next->start < end)) + return(KERN_NO_SPACE); + + /* + * See if we can avoid creating a new entry by + * extending one of our neighbors. + */ + + if (object == NULL) { + if ((prev_entry != &map->header) && + (prev_entry->end == start) && + (map->is_main_map) && + (prev_entry->is_a_map == FALSE) && + (prev_entry->is_sub_map == FALSE) && + (prev_entry->inheritance == VM_INHERIT_DEFAULT) && + (prev_entry->protection == VM_PROT_DEFAULT) && + (prev_entry->max_protection == VM_PROT_DEFAULT) && + (prev_entry->wired_count == 0)) { + + if (vm_object_coalesce(prev_entry->object.vm_object, + NULL, + prev_entry->offset, + (vm_offset_t) 0, + (vm_size_t)(prev_entry->end + - prev_entry->start), + (vm_size_t)(end - prev_entry->end))) { + /* + * Coalesced the two objects - can extend + * the previous map entry to include the + * new range. + */ + map->size += (end - prev_entry->end); + prev_entry->end = end; + return(KERN_SUCCESS); + } + } + } + + /* + * Create a new entry + */ + + new_entry = vm_map_entry_create(map); + new_entry->start = start; + new_entry->end = end; + + new_entry->is_a_map = FALSE; + new_entry->is_sub_map = FALSE; + new_entry->object.vm_object = object; + new_entry->offset = offset; + + new_entry->copy_on_write = FALSE; + new_entry->needs_copy = FALSE; + + if (map->is_main_map) { + new_entry->inheritance = VM_INHERIT_DEFAULT; + new_entry->protection = VM_PROT_DEFAULT; + new_entry->max_protection = VM_PROT_DEFAULT; + new_entry->wired_count = 0; + } + + /* + * Insert the new entry into the list + */ + + vm_map_entry_link(map, prev_entry, new_entry); + map->size += new_entry->end - new_entry->start; + + /* + * Update the free space hint + */ + + if ((map->first_free == prev_entry) && (prev_entry->end >= new_entry->start)) + map->first_free = new_entry; + + return(KERN_SUCCESS); +} + +/* + * SAVE_HINT: + * + * Saves the specified entry as the hint for + * future lookups. Performs necessary interlocks. + */ +#define SAVE_HINT(map,value) \ + simple_lock(&(map)->hint_lock); \ + (map)->hint = (value); \ + simple_unlock(&(map)->hint_lock); + +/* + * vm_map_lookup_entry: [ internal use only ] + * + * Finds the map entry containing (or + * immediately preceding) the specified address + * in the given map; the entry is returned + * in the "entry" parameter. The boolean + * result indicates whether the address is + * actually contained in the map. + */ +boolean_t vm_map_lookup_entry(map, address, entry) + register vm_map_t map; + register vm_offset_t address; + vm_map_entry_t *entry; /* OUT */ +{ + register vm_map_entry_t cur; + register vm_map_entry_t last; + + /* + * Start looking either from the head of the + * list, or from the hint. + */ + + simple_lock(&map->hint_lock); + cur = map->hint; + simple_unlock(&map->hint_lock); + + if (cur == &map->header) + cur = cur->next; + + if (address >= cur->start) { + /* + * Go from hint to end of list. + * + * But first, make a quick check to see if + * we are already looking at the entry we + * want (which is usually the case). + * Note also that we don't need to save the hint + * here... it is the same hint (unless we are + * at the header, in which case the hint didn't + * buy us anything anyway). + */ + last = &map->header; + if ((cur != last) && (cur->end > address)) { + *entry = cur; + return(TRUE); + } + } + else { + /* + * Go from start to hint, *inclusively* + */ + last = cur->next; + cur = map->header.next; + } + + /* + * Search linearly + */ + + while (cur != last) { + if (cur->end > address) { + if (address >= cur->start) { + /* + * Save this lookup for future + * hints, and return + */ + + *entry = cur; + SAVE_HINT(map, cur); + return(TRUE); + } + break; + } + cur = cur->next; + } + *entry = cur->prev; + SAVE_HINT(map, *entry); + return(FALSE); +} + +/* + * Find sufficient space for `length' bytes in the given map, starting at + * `start'. The map must be locked. Returns 0 on success, 1 on no space. + */ +int +vm_map_findspace(map, start, length, addr) + register vm_map_t map; + register vm_offset_t start; + vm_size_t length; + vm_offset_t *addr; +{ + register vm_map_entry_t entry, next; + register vm_offset_t end; + + if (start < map->min_offset) + start = map->min_offset; + if (start > map->max_offset) + return (1); + + /* + * Look for the first possible address; if there's already + * something at this address, we have to start after it. + */ + if (start == map->min_offset) { + if ((entry = map->first_free) != &map->header) + start = entry->end; + } else { + vm_map_entry_t tmp; + if (vm_map_lookup_entry(map, start, &tmp)) + start = tmp->end; + entry = tmp; + } + + /* + * Look through the rest of the map, trying to fit a new region in + * the gap between existing regions, or after the very last region. + */ + for (;; start = (entry = next)->end) { + /* + * Find the end of the proposed new region. Be sure we didn't + * go beyond the end of the map, or wrap around the address; + * if so, we lose. Otherwise, if this is the last entry, or + * if the proposed new region fits before the next entry, we + * win. + */ + end = start + length; + if (end > map->max_offset || end < start) + return (1); + next = entry->next; + if (next == &map->header || next->start >= end) + break; + } + SAVE_HINT(map, entry); + *addr = start; + return (0); +} + +/* + * vm_map_find finds an unallocated region in the target address + * map with the given length. The search is defined to be + * first-fit from the specified address; the region found is + * returned in the same parameter. + * + */ +int +vm_map_find(map, object, offset, addr, length, find_space) + vm_map_t map; + vm_object_t object; + vm_offset_t offset; + vm_offset_t *addr; /* IN/OUT */ + vm_size_t length; + boolean_t find_space; +{ + register vm_offset_t start; + int result; + + start = *addr; + vm_map_lock(map); + if (find_space) { + if (vm_map_findspace(map, start, length, addr)) { + vm_map_unlock(map); + return (KERN_NO_SPACE); + } + start = *addr; + } + result = vm_map_insert(map, object, offset, start, start + length); + vm_map_unlock(map); + return (result); +} + +/* + * vm_map_simplify_entry: [ internal use only ] + * + * Simplify the given map entry by: + * removing extra sharing maps + * [XXX maybe later] merging with a neighbor + */ +void vm_map_simplify_entry(map, entry) + vm_map_t map; + vm_map_entry_t entry; +{ +#ifdef lint + map++; +#endif + + /* + * If this entry corresponds to a sharing map, then + * see if we can remove the level of indirection. + * If it's not a sharing map, then it points to + * a VM object, so see if we can merge with either + * of our neighbors. + */ + + if (entry->is_sub_map) + return; + if (entry->is_a_map) { +#if 0 + vm_map_t my_share_map; + int count; + + my_share_map = entry->object.share_map; + simple_lock(&my_share_map->ref_lock); + count = my_share_map->ref_count; + simple_unlock(&my_share_map->ref_lock); + + if (count == 1) { + /* Can move the region from + * entry->start to entry->end (+ entry->offset) + * in my_share_map into place of entry. + * Later. + */ + } +#endif + } + else { + /* + * Try to merge with our neighbors. + * + * Conditions for merge are: + * + * 1. entries are adjacent. + * 2. both entries point to objects + * with null pagers. + * + * If a merge is possible, we replace the two + * entries with a single entry, then merge + * the two objects into a single object. + * + * Now, all that is left to do is write the + * code! + */ + } +} + +/* + * vm_map_clip_start: [ internal use only ] + * + * Asserts that the given entry begins at or after + * the specified address; if necessary, + * it splits the entry into two. + */ +#define vm_map_clip_start(map, entry, startaddr) \ +{ \ + if (startaddr > entry->start) \ + _vm_map_clip_start(map, entry, startaddr); \ +} + +/* + * This routine is called only when it is known that + * the entry must be split. + */ +static void _vm_map_clip_start(map, entry, start) + register vm_map_t map; + register vm_map_entry_t entry; + register vm_offset_t start; +{ + register vm_map_entry_t new_entry; + + /* + * See if we can simplify this entry first + */ + + vm_map_simplify_entry(map, entry); + + /* + * Split off the front portion -- + * note that we must insert the new + * entry BEFORE this one, so that + * this entry has the specified starting + * address. + */ + + new_entry = vm_map_entry_create(map); + *new_entry = *entry; + + new_entry->end = start; + entry->offset += (start - entry->start); + entry->start = start; + + vm_map_entry_link(map, entry->prev, new_entry); + + if (entry->is_a_map || entry->is_sub_map) + vm_map_reference(new_entry->object.share_map); + else + vm_object_reference(new_entry->object.vm_object); +} + +/* + * vm_map_clip_end: [ internal use only ] + * + * Asserts that the given entry ends at or before + * the specified address; if necessary, + * it splits the entry into two. + */ + +#define vm_map_clip_end(map, entry, endaddr) \ +{ \ + if (endaddr < entry->end) \ + _vm_map_clip_end(map, entry, endaddr); \ +} + +/* + * This routine is called only when it is known that + * the entry must be split. + */ +static void _vm_map_clip_end(map, entry, end) + register vm_map_t map; + register vm_map_entry_t entry; + register vm_offset_t end; +{ + register vm_map_entry_t new_entry; + + /* + * Create a new entry and insert it + * AFTER the specified entry + */ + + new_entry = vm_map_entry_create(map); + *new_entry = *entry; + + new_entry->start = entry->end = end; + new_entry->offset += (end - entry->start); + + vm_map_entry_link(map, entry, new_entry); + + if (entry->is_a_map || entry->is_sub_map) + vm_map_reference(new_entry->object.share_map); + else + vm_object_reference(new_entry->object.vm_object); +} + +/* + * VM_MAP_RANGE_CHECK: [ internal use only ] + * + * Asserts that the starting and ending region + * addresses fall within the valid range of the map. + */ +#define VM_MAP_RANGE_CHECK(map, start, end) \ + { \ + if (start < vm_map_min(map)) \ + start = vm_map_min(map); \ + if (end > vm_map_max(map)) \ + end = vm_map_max(map); \ + if (start > end) \ + start = end; \ + } + +/* + * vm_map_submap: [ kernel use only ] + * + * Mark the given range as handled by a subordinate map. + * + * This range must have been created with vm_map_find, + * and no other operations may have been performed on this + * range prior to calling vm_map_submap. + * + * Only a limited number of operations can be performed + * within this rage after calling vm_map_submap: + * vm_fault + * [Don't try vm_map_copy!] + * + * To remove a submapping, one must first remove the + * range from the superior map, and then destroy the + * submap (if desired). [Better yet, don't try it.] + */ +int +vm_map_submap(map, start, end, submap) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + vm_map_t submap; +{ + vm_map_entry_t entry; + register int result = KERN_INVALID_ARGUMENT; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &entry)) { + vm_map_clip_start(map, entry, start); + } + else + entry = entry->next; + + vm_map_clip_end(map, entry, end); + + if ((entry->start == start) && (entry->end == end) && + (!entry->is_a_map) && + (entry->object.vm_object == NULL) && + (!entry->copy_on_write)) { + entry->is_a_map = FALSE; + entry->is_sub_map = TRUE; + vm_map_reference(entry->object.sub_map = submap); + result = KERN_SUCCESS; + } + vm_map_unlock(map); + + return(result); +} + +/* + * vm_map_protect: + * + * Sets the protection of the specified address + * region in the target map. If "set_max" is + * specified, the maximum protection is to be set; + * otherwise, only the current protection is affected. + */ +int +vm_map_protect(map, start, end, new_prot, set_max) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + register vm_prot_t new_prot; + register boolean_t set_max; +{ + register vm_map_entry_t current; + vm_map_entry_t entry; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &entry)) { + vm_map_clip_start(map, entry, start); + } + else + entry = entry->next; + + /* + * Make a first pass to check for protection + * violations. + */ + + current = entry; + while ((current != &map->header) && (current->start < end)) { + if (current->is_sub_map) + return(KERN_INVALID_ARGUMENT); + if ((new_prot & current->max_protection) != new_prot) { + vm_map_unlock(map); + return(KERN_PROTECTION_FAILURE); + } + + current = current->next; + } + + /* + * Go back and fix up protections. + * [Note that clipping is not necessary the second time.] + */ + + current = entry; + + while ((current != &map->header) && (current->start < end)) { + vm_prot_t old_prot; + + vm_map_clip_end(map, current, end); + + old_prot = current->protection; + if (set_max) + current->protection = + (current->max_protection = new_prot) & + old_prot; + else + current->protection = new_prot; + + /* + * Update physical map if necessary. + * Worry about copy-on-write here -- CHECK THIS XXX + */ + + if (current->protection != old_prot) { + +#define MASK(entry) ((entry)->copy_on_write ? ~VM_PROT_WRITE : \ + VM_PROT_ALL) +#define max(a,b) ((a) > (b) ? (a) : (b)) + + if (current->is_a_map) { + vm_map_entry_t share_entry; + vm_offset_t share_end; + + vm_map_lock(current->object.share_map); + (void) vm_map_lookup_entry( + current->object.share_map, + current->offset, + &share_entry); + share_end = current->offset + + (current->end - current->start); + while ((share_entry != + ¤t->object.share_map->header) && + (share_entry->start < share_end)) { + + pmap_protect(map->pmap, + (max(share_entry->start, + current->offset) - + current->offset + + current->start), + min(share_entry->end, + share_end) - + current->offset + + current->start, + current->protection & + MASK(share_entry)); + + share_entry = share_entry->next; + } + vm_map_unlock(current->object.share_map); + } + else + pmap_protect(map->pmap, current->start, + current->end, + current->protection & MASK(entry)); +#undef max +#undef MASK + } + current = current->next; + } + + vm_map_unlock(map); + return(KERN_SUCCESS); +} + +/* + * vm_map_inherit: + * + * Sets the inheritance of the specified address + * range in the target map. Inheritance + * affects how the map will be shared with + * child maps at the time of vm_map_fork. + */ +int +vm_map_inherit(map, start, end, new_inheritance) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + register vm_inherit_t new_inheritance; +{ + register vm_map_entry_t entry; + vm_map_entry_t temp_entry; + + switch (new_inheritance) { + case VM_INHERIT_NONE: + case VM_INHERIT_COPY: + case VM_INHERIT_SHARE: + break; + default: + return(KERN_INVALID_ARGUMENT); + } + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &temp_entry)) { + entry = temp_entry; + vm_map_clip_start(map, entry, start); + } + else + entry = temp_entry->next; + + while ((entry != &map->header) && (entry->start < end)) { + vm_map_clip_end(map, entry, end); + + entry->inheritance = new_inheritance; + + entry = entry->next; + } + + vm_map_unlock(map); + return(KERN_SUCCESS); +} + +/* + * vm_map_pageable: + * + * Sets the pageability of the specified address + * range in the target map. Regions specified + * as not pageable require locked-down physical + * memory and physical page maps. + * + * The map must not be locked, but a reference + * must remain to the map throughout the call. + */ +int +vm_map_pageable(map, start, end, new_pageable) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + register boolean_t new_pageable; +{ + register vm_map_entry_t entry; + vm_map_entry_t start_entry; + register vm_offset_t failed; + int rv; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + /* + * Only one pageability change may take place at one + * time, since vm_fault assumes it will be called + * only once for each wiring/unwiring. Therefore, we + * have to make sure we're actually changing the pageability + * for the entire region. We do so before making any changes. + */ + + if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) { + vm_map_unlock(map); + return(KERN_INVALID_ADDRESS); + } + entry = start_entry; + + /* + * Actions are rather different for wiring and unwiring, + * so we have two separate cases. + */ + + if (new_pageable) { + + vm_map_clip_start(map, entry, start); + + /* + * Unwiring. First ensure that the range to be + * unwired is really wired down and that there + * are no holes. + */ + while ((entry != &map->header) && (entry->start < end)) { + + if (entry->wired_count == 0 || + (entry->end < end && + (entry->next == &map->header || + entry->next->start > entry->end))) { + vm_map_unlock(map); + return(KERN_INVALID_ARGUMENT); + } + entry = entry->next; + } + + /* + * Now decrement the wiring count for each region. + * If a region becomes completely unwired, + * unwire its physical pages and mappings. + */ + lock_set_recursive(&map->lock); + + entry = start_entry; + while ((entry != &map->header) && (entry->start < end)) { + vm_map_clip_end(map, entry, end); + + entry->wired_count--; + if (entry->wired_count == 0) + vm_fault_unwire(map, entry->start, entry->end); + + entry = entry->next; + } + lock_clear_recursive(&map->lock); + } + + else { + /* + * Wiring. We must do this in two passes: + * + * 1. Holding the write lock, we create any shadow + * or zero-fill objects that need to be created. + * Then we clip each map entry to the region to be + * wired and increment its wiring count. We + * create objects before clipping the map entries + * to avoid object proliferation. + * + * 2. We downgrade to a read lock, and call + * vm_fault_wire to fault in the pages for any + * newly wired area (wired_count is 1). + * + * Downgrading to a read lock for vm_fault_wire avoids + * a possible deadlock with another thread that may have + * faulted on one of the pages to be wired (it would mark + * the page busy, blocking us, then in turn block on the + * map lock that we hold). Because of problems in the + * recursive lock package, we cannot upgrade to a write + * lock in vm_map_lookup. Thus, any actions that require + * the write lock must be done beforehand. Because we + * keep the read lock on the map, the copy-on-write status + * of the entries we modify here cannot change. + */ + + /* + * Pass 1. + */ + while ((entry != &map->header) && (entry->start < end)) { + if (entry->wired_count == 0) { + + /* + * Perform actions of vm_map_lookup that need + * the write lock on the map: create a shadow + * object for a copy-on-write region, or an + * object for a zero-fill region. + * + * We don't have to do this for entries that + * point to sharing maps, because we won't hold + * the lock on the sharing map. + */ + if (!entry->is_a_map) { + if (entry->needs_copy && + ((entry->protection & VM_PROT_WRITE) != 0)) { + + vm_object_shadow(&entry->object.vm_object, + &entry->offset, + (vm_size_t)(entry->end + - entry->start)); + entry->needs_copy = FALSE; + } + else if (entry->object.vm_object == NULL) { + entry->object.vm_object = + vm_object_allocate((vm_size_t)(entry->end + - entry->start)); + entry->offset = (vm_offset_t)0; + } + } + } + vm_map_clip_start(map, entry, start); + vm_map_clip_end(map, entry, end); + entry->wired_count++; + + /* + * Check for holes + */ + if (entry->end < end && + (entry->next == &map->header || + entry->next->start > entry->end)) { + /* + * Found one. Object creation actions + * do not need to be undone, but the + * wired counts need to be restored. + */ + while (entry != &map->header && entry->end > start) { + entry->wired_count--; + entry = entry->prev; + } + vm_map_unlock(map); + return(KERN_INVALID_ARGUMENT); + } + entry = entry->next; + } + + /* + * Pass 2. + */ + + /* + * HACK HACK HACK HACK + * + * If we are wiring in the kernel map or a submap of it, + * unlock the map to avoid deadlocks. We trust that the + * kernel threads are well-behaved, and therefore will + * not do anything destructive to this region of the map + * while we have it unlocked. We cannot trust user threads + * to do the same. + * + * HACK HACK HACK HACK + */ + if (vm_map_pmap(map) == kernel_pmap) { + vm_map_unlock(map); /* trust me ... */ + } + else { + lock_set_recursive(&map->lock); + lock_write_to_read(&map->lock); + } + + rv = 0; + entry = start_entry; + while (entry != &map->header && entry->start < end) { + /* + * If vm_fault_wire fails for any page we need to + * undo what has been done. We decrement the wiring + * count for those pages which have not yet been + * wired (now) and unwire those that have (later). + * + * XXX this violates the locking protocol on the map, + * needs to be fixed. + */ + if (rv) + entry->wired_count--; + else if (entry->wired_count == 1) { + rv = vm_fault_wire(map, entry->start, entry->end); + if (rv) { + failed = entry->start; + entry->wired_count--; + } + } + entry = entry->next; + } + + if (vm_map_pmap(map) == kernel_pmap) { + vm_map_lock(map); + } + else { + lock_clear_recursive(&map->lock); + } + if (rv) { + vm_map_unlock(map); + (void) vm_map_pageable(map, start, failed, TRUE); + return(rv); + } + } + + vm_map_unlock(map); + + return(KERN_SUCCESS); +} + +/* + * vm_map_clean + * + * Push any dirty cached pages in the address range to their pager. + * If syncio is TRUE, dirty pages are written synchronously. + * If invalidate is TRUE, any cached pages are freed as well. + * + * Returns an error if any part of the specified range is not mapped. + */ +int +vm_map_clean(map, start, end, syncio, invalidate) + vm_map_t map; + vm_offset_t start; + vm_offset_t end; + boolean_t syncio; + boolean_t invalidate; +{ + register vm_map_entry_t current; + vm_map_entry_t entry; + vm_size_t size; + vm_object_t object; + vm_offset_t offset; + + vm_map_lock_read(map); + VM_MAP_RANGE_CHECK(map, start, end); + if (!vm_map_lookup_entry(map, start, &entry)) { + vm_map_unlock_read(map); + return(KERN_INVALID_ADDRESS); + } + + /* + * Make a first pass to check for holes. + */ + for (current = entry; current->start < end; current = current->next) { + if (current->is_sub_map) { + vm_map_unlock_read(map); + return(KERN_INVALID_ARGUMENT); + } + if (end > current->end && + (current->next == &map->header || + current->end != current->next->start)) { + vm_map_unlock_read(map); + return(KERN_INVALID_ADDRESS); + } + } + + /* + * Make a second pass, cleaning/uncaching pages from the indicated + * objects as we go. + */ + for (current = entry; current->start < end; current = current->next) { + offset = current->offset + (start - current->start); + size = (end <= current->end ? end : current->end) - start; + if (current->is_a_map) { + register vm_map_t smap; + vm_map_entry_t tentry; + vm_size_t tsize; + + smap = current->object.share_map; + vm_map_lock_read(smap); + (void) vm_map_lookup_entry(smap, offset, &tentry); + tsize = tentry->end - offset; + if (tsize < size) + size = tsize; + object = tentry->object.vm_object; + offset = tentry->offset + (offset - tentry->start); + vm_object_lock(object); + vm_map_unlock_read(smap); + } else { + object = current->object.vm_object; + vm_object_lock(object); + } + /* + * Flush pages if writing is allowed. + * XXX should we continue on an error? + */ + if ((current->protection & VM_PROT_WRITE) && + !vm_object_page_clean(object, offset, offset+size, + syncio, FALSE)) { + vm_object_unlock(object); + vm_map_unlock_read(map); + return(KERN_FAILURE); + } + if (invalidate) + vm_object_page_remove(object, offset, offset+size); + vm_object_unlock(object); + start += size; + } + + vm_map_unlock_read(map); + return(KERN_SUCCESS); +} + +/* + * vm_map_entry_unwire: [ internal use only ] + * + * Make the region specified by this entry pageable. + * + * The map in question should be locked. + * [This is the reason for this routine's existence.] + */ +void vm_map_entry_unwire(map, entry) + vm_map_t map; + register vm_map_entry_t entry; +{ + vm_fault_unwire(map, entry->start, entry->end); + entry->wired_count = 0; +} + +/* + * vm_map_entry_delete: [ internal use only ] + * + * Deallocate the given entry from the target map. + */ +void vm_map_entry_delete(map, entry) + register vm_map_t map; + register vm_map_entry_t entry; +{ + if (entry->wired_count != 0) + vm_map_entry_unwire(map, entry); + + vm_map_entry_unlink(map, entry); + map->size -= entry->end - entry->start; + + if (entry->is_a_map || entry->is_sub_map) + vm_map_deallocate(entry->object.share_map); + else + vm_object_deallocate(entry->object.vm_object); + + vm_map_entry_dispose(map, entry); +} + +/* + * vm_map_delete: [ internal use only ] + * + * Deallocates the given address range from the target + * map. + * + * When called with a sharing map, removes pages from + * that region from all physical maps. + */ +int +vm_map_delete(map, start, end) + register vm_map_t map; + vm_offset_t start; + register vm_offset_t end; +{ + register vm_map_entry_t entry; + vm_map_entry_t first_entry; + + /* + * Find the start of the region, and clip it + */ + + if (!vm_map_lookup_entry(map, start, &first_entry)) + entry = first_entry->next; + else { + entry = first_entry; + vm_map_clip_start(map, entry, start); + + /* + * Fix the lookup hint now, rather than each + * time though the loop. + */ + + SAVE_HINT(map, entry->prev); + } + + /* + * Save the free space hint + */ + + if (map->first_free->start >= start) + map->first_free = entry->prev; + + /* + * Step through all entries in this region + */ + + while ((entry != &map->header) && (entry->start < end)) { + vm_map_entry_t next; + register vm_offset_t s, e; + register vm_object_t object; + + vm_map_clip_end(map, entry, end); + + next = entry->next; + s = entry->start; + e = entry->end; + + /* + * Unwire before removing addresses from the pmap; + * otherwise, unwiring will put the entries back in + * the pmap. + */ + + object = entry->object.vm_object; + if (entry->wired_count != 0) + vm_map_entry_unwire(map, entry); + + /* + * If this is a sharing map, we must remove + * *all* references to this data, since we can't + * find all of the physical maps which are sharing + * it. + */ + + if (object == kernel_object || object == kmem_object) + vm_object_page_remove(object, entry->offset, + entry->offset + (e - s)); + else if (!map->is_main_map) + vm_object_pmap_remove(object, + entry->offset, + entry->offset + (e - s)); + else + pmap_remove(map->pmap, s, e); + + /* + * Delete the entry (which may delete the object) + * only after removing all pmap entries pointing + * to its pages. (Otherwise, its page frames may + * be reallocated, and any modify bits will be + * set in the wrong object!) + */ + + vm_map_entry_delete(map, entry); + entry = next; + } + return(KERN_SUCCESS); +} + +/* + * vm_map_remove: + * + * Remove the given address range from the target map. + * This is the exported form of vm_map_delete. + */ +int +vm_map_remove(map, start, end) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; +{ + register int result; + + vm_map_lock(map); + VM_MAP_RANGE_CHECK(map, start, end); + result = vm_map_delete(map, start, end); + vm_map_unlock(map); + + return(result); +} + +/* + * vm_map_check_protection: + * + * Assert that the target map allows the specified + * privilege on the entire address region given. + * The entire region must be allocated. + */ +boolean_t vm_map_check_protection(map, start, end, protection) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + register vm_prot_t protection; +{ + register vm_map_entry_t entry; + vm_map_entry_t tmp_entry; + + if (!vm_map_lookup_entry(map, start, &tmp_entry)) { + return(FALSE); + } + + entry = tmp_entry; + + while (start < end) { + if (entry == &map->header) { + return(FALSE); + } + + /* + * No holes allowed! + */ + + if (start < entry->start) { + return(FALSE); + } + + /* + * Check protection associated with entry. + */ + + if ((entry->protection & protection) != protection) { + return(FALSE); + } + + /* go to next entry */ + + start = entry->end; + entry = entry->next; + } + return(TRUE); +} + +/* + * vm_map_copy_entry: + * + * Copies the contents of the source entry to the destination + * entry. The entries *must* be aligned properly. + */ +void vm_map_copy_entry(src_map, dst_map, src_entry, dst_entry) + vm_map_t src_map, dst_map; + register vm_map_entry_t src_entry, dst_entry; +{ + vm_object_t temp_object; + + if (src_entry->is_sub_map || dst_entry->is_sub_map) + return; + + if (dst_entry->object.vm_object != NULL && + (dst_entry->object.vm_object->flags & OBJ_INTERNAL) == 0) + printf("vm_map_copy_entry: copying over permanent data!\n"); + + /* + * If our destination map was wired down, + * unwire it now. + */ + + if (dst_entry->wired_count != 0) + vm_map_entry_unwire(dst_map, dst_entry); + + /* + * If we're dealing with a sharing map, we + * must remove the destination pages from + * all maps (since we cannot know which maps + * this sharing map belongs in). + */ + + if (dst_map->is_main_map) + pmap_remove(dst_map->pmap, dst_entry->start, dst_entry->end); + else + vm_object_pmap_remove(dst_entry->object.vm_object, + dst_entry->offset, + dst_entry->offset + + (dst_entry->end - dst_entry->start)); + + if (src_entry->wired_count == 0) { + + boolean_t src_needs_copy; + + /* + * If the source entry is marked needs_copy, + * it is already write-protected. + */ + if (!src_entry->needs_copy) { + + boolean_t su; + + /* + * If the source entry has only one mapping, + * we can just protect the virtual address + * range. + */ + if (!(su = src_map->is_main_map)) { + simple_lock(&src_map->ref_lock); + su = (src_map->ref_count == 1); + simple_unlock(&src_map->ref_lock); + } + + if (su) { + pmap_protect(src_map->pmap, + src_entry->start, + src_entry->end, + src_entry->protection & ~VM_PROT_WRITE); + } + else { + vm_object_pmap_copy(src_entry->object.vm_object, + src_entry->offset, + src_entry->offset + (src_entry->end + -src_entry->start)); + } + } + + /* + * Make a copy of the object. + */ + temp_object = dst_entry->object.vm_object; + vm_object_copy(src_entry->object.vm_object, + src_entry->offset, + (vm_size_t)(src_entry->end - + src_entry->start), + &dst_entry->object.vm_object, + &dst_entry->offset, + &src_needs_copy); + /* + * If we didn't get a copy-object now, mark the + * source map entry so that a shadow will be created + * to hold its changed pages. + */ + if (src_needs_copy) + src_entry->needs_copy = TRUE; + + /* + * The destination always needs to have a shadow + * created. + */ + dst_entry->needs_copy = TRUE; + + /* + * Mark the entries copy-on-write, so that write-enabling + * the entry won't make copy-on-write pages writable. + */ + src_entry->copy_on_write = TRUE; + dst_entry->copy_on_write = TRUE; + /* + * Get rid of the old object. + */ + vm_object_deallocate(temp_object); + + pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, + dst_entry->end - dst_entry->start, src_entry->start); + } + else { + /* + * Of course, wired down pages can't be set copy-on-write. + * Cause wired pages to be copied into the new + * map by simulating faults (the new pages are + * pageable) + */ + vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry); + } +} + +/* + * vm_map_copy: + * + * Perform a virtual memory copy from the source + * address map/range to the destination map/range. + * + * If src_destroy or dst_alloc is requested, + * the source and destination regions should be + * disjoint, not only in the top-level map, but + * in the sharing maps as well. [The best way + * to guarantee this is to use a new intermediate + * map to make copies. This also reduces map + * fragmentation.] + */ +int +vm_map_copy(dst_map, src_map, + dst_addr, len, src_addr, + dst_alloc, src_destroy) + vm_map_t dst_map; + vm_map_t src_map; + vm_offset_t dst_addr; + vm_size_t len; + vm_offset_t src_addr; + boolean_t dst_alloc; + boolean_t src_destroy; +{ + register + vm_map_entry_t src_entry; + register + vm_map_entry_t dst_entry; + vm_map_entry_t tmp_entry; + vm_offset_t src_start; + vm_offset_t src_end; + vm_offset_t dst_start; + vm_offset_t dst_end; + vm_offset_t src_clip; + vm_offset_t dst_clip; + int result; + boolean_t old_src_destroy; + + /* + * XXX While we figure out why src_destroy screws up, + * we'll do it by explicitly vm_map_delete'ing at the end. + */ + + old_src_destroy = src_destroy; + src_destroy = FALSE; + + /* + * Compute start and end of region in both maps + */ + + src_start = src_addr; + src_end = src_start + len; + dst_start = dst_addr; + dst_end = dst_start + len; + + /* + * Check that the region can exist in both source + * and destination. + */ + + if ((dst_end < dst_start) || (src_end < src_start)) + return(KERN_NO_SPACE); + + /* + * Lock the maps in question -- we avoid deadlock + * by ordering lock acquisition by map value + */ + + if (src_map == dst_map) { + vm_map_lock(src_map); + } + else if ((int) src_map < (int) dst_map) { + vm_map_lock(src_map); + vm_map_lock(dst_map); + } else { + vm_map_lock(dst_map); + vm_map_lock(src_map); + } + + result = KERN_SUCCESS; + + /* + * Check protections... source must be completely readable and + * destination must be completely writable. [Note that if we're + * allocating the destination region, we don't have to worry + * about protection, but instead about whether the region + * exists.] + */ + + if (src_map->is_main_map && dst_map->is_main_map) { + if (!vm_map_check_protection(src_map, src_start, src_end, + VM_PROT_READ)) { + result = KERN_PROTECTION_FAILURE; + goto Return; + } + + if (dst_alloc) { + /* XXX Consider making this a vm_map_find instead */ + if ((result = vm_map_insert(dst_map, NULL, + (vm_offset_t) 0, dst_start, dst_end)) != KERN_SUCCESS) + goto Return; + } + else if (!vm_map_check_protection(dst_map, dst_start, dst_end, + VM_PROT_WRITE)) { + result = KERN_PROTECTION_FAILURE; + goto Return; + } + } + + /* + * Find the start entries and clip. + * + * Note that checking protection asserts that the + * lookup cannot fail. + * + * Also note that we wait to do the second lookup + * until we have done the first clip, as the clip + * may affect which entry we get! + */ + + (void) vm_map_lookup_entry(src_map, src_addr, &tmp_entry); + src_entry = tmp_entry; + vm_map_clip_start(src_map, src_entry, src_start); + + (void) vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry); + dst_entry = tmp_entry; + vm_map_clip_start(dst_map, dst_entry, dst_start); + + /* + * If both source and destination entries are the same, + * retry the first lookup, as it may have changed. + */ + + if (src_entry == dst_entry) { + (void) vm_map_lookup_entry(src_map, src_addr, &tmp_entry); + src_entry = tmp_entry; + } + + /* + * If source and destination entries are still the same, + * a null copy is being performed. + */ + + if (src_entry == dst_entry) + goto Return; + + /* + * Go through entries until we get to the end of the + * region. + */ + + while (src_start < src_end) { + /* + * Clip the entries to the endpoint of the entire region. + */ + + vm_map_clip_end(src_map, src_entry, src_end); + vm_map_clip_end(dst_map, dst_entry, dst_end); + + /* + * Clip each entry to the endpoint of the other entry. + */ + + src_clip = src_entry->start + (dst_entry->end - dst_entry->start); + vm_map_clip_end(src_map, src_entry, src_clip); + + dst_clip = dst_entry->start + (src_entry->end - src_entry->start); + vm_map_clip_end(dst_map, dst_entry, dst_clip); + + /* + * Both entries now match in size and relative endpoints. + * + * If both entries refer to a VM object, we can + * deal with them now. + */ + + if (!src_entry->is_a_map && !dst_entry->is_a_map) { + vm_map_copy_entry(src_map, dst_map, src_entry, + dst_entry); + } + else { + register vm_map_t new_dst_map; + vm_offset_t new_dst_start; + vm_size_t new_size; + vm_map_t new_src_map; + vm_offset_t new_src_start; + + /* + * We have to follow at least one sharing map. + */ + + new_size = (dst_entry->end - dst_entry->start); + + if (src_entry->is_a_map) { + new_src_map = src_entry->object.share_map; + new_src_start = src_entry->offset; + } + else { + new_src_map = src_map; + new_src_start = src_entry->start; + lock_set_recursive(&src_map->lock); + } + + if (dst_entry->is_a_map) { + vm_offset_t new_dst_end; + + new_dst_map = dst_entry->object.share_map; + new_dst_start = dst_entry->offset; + + /* + * Since the destination sharing entries + * will be merely deallocated, we can + * do that now, and replace the region + * with a null object. [This prevents + * splitting the source map to match + * the form of the destination map.] + * Note that we can only do so if the + * source and destination do not overlap. + */ + + new_dst_end = new_dst_start + new_size; + + if (new_dst_map != new_src_map) { + vm_map_lock(new_dst_map); + (void) vm_map_delete(new_dst_map, + new_dst_start, + new_dst_end); + (void) vm_map_insert(new_dst_map, + NULL, + (vm_offset_t) 0, + new_dst_start, + new_dst_end); + vm_map_unlock(new_dst_map); + } + } + else { + new_dst_map = dst_map; + new_dst_start = dst_entry->start; + lock_set_recursive(&dst_map->lock); + } + + /* + * Recursively copy the sharing map. + */ + + (void) vm_map_copy(new_dst_map, new_src_map, + new_dst_start, new_size, new_src_start, + FALSE, FALSE); + + if (dst_map == new_dst_map) + lock_clear_recursive(&dst_map->lock); + if (src_map == new_src_map) + lock_clear_recursive(&src_map->lock); + } + + /* + * Update variables for next pass through the loop. + */ + + src_start = src_entry->end; + src_entry = src_entry->next; + dst_start = dst_entry->end; + dst_entry = dst_entry->next; + + /* + * If the source is to be destroyed, here is the + * place to do it. + */ + + if (src_destroy && src_map->is_main_map && + dst_map->is_main_map) + vm_map_entry_delete(src_map, src_entry->prev); + } + + /* + * Update the physical maps as appropriate + */ + + if (src_map->is_main_map && dst_map->is_main_map) { + if (src_destroy) + pmap_remove(src_map->pmap, src_addr, src_addr + len); + } + + /* + * Unlock the maps + */ + + Return: ; + + if (old_src_destroy) + vm_map_delete(src_map, src_addr, src_addr + len); + + vm_map_unlock(src_map); + if (src_map != dst_map) + vm_map_unlock(dst_map); + + return(result); +} + +/* + * vmspace_fork: + * Create a new process vmspace structure and vm_map + * based on those of an existing process. The new map + * is based on the old map, according to the inheritance + * values on the regions in that map. + * + * The source map must not be locked. + */ +struct vmspace * +vmspace_fork(vm1) + register struct vmspace *vm1; +{ + register struct vmspace *vm2; + vm_map_t old_map = &vm1->vm_map; + vm_map_t new_map; + vm_map_entry_t old_entry; + vm_map_entry_t new_entry; + pmap_t new_pmap; + + vm_map_lock(old_map); + + vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, + old_map->entries_pageable); + bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy, + (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy); + new_pmap = &vm2->vm_pmap; /* XXX */ + new_map = &vm2->vm_map; /* XXX */ + + old_entry = old_map->header.next; + + while (old_entry != &old_map->header) { + if (old_entry->is_sub_map) + panic("vm_map_fork: encountered a submap"); + + switch (old_entry->inheritance) { + case VM_INHERIT_NONE: + break; + + case VM_INHERIT_SHARE: + /* + * If we don't already have a sharing map: + */ + + if (!old_entry->is_a_map) { + vm_map_t new_share_map; + vm_map_entry_t new_share_entry; + + /* + * Create a new sharing map + */ + + new_share_map = vm_map_create(NULL, + old_entry->start, + old_entry->end, + TRUE); + new_share_map->is_main_map = FALSE; + + /* + * Create the only sharing entry from the + * old task map entry. + */ + + new_share_entry = + vm_map_entry_create(new_share_map); + *new_share_entry = *old_entry; + new_share_entry->wired_count = 0; + + /* + * Insert the entry into the new sharing + * map + */ + + vm_map_entry_link(new_share_map, + new_share_map->header.prev, + new_share_entry); + + /* + * Fix up the task map entry to refer + * to the sharing map now. + */ + + old_entry->is_a_map = TRUE; + old_entry->object.share_map = new_share_map; + old_entry->offset = old_entry->start; + } + + /* + * Clone the entry, referencing the sharing map. + */ + + new_entry = vm_map_entry_create(new_map); + *new_entry = *old_entry; + new_entry->wired_count = 0; + vm_map_reference(new_entry->object.share_map); + + /* + * Insert the entry into the new map -- we + * know we're inserting at the end of the new + * map. + */ + + vm_map_entry_link(new_map, new_map->header.prev, + new_entry); + + /* + * Update the physical map + */ + + pmap_copy(new_map->pmap, old_map->pmap, + new_entry->start, + (old_entry->end - old_entry->start), + old_entry->start); + break; + + case VM_INHERIT_COPY: + /* + * Clone the entry and link into the map. + */ + + new_entry = vm_map_entry_create(new_map); + *new_entry = *old_entry; + new_entry->wired_count = 0; + new_entry->object.vm_object = NULL; + new_entry->is_a_map = FALSE; + vm_map_entry_link(new_map, new_map->header.prev, + new_entry); + if (old_entry->is_a_map) { + int check; + + check = vm_map_copy(new_map, + old_entry->object.share_map, + new_entry->start, + (vm_size_t)(new_entry->end - + new_entry->start), + old_entry->offset, + FALSE, FALSE); + if (check != KERN_SUCCESS) + printf("vm_map_fork: copy in share_map region failed\n"); + } + else { + vm_map_copy_entry(old_map, new_map, old_entry, + new_entry); + } + break; + } + old_entry = old_entry->next; + } + + new_map->size = old_map->size; + vm_map_unlock(old_map); + + return(vm2); +} + +/* + * vm_map_lookup: + * + * Finds the VM object, offset, and + * protection for a given virtual address in the + * specified map, assuming a page fault of the + * type specified. + * + * Leaves the map in question locked for read; return + * values are guaranteed until a vm_map_lookup_done + * call is performed. Note that the map argument + * is in/out; the returned map must be used in + * the call to vm_map_lookup_done. + * + * A handle (out_entry) is returned for use in + * vm_map_lookup_done, to make that fast. + * + * If a lookup is requested with "write protection" + * specified, the map may be changed to perform virtual + * copying operations, although the data referenced will + * remain the same. + */ +int +vm_map_lookup(var_map, vaddr, fault_type, out_entry, + object, offset, out_prot, wired, single_use) + vm_map_t *var_map; /* IN/OUT */ + register vm_offset_t vaddr; + register vm_prot_t fault_type; + + vm_map_entry_t *out_entry; /* OUT */ + vm_object_t *object; /* OUT */ + vm_offset_t *offset; /* OUT */ + vm_prot_t *out_prot; /* OUT */ + boolean_t *wired; /* OUT */ + boolean_t *single_use; /* OUT */ +{ + vm_map_t share_map; + vm_offset_t share_offset; + register vm_map_entry_t entry; + register vm_map_t map = *var_map; + register vm_prot_t prot; + register boolean_t su; + + RetryLookup: ; + + /* + * Lookup the faulting address. + */ + + vm_map_lock_read(map); + +#define RETURN(why) \ + { \ + vm_map_unlock_read(map); \ + return(why); \ + } + + /* + * If the map has an interesting hint, try it before calling + * full blown lookup routine. + */ + + simple_lock(&map->hint_lock); + entry = map->hint; + simple_unlock(&map->hint_lock); + + *out_entry = entry; + + if ((entry == &map->header) || + (vaddr < entry->start) || (vaddr >= entry->end)) { + vm_map_entry_t tmp_entry; + + /* + * Entry was either not a valid hint, or the vaddr + * was not contained in the entry, so do a full lookup. + */ + if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) + RETURN(KERN_INVALID_ADDRESS); + + entry = tmp_entry; + *out_entry = entry; + } + + /* + * Handle submaps. + */ + + if (entry->is_sub_map) { + vm_map_t old_map = map; + + *var_map = map = entry->object.sub_map; + vm_map_unlock_read(old_map); + goto RetryLookup; + } + + /* + * Check whether this task is allowed to have + * this page. + */ + + prot = entry->protection; + if ((fault_type & (prot)) != fault_type) + RETURN(KERN_PROTECTION_FAILURE); + + /* + * If this page is not pageable, we have to get + * it for all possible accesses. + */ + + if (*wired = (entry->wired_count != 0)) + prot = fault_type = entry->protection; + + /* + * If we don't already have a VM object, track + * it down. + */ + + if (su = !entry->is_a_map) { + share_map = map; + share_offset = vaddr; + } + else { + vm_map_entry_t share_entry; + + /* + * Compute the sharing map, and offset into it. + */ + + share_map = entry->object.share_map; + share_offset = (vaddr - entry->start) + entry->offset; + + /* + * Look for the backing store object and offset + */ + + vm_map_lock_read(share_map); + + if (!vm_map_lookup_entry(share_map, share_offset, + &share_entry)) { + vm_map_unlock_read(share_map); + RETURN(KERN_INVALID_ADDRESS); + } + entry = share_entry; + } + + /* + * If the entry was copy-on-write, we either ... + */ + + if (entry->needs_copy) { + /* + * If we want to write the page, we may as well + * handle that now since we've got the sharing + * map locked. + * + * If we don't need to write the page, we just + * demote the permissions allowed. + */ + + if (fault_type & VM_PROT_WRITE) { + /* + * Make a new object, and place it in the + * object chain. Note that no new references + * have appeared -- one just moved from the + * share map to the new object. + */ + + if (lock_read_to_write(&share_map->lock)) { + if (share_map != map) + vm_map_unlock_read(map); + goto RetryLookup; + } + + vm_object_shadow( + &entry->object.vm_object, + &entry->offset, + (vm_size_t) (entry->end - entry->start)); + + entry->needs_copy = FALSE; + + lock_write_to_read(&share_map->lock); + } + else { + /* + * We're attempting to read a copy-on-write + * page -- don't allow writes. + */ + + prot &= (~VM_PROT_WRITE); + } + } + + /* + * Create an object if necessary. + */ + if (entry->object.vm_object == NULL) { + + if (lock_read_to_write(&share_map->lock)) { + if (share_map != map) + vm_map_unlock_read(map); + goto RetryLookup; + } + + entry->object.vm_object = vm_object_allocate( + (vm_size_t)(entry->end - entry->start)); + entry->offset = 0; + lock_write_to_read(&share_map->lock); + } + + /* + * Return the object/offset from this entry. If the entry + * was copy-on-write or empty, it has been fixed up. + */ + + *offset = (share_offset - entry->start) + entry->offset; + *object = entry->object.vm_object; + + /* + * Return whether this is the only map sharing this data. + */ + + if (!su) { + simple_lock(&share_map->ref_lock); + su = (share_map->ref_count == 1); + simple_unlock(&share_map->ref_lock); + } + + *out_prot = prot; + *single_use = su; + + return(KERN_SUCCESS); + +#undef RETURN +} + +/* + * vm_map_lookup_done: + * + * Releases locks acquired by a vm_map_lookup + * (according to the handle returned by that lookup). + */ + +void vm_map_lookup_done(map, entry) + register vm_map_t map; + vm_map_entry_t entry; +{ + /* + * If this entry references a map, unlock it first. + */ + + if (entry->is_a_map) + vm_map_unlock_read(entry->object.share_map); + + /* + * Unlock the main-level map + */ + + vm_map_unlock_read(map); +} + +/* + * Routine: vm_map_simplify + * Purpose: + * Attempt to simplify the map representation in + * the vicinity of the given starting address. + * Note: + * This routine is intended primarily to keep the + * kernel maps more compact -- they generally don't + * benefit from the "expand a map entry" technology + * at allocation time because the adjacent entry + * is often wired down. + */ +void vm_map_simplify(map, start) + vm_map_t map; + vm_offset_t start; +{ + vm_map_entry_t this_entry; + vm_map_entry_t prev_entry; + + vm_map_lock(map); + if ( + (vm_map_lookup_entry(map, start, &this_entry)) && + ((prev_entry = this_entry->prev) != &map->header) && + + (prev_entry->end == start) && + (map->is_main_map) && + + (prev_entry->is_a_map == FALSE) && + (prev_entry->is_sub_map == FALSE) && + + (this_entry->is_a_map == FALSE) && + (this_entry->is_sub_map == FALSE) && + + (prev_entry->inheritance == this_entry->inheritance) && + (prev_entry->protection == this_entry->protection) && + (prev_entry->max_protection == this_entry->max_protection) && + (prev_entry->wired_count == this_entry->wired_count) && + + (prev_entry->copy_on_write == this_entry->copy_on_write) && + (prev_entry->needs_copy == this_entry->needs_copy) && + + (prev_entry->object.vm_object == this_entry->object.vm_object) && + ((prev_entry->offset + (prev_entry->end - prev_entry->start)) + == this_entry->offset) + ) { + if (map->first_free == this_entry) + map->first_free = prev_entry; + + SAVE_HINT(map, prev_entry); + vm_map_entry_unlink(map, this_entry); + prev_entry->end = this_entry->end; + vm_object_deallocate(this_entry->object.vm_object); + vm_map_entry_dispose(map, this_entry); + } + vm_map_unlock(map); +} + +/* + * vm_map_print: [ debug ] + */ +void vm_map_print(map, full) + register vm_map_t map; + boolean_t full; +{ + register vm_map_entry_t entry; + extern int indent; + + iprintf("%s map 0x%x: pmap=0x%x,ref=%d,nentries=%d,version=%d\n", + (map->is_main_map ? "Task" : "Share"), + (int) map, (int) (map->pmap), map->ref_count, map->nentries, + map->timestamp); + + if (!full && indent) + return; + + indent += 2; + for (entry = map->header.next; entry != &map->header; + entry = entry->next) { + iprintf("map entry 0x%x: start=0x%x, end=0x%x, ", + (int) entry, (int) entry->start, (int) entry->end); + if (map->is_main_map) { + static char *inheritance_name[4] = + { "share", "copy", "none", "donate_copy"}; + printf("prot=%x/%x/%s, ", + entry->protection, + entry->max_protection, + inheritance_name[entry->inheritance]); + if (entry->wired_count != 0) + printf("wired, "); + } + + if (entry->is_a_map || entry->is_sub_map) { + printf("share=0x%x, offset=0x%x\n", + (int) entry->object.share_map, + (int) entry->offset); + if ((entry->prev == &map->header) || + (!entry->prev->is_a_map) || + (entry->prev->object.share_map != + entry->object.share_map)) { + indent += 2; + vm_map_print(entry->object.share_map, full); + indent -= 2; + } + + } + else { + printf("object=0x%x, offset=0x%x", + (int) entry->object.vm_object, + (int) entry->offset); + if (entry->copy_on_write) + printf(", copy (%s)", + entry->needs_copy ? "needed" : "done"); + printf("\n"); + + if ((entry->prev == &map->header) || + (entry->prev->is_a_map) || + (entry->prev->object.vm_object != + entry->object.vm_object)) { + indent += 2; + vm_object_print(entry->object.vm_object, full); + indent -= 2; + } + } + } + indent -= 2; +} diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h new file mode 100644 index 00000000000..d25b7a2d1bd --- /dev/null +++ b/sys/vm/vm_map.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_map.h 8.3 (Berkeley) 3/15/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory map module definitions. + */ + +#ifndef _VM_MAP_ +#define _VM_MAP_ + +/* + * Types defined: + * + * vm_map_t the high-level address map data structure. + * vm_map_entry_t an entry in an address map. + * vm_map_version_t a timestamp of a map, for use with vm_map_lookup + */ + +/* + * Objects which live in maps may be either VM objects, or + * another map (called a "sharing map") which denotes read-write + * sharing with other maps. + */ + +union vm_map_object { + struct vm_object *vm_object; /* object object */ + struct vm_map *share_map; /* share map */ + struct vm_map *sub_map; /* belongs to another map */ +}; + +/* + * Address map entries consist of start and end addresses, + * a VM object (or sharing map) and offset into that object, + * and user-exported inheritance and protection information. + * Also included is control information for virtual copy operations. + */ +struct vm_map_entry { + struct vm_map_entry *prev; /* previous entry */ + struct vm_map_entry *next; /* next entry */ + vm_offset_t start; /* start address */ + vm_offset_t end; /* end address */ + union vm_map_object object; /* object I point to */ + vm_offset_t offset; /* offset into object */ + boolean_t is_a_map; /* Is "object" a map? */ + boolean_t is_sub_map; /* Is "object" a submap? */ + /* Only in sharing maps: */ + boolean_t copy_on_write; /* is data copy-on-write */ + boolean_t needs_copy; /* does object need to be copied */ + /* Only in task maps: */ + vm_prot_t protection; /* protection code */ + vm_prot_t max_protection; /* maximum protection */ + vm_inherit_t inheritance; /* inheritance */ + int wired_count; /* can be paged if = 0 */ +}; + +/* + * Maps are doubly-linked lists of map entries, kept sorted + * by address. A single hint is provided to start + * searches again from the last successful search, + * insertion, or removal. + */ +struct vm_map { + struct pmap * pmap; /* Physical map */ + lock_data_t lock; /* Lock for map data */ + struct vm_map_entry header; /* List of entries */ + int nentries; /* Number of entries */ + vm_size_t size; /* virtual size */ + boolean_t is_main_map; /* Am I a main map? */ + int ref_count; /* Reference count */ + simple_lock_data_t ref_lock; /* Lock for ref_count field */ + vm_map_entry_t hint; /* hint for quick lookups */ + simple_lock_data_t hint_lock; /* lock for hint storage */ + vm_map_entry_t first_free; /* First free space hint */ + boolean_t entries_pageable; /* map entries pageable?? */ + unsigned int timestamp; /* Version number */ +#define min_offset header.start +#define max_offset header.end +}; + +/* + * Map versions are used to validate a previous lookup attempt. + * + * Since lookup operations may involve both a main map and + * a sharing map, it is necessary to have a timestamp from each. + * [If the main map timestamp has changed, the share_map and + * associated timestamp are no longer valid; the map version + * does not include a reference for the imbedded share_map.] + */ +typedef struct { + int main_timestamp; + vm_map_t share_map; + int share_timestamp; +} vm_map_version_t; + +/* + * Macros: vm_map_lock, etc. + * Function: + * Perform locking on the data portion of a map. + */ + +#define vm_map_lock(map) { \ + lock_write(&(map)->lock); \ + (map)->timestamp++; \ +} +#define vm_map_unlock(map) lock_write_done(&(map)->lock) +#define vm_map_lock_read(map) lock_read(&(map)->lock) +#define vm_map_unlock_read(map) lock_read_done(&(map)->lock) + +/* + * Functions implemented as macros + */ +#define vm_map_min(map) ((map)->min_offset) +#define vm_map_max(map) ((map)->max_offset) +#define vm_map_pmap(map) ((map)->pmap) + +/* XXX: number of kernel maps and entries to statically allocate */ +#define MAX_KMAP 10 +#define MAX_KMAPENT 500 + +#ifdef KERNEL +boolean_t vm_map_check_protection __P((vm_map_t, + vm_offset_t, vm_offset_t, vm_prot_t)); +int vm_map_copy __P((vm_map_t, vm_map_t, vm_offset_t, + vm_size_t, vm_offset_t, boolean_t, boolean_t)); +void vm_map_copy_entry __P((vm_map_t, + vm_map_t, vm_map_entry_t, vm_map_entry_t)); +struct pmap; +vm_map_t vm_map_create __P((struct pmap *, + vm_offset_t, vm_offset_t, boolean_t)); +void vm_map_deallocate __P((vm_map_t)); +int vm_map_delete __P((vm_map_t, vm_offset_t, vm_offset_t)); +vm_map_entry_t vm_map_entry_create __P((vm_map_t)); +void vm_map_entry_delete __P((vm_map_t, vm_map_entry_t)); +void vm_map_entry_dispose __P((vm_map_t, vm_map_entry_t)); +void vm_map_entry_unwire __P((vm_map_t, vm_map_entry_t)); +int vm_map_find __P((vm_map_t, vm_object_t, + vm_offset_t, vm_offset_t *, vm_size_t, boolean_t)); +int vm_map_findspace __P((vm_map_t, + vm_offset_t, vm_size_t, vm_offset_t *)); +int vm_map_inherit __P((vm_map_t, + vm_offset_t, vm_offset_t, vm_inherit_t)); +void vm_map_init __P((struct vm_map *, + vm_offset_t, vm_offset_t, boolean_t)); +int vm_map_insert __P((vm_map_t, + vm_object_t, vm_offset_t, vm_offset_t, vm_offset_t)); +int vm_map_lookup __P((vm_map_t *, vm_offset_t, vm_prot_t, + vm_map_entry_t *, vm_object_t *, vm_offset_t *, vm_prot_t *, + boolean_t *, boolean_t *)); +void vm_map_lookup_done __P((vm_map_t, vm_map_entry_t)); +boolean_t vm_map_lookup_entry __P((vm_map_t, + vm_offset_t, vm_map_entry_t *)); +int vm_map_pageable __P((vm_map_t, + vm_offset_t, vm_offset_t, boolean_t)); +int vm_map_clean __P((vm_map_t, + vm_offset_t, vm_offset_t, boolean_t, boolean_t)); +void vm_map_print __P((vm_map_t, boolean_t)); +int vm_map_protect __P((vm_map_t, + vm_offset_t, vm_offset_t, vm_prot_t, boolean_t)); +void vm_map_reference __P((vm_map_t)); +int vm_map_remove __P((vm_map_t, vm_offset_t, vm_offset_t)); +void vm_map_simplify __P((vm_map_t, vm_offset_t)); +void vm_map_simplify_entry __P((vm_map_t, vm_map_entry_t)); +void vm_map_startup __P((void)); +int vm_map_submap __P((vm_map_t, + vm_offset_t, vm_offset_t, vm_map_t)); +#endif +#endif /* _VM_MAP_ */ diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c new file mode 100644 index 00000000000..9db6f506c2a --- /dev/null +++ b/sys/vm/vm_meter.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_meter.c 8.4 (Berkeley) 1/4/94 + */ + +#include +#include +#include +#include +#include +#include + +struct loadavg averunnable; /* load average, of runnable procs */ + +int maxslp = MAXSLP; +int saferss = SAFERSS; + +void +vmmeter() +{ + + if (time.tv_sec % 5 == 0) + loadav(&averunnable); + if (proc0.p_slptime > maxslp/2) + wakeup((caddr_t)&proc0); +} + +/* + * Constants for averages over 1, 5, and 15 minutes + * when sampling at 5 second intervals. + */ +fixpt_t cexp[3] = { + 0.9200444146293232 * FSCALE, /* exp(-1/12) */ + 0.9834714538216174 * FSCALE, /* exp(-1/60) */ + 0.9944598480048967 * FSCALE, /* exp(-1/180) */ +}; + +/* + * Compute a tenex style load average of a quantity on + * 1, 5 and 15 minute intervals. + */ +void +loadav(avg) + register struct loadavg *avg; +{ + register int i, nrun; + register struct proc *p; + + for (nrun = 0, p = (struct proc *)allproc; p != NULL; p = p->p_next) { + switch (p->p_stat) { + case SSLEEP: + if (p->p_priority > PZERO || p->p_slptime != 0) + continue; + /* fall through */ + case SRUN: + case SIDL: + nrun++; + } + } + for (i = 0; i < 3; i++) + avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; +} + +/* + * Attributes associated with virtual memory. + */ +vm_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) + int *name; + u_int namelen; + void *oldp; + size_t *oldlenp; + void *newp; + size_t newlen; + struct proc *p; +{ + struct vmtotal vmtotals; + + /* all sysctl names at this level are terminal */ + if (namelen != 1) + return (ENOTDIR); /* overloaded */ + + switch (name[0]) { + case VM_LOADAVG: + averunnable.fscale = FSCALE; + return (sysctl_rdstruct(oldp, oldlenp, newp, &averunnable, + sizeof(averunnable))); + case VM_METER: + vmtotal(&vmtotals); + return (sysctl_rdstruct(oldp, oldlenp, newp, &vmtotals, + sizeof(vmtotals))); + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} + +/* + * Calculate the current state of the system. + * Done on demand from getkerninfo(). + */ +void +vmtotal(totalp) + register struct vmtotal *totalp; +{ + register struct proc *p; + register vm_map_entry_t entry; + register vm_object_t object; + register vm_map_t map; + int paging; + + bzero(totalp, sizeof *totalp); + /* + * Mark all objects as inactive. + */ + simple_lock(&vm_object_list_lock); + for (object = vm_object_list.tqh_first; + object != NULL; + object = object->object_list.tqe_next) + object->flags &= ~OBJ_ACTIVE; + simple_unlock(&vm_object_list_lock); + /* + * Calculate process statistics. + */ + for (p = (struct proc *)allproc; p != NULL; p = p->p_next) { + if (p->p_flag & P_SYSTEM) + continue; + switch (p->p_stat) { + case 0: + continue; + + case SSLEEP: + case SSTOP: + if (p->p_flag & P_INMEM) { + if (p->p_priority <= PZERO) + totalp->t_dw++; + else if (p->p_slptime < maxslp) + totalp->t_sl++; + } else if (p->p_slptime < maxslp) + totalp->t_sw++; + if (p->p_slptime >= maxslp) + continue; + break; + + case SRUN: + case SIDL: + if (p->p_flag & P_INMEM) + totalp->t_rq++; + else + totalp->t_sw++; + if (p->p_stat == SIDL) + continue; + break; + } + /* + * Note active objects. + */ + paging = 0; + for (map = &p->p_vmspace->vm_map, entry = map->header.next; + entry != &map->header; entry = entry->next) { + if (entry->is_a_map || entry->is_sub_map || + entry->object.vm_object == NULL) + continue; + entry->object.vm_object->flags |= OBJ_ACTIVE; + paging |= entry->object.vm_object->paging_in_progress; + } + if (paging) + totalp->t_pw++; + } + /* + * Calculate object memory usage statistics. + */ + simple_lock(&vm_object_list_lock); + for (object = vm_object_list.tqh_first; + object != NULL; + object = object->object_list.tqe_next) { + totalp->t_vm += num_pages(object->size); + totalp->t_rm += object->resident_page_count; + if (object->flags & OBJ_ACTIVE) { + totalp->t_avm += num_pages(object->size); + totalp->t_arm += object->resident_page_count; + } + if (object->ref_count > 1) { + /* shared object */ + totalp->t_vmshr += num_pages(object->size); + totalp->t_rmshr += object->resident_page_count; + if (object->flags & OBJ_ACTIVE) { + totalp->t_avmshr += num_pages(object->size); + totalp->t_armshr += object->resident_page_count; + } + } + } + totalp->t_free = cnt.v_free_count; +} diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c new file mode 100644 index 00000000000..340cded1ba4 --- /dev/null +++ b/sys/vm/vm_mmap.c @@ -0,0 +1,832 @@ +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ + * + * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 + */ + +/* + * Mapped file (mmap) interface to VM + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#ifdef DEBUG +int mmapdebug = 0; +#define MDB_FOLLOW 0x01 +#define MDB_SYNC 0x02 +#define MDB_MAPIT 0x04 +#endif + +struct sbrk_args { + int incr; +}; +/* ARGSUSED */ +int +sbrk(p, uap, retval) + struct proc *p; + struct sbrk_args *uap; + int *retval; +{ + + /* Not yet implemented */ + return (EOPNOTSUPP); +} + +struct sstk_args { + int incr; +}; +/* ARGSUSED */ +int +sstk(p, uap, retval) + struct proc *p; + struct sstk_args *uap; + int *retval; +{ + + /* Not yet implemented */ + return (EOPNOTSUPP); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +struct getpagesize_args { + int dummy; +}; +/* ARGSUSED */ +int +ogetpagesize(p, uap, retval) + struct proc *p; + struct getpagesize_args *uap; + int *retval; +{ + + *retval = PAGE_SIZE; + return (0); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +struct mmap_args { + caddr_t addr; + size_t len; + int prot; + int flags; + int fd; + long pad; + off_t pos; +}; + +#ifdef COMPAT_43 +struct ommap_args { + caddr_t addr; + int len; + int prot; + int flags; + int fd; + long pos; +}; +int +ommap(p, uap, retval) + struct proc *p; + register struct ommap_args *uap; + int *retval; +{ + struct mmap_args nargs; + static const char cvtbsdprot[8] = { + 0, + PROT_EXEC, + PROT_WRITE, + PROT_EXEC|PROT_WRITE, + PROT_READ, + PROT_EXEC|PROT_READ, + PROT_WRITE|PROT_READ, + PROT_EXEC|PROT_WRITE|PROT_READ, + }; +#define OMAP_ANON 0x0002 +#define OMAP_COPY 0x0020 +#define OMAP_SHARED 0x0010 +#define OMAP_FIXED 0x0100 +#define OMAP_INHERIT 0x0800 + + nargs.addr = uap->addr; + nargs.len = uap->len; + nargs.prot = cvtbsdprot[uap->prot&0x7]; + nargs.flags = 0; + if (uap->flags & OMAP_ANON) + nargs.flags |= MAP_ANON; + if (uap->flags & OMAP_COPY) + nargs.flags |= MAP_COPY; + if (uap->flags & OMAP_SHARED) + nargs.flags |= MAP_SHARED; + else + nargs.flags |= MAP_PRIVATE; + if (uap->flags & OMAP_FIXED) + nargs.flags |= MAP_FIXED; + if (uap->flags & OMAP_INHERIT) + nargs.flags |= MAP_INHERIT; + nargs.fd = uap->fd; + nargs.pos = uap->pos; + return (mmap(p, &nargs, retval)); +} +#endif + +int +mmap(p, uap, retval) + struct proc *p; + register struct mmap_args *uap; + int *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vnode *vp; + vm_offset_t addr; + vm_size_t size; + vm_prot_t prot, maxprot; + caddr_t handle; + int flags, error; + + prot = uap->prot & VM_PROT_ALL; + flags = uap->flags; +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("mmap(%d): addr %x len %x pro %x flg %x fd %d pos %x\n", + p->p_pid, uap->addr, uap->len, prot, + flags, uap->fd, (vm_offset_t)uap->pos); +#endif + /* + * Address (if FIXED) must be page aligned. + * Size is implicitly rounded to a page boundary. + */ + addr = (vm_offset_t) uap->addr; + if (((flags & MAP_FIXED) && (addr & PAGE_MASK)) || + (ssize_t)uap->len < 0 || ((flags & MAP_ANON) && uap->fd != -1)) + return (EINVAL); + size = (vm_size_t) round_page(uap->len); + /* + * Check for illegal addresses. Watch out for address wrap... + * Note that VM_*_ADDRESS are not constants due to casts (argh). + */ + if (flags & MAP_FIXED) { + if (VM_MAXUSER_ADDRESS > 0 && addr + size >= VM_MAXUSER_ADDRESS) + return (EINVAL); + if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) + return (EINVAL); + if (addr > addr + size) + return (EINVAL); + } + /* + * XXX if no hint provided for a non-fixed mapping place it after + * the end of the largest possible heap. + * + * There should really be a pmap call to determine a reasonable + * location. + */ + if (addr == 0 && (flags & MAP_FIXED) == 0) + addr = round_page(p->p_vmspace->vm_daddr + MAXDSIZ); + if (flags & MAP_ANON) { + /* + * Mapping blank space is trivial. + */ + handle = NULL; + maxprot = VM_PROT_ALL; + } else { + /* + * Mapping file, get fp for validation. + * Obtain vnode and make sure it is of appropriate type. + */ + if (((unsigned)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (EINVAL); + vp = (struct vnode *)fp->f_data; + if (vp->v_type != VREG && vp->v_type != VCHR) + return (EINVAL); + /* + * XXX hack to handle use of /dev/zero to map anon + * memory (ala SunOS). + */ + if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { + handle = NULL; + maxprot = VM_PROT_ALL; + flags |= MAP_ANON; + } else { + /* + * Ensure that file and memory protections are + * compatible. Note that we only worry about + * writability if mapping is shared; in this case, + * current and max prot are dictated by the open file. + * XXX use the vnode instead? Problem is: what + * credentials do we use for determination? + * What if proc does a setuid? + */ + maxprot = VM_PROT_EXECUTE; /* ??? */ + if (fp->f_flag & FREAD) + maxprot |= VM_PROT_READ; + else if (prot & PROT_READ) + return (EACCES); + if (flags & MAP_SHARED) { + if (fp->f_flag & FWRITE) + maxprot |= VM_PROT_WRITE; + else if (prot & PROT_WRITE) + return (EACCES); + } else + maxprot |= VM_PROT_WRITE; + handle = (caddr_t)vp; + } + } + error = vm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, + flags, handle, (vm_offset_t)uap->pos); + if (error == 0) + *retval = (int)addr; + return (error); +} + +struct msync_args { + caddr_t addr; + int len; +}; +int +msync(p, uap, retval) + struct proc *p; + struct msync_args *uap; + int *retval; +{ + vm_offset_t addr; + vm_size_t size; + vm_map_t map; + int rv; + boolean_t syncio, invalidate; + +#ifdef DEBUG + if (mmapdebug & (MDB_FOLLOW|MDB_SYNC)) + printf("msync(%d): addr %x len %x\n", + p->p_pid, uap->addr, uap->len); +#endif + if (((int)uap->addr & PAGE_MASK) || uap->addr + uap->len < uap->addr) + return (EINVAL); + map = &p->p_vmspace->vm_map; + addr = (vm_offset_t)uap->addr; + size = (vm_size_t)uap->len; + /* + * XXX Gak! If size is zero we are supposed to sync "all modified + * pages with the region containing addr". Unfortunately, we + * don't really keep track of individual mmaps so we approximate + * by flushing the range of the map entry containing addr. + * This can be incorrect if the region splits or is coalesced + * with a neighbor. + */ + if (size == 0) { + vm_map_entry_t entry; + + vm_map_lock_read(map); + rv = vm_map_lookup_entry(map, addr, &entry); + vm_map_unlock_read(map); + if (rv) + return (EINVAL); + addr = entry->start; + size = entry->end - entry->start; + } +#ifdef DEBUG + if (mmapdebug & MDB_SYNC) + printf("msync: cleaning/flushing address range [%x-%x)\n", + addr, addr+size); +#endif + /* + * Could pass this in as a third flag argument to implement + * Sun's MS_ASYNC. + */ + syncio = TRUE; + /* + * XXX bummer, gotta flush all cached pages to ensure + * consistency with the file system cache. Otherwise, we could + * pass this in to implement Sun's MS_INVALIDATE. + */ + invalidate = TRUE; + /* + * Clean the pages and interpret the return value. + */ + rv = vm_map_clean(map, addr, addr+size, syncio, invalidate); + switch (rv) { + case KERN_SUCCESS: + break; + case KERN_INVALID_ADDRESS: + return (EINVAL); /* Sun returns ENOMEM? */ + case KERN_FAILURE: + return (EIO); + default: + return (EINVAL); + } + return (0); +} + +struct munmap_args { + caddr_t addr; + int len; +}; +int +munmap(p, uap, retval) + register struct proc *p; + register struct munmap_args *uap; + int *retval; +{ + vm_offset_t addr; + vm_size_t size; + vm_map_t map; + +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("munmap(%d): addr %x len %x\n", + p->p_pid, uap->addr, uap->len); +#endif + + addr = (vm_offset_t) uap->addr; + if ((addr & PAGE_MASK) || uap->len < 0) + return(EINVAL); + size = (vm_size_t) round_page(uap->len); + if (size == 0) + return(0); + /* + * Check for illegal addresses. Watch out for address wrap... + * Note that VM_*_ADDRESS are not constants due to casts (argh). + */ + if (VM_MAXUSER_ADDRESS > 0 && addr + size >= VM_MAXUSER_ADDRESS) + return (EINVAL); + if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS) + return (EINVAL); + if (addr > addr + size) + return (EINVAL); + map = &p->p_vmspace->vm_map; + /* + * Make sure entire range is allocated. + */ + if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) + return(EINVAL); + /* returns nothing but KERN_SUCCESS anyway */ + (void) vm_map_remove(map, addr, addr+size); + return(0); +} + +void +munmapfd(fd) + int fd; +{ +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("munmapfd(%d): fd %d\n", curproc->p_pid, fd); +#endif + + /* + * XXX should vm_deallocate any regions mapped to this file + */ + curproc->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; +} + +struct mprotect_args { + caddr_t addr; + int len; + int prot; +}; +int +mprotect(p, uap, retval) + struct proc *p; + struct mprotect_args *uap; + int *retval; +{ + vm_offset_t addr; + vm_size_t size; + register vm_prot_t prot; + +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("mprotect(%d): addr %x len %x prot %d\n", + p->p_pid, uap->addr, uap->len, uap->prot); +#endif + + addr = (vm_offset_t)uap->addr; + if ((addr & PAGE_MASK) || uap->len < 0) + return(EINVAL); + size = (vm_size_t)uap->len; + prot = uap->prot & VM_PROT_ALL; + + switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr+size, prot, + FALSE)) { + case KERN_SUCCESS: + return (0); + case KERN_PROTECTION_FAILURE: + return (EACCES); + } + return (EINVAL); +} + +struct madvise_args { + caddr_t addr; + int len; + int behav; +}; +/* ARGSUSED */ +int +madvise(p, uap, retval) + struct proc *p; + struct madvise_args *uap; + int *retval; +{ + + /* Not yet implemented */ + return (EOPNOTSUPP); +} + +struct mincore_args { + caddr_t addr; + int len; + char *vec; +}; +/* ARGSUSED */ +int +mincore(p, uap, retval) + struct proc *p; + struct mincore_args *uap; + int *retval; +{ + + /* Not yet implemented */ + return (EOPNOTSUPP); +} + +struct mlock_args { + caddr_t addr; + size_t len; +}; +int +mlock(p, uap, retval) + struct proc *p; + struct mlock_args *uap; + int *retval; +{ + vm_offset_t addr; + vm_size_t size; + int error; + extern int vm_page_max_wired; + +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("mlock(%d): addr %x len %x\n", + p->p_pid, uap->addr, uap->len); +#endif + addr = (vm_offset_t)uap->addr; + if ((addr & PAGE_MASK) || uap->addr + uap->len < uap->addr) + return (EINVAL); + size = round_page((vm_size_t)uap->len); + if (atop(size) + cnt.v_wire_count > vm_page_max_wired) + return (EAGAIN); +#ifdef pmap_wired_count + if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > + p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) + return (EAGAIN); +#else + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); +#endif + + error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE); + return (error == KERN_SUCCESS ? 0 : ENOMEM); +} + +struct munlock_args { + caddr_t addr; + size_t len; +}; +int +munlock(p, uap, retval) + struct proc *p; + struct munlock_args *uap; + int *retval; +{ + vm_offset_t addr; + vm_size_t size; + int error; + +#ifdef DEBUG + if (mmapdebug & MDB_FOLLOW) + printf("munlock(%d): addr %x len %x\n", + p->p_pid, uap->addr, uap->len); +#endif + addr = (vm_offset_t)uap->addr; + if ((addr & PAGE_MASK) || uap->addr + uap->len < uap->addr) + return (EINVAL); +#ifndef pmap_wired_count + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); +#endif + size = round_page((vm_size_t)uap->len); + + error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE); + return (error == KERN_SUCCESS ? 0 : ENOMEM); +} + +/* + * Internal version of mmap. + * Currently used by mmap, exec, and sys5 shared memory. + * Handle is either a vnode pointer or NULL for MAP_ANON. + */ +int +vm_mmap(map, addr, size, prot, maxprot, flags, handle, foff) + register vm_map_t map; + register vm_offset_t *addr; + register vm_size_t size; + vm_prot_t prot, maxprot; + register int flags; + caddr_t handle; /* XXX should be vp */ + vm_offset_t foff; +{ + register vm_pager_t pager; + boolean_t fitit; + vm_object_t object; + struct vnode *vp = NULL; + int type; + int rv = KERN_SUCCESS; + + if (size == 0) + return (0); + + if ((flags & MAP_FIXED) == 0) { + fitit = TRUE; + *addr = round_page(*addr); + } else { + fitit = FALSE; + (void)vm_deallocate(map, *addr, size); + } + + /* + * Lookup/allocate pager. All except an unnamed anonymous lookup + * gain a reference to ensure continued existance of the object. + * (XXX the exception is to appease the pageout daemon) + */ + if (flags & MAP_ANON) + type = PG_DFLT; + else { + vp = (struct vnode *)handle; + if (vp->v_type == VCHR) { + type = PG_DEVICE; + handle = (caddr_t)vp->v_rdev; + } else + type = PG_VNODE; + } + pager = vm_pager_allocate(type, handle, size, prot, foff); + if (pager == NULL) + return (type == PG_DEVICE ? EINVAL : ENOMEM); + /* + * Find object and release extra reference gained by lookup + */ + object = vm_object_lookup(pager); + vm_object_deallocate(object); + + /* + * Anonymous memory. + */ + if (flags & MAP_ANON) { + rv = vm_allocate_with_pager(map, addr, size, fitit, + pager, foff, TRUE); + if (rv != KERN_SUCCESS) { + if (handle == NULL) + vm_pager_deallocate(pager); + else + vm_object_deallocate(object); + goto out; + } + /* + * Don't cache anonymous objects. + * Loses the reference gained by vm_pager_allocate. + * Note that object will be NULL when handle == NULL, + * this is ok since vm_allocate_with_pager has made + * sure that these objects are uncached. + */ + (void) pager_cache(object, FALSE); +#ifdef DEBUG + if (mmapdebug & MDB_MAPIT) + printf("vm_mmap(%d): ANON *addr %x size %x pager %x\n", + curproc->p_pid, *addr, size, pager); +#endif + } + /* + * Must be a mapped file. + * Distinguish between character special and regular files. + */ + else if (vp->v_type == VCHR) { + rv = vm_allocate_with_pager(map, addr, size, fitit, + pager, foff, FALSE); + /* + * Uncache the object and lose the reference gained + * by vm_pager_allocate(). If the call to + * vm_allocate_with_pager() was sucessful, then we + * gained an additional reference ensuring the object + * will continue to exist. If the call failed then + * the deallocate call below will terminate the + * object which is fine. + */ + (void) pager_cache(object, FALSE); + if (rv != KERN_SUCCESS) + goto out; + } + /* + * A regular file + */ + else { +#ifdef DEBUG + if (object == NULL) + printf("vm_mmap: no object: vp %x, pager %x\n", + vp, pager); +#endif + /* + * Map it directly. + * Allows modifications to go out to the vnode. + */ + if (flags & MAP_SHARED) { + rv = vm_allocate_with_pager(map, addr, size, + fitit, pager, + foff, FALSE); + if (rv != KERN_SUCCESS) { + vm_object_deallocate(object); + goto out; + } + /* + * Don't cache the object. This is the easiest way + * of ensuring that data gets back to the filesystem + * because vnode_pager_deallocate() will fsync the + * vnode. pager_cache() will lose the extra ref. + */ + if (prot & VM_PROT_WRITE) + pager_cache(object, FALSE); + else + vm_object_deallocate(object); + } + /* + * Copy-on-write of file. Two flavors. + * MAP_COPY is true COW, you essentially get a snapshot of + * the region at the time of mapping. MAP_PRIVATE means only + * that your changes are not reflected back to the object. + * Changes made by others will be seen. + */ + else { + vm_map_t tmap; + vm_offset_t off; + + /* locate and allocate the target address space */ + rv = vm_map_find(map, NULL, (vm_offset_t)0, + addr, size, fitit); + if (rv != KERN_SUCCESS) { + vm_object_deallocate(object); + goto out; + } + tmap = vm_map_create(pmap_create(size), VM_MIN_ADDRESS, + VM_MIN_ADDRESS+size, TRUE); + off = VM_MIN_ADDRESS; + rv = vm_allocate_with_pager(tmap, &off, size, + TRUE, pager, + foff, FALSE); + if (rv != KERN_SUCCESS) { + vm_object_deallocate(object); + vm_map_deallocate(tmap); + goto out; + } + /* + * (XXX) + * MAP_PRIVATE implies that we see changes made by + * others. To ensure that we need to guarentee that + * no copy object is created (otherwise original + * pages would be pushed to the copy object and we + * would never see changes made by others). We + * totally sleeze it right now by marking the object + * internal temporarily. + */ + if ((flags & MAP_COPY) == 0) + object->flags |= OBJ_INTERNAL; + rv = vm_map_copy(map, tmap, *addr, size, off, + FALSE, FALSE); + object->flags &= ~OBJ_INTERNAL; + /* + * (XXX) + * My oh my, this only gets worse... + * Force creation of a shadow object so that + * vm_map_fork will do the right thing. + */ + if ((flags & MAP_COPY) == 0) { + vm_map_t tmap; + vm_map_entry_t tentry; + vm_object_t tobject; + vm_offset_t toffset; + vm_prot_t tprot; + boolean_t twired, tsu; + + tmap = map; + vm_map_lookup(&tmap, *addr, VM_PROT_WRITE, + &tentry, &tobject, &toffset, + &tprot, &twired, &tsu); + vm_map_lookup_done(tmap, tentry); + } + /* + * (XXX) + * Map copy code cannot detect sharing unless a + * sharing map is involved. So we cheat and write + * protect everything ourselves. + */ + vm_object_pmap_copy(object, foff, foff + size); + vm_object_deallocate(object); + vm_map_deallocate(tmap); + if (rv != KERN_SUCCESS) + goto out; + } +#ifdef DEBUG + if (mmapdebug & MDB_MAPIT) + printf("vm_mmap(%d): FILE *addr %x size %x pager %x\n", + curproc->p_pid, *addr, size, pager); +#endif + } + /* + * Correct protection (default is VM_PROT_ALL). + * If maxprot is different than prot, we must set both explicitly. + */ + rv = KERN_SUCCESS; + if (maxprot != VM_PROT_ALL) + rv = vm_map_protect(map, *addr, *addr+size, maxprot, TRUE); + if (rv == KERN_SUCCESS && prot != maxprot) + rv = vm_map_protect(map, *addr, *addr+size, prot, FALSE); + if (rv != KERN_SUCCESS) { + (void) vm_deallocate(map, *addr, size); + goto out; + } + /* + * Shared memory is also shared with children. + */ + if (flags & MAP_SHARED) { + rv = vm_map_inherit(map, *addr, *addr+size, VM_INHERIT_SHARE); + if (rv != KERN_SUCCESS) { + (void) vm_deallocate(map, *addr, size); + goto out; + } + } +out: +#ifdef DEBUG + if (mmapdebug & MDB_MAPIT) + printf("vm_mmap: rv %d\n", rv); +#endif + switch (rv) { + case KERN_SUCCESS: + return (0); + case KERN_INVALID_ADDRESS: + case KERN_NO_SPACE: + return (ENOMEM); + case KERN_PROTECTION_FAILURE: + return (EACCES); + default: + return (EINVAL); + } +} diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c new file mode 100644 index 00000000000..d11fa8be014 --- /dev/null +++ b/sys/vm/vm_object.c @@ -0,0 +1,1436 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_object.c 8.5 (Berkeley) 3/22/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory object module. + */ + +#include +#include +#include + +#include +#include + +/* + * Virtual memory objects maintain the actual data + * associated with allocated virtual memory. A given + * page of memory exists within exactly one object. + * + * An object is only deallocated when all "references" + * are given up. Only one "reference" to a given + * region of an object should be writeable. + * + * Associated with each object is a list of all resident + * memory pages belonging to that object; this list is + * maintained by the "vm_page" module, and locked by the object's + * lock. + * + * Each object also records a "pager" routine which is + * used to retrieve (and store) pages to the proper backing + * storage. In addition, objects may be backed by other + * objects from which they were virtual-copied. + * + * The only items within the object structure which are + * modified after time of creation are: + * reference count locked by object's lock + * pager routine locked by object's lock + * + */ + +struct vm_object kernel_object_store; +struct vm_object kmem_object_store; + +#define VM_OBJECT_HASH_COUNT 157 + +int vm_cache_max = 100; /* can patch if necessary */ +struct vm_object_hash_head vm_object_hashtable[VM_OBJECT_HASH_COUNT]; + +long object_collapses = 0; +long object_bypasses = 0; + +static void _vm_object_allocate __P((vm_size_t, vm_object_t)); + +/* + * vm_object_init: + * + * Initialize the VM objects module. + */ +void vm_object_init(size) + vm_size_t size; +{ + register int i; + + TAILQ_INIT(&vm_object_cached_list); + TAILQ_INIT(&vm_object_list); + vm_object_count = 0; + simple_lock_init(&vm_cache_lock); + simple_lock_init(&vm_object_list_lock); + + for (i = 0; i < VM_OBJECT_HASH_COUNT; i++) + TAILQ_INIT(&vm_object_hashtable[i]); + + kernel_object = &kernel_object_store; + _vm_object_allocate(size, kernel_object); + + kmem_object = &kmem_object_store; + _vm_object_allocate(VM_KMEM_SIZE + VM_MBUF_SIZE, kmem_object); +} + +/* + * vm_object_allocate: + * + * Returns a new object with the given size. + */ + +vm_object_t vm_object_allocate(size) + vm_size_t size; +{ + register vm_object_t result; + + result = (vm_object_t) + malloc((u_long)sizeof *result, M_VMOBJ, M_WAITOK); + + _vm_object_allocate(size, result); + + return(result); +} + +static void +_vm_object_allocate(size, object) + vm_size_t size; + register vm_object_t object; +{ + TAILQ_INIT(&object->memq); + vm_object_lock_init(object); + object->ref_count = 1; + object->resident_page_count = 0; + object->size = size; + object->flags = OBJ_INTERNAL; /* vm_allocate_with_pager will reset */ + object->paging_in_progress = 0; + object->copy = NULL; + + /* + * Object starts out read-write, with no pager. + */ + + object->pager = NULL; + object->paging_offset = 0; + object->shadow = NULL; + object->shadow_offset = (vm_offset_t) 0; + + simple_lock(&vm_object_list_lock); + TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); + vm_object_count++; + cnt.v_nzfod += atop(size); + simple_unlock(&vm_object_list_lock); +} + +/* + * vm_object_reference: + * + * Gets another reference to the given object. + */ +void vm_object_reference(object) + register vm_object_t object; +{ + if (object == NULL) + return; + + vm_object_lock(object); + object->ref_count++; + vm_object_unlock(object); +} + +/* + * vm_object_deallocate: + * + * Release a reference to the specified object, + * gained either through a vm_object_allocate + * or a vm_object_reference call. When all references + * are gone, storage associated with this object + * may be relinquished. + * + * No object may be locked. + */ +void vm_object_deallocate(object) + register vm_object_t object; +{ + vm_object_t temp; + + while (object != NULL) { + + /* + * The cache holds a reference (uncounted) to + * the object; we must lock it before removing + * the object. + */ + + vm_object_cache_lock(); + + /* + * Lose the reference + */ + vm_object_lock(object); + if (--(object->ref_count) != 0) { + + /* + * If there are still references, then + * we are done. + */ + vm_object_unlock(object); + vm_object_cache_unlock(); + return; + } + + /* + * See if this object can persist. If so, enter + * it in the cache, then deactivate all of its + * pages. + */ + + if (object->flags & OBJ_CANPERSIST) { + + TAILQ_INSERT_TAIL(&vm_object_cached_list, object, + cached_list); + vm_object_cached++; + vm_object_cache_unlock(); + + vm_object_deactivate_pages(object); + vm_object_unlock(object); + + vm_object_cache_trim(); + return; + } + + /* + * Make sure no one can look us up now. + */ + vm_object_remove(object->pager); + vm_object_cache_unlock(); + + temp = object->shadow; + vm_object_terminate(object); + /* unlocks and deallocates object */ + object = temp; + } +} + + +/* + * vm_object_terminate actually destroys the specified object, freeing + * up all previously used resources. + * + * The object must be locked. + */ +void vm_object_terminate(object) + register vm_object_t object; +{ + register vm_page_t p; + vm_object_t shadow_object; + + /* + * Detach the object from its shadow if we are the shadow's + * copy. + */ + if ((shadow_object = object->shadow) != NULL) { + vm_object_lock(shadow_object); + if (shadow_object->copy == object) + shadow_object->copy = NULL; +#if 0 + else if (shadow_object->copy != NULL) + panic("vm_object_terminate: copy/shadow inconsistency"); +#endif + vm_object_unlock(shadow_object); + } + + /* + * Wait until the pageout daemon is through with the object. + */ + while (object->paging_in_progress) { + vm_object_sleep((int)object, object, FALSE); + vm_object_lock(object); + } + + /* + * If not an internal object clean all the pages, removing them + * from paging queues as we go. + * + * XXX need to do something in the event of a cleaning error. + */ + if ((object->flags & OBJ_INTERNAL) == 0) { + (void) vm_object_page_clean(object, 0, 0, TRUE, TRUE); + vm_object_unlock(object); + } + + /* + * Now free the pages. + * For internal objects, this also removes them from paging queues. + */ + while ((p = object->memq.tqh_first) != NULL) { + VM_PAGE_CHECK(p); + vm_page_lock_queues(); + vm_page_free(p); + cnt.v_pfree++; + vm_page_unlock_queues(); + } + if ((object->flags & OBJ_INTERNAL) == 0) + vm_object_unlock(object); + + /* + * Let the pager know object is dead. + */ + if (object->pager != NULL) + vm_pager_deallocate(object->pager); + + simple_lock(&vm_object_list_lock); + TAILQ_REMOVE(&vm_object_list, object, object_list); + vm_object_count--; + simple_unlock(&vm_object_list_lock); + + /* + * Free the space for the object. + */ + free((caddr_t)object, M_VMOBJ); +} + +/* + * vm_object_page_clean + * + * Clean all dirty pages in the specified range of object. + * If syncio is TRUE, page cleaning is done synchronously. + * If de_queue is TRUE, pages are removed from any paging queue + * they were on, otherwise they are left on whatever queue they + * were on before the cleaning operation began. + * + * Odd semantics: if start == end, we clean everything. + * + * The object must be locked. + * + * Returns TRUE if all was well, FALSE if there was a pager error + * somewhere. We attempt to clean (and dequeue) all pages regardless + * of where an error occurs. + */ +boolean_t +vm_object_page_clean(object, start, end, syncio, de_queue) + register vm_object_t object; + register vm_offset_t start; + register vm_offset_t end; + boolean_t syncio; + boolean_t de_queue; +{ + register vm_page_t p; + int onqueue; + boolean_t noerror = TRUE; + + if (object == NULL) + return (TRUE); + + /* + * If it is an internal object and there is no pager, attempt to + * allocate one. Note that vm_object_collapse may relocate one + * from a collapsed object so we must recheck afterward. + */ + if ((object->flags & OBJ_INTERNAL) && object->pager == NULL) { + vm_object_collapse(object); + if (object->pager == NULL) { + vm_pager_t pager; + + vm_object_unlock(object); + pager = vm_pager_allocate(PG_DFLT, (caddr_t)0, + object->size, VM_PROT_ALL, + (vm_offset_t)0); + if (pager) + vm_object_setpager(object, pager, 0, FALSE); + vm_object_lock(object); + } + } + if (object->pager == NULL) + return (FALSE); + +again: + /* + * Wait until the pageout daemon is through with the object. + */ + while (object->paging_in_progress) { + vm_object_sleep((int)object, object, FALSE); + vm_object_lock(object); + } + /* + * Loop through the object page list cleaning as necessary. + */ + for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { + if ((start == end || p->offset >= start && p->offset < end) && + !(p->flags & PG_FICTITIOUS)) { + if ((p->flags & PG_CLEAN) && + pmap_is_modified(VM_PAGE_TO_PHYS(p))) + p->flags &= ~PG_CLEAN; + /* + * Remove the page from any paging queue. + * This needs to be done if either we have been + * explicitly asked to do so or it is about to + * be cleaned (see comment below). + */ + if (de_queue || !(p->flags & PG_CLEAN)) { + vm_page_lock_queues(); + if (p->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, + p, pageq); + p->flags &= ~PG_ACTIVE; + cnt.v_active_count--; + onqueue = 1; + } else if (p->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, + p, pageq); + p->flags &= ~PG_INACTIVE; + cnt.v_inactive_count--; + onqueue = -1; + } else + onqueue = 0; + vm_page_unlock_queues(); + } + /* + * To ensure the state of the page doesn't change + * during the clean operation we do two things. + * First we set the busy bit and write-protect all + * mappings to ensure that write accesses to the + * page block (in vm_fault). Second, we remove + * the page from any paging queue to foil the + * pageout daemon (vm_pageout_scan). + */ + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_READ); + if (!(p->flags & PG_CLEAN)) { + p->flags |= PG_BUSY; + object->paging_in_progress++; + vm_object_unlock(object); + /* + * XXX if put fails we mark the page as + * clean to avoid an infinite loop. + * Will loose changes to the page. + */ + if (vm_pager_put(object->pager, p, syncio)) { + printf("%s: pager_put error\n", + "vm_object_page_clean"); + p->flags |= PG_CLEAN; + noerror = FALSE; + } + vm_object_lock(object); + object->paging_in_progress--; + if (!de_queue && onqueue) { + vm_page_lock_queues(); + if (onqueue > 0) + vm_page_activate(p); + else + vm_page_deactivate(p); + vm_page_unlock_queues(); + } + p->flags &= ~PG_BUSY; + PAGE_WAKEUP(p); + goto again; + } + } + } + return (noerror); +} + +/* + * vm_object_deactivate_pages + * + * Deactivate all pages in the specified object. (Keep its pages + * in memory even though it is no longer referenced.) + * + * The object must be locked. + */ +void +vm_object_deactivate_pages(object) + register vm_object_t object; +{ + register vm_page_t p, next; + + for (p = object->memq.tqh_first; p != NULL; p = next) { + next = p->listq.tqe_next; + vm_page_lock_queues(); + vm_page_deactivate(p); + vm_page_unlock_queues(); + } +} + +/* + * Trim the object cache to size. + */ +void +vm_object_cache_trim() +{ + register vm_object_t object; + + vm_object_cache_lock(); + while (vm_object_cached > vm_cache_max) { + object = vm_object_cached_list.tqh_first; + vm_object_cache_unlock(); + + if (object != vm_object_lookup(object->pager)) + panic("vm_object_deactivate: I'm sooo confused."); + + pager_cache(object, FALSE); + + vm_object_cache_lock(); + } + vm_object_cache_unlock(); +} + +/* + * vm_object_pmap_copy: + * + * Makes all physical pages in the specified + * object range copy-on-write. No writeable + * references to these pages should remain. + * + * The object must *not* be locked. + */ +void vm_object_pmap_copy(object, start, end) + register vm_object_t object; + register vm_offset_t start; + register vm_offset_t end; +{ + register vm_page_t p; + + if (object == NULL) + return; + + vm_object_lock(object); + for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { + if ((start <= p->offset) && (p->offset < end)) { + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_READ); + p->flags |= PG_COPYONWRITE; + } + } + vm_object_unlock(object); +} + +/* + * vm_object_pmap_remove: + * + * Removes all physical pages in the specified + * object range from all physical maps. + * + * The object must *not* be locked. + */ +void vm_object_pmap_remove(object, start, end) + register vm_object_t object; + register vm_offset_t start; + register vm_offset_t end; +{ + register vm_page_t p; + + if (object == NULL) + return; + + vm_object_lock(object); + for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) + if ((start <= p->offset) && (p->offset < end)) + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); + vm_object_unlock(object); +} + +/* + * vm_object_copy: + * + * Create a new object which is a copy of an existing + * object, and mark all of the pages in the existing + * object 'copy-on-write'. The new object has one reference. + * Returns the new object. + * + * May defer the copy until later if the object is not backed + * up by a non-default pager. + */ +void vm_object_copy(src_object, src_offset, size, + dst_object, dst_offset, src_needs_copy) + register vm_object_t src_object; + vm_offset_t src_offset; + vm_size_t size; + vm_object_t *dst_object; /* OUT */ + vm_offset_t *dst_offset; /* OUT */ + boolean_t *src_needs_copy; /* OUT */ +{ + register vm_object_t new_copy; + register vm_object_t old_copy; + vm_offset_t new_start, new_end; + + register vm_page_t p; + + if (src_object == NULL) { + /* + * Nothing to copy + */ + *dst_object = NULL; + *dst_offset = 0; + *src_needs_copy = FALSE; + return; + } + + /* + * If the object's pager is null_pager or the + * default pager, we don't have to make a copy + * of it. Instead, we set the needs copy flag and + * make a shadow later. + */ + + vm_object_lock(src_object); + if (src_object->pager == NULL || + (src_object->flags & OBJ_INTERNAL)) { + + /* + * Make another reference to the object + */ + src_object->ref_count++; + + /* + * Mark all of the pages copy-on-write. + */ + for (p = src_object->memq.tqh_first; p; p = p->listq.tqe_next) + if (src_offset <= p->offset && + p->offset < src_offset + size) + p->flags |= PG_COPYONWRITE; + vm_object_unlock(src_object); + + *dst_object = src_object; + *dst_offset = src_offset; + + /* + * Must make a shadow when write is desired + */ + *src_needs_copy = TRUE; + return; + } + + /* + * Try to collapse the object before copying it. + */ + vm_object_collapse(src_object); + + /* + * If the object has a pager, the pager wants to + * see all of the changes. We need a copy-object + * for the changed pages. + * + * If there is a copy-object, and it is empty, + * no changes have been made to the object since the + * copy-object was made. We can use the same copy- + * object. + */ + + Retry1: + old_copy = src_object->copy; + if (old_copy != NULL) { + /* + * Try to get the locks (out of order) + */ + if (!vm_object_lock_try(old_copy)) { + vm_object_unlock(src_object); + + /* should spin a bit here... */ + vm_object_lock(src_object); + goto Retry1; + } + + if (old_copy->resident_page_count == 0 && + old_copy->pager == NULL) { + /* + * Return another reference to + * the existing copy-object. + */ + old_copy->ref_count++; + vm_object_unlock(old_copy); + vm_object_unlock(src_object); + *dst_object = old_copy; + *dst_offset = src_offset; + *src_needs_copy = FALSE; + return; + } + vm_object_unlock(old_copy); + } + vm_object_unlock(src_object); + + /* + * If the object has a pager, the pager wants + * to see all of the changes. We must make + * a copy-object and put the changed pages there. + * + * The copy-object is always made large enough to + * completely shadow the original object, since + * it may have several users who want to shadow + * the original object at different points. + */ + + new_copy = vm_object_allocate(src_object->size); + + Retry2: + vm_object_lock(src_object); + /* + * Copy object may have changed while we were unlocked + */ + old_copy = src_object->copy; + if (old_copy != NULL) { + /* + * Try to get the locks (out of order) + */ + if (!vm_object_lock_try(old_copy)) { + vm_object_unlock(src_object); + goto Retry2; + } + + /* + * Consistency check + */ + if (old_copy->shadow != src_object || + old_copy->shadow_offset != (vm_offset_t) 0) + panic("vm_object_copy: copy/shadow inconsistency"); + + /* + * Make the old copy-object shadow the new one. + * It will receive no more pages from the original + * object. + */ + + src_object->ref_count--; /* remove ref. from old_copy */ + old_copy->shadow = new_copy; + new_copy->ref_count++; /* locking not needed - we + have the only pointer */ + vm_object_unlock(old_copy); /* done with old_copy */ + } + + new_start = (vm_offset_t) 0; /* always shadow original at 0 */ + new_end = (vm_offset_t) new_copy->size; /* for the whole object */ + + /* + * Point the new copy at the existing object. + */ + + new_copy->shadow = src_object; + new_copy->shadow_offset = new_start; + src_object->ref_count++; + src_object->copy = new_copy; + + /* + * Mark all the affected pages of the existing object + * copy-on-write. + */ + for (p = src_object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) + if ((new_start <= p->offset) && (p->offset < new_end)) + p->flags |= PG_COPYONWRITE; + + vm_object_unlock(src_object); + + *dst_object = new_copy; + *dst_offset = src_offset - new_start; + *src_needs_copy = FALSE; +} + +/* + * vm_object_shadow: + * + * Create a new object which is backed by the + * specified existing object range. The source + * object reference is deallocated. + * + * The new object and offset into that object + * are returned in the source parameters. + */ + +void vm_object_shadow(object, offset, length) + vm_object_t *object; /* IN/OUT */ + vm_offset_t *offset; /* IN/OUT */ + vm_size_t length; +{ + register vm_object_t source; + register vm_object_t result; + + source = *object; + + /* + * Allocate a new object with the given length + */ + + if ((result = vm_object_allocate(length)) == NULL) + panic("vm_object_shadow: no object for shadowing"); + + /* + * The new object shadows the source object, adding + * a reference to it. Our caller changes his reference + * to point to the new object, removing a reference to + * the source object. Net result: no change of reference + * count. + */ + result->shadow = source; + + /* + * Store the offset into the source object, + * and fix up the offset into the new object. + */ + + result->shadow_offset = *offset; + + /* + * Return the new things + */ + + *offset = 0; + *object = result; +} + +/* + * Set the specified object's pager to the specified pager. + */ + +void vm_object_setpager(object, pager, paging_offset, + read_only) + vm_object_t object; + vm_pager_t pager; + vm_offset_t paging_offset; + boolean_t read_only; +{ +#ifdef lint + read_only++; /* No longer used */ +#endif + + vm_object_lock(object); /* XXX ? */ + object->pager = pager; + object->paging_offset = paging_offset; + vm_object_unlock(object); /* XXX ? */ +} + +/* + * vm_object_hash hashes the pager/id pair. + */ + +#define vm_object_hash(pager) \ + (((unsigned)pager)%VM_OBJECT_HASH_COUNT) + +/* + * vm_object_lookup looks in the object cache for an object with the + * specified pager and paging id. + */ + +vm_object_t vm_object_lookup(pager) + vm_pager_t pager; +{ + register vm_object_hash_entry_t entry; + vm_object_t object; + + vm_object_cache_lock(); + + for (entry = vm_object_hashtable[vm_object_hash(pager)].tqh_first; + entry != NULL; + entry = entry->hash_links.tqe_next) { + object = entry->object; + if (object->pager == pager) { + vm_object_lock(object); + if (object->ref_count == 0) { + TAILQ_REMOVE(&vm_object_cached_list, object, + cached_list); + vm_object_cached--; + } + object->ref_count++; + vm_object_unlock(object); + vm_object_cache_unlock(); + return(object); + } + } + + vm_object_cache_unlock(); + return(NULL); +} + +/* + * vm_object_enter enters the specified object/pager/id into + * the hash table. + */ + +void vm_object_enter(object, pager) + vm_object_t object; + vm_pager_t pager; +{ + struct vm_object_hash_head *bucket; + register vm_object_hash_entry_t entry; + + /* + * We don't cache null objects, and we can't cache + * objects with the null pager. + */ + + if (object == NULL) + return; + if (pager == NULL) + return; + + bucket = &vm_object_hashtable[vm_object_hash(pager)]; + entry = (vm_object_hash_entry_t) + malloc((u_long)sizeof *entry, M_VMOBJHASH, M_WAITOK); + entry->object = object; + object->flags |= OBJ_CANPERSIST; + + vm_object_cache_lock(); + TAILQ_INSERT_TAIL(bucket, entry, hash_links); + vm_object_cache_unlock(); +} + +/* + * vm_object_remove: + * + * Remove the pager from the hash table. + * Note: This assumes that the object cache + * is locked. XXX this should be fixed + * by reorganizing vm_object_deallocate. + */ +void +vm_object_remove(pager) + register vm_pager_t pager; +{ + struct vm_object_hash_head *bucket; + register vm_object_hash_entry_t entry; + register vm_object_t object; + + bucket = &vm_object_hashtable[vm_object_hash(pager)]; + + for (entry = bucket->tqh_first; + entry != NULL; + entry = entry->hash_links.tqe_next) { + object = entry->object; + if (object->pager == pager) { + TAILQ_REMOVE(bucket, entry, hash_links); + free((caddr_t)entry, M_VMOBJHASH); + break; + } + } +} + +/* + * vm_object_cache_clear removes all objects from the cache. + * + */ + +void vm_object_cache_clear() +{ + register vm_object_t object; + + /* + * Remove each object in the cache by scanning down the + * list of cached objects. + */ + vm_object_cache_lock(); + while ((object = vm_object_cached_list.tqh_first) != NULL) { + vm_object_cache_unlock(); + + /* + * Note: it is important that we use vm_object_lookup + * to gain a reference, and not vm_object_reference, because + * the logic for removing an object from the cache lies in + * lookup. + */ + if (object != vm_object_lookup(object->pager)) + panic("vm_object_cache_clear: I'm sooo confused."); + pager_cache(object, FALSE); + + vm_object_cache_lock(); + } + vm_object_cache_unlock(); +} + +boolean_t vm_object_collapse_allowed = TRUE; +/* + * vm_object_collapse: + * + * Collapse an object with the object backing it. + * Pages in the backing object are moved into the + * parent, and the backing object is deallocated. + * + * Requires that the object be locked and the page + * queues be unlocked. + * + */ +void vm_object_collapse(object) + register vm_object_t object; + +{ + register vm_object_t backing_object; + register vm_offset_t backing_offset; + register vm_size_t size; + register vm_offset_t new_offset; + register vm_page_t p, pp; + + if (!vm_object_collapse_allowed) + return; + + while (TRUE) { + /* + * Verify that the conditions are right for collapse: + * + * The object exists and no pages in it are currently + * being paged out (or have ever been paged out). + */ + if (object == NULL || + object->paging_in_progress != 0 || + object->pager != NULL) + return; + + /* + * There is a backing object, and + */ + + if ((backing_object = object->shadow) == NULL) + return; + + vm_object_lock(backing_object); + /* + * ... + * The backing object is not read_only, + * and no pages in the backing object are + * currently being paged out. + * The backing object is internal. + */ + + if ((backing_object->flags & OBJ_INTERNAL) == 0 || + backing_object->paging_in_progress != 0) { + vm_object_unlock(backing_object); + return; + } + + /* + * The backing object can't be a copy-object: + * the shadow_offset for the copy-object must stay + * as 0. Furthermore (for the 'we have all the + * pages' case), if we bypass backing_object and + * just shadow the next object in the chain, old + * pages from that object would then have to be copied + * BOTH into the (former) backing_object and into the + * parent object. + */ + if (backing_object->shadow != NULL && + backing_object->shadow->copy != NULL) { + vm_object_unlock(backing_object); + return; + } + + /* + * We know that we can either collapse the backing + * object (if the parent is the only reference to + * it) or (perhaps) remove the parent's reference + * to it. + */ + + backing_offset = object->shadow_offset; + size = object->size; + + /* + * If there is exactly one reference to the backing + * object, we can collapse it into the parent. + */ + + if (backing_object->ref_count == 1) { + + /* + * We can collapse the backing object. + * + * Move all in-memory pages from backing_object + * to the parent. Pages that have been paged out + * will be overwritten by any of the parent's + * pages that shadow them. + */ + + while ((p = backing_object->memq.tqh_first) != NULL) { + new_offset = (p->offset - backing_offset); + + /* + * If the parent has a page here, or if + * this page falls outside the parent, + * dispose of it. + * + * Otherwise, move it as planned. + */ + + if (p->offset < backing_offset || + new_offset >= size) { + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + } else { + pp = vm_page_lookup(object, new_offset); + if (pp != NULL && !(pp->flags & PG_FAKE)) { + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + } + else { + if (pp) { + /* may be someone waiting for it */ + PAGE_WAKEUP(pp); + vm_page_lock_queues(); + vm_page_free(pp); + vm_page_unlock_queues(); + } + vm_page_rename(p, object, new_offset); + } + } + } + + /* + * Move the pager from backing_object to object. + * + * XXX We're only using part of the paging space + * for keeps now... we ought to discard the + * unused portion. + */ + + if (backing_object->pager) { + object->pager = backing_object->pager; + object->paging_offset = backing_offset + + backing_object->paging_offset; + backing_object->pager = NULL; + } + + /* + * Object now shadows whatever backing_object did. + * Note that the reference to backing_object->shadow + * moves from within backing_object to within object. + */ + + object->shadow = backing_object->shadow; + object->shadow_offset += backing_object->shadow_offset; + if (object->shadow != NULL && + object->shadow->copy != NULL) { + panic("vm_object_collapse: we collapsed a copy-object!"); + } + /* + * Discard backing_object. + * + * Since the backing object has no pages, no + * pager left, and no object references within it, + * all that is necessary is to dispose of it. + */ + + vm_object_unlock(backing_object); + + simple_lock(&vm_object_list_lock); + TAILQ_REMOVE(&vm_object_list, backing_object, + object_list); + vm_object_count--; + simple_unlock(&vm_object_list_lock); + + free((caddr_t)backing_object, M_VMOBJ); + + object_collapses++; + } + else { + /* + * If all of the pages in the backing object are + * shadowed by the parent object, the parent + * object no longer has to shadow the backing + * object; it can shadow the next one in the + * chain. + * + * The backing object must not be paged out - we'd + * have to check all of the paged-out pages, as + * well. + */ + + if (backing_object->pager != NULL) { + vm_object_unlock(backing_object); + return; + } + + /* + * Should have a check for a 'small' number + * of pages here. + */ + + for (p = backing_object->memq.tqh_first; + p != NULL; + p = p->listq.tqe_next) { + new_offset = (p->offset - backing_offset); + + /* + * If the parent has a page here, or if + * this page falls outside the parent, + * keep going. + * + * Otherwise, the backing_object must be + * left in the chain. + */ + + if (p->offset >= backing_offset && + new_offset < size && + ((pp = vm_page_lookup(object, new_offset)) + == NULL || + (pp->flags & PG_FAKE))) { + /* + * Page still needed. + * Can't go any further. + */ + vm_object_unlock(backing_object); + return; + } + } + + /* + * Make the parent shadow the next object + * in the chain. Deallocating backing_object + * will not remove it, since its reference + * count is at least 2. + */ + + object->shadow = backing_object->shadow; + vm_object_reference(object->shadow); + object->shadow_offset += backing_object->shadow_offset; + + /* + * Backing object might have had a copy pointer + * to us. If it did, clear it. + */ + if (backing_object->copy == object) { + backing_object->copy = NULL; + } + + /* Drop the reference count on backing_object. + * Since its ref_count was at least 2, it + * will not vanish; so we don't need to call + * vm_object_deallocate. + */ + backing_object->ref_count--; + vm_object_unlock(backing_object); + + object_bypasses ++; + + } + + /* + * Try again with this object's new backing object. + */ + } +} + +/* + * vm_object_page_remove: [internal] + * + * Removes all physical pages in the specified + * object range from the object's list of pages. + * + * The object must be locked. + */ +void vm_object_page_remove(object, start, end) + register vm_object_t object; + register vm_offset_t start; + register vm_offset_t end; +{ + register vm_page_t p, next; + + if (object == NULL) + return; + + for (p = object->memq.tqh_first; p != NULL; p = next) { + next = p->listq.tqe_next; + if ((start <= p->offset) && (p->offset < end)) { + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + } + } +} + +/* + * Routine: vm_object_coalesce + * Function: Coalesces two objects backing up adjoining + * regions of memory into a single object. + * + * returns TRUE if objects were combined. + * + * NOTE: Only works at the moment if the second object is NULL - + * if it's not, which object do we lock first? + * + * Parameters: + * prev_object First object to coalesce + * prev_offset Offset into prev_object + * next_object Second object into coalesce + * next_offset Offset into next_object + * + * prev_size Size of reference to prev_object + * next_size Size of reference to next_object + * + * Conditions: + * The object must *not* be locked. + */ +boolean_t vm_object_coalesce(prev_object, next_object, + prev_offset, next_offset, + prev_size, next_size) + + register vm_object_t prev_object; + vm_object_t next_object; + vm_offset_t prev_offset, next_offset; + vm_size_t prev_size, next_size; +{ + vm_size_t newsize; + +#ifdef lint + next_offset++; +#endif + + if (next_object != NULL) { + return(FALSE); + } + + if (prev_object == NULL) { + return(TRUE); + } + + vm_object_lock(prev_object); + + /* + * Try to collapse the object first + */ + vm_object_collapse(prev_object); + + /* + * Can't coalesce if: + * . more than one reference + * . paged out + * . shadows another object + * . has a copy elsewhere + * (any of which mean that the pages not mapped to + * prev_entry may be in use anyway) + */ + + if (prev_object->ref_count > 1 || + prev_object->pager != NULL || + prev_object->shadow != NULL || + prev_object->copy != NULL) { + vm_object_unlock(prev_object); + return(FALSE); + } + + /* + * Remove any pages that may still be in the object from + * a previous deallocation. + */ + + vm_object_page_remove(prev_object, + prev_offset + prev_size, + prev_offset + prev_size + next_size); + + /* + * Extend the object if necessary. + */ + newsize = prev_offset + prev_size + next_size; + if (newsize > prev_object->size) + prev_object->size = newsize; + + vm_object_unlock(prev_object); + return(TRUE); +} + +/* + * vm_object_print: [ debug ] + */ +void vm_object_print(object, full) + vm_object_t object; + boolean_t full; +{ + register vm_page_t p; + extern indent; + + register int count; + + if (object == NULL) + return; + + iprintf("Object 0x%x: size=0x%x, res=%d, ref=%d, ", + (int) object, (int) object->size, + object->resident_page_count, object->ref_count); + printf("pager=0x%x+0x%x, shadow=(0x%x)+0x%x\n", + (int) object->pager, (int) object->paging_offset, + (int) object->shadow, (int) object->shadow_offset); + printf("cache: next=0x%x, prev=0x%x\n", + object->cached_list.tqe_next, object->cached_list.tqe_prev); + + if (!full) + return; + + indent += 2; + count = 0; + for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) { + if (count == 0) + iprintf("memory:="); + else if (count == 6) { + printf("\n"); + iprintf(" ..."); + count = 0; + } else + printf(","); + count++; + + printf("(off=0x%x,page=0x%x)", p->offset, VM_PAGE_TO_PHYS(p)); + } + if (count != 0) + printf("\n"); + indent -= 2; +} diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h new file mode 100644 index 00000000000..5e220acd47c --- /dev/null +++ b/sys/vm/vm_object.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_object.h 8.3 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory object module definitions. + */ + +#ifndef _VM_OBJECT_ +#define _VM_OBJECT_ + +#include +#include + +/* + * Types defined: + * + * vm_object_t Virtual memory object. + */ + +struct vm_object { + struct pglist memq; /* Resident memory */ + TAILQ_ENTRY(vm_object) object_list; /* list of all objects */ + u_short flags; /* see below */ + u_short paging_in_progress; /* Paging (in or out) so + don't collapse or destroy */ + simple_lock_data_t Lock; /* Synchronization */ + int ref_count; /* How many refs?? */ + vm_size_t size; /* Object size */ + int resident_page_count; + /* number of resident pages */ + struct vm_object *copy; /* Object that holds copies of + my changed pages */ + vm_pager_t pager; /* Where to get data */ + vm_offset_t paging_offset; /* Offset into paging space */ + struct vm_object *shadow; /* My shadow */ + vm_offset_t shadow_offset; /* Offset in shadow */ + TAILQ_ENTRY(vm_object) cached_list; /* for persistence */ +}; +/* + * Flags + */ +#define OBJ_CANPERSIST 0x0001 /* allow to persist */ +#define OBJ_INTERNAL 0x0002 /* internally created object */ +#define OBJ_ACTIVE 0x0004 /* used to mark active objects */ + +TAILQ_HEAD(vm_object_hash_head, vm_object_hash_entry); + +struct vm_object_hash_entry { + TAILQ_ENTRY(vm_object_hash_entry) hash_links; /* hash chain links */ + vm_object_t object; /* object represened */ +}; + +typedef struct vm_object_hash_entry *vm_object_hash_entry_t; + +#ifdef KERNEL +TAILQ_HEAD(object_q, vm_object); + +struct object_q vm_object_cached_list; /* list of objects persisting */ +int vm_object_cached; /* size of cached list */ +simple_lock_data_t vm_cache_lock; /* lock for object cache */ + +struct object_q vm_object_list; /* list of allocated objects */ +long vm_object_count; /* count of all objects */ +simple_lock_data_t vm_object_list_lock; + /* lock for object list and count */ + +vm_object_t kernel_object; /* the single kernel object */ +vm_object_t kmem_object; + +#define vm_object_cache_lock() simple_lock(&vm_cache_lock) +#define vm_object_cache_unlock() simple_unlock(&vm_cache_lock) +#endif /* KERNEL */ + +#define vm_object_lock_init(object) simple_lock_init(&(object)->Lock) +#define vm_object_lock(object) simple_lock(&(object)->Lock) +#define vm_object_unlock(object) simple_unlock(&(object)->Lock) +#define vm_object_lock_try(object) simple_lock_try(&(object)->Lock) +#define vm_object_sleep(event, object, interruptible) \ + thread_sleep((event), &(object)->Lock, (interruptible)) + +#ifdef KERNEL +vm_object_t vm_object_allocate __P((vm_size_t)); +void vm_object_cache_clear __P((void)); +void vm_object_cache_trim __P((void)); +boolean_t vm_object_coalesce __P((vm_object_t, vm_object_t, + vm_offset_t, vm_offset_t, vm_offset_t, vm_size_t)); +void vm_object_collapse __P((vm_object_t)); +void vm_object_copy __P((vm_object_t, vm_offset_t, vm_size_t, + vm_object_t *, vm_offset_t *, boolean_t *)); +void vm_object_deactivate_pages __P((vm_object_t)); +void vm_object_deallocate __P((vm_object_t)); +void vm_object_enter __P((vm_object_t, vm_pager_t)); +void vm_object_init __P((vm_size_t)); +vm_object_t vm_object_lookup __P((vm_pager_t)); +boolean_t vm_object_page_clean __P((vm_object_t, + vm_offset_t, vm_offset_t, boolean_t, boolean_t)); +void vm_object_page_remove __P((vm_object_t, + vm_offset_t, vm_offset_t)); +void vm_object_pmap_copy __P((vm_object_t, + vm_offset_t, vm_offset_t)); +void vm_object_pmap_remove __P((vm_object_t, + vm_offset_t, vm_offset_t)); +void vm_object_print __P((vm_object_t, boolean_t)); +void vm_object_reference __P((vm_object_t)); +void vm_object_remove __P((vm_pager_t)); +void vm_object_setpager __P((vm_object_t, + vm_pager_t, vm_offset_t, boolean_t)); +void vm_object_shadow __P((vm_object_t *, + vm_offset_t *, vm_size_t)); +void vm_object_terminate __P((vm_object_t)); +#endif +#endif /* _VM_OBJECT_ */ diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c new file mode 100644 index 00000000000..0cd9d875b69 --- /dev/null +++ b/sys/vm/vm_page.c @@ -0,0 +1,696 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_page.c 8.3 (Berkeley) 3/21/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Resident memory management module. + */ + +#include +#include + +#include +#include +#include +#include + +/* + * Associated with page of user-allocatable memory is a + * page structure. + */ + +struct pglist *vm_page_buckets; /* Array of buckets */ +int vm_page_bucket_count = 0; /* How big is array? */ +int vm_page_hash_mask; /* Mask for hash function */ +simple_lock_data_t bucket_lock; /* lock for all buckets XXX */ + +struct pglist vm_page_queue_free; +struct pglist vm_page_queue_active; +struct pglist vm_page_queue_inactive; +simple_lock_data_t vm_page_queue_lock; +simple_lock_data_t vm_page_queue_free_lock; + +/* has physical page allocation been initialized? */ +boolean_t vm_page_startup_initialized; + +vm_page_t vm_page_array; +long first_page; +long last_page; +vm_offset_t first_phys_addr; +vm_offset_t last_phys_addr; +vm_size_t page_mask; +int page_shift; + +/* + * vm_set_page_size: + * + * Sets the page size, perhaps based upon the memory + * size. Must be called before any use of page-size + * dependent functions. + * + * Sets page_shift and page_mask from cnt.v_page_size. + */ +void vm_set_page_size() +{ + + if (cnt.v_page_size == 0) + cnt.v_page_size = DEFAULT_PAGE_SIZE; + page_mask = cnt.v_page_size - 1; + if ((page_mask & cnt.v_page_size) != 0) + panic("vm_set_page_size: page size not a power of two"); + for (page_shift = 0; ; page_shift++) + if ((1 << page_shift) == cnt.v_page_size) + break; +} + + +/* + * vm_page_startup: + * + * Initializes the resident memory module. + * + * Allocates memory for the page cells, and + * for the object/offset-to-page hash table headers. + * Each page cell is initialized and placed on the free list. + */ +void vm_page_startup(start, end) + vm_offset_t *start; + vm_offset_t *end; +{ + register vm_page_t m; + register struct pglist *bucket; + vm_size_t npages; + int i; + vm_offset_t pa; + extern vm_offset_t kentry_data; + extern vm_size_t kentry_data_size; + + + /* + * Initialize the locks + */ + + simple_lock_init(&vm_page_queue_free_lock); + simple_lock_init(&vm_page_queue_lock); + + /* + * Initialize the queue headers for the free queue, + * the active queue and the inactive queue. + */ + + TAILQ_INIT(&vm_page_queue_free); + TAILQ_INIT(&vm_page_queue_active); + TAILQ_INIT(&vm_page_queue_inactive); + + /* + * Calculate the number of hash table buckets. + * + * The number of buckets MUST BE a power of 2, and + * the actual value is the next power of 2 greater + * than the number of physical pages in the system. + * + * Note: + * This computation can be tweaked if desired. + */ + + if (vm_page_bucket_count == 0) { + vm_page_bucket_count = 1; + while (vm_page_bucket_count < atop(*end - *start)) + vm_page_bucket_count <<= 1; + } + + vm_page_hash_mask = vm_page_bucket_count - 1; + + /* + * Allocate (and initialize) the hash table buckets. + */ + vm_page_buckets = (struct pglist *) + pmap_bootstrap_alloc(vm_page_bucket_count * sizeof(struct pglist)); + bucket = vm_page_buckets; + + for (i = vm_page_bucket_count; i--;) { + TAILQ_INIT(bucket); + bucket++; + } + + simple_lock_init(&bucket_lock); + + /* + * Truncate the remainder of physical memory to our page size. + */ + + *end = trunc_page(*end); + + /* + * Pre-allocate maps and map entries that cannot be dynamically + * allocated via malloc(). The maps include the kernel_map and + * kmem_map which must be initialized before malloc() will + * work (obviously). Also could include pager maps which would + * be allocated before kmeminit. + * + * Allow some kernel map entries... this should be plenty + * since people shouldn't be cluttering up the kernel + * map (they should use their own maps). + */ + + kentry_data_size = round_page(MAX_KMAP*sizeof(struct vm_map) + + MAX_KMAPENT*sizeof(struct vm_map_entry)); + kentry_data = (vm_offset_t) pmap_bootstrap_alloc(kentry_data_size); + + /* + * Compute the number of pages of memory that will be + * available for use (taking into account the overhead + * of a page structure per page). + */ + + cnt.v_free_count = npages = (*end - *start + sizeof(struct vm_page)) + / (PAGE_SIZE + sizeof(struct vm_page)); + + /* + * Record the extent of physical memory that the + * virtual memory system manages. + */ + + first_page = *start; + first_page += npages*sizeof(struct vm_page); + first_page = atop(round_page(first_page)); + last_page = first_page + npages - 1; + + first_phys_addr = ptoa(first_page); + last_phys_addr = ptoa(last_page) + PAGE_MASK; + + + /* + * Allocate and clear the mem entry structures. + */ + + m = vm_page_array = (vm_page_t) + pmap_bootstrap_alloc(npages * sizeof(struct vm_page)); + + /* + * Initialize the mem entry structures now, and + * put them in the free queue. + */ + + pa = first_phys_addr; + while (npages--) { + m->flags = 0; + m->object = NULL; + m->phys_addr = pa; +#ifdef i386 + if (pmap_isvalidphys(m->phys_addr)) { + TAILQ_INSERT_TAIL(&vm_page_queue_free, m, pageq); + } else { + /* perhaps iomem needs it's own type, or dev pager? */ + m->flags |= PG_FICTITIOUS | PG_BUSY; + cnt.v_free_count--; + } +#else /* i386 */ + TAILQ_INSERT_TAIL(&vm_page_queue_free, m, pageq); +#endif /* i386 */ + m++; + pa += PAGE_SIZE; + } + + /* + * Initialize vm_pages_needed lock here - don't wait for pageout + * daemon XXX + */ + simple_lock_init(&vm_pages_needed_lock); + + /* from now on, pmap_bootstrap_alloc can't be used */ + vm_page_startup_initialized = TRUE; +} + +/* + * vm_page_hash: + * + * Distributes the object/offset key pair among hash buckets. + * + * NOTE: This macro depends on vm_page_bucket_count being a power of 2. + */ +#define vm_page_hash(object, offset) \ + (((unsigned)object+(unsigned)atop(offset))&vm_page_hash_mask) + +/* + * vm_page_insert: [ internal use only ] + * + * Inserts the given mem entry into the object/object-page + * table and object list. + * + * The object and page must be locked. + */ + +void vm_page_insert(mem, object, offset) + register vm_page_t mem; + register vm_object_t object; + register vm_offset_t offset; +{ + register struct pglist *bucket; + int spl; + + VM_PAGE_CHECK(mem); + + if (mem->flags & PG_TABLED) + panic("vm_page_insert: already inserted"); + + /* + * Record the object/offset pair in this page + */ + + mem->object = object; + mem->offset = offset; + + /* + * Insert it into the object_object/offset hash table + */ + + bucket = &vm_page_buckets[vm_page_hash(object, offset)]; + spl = splimp(); + simple_lock(&bucket_lock); + TAILQ_INSERT_TAIL(bucket, mem, hashq); + simple_unlock(&bucket_lock); + (void) splx(spl); + + /* + * Now link into the object's list of backed pages. + */ + + TAILQ_INSERT_TAIL(&object->memq, mem, listq); + mem->flags |= PG_TABLED; + + /* + * And show that the object has one more resident + * page. + */ + + object->resident_page_count++; +} + +/* + * vm_page_remove: [ internal use only ] + * NOTE: used by device pager as well -wfj + * + * Removes the given mem entry from the object/offset-page + * table and the object page list. + * + * The object and page must be locked. + */ + +void vm_page_remove(mem) + register vm_page_t mem; +{ + register struct pglist *bucket; + int spl; + + VM_PAGE_CHECK(mem); + + if (!(mem->flags & PG_TABLED)) + return; + + /* + * Remove from the object_object/offset hash table + */ + + bucket = &vm_page_buckets[vm_page_hash(mem->object, mem->offset)]; + spl = splimp(); + simple_lock(&bucket_lock); + TAILQ_REMOVE(bucket, mem, hashq); + simple_unlock(&bucket_lock); + (void) splx(spl); + + /* + * Now remove from the object's list of backed pages. + */ + + TAILQ_REMOVE(&mem->object->memq, mem, listq); + + /* + * And show that the object has one fewer resident + * page. + */ + + mem->object->resident_page_count--; + + mem->flags &= ~PG_TABLED; +} + +/* + * vm_page_lookup: + * + * Returns the page associated with the object/offset + * pair specified; if none is found, NULL is returned. + * + * The object must be locked. No side effects. + */ + +vm_page_t vm_page_lookup(object, offset) + register vm_object_t object; + register vm_offset_t offset; +{ + register vm_page_t mem; + register struct pglist *bucket; + int spl; + + /* + * Search the hash table for this object/offset pair + */ + + bucket = &vm_page_buckets[vm_page_hash(object, offset)]; + + spl = splimp(); + simple_lock(&bucket_lock); + for (mem = bucket->tqh_first; mem != NULL; mem = mem->hashq.tqe_next) { + VM_PAGE_CHECK(mem); + if ((mem->object == object) && (mem->offset == offset)) { + simple_unlock(&bucket_lock); + splx(spl); + return(mem); + } + } + + simple_unlock(&bucket_lock); + splx(spl); + return(NULL); +} + +/* + * vm_page_rename: + * + * Move the given memory entry from its + * current object to the specified target object/offset. + * + * The object must be locked. + */ +void vm_page_rename(mem, new_object, new_offset) + register vm_page_t mem; + register vm_object_t new_object; + vm_offset_t new_offset; +{ + if (mem->object == new_object) + return; + + vm_page_lock_queues(); /* keep page from moving out from + under pageout daemon */ + vm_page_remove(mem); + vm_page_insert(mem, new_object, new_offset); + vm_page_unlock_queues(); +} + +/* + * vm_page_alloc: + * + * Allocate and return a memory cell associated + * with this VM object/offset pair. + * + * Object must be locked. + */ +vm_page_t vm_page_alloc(object, offset) + vm_object_t object; + vm_offset_t offset; +{ + register vm_page_t mem; + int spl; + + spl = splimp(); /* XXX */ + simple_lock(&vm_page_queue_free_lock); + if (vm_page_queue_free.tqh_first == NULL) { + simple_unlock(&vm_page_queue_free_lock); + splx(spl); + return(NULL); + } + + mem = vm_page_queue_free.tqh_first; + TAILQ_REMOVE(&vm_page_queue_free, mem, pageq); + + cnt.v_free_count--; + simple_unlock(&vm_page_queue_free_lock); + splx(spl); + + VM_PAGE_INIT(mem, object, offset); + + /* + * Decide if we should poke the pageout daemon. + * We do this if the free count is less than the low + * water mark, or if the free count is less than the high + * water mark (but above the low water mark) and the inactive + * count is less than its target. + * + * We don't have the counts locked ... if they change a little, + * it doesn't really matter. + */ + + if (cnt.v_free_count < cnt.v_free_min || + (cnt.v_free_count < cnt.v_free_target && + cnt.v_inactive_count < cnt.v_inactive_target)) + thread_wakeup((int)&vm_pages_needed); + return (mem); +} + +/* + * vm_page_free: + * + * Returns the given page to the free list, + * disassociating it with any VM object. + * + * Object and page must be locked prior to entry. + */ +void vm_page_free(mem) + register vm_page_t mem; +{ + vm_page_remove(mem); + if (mem->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, mem, pageq); + mem->flags &= ~PG_ACTIVE; + cnt.v_active_count--; + } + + if (mem->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, mem, pageq); + mem->flags &= ~PG_INACTIVE; + cnt.v_inactive_count--; + } + + if (!(mem->flags & PG_FICTITIOUS)) { + int spl; + + spl = splimp(); + simple_lock(&vm_page_queue_free_lock); + TAILQ_INSERT_TAIL(&vm_page_queue_free, mem, pageq); + + cnt.v_free_count++; + simple_unlock(&vm_page_queue_free_lock); + splx(spl); + } +} + +/* + * vm_page_wire: + * + * Mark this page as wired down by yet + * another map, removing it from paging queues + * as necessary. + * + * The page queues must be locked. + */ +void vm_page_wire(mem) + register vm_page_t mem; +{ + VM_PAGE_CHECK(mem); + + if (mem->wire_count == 0) { + if (mem->flags & PG_ACTIVE) { + TAILQ_REMOVE(&vm_page_queue_active, mem, pageq); + cnt.v_active_count--; + mem->flags &= ~PG_ACTIVE; + } + if (mem->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, mem, pageq); + cnt.v_inactive_count--; + mem->flags &= ~PG_INACTIVE; + } + cnt.v_wire_count++; + } + mem->wire_count++; +} + +/* + * vm_page_unwire: + * + * Release one wiring of this page, potentially + * enabling it to be paged again. + * + * The page queues must be locked. + */ +void vm_page_unwire(mem) + register vm_page_t mem; +{ + VM_PAGE_CHECK(mem); + + mem->wire_count--; + if (mem->wire_count == 0) { + TAILQ_INSERT_TAIL(&vm_page_queue_active, mem, pageq); + cnt.v_active_count++; + mem->flags |= PG_ACTIVE; + cnt.v_wire_count--; + } +} + +/* + * vm_page_deactivate: + * + * Returns the given page to the inactive list, + * indicating that no physical maps have access + * to this page. [Used by the physical mapping system.] + * + * The page queues must be locked. + */ +void vm_page_deactivate(m) + register vm_page_t m; +{ + VM_PAGE_CHECK(m); + + /* + * Only move active pages -- ignore locked or already + * inactive ones. + */ + + if (m->flags & PG_ACTIVE) { + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + TAILQ_REMOVE(&vm_page_queue_active, m, pageq); + TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq); + m->flags &= ~PG_ACTIVE; + m->flags |= PG_INACTIVE; + cnt.v_active_count--; + cnt.v_inactive_count++; + if (pmap_is_modified(VM_PAGE_TO_PHYS(m))) + m->flags &= ~PG_CLEAN; + if (m->flags & PG_CLEAN) + m->flags &= ~PG_LAUNDRY; + else + m->flags |= PG_LAUNDRY; + } +} + +/* + * vm_page_activate: + * + * Put the specified page on the active list (if appropriate). + * + * The page queues must be locked. + */ + +void vm_page_activate(m) + register vm_page_t m; +{ + VM_PAGE_CHECK(m); + + if (m->flags & PG_INACTIVE) { + TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq); + cnt.v_inactive_count--; + m->flags &= ~PG_INACTIVE; + } + if (m->wire_count == 0) { + if (m->flags & PG_ACTIVE) + panic("vm_page_activate: already active"); + + TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq); + m->flags |= PG_ACTIVE; + cnt.v_active_count++; + } +} + +/* + * vm_page_zero_fill: + * + * Zero-fill the specified page. + * Written as a standard pagein routine, to + * be used by the zero-fill object. + */ + +boolean_t vm_page_zero_fill(m) + vm_page_t m; +{ + VM_PAGE_CHECK(m); + + m->flags &= ~PG_CLEAN; + pmap_zero_page(VM_PAGE_TO_PHYS(m)); + return(TRUE); +} + +/* + * vm_page_copy: + * + * Copy one page to another + */ + +void vm_page_copy(src_m, dest_m) + vm_page_t src_m; + vm_page_t dest_m; +{ + VM_PAGE_CHECK(src_m); + VM_PAGE_CHECK(dest_m); + + dest_m->flags &= ~PG_CLEAN; + pmap_copy_page(VM_PAGE_TO_PHYS(src_m), VM_PAGE_TO_PHYS(dest_m)); +} diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h new file mode 100644 index 00000000000..8bf51469a1f --- /dev/null +++ b/sys/vm/vm_page.h @@ -0,0 +1,242 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_page.h 8.2 (Berkeley) 12/13/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Resident memory system definitions. + */ + +#ifndef _VM_PAGE_ +#define _VM_PAGE_ + +/* + * Management of resident (logical) pages. + * + * A small structure is kept for each resident + * page, indexed by page number. Each structure + * is an element of several lists: + * + * A hash table bucket used to quickly + * perform object/offset lookups + * + * A list of all pages for a given object, + * so they can be quickly deactivated at + * time of deallocation. + * + * An ordered list of pages due for pageout. + * + * In addition, the structure contains the object + * and offset to which this page belongs (for pageout), + * and sundry status bits. + * + * Fields in this structure are locked either by the lock on the + * object that the page belongs to (O) or by the lock on the page + * queues (P). + */ + +TAILQ_HEAD(pglist, vm_page); + +struct vm_page { + TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO + * queue or free list (P) */ + TAILQ_ENTRY(vm_page) hashq; /* hash table links (O)*/ + TAILQ_ENTRY(vm_page) listq; /* pages in same object (O)*/ + + vm_object_t object; /* which object am I in (O,P)*/ + vm_offset_t offset; /* offset into object (O,P) */ + + u_short wire_count; /* wired down maps refs (P) */ + u_short flags; /* see below */ + + vm_offset_t phys_addr; /* physical address of page */ +}; + +/* + * These are the flags defined for vm_page. + * + * Note: PG_FILLED and PG_DIRTY are added for the filesystems. + */ +#define PG_INACTIVE 0x0001 /* page is in inactive list (P) */ +#define PG_ACTIVE 0x0002 /* page is in active list (P) */ +#define PG_LAUNDRY 0x0004 /* page is being cleaned now (P)*/ +#define PG_CLEAN 0x0008 /* page has not been modified */ +#define PG_BUSY 0x0010 /* page is in transit (O) */ +#define PG_WANTED 0x0020 /* someone is waiting for page (O) */ +#define PG_TABLED 0x0040 /* page is in VP table (O) */ +#define PG_COPYONWRITE 0x0080 /* must copy page before changing (O) */ +#define PG_FICTITIOUS 0x0100 /* physical page doesn't exist (O) */ +#define PG_FAKE 0x0200 /* page is placeholder for pagein (O) */ +#define PG_FILLED 0x0400 /* client flag to set when filled */ +#define PG_DIRTY 0x0800 /* client flag to set when dirty */ +#define PG_PAGEROWNED 0x4000 /* DEBUG: async paging op in progress */ +#define PG_PTPAGE 0x8000 /* DEBUG: is a user page table page */ + +#if VM_PAGE_DEBUG +#define VM_PAGE_CHECK(mem) { \ + if ((((unsigned int) mem) < ((unsigned int) &vm_page_array[0])) || \ + (((unsigned int) mem) > \ + ((unsigned int) &vm_page_array[last_page-first_page])) || \ + ((mem->flags & (PG_ACTIVE | PG_INACTIVE)) == \ + (PG_ACTIVE | PG_INACTIVE))) \ + panic("vm_page_check: not valid!"); \ +} +#else /* VM_PAGE_DEBUG */ +#define VM_PAGE_CHECK(mem) +#endif /* VM_PAGE_DEBUG */ + +#ifdef KERNEL +/* + * Each pageable resident page falls into one of three lists: + * + * free + * Available for allocation now. + * inactive + * Not referenced in any map, but still has an + * object/offset-page mapping, and may be dirty. + * This is the list of pages that should be + * paged out next. + * active + * A list of pages which have been placed in + * at least one physical map. This list is + * ordered, in LRU-like fashion. + */ + +extern +struct pglist vm_page_queue_free; /* memory free queue */ +extern +struct pglist vm_page_queue_active; /* active memory queue */ +extern +struct pglist vm_page_queue_inactive; /* inactive memory queue */ + +extern +vm_page_t vm_page_array; /* First resident page in table */ +extern +long first_page; /* first physical page number */ + /* ... represented in vm_page_array */ +extern +long last_page; /* last physical page number */ + /* ... represented in vm_page_array */ + /* [INCLUSIVE] */ +extern +vm_offset_t first_phys_addr; /* physical address for first_page */ +extern +vm_offset_t last_phys_addr; /* physical address for last_page */ + +#define VM_PAGE_TO_PHYS(entry) ((entry)->phys_addr) + +#define IS_VM_PHYSADDR(pa) \ + ((pa) >= first_phys_addr && (pa) <= last_phys_addr) + +#define PHYS_TO_VM_PAGE(pa) \ + (&vm_page_array[atop(pa) - first_page ]) + +extern +simple_lock_data_t vm_page_queue_lock; /* lock on active and inactive + page queues */ +extern /* lock on free page queue */ +simple_lock_data_t vm_page_queue_free_lock; + +/* + * Functions implemented as macros + */ + +#define PAGE_ASSERT_WAIT(m, interruptible) { \ + (m)->flags |= PG_WANTED; \ + assert_wait((int) (m), (interruptible)); \ + } + +#define PAGE_WAKEUP(m) { \ + (m)->flags &= ~PG_BUSY; \ + if ((m)->flags & PG_WANTED) { \ + (m)->flags &= ~PG_WANTED; \ + thread_wakeup((int) (m)); \ + } \ + } + +#define vm_page_lock_queues() simple_lock(&vm_page_queue_lock) +#define vm_page_unlock_queues() simple_unlock(&vm_page_queue_lock) + +#define vm_page_set_modified(m) { (m)->flags &= ~PG_CLEAN; } + +#define VM_PAGE_INIT(mem, object, offset) { \ + (mem)->flags = PG_BUSY | PG_CLEAN | PG_FAKE; \ + vm_page_insert((mem), (object), (offset)); \ + (mem)->wire_count = 0; \ +} + +void vm_page_activate __P((vm_page_t)); +vm_page_t vm_page_alloc __P((vm_object_t, vm_offset_t)); +void vm_page_copy __P((vm_page_t, vm_page_t)); +void vm_page_deactivate __P((vm_page_t)); +void vm_page_free __P((vm_page_t)); +void vm_page_insert __P((vm_page_t, vm_object_t, vm_offset_t)); +vm_page_t vm_page_lookup __P((vm_object_t, vm_offset_t)); +void vm_page_remove __P((vm_page_t)); +void vm_page_rename __P((vm_page_t, vm_object_t, vm_offset_t)); +void vm_page_startup __P((vm_offset_t *, vm_offset_t *)); +void vm_page_unwire __P((vm_page_t)); +void vm_page_wire __P((vm_page_t)); +boolean_t vm_page_zero_fill __P((vm_page_t)); + +#endif /* KERNEL */ +#endif /* !_VM_PAGE_ */ diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c new file mode 100644 index 00000000000..679540591e7 --- /dev/null +++ b/sys/vm/vm_pageout.c @@ -0,0 +1,567 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_pageout.c 8.5 (Berkeley) 2/14/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * The proverbial page-out daemon. + */ + +#include + +#include +#include +#include + +#ifndef VM_PAGE_FREE_MIN +#define VM_PAGE_FREE_MIN (cnt.v_free_count / 20) +#endif + +#ifndef VM_PAGE_FREE_TARGET +#define VM_PAGE_FREE_TARGET ((cnt.v_free_min * 4) / 3) +#endif + +int vm_page_free_min_min = 16 * 1024; +int vm_page_free_min_max = 256 * 1024; + +int vm_pages_needed; /* Event on which pageout daemon sleeps */ + +int vm_page_max_wired = 0; /* XXX max # of wired pages system-wide */ + +#ifdef CLUSTERED_PAGEOUT +#define MAXPOCLUSTER (MAXPHYS/NBPG) /* XXX */ +int doclustered_pageout = 1; +#endif + +/* + * vm_pageout_scan does the dirty work for the pageout daemon. + */ +void +vm_pageout_scan() +{ + register vm_page_t m, next; + register int page_shortage; + register int s; + register int pages_freed; + int free; + vm_object_t object; + + /* + * Only continue when we want more pages to be "free" + */ + + cnt.v_rev++; + + s = splimp(); + simple_lock(&vm_page_queue_free_lock); + free = cnt.v_free_count; + simple_unlock(&vm_page_queue_free_lock); + splx(s); + + if (free < cnt.v_free_target) { + swapout_threads(); + + /* + * Be sure the pmap system is updated so + * we can scan the inactive queue. + */ + + pmap_update(); + } + + /* + * Acquire the resident page system lock, + * as we may be changing what's resident quite a bit. + */ + vm_page_lock_queues(); + + /* + * Start scanning the inactive queue for pages we can free. + * We keep scanning until we have enough free pages or + * we have scanned through the entire queue. If we + * encounter dirty pages, we start cleaning them. + */ + + pages_freed = 0; + for (m = vm_page_queue_inactive.tqh_first; m != NULL; m = next) { + s = splimp(); + simple_lock(&vm_page_queue_free_lock); + free = cnt.v_free_count; + simple_unlock(&vm_page_queue_free_lock); + splx(s); + if (free >= cnt.v_free_target) + break; + + cnt.v_scan++; + next = m->pageq.tqe_next; + + /* + * If the page has been referenced, move it back to the + * active queue. + */ + if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { + vm_page_activate(m); + cnt.v_reactivated++; + continue; + } + + /* + * If the page is clean, free it up. + */ + if (m->flags & PG_CLEAN) { + object = m->object; + if (vm_object_lock_try(object)) { + pmap_page_protect(VM_PAGE_TO_PHYS(m), + VM_PROT_NONE); + vm_page_free(m); + pages_freed++; + cnt.v_dfree++; + vm_object_unlock(object); + } + continue; + } + + /* + * If the page is dirty but already being washed, skip it. + */ + if ((m->flags & PG_LAUNDRY) == 0) + continue; + + /* + * Otherwise the page is dirty and still in the laundry, + * so we start the cleaning operation and remove it from + * the laundry. + */ + object = m->object; + if (!vm_object_lock_try(object)) + continue; + cnt.v_pageouts++; +#ifdef CLUSTERED_PAGEOUT + if (object->pager && + vm_pager_cancluster(object->pager, PG_CLUSTERPUT)) + vm_pageout_cluster(m, object); + else +#endif + vm_pageout_page(m, object); + thread_wakeup((int) object); + vm_object_unlock(object); + /* + * Former next page may no longer even be on the inactive + * queue (due to potential blocking in the pager with the + * queues unlocked). If it isn't, we just start over. + */ + if (next && (next->flags & PG_INACTIVE) == 0) + next = vm_page_queue_inactive.tqh_first; + } + + /* + * Compute the page shortage. If we are still very low on memory + * be sure that we will move a minimal amount of pages from active + * to inactive. + */ + + page_shortage = cnt.v_inactive_target - cnt.v_inactive_count; + if (page_shortage <= 0 && pages_freed == 0) + page_shortage = 1; + + while (page_shortage > 0) { + /* + * Move some more pages from active to inactive. + */ + + if ((m = vm_page_queue_active.tqh_first) == NULL) + break; + vm_page_deactivate(m); + page_shortage--; + } + + vm_page_unlock_queues(); +} + +/* + * Called with object and page queues locked. + * If reactivate is TRUE, a pager error causes the page to be + * put back on the active queue, ow it is left on the inactive queue. + */ +void +vm_pageout_page(m, object) + vm_page_t m; + vm_object_t object; +{ + vm_pager_t pager; + int pageout_status; + + /* + * We set the busy bit to cause potential page faults on + * this page to block. + * + * We also set pageout-in-progress to keep the object from + * disappearing during pageout. This guarantees that the + * page won't move from the inactive queue. (However, any + * other page on the inactive queue may move!) + */ + pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); + m->flags |= PG_BUSY; + + /* + * Try to collapse the object before making a pager for it. + * We must unlock the page queues first. + */ + vm_page_unlock_queues(); + if (object->pager == NULL) + vm_object_collapse(object); + + object->paging_in_progress++; + vm_object_unlock(object); + + /* + * Do a wakeup here in case the following operations block. + */ + thread_wakeup((int) &cnt.v_free_count); + + /* + * If there is no pager for the page, use the default pager. + * If there is no place to put the page at the moment, + * leave it in the laundry and hope that there will be + * paging space later. + */ + if ((pager = object->pager) == NULL) { + pager = vm_pager_allocate(PG_DFLT, (caddr_t)0, object->size, + VM_PROT_ALL, (vm_offset_t)0); + if (pager != NULL) + vm_object_setpager(object, pager, 0, FALSE); + } + pageout_status = pager ? vm_pager_put(pager, m, FALSE) : VM_PAGER_FAIL; + vm_object_lock(object); + vm_page_lock_queues(); + + switch (pageout_status) { + case VM_PAGER_OK: + case VM_PAGER_PEND: + cnt.v_pgpgout++; + m->flags &= ~PG_LAUNDRY; + break; + case VM_PAGER_BAD: + /* + * Page outside of range of object. Right now we + * essentially lose the changes by pretending it + * worked. + * + * XXX dubious, what should we do? + */ + m->flags &= ~PG_LAUNDRY; + m->flags |= PG_CLEAN; + pmap_clear_modify(VM_PAGE_TO_PHYS(m)); + break; + case VM_PAGER_AGAIN: + { + extern int lbolt; + + /* + * FAIL on a write is interpreted to mean a resource + * shortage, so we put pause for awhile and try again. + * XXX could get stuck here. + */ + (void) tsleep((caddr_t)&lbolt, PZERO|PCATCH, "pageout", 0); + break; + } + case VM_PAGER_FAIL: + case VM_PAGER_ERROR: + /* + * If page couldn't be paged out, then reactivate + * the page so it doesn't clog the inactive list. + * (We will try paging out it again later). + */ + vm_page_activate(m); + cnt.v_reactivated++; + break; + } + + pmap_clear_reference(VM_PAGE_TO_PHYS(m)); + + /* + * If the operation is still going, leave the page busy + * to block all other accesses. Also, leave the paging + * in progress indicator set so that we don't attempt an + * object collapse. + */ + if (pageout_status != VM_PAGER_PEND) { + m->flags &= ~PG_BUSY; + PAGE_WAKEUP(m); + object->paging_in_progress--; + } +} + +#ifdef CLUSTERED_PAGEOUT +#define PAGEOUTABLE(p) \ + ((((p)->flags & (PG_INACTIVE|PG_CLEAN|PG_LAUNDRY)) == \ + (PG_INACTIVE|PG_LAUNDRY)) && !pmap_is_referenced(VM_PAGE_TO_PHYS(p))) + +/* + * Attempt to pageout as many contiguous (to ``m'') dirty pages as possible + * from ``object''. Using information returned from the pager, we assemble + * a sorted list of contiguous dirty pages and feed them to the pager in one + * chunk. Called with paging queues and object locked. Also, object must + * already have a pager. + */ +void +vm_pageout_cluster(m, object) + vm_page_t m; + vm_object_t object; +{ + vm_offset_t offset, loff, hoff; + vm_page_t plist[MAXPOCLUSTER], *plistp, p; + int postatus, ix, count; + + /* + * Determine the range of pages that can be part of a cluster + * for this object/offset. If it is only our single page, just + * do it normally. + */ + vm_pager_cluster(object->pager, m->offset, &loff, &hoff); + if (hoff - loff == PAGE_SIZE) { + vm_pageout_page(m, object); + return; + } + + plistp = plist; + + /* + * Target page is always part of the cluster. + */ + pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE); + m->flags |= PG_BUSY; + plistp[atop(m->offset - loff)] = m; + count = 1; + + /* + * Backup from the given page til we find one not fulfilling + * the pageout criteria or we hit the lower bound for the + * cluster. For each page determined to be part of the + * cluster, unmap it and busy it out so it won't change. + */ + ix = atop(m->offset - loff); + offset = m->offset; + while (offset > loff && count < MAXPOCLUSTER-1) { + p = vm_page_lookup(object, offset - PAGE_SIZE); + if (p == NULL || !PAGEOUTABLE(p)) + break; + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); + p->flags |= PG_BUSY; + plistp[--ix] = p; + offset -= PAGE_SIZE; + count++; + } + plistp += atop(offset - loff); + loff = offset; + + /* + * Now do the same moving forward from the target. + */ + ix = atop(m->offset - loff) + 1; + offset = m->offset + PAGE_SIZE; + while (offset < hoff && count < MAXPOCLUSTER) { + p = vm_page_lookup(object, offset); + if (p == NULL || !PAGEOUTABLE(p)) + break; + pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE); + p->flags |= PG_BUSY; + plistp[ix++] = p; + offset += PAGE_SIZE; + count++; + } + hoff = offset; + + /* + * Pageout the page. + * Unlock everything and do a wakeup prior to the pager call + * in case it blocks. + */ + vm_page_unlock_queues(); + object->paging_in_progress++; + vm_object_unlock(object); +again: + thread_wakeup((int) &cnt.v_free_count); + postatus = vm_pager_put_pages(object->pager, plistp, count, FALSE); + /* + * XXX rethink this + */ + if (postatus == VM_PAGER_AGAIN) { + extern int lbolt; + + (void) tsleep((caddr_t)&lbolt, PZERO|PCATCH, "pageout", 0); + goto again; + } else if (postatus == VM_PAGER_BAD) + panic("vm_pageout_cluster: VM_PAGER_BAD"); + vm_object_lock(object); + vm_page_lock_queues(); + + /* + * Loop through the affected pages, reflecting the outcome of + * the operation. + */ + for (ix = 0; ix < count; ix++) { + p = *plistp++; + switch (postatus) { + case VM_PAGER_OK: + case VM_PAGER_PEND: + cnt.v_pgpgout++; + p->flags &= ~PG_LAUNDRY; + break; + case VM_PAGER_FAIL: + case VM_PAGER_ERROR: + /* + * Pageout failed, reactivate the target page so it + * doesn't clog the inactive list. Other pages are + * left as they are. + */ + if (p == m) { + vm_page_activate(p); + cnt.v_reactivated++; + } + break; + } + pmap_clear_reference(VM_PAGE_TO_PHYS(p)); + /* + * If the operation is still going, leave the page busy + * to block all other accesses. + */ + if (postatus != VM_PAGER_PEND) { + p->flags &= ~PG_BUSY; + PAGE_WAKEUP(p); + + } + } + /* + * If the operation is still going, leave the paging in progress + * indicator set so that we don't attempt an object collapse. + */ + if (postatus != VM_PAGER_PEND) + object->paging_in_progress--; + +} +#endif + +/* + * vm_pageout is the high level pageout daemon. + */ + +void vm_pageout() +{ + (void) spl0(); + + /* + * Initialize some paging parameters. + */ + + if (cnt.v_free_min == 0) { + cnt.v_free_min = VM_PAGE_FREE_MIN; + vm_page_free_min_min /= cnt.v_page_size; + vm_page_free_min_max /= cnt.v_page_size; + if (cnt.v_free_min < vm_page_free_min_min) + cnt.v_free_min = vm_page_free_min_min; + if (cnt.v_free_min > vm_page_free_min_max) + cnt.v_free_min = vm_page_free_min_max; + } + + if (cnt.v_free_target == 0) + cnt.v_free_target = VM_PAGE_FREE_TARGET; + + if (cnt.v_free_target <= cnt.v_free_min) + cnt.v_free_target = cnt.v_free_min + 1; + + /* XXX does not really belong here */ + if (vm_page_max_wired == 0) + vm_page_max_wired = cnt.v_free_count / 3; + + /* + * The pageout daemon is never done, so loop + * forever. + */ + + simple_lock(&vm_pages_needed_lock); + while (TRUE) { + thread_sleep((int) &vm_pages_needed, &vm_pages_needed_lock, + FALSE); + /* + * Compute the inactive target for this scan. + * We need to keep a reasonable amount of memory in the + * inactive list to better simulate LRU behavior. + */ + cnt.v_inactive_target = + (cnt.v_active_count + cnt.v_inactive_count) / 3; + if (cnt.v_inactive_target <= cnt.v_free_target) + cnt.v_inactive_target = cnt.v_free_target + 1; + + /* + * Only make a scan if we are likely to do something. + * Otherwise we might have been awakened by a pager + * to clean up async pageouts. + */ + if (cnt.v_free_count < cnt.v_free_target || + cnt.v_inactive_count < cnt.v_inactive_target) + vm_pageout_scan(); + vm_pager_sync(); + simple_lock(&vm_pages_needed_lock); + thread_wakeup((int) &cnt.v_free_count); + } +} diff --git a/sys/vm/vm_pageout.h b/sys/vm/vm_pageout.h new file mode 100644 index 00000000000..a82a0ea40ac --- /dev/null +++ b/sys/vm/vm_pageout.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_pageout.h 8.2 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Avadis Tevanian, Jr. + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Header file for pageout daemon. + */ + +/* + * Exported data structures. + */ + +extern int vm_pages_needed; /* should be some "event" structure */ +simple_lock_data_t vm_pages_needed_lock; + + +/* + * Exported routines. + */ + +/* + * Signal pageout-daemon and wait for it. + */ + +#define VM_WAIT { \ + simple_lock(&vm_pages_needed_lock); \ + thread_wakeup((int)&vm_pages_needed); \ + thread_sleep((int)&cnt.v_free_count, \ + &vm_pages_needed_lock, FALSE); \ + } +#ifdef KERNEL +void vm_pageout __P((void)); +void vm_pageout_scan __P((void)); +void vm_pageout_page __P((vm_page_t, vm_object_t)); +void vm_pageout_cluster __P((vm_page_t, vm_object_t)); +#endif diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c new file mode 100644 index 00000000000..7123abb16ef --- /dev/null +++ b/sys/vm/vm_pager.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_pager.c 8.6 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Paging space routine stubs. Emulates a matchmaker-like interface + * for builtin pagers. + */ + +#include +#include +#include + +#include +#include +#include + +#ifdef SWAPPAGER +extern struct pagerops swappagerops; +#endif + +#ifdef VNODEPAGER +extern struct pagerops vnodepagerops; +#endif + +#ifdef DEVPAGER +extern struct pagerops devicepagerops; +#endif + +struct pagerops *pagertab[] = { +#ifdef SWAPPAGER + &swappagerops, /* PG_SWAP */ +#else + NULL, +#endif +#ifdef VNODEPAGER + &vnodepagerops, /* PG_VNODE */ +#else + NULL, +#endif +#ifdef DEVPAGER + &devicepagerops, /* PG_DEV */ +#else + NULL, +#endif +}; +int npagers = sizeof (pagertab) / sizeof (pagertab[0]); + +struct pagerops *dfltpagerops = NULL; /* default pager */ + +/* + * Kernel address space for mapping pages. + * Used by pagers where KVAs are needed for IO. + * + * XXX needs to be large enough to support the number of pending async + * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size + * (MAXPHYS == 64k) if you want to get the most efficiency. + */ +#define PAGER_MAP_SIZE (4 * 1024 * 1024) + +vm_map_t pager_map; +boolean_t pager_map_wanted; +vm_offset_t pager_sva, pager_eva; + +void +vm_pager_init() +{ + struct pagerops **pgops; + + /* + * Allocate a kernel submap for tracking get/put page mappings + */ + pager_map = kmem_suballoc(kernel_map, &pager_sva, &pager_eva, + PAGER_MAP_SIZE, FALSE); + /* + * Initialize known pagers + */ + for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) + if (pgops) + (*(*pgops)->pgo_init)(); + if (dfltpagerops == NULL) + panic("no default pager"); +} + +/* + * Allocate an instance of a pager of the given type. + * Size, protection and offset parameters are passed in for pagers that + * need to perform page-level validation (e.g. the device pager). + */ +vm_pager_t +vm_pager_allocate(type, handle, size, prot, off) + int type; + caddr_t handle; + vm_size_t size; + vm_prot_t prot; + vm_offset_t off; +{ + struct pagerops *ops; + + ops = (type == PG_DFLT) ? dfltpagerops : pagertab[type]; + if (ops) + return ((*ops->pgo_alloc)(handle, size, prot, off)); + return (NULL); +} + +void +vm_pager_deallocate(pager) + vm_pager_t pager; +{ + if (pager == NULL) + panic("vm_pager_deallocate: null pager"); + + (*pager->pg_ops->pgo_dealloc)(pager); +} + +int +vm_pager_get_pages(pager, mlist, npages, sync) + vm_pager_t pager; + vm_page_t *mlist; + int npages; + boolean_t sync; +{ + int rv; + + if (pager == NULL) { + rv = VM_PAGER_OK; + while (npages--) + if (!vm_page_zero_fill(*mlist)) { + rv = VM_PAGER_FAIL; + break; + } else + mlist++; + return (rv); + } + return ((*pager->pg_ops->pgo_getpages)(pager, mlist, npages, sync)); +} + +int +vm_pager_put_pages(pager, mlist, npages, sync) + vm_pager_t pager; + vm_page_t *mlist; + int npages; + boolean_t sync; +{ + if (pager == NULL) + panic("vm_pager_put_pages: null pager"); + return ((*pager->pg_ops->pgo_putpages)(pager, mlist, npages, sync)); +} + +boolean_t +vm_pager_has_page(pager, offset) + vm_pager_t pager; + vm_offset_t offset; +{ + if (pager == NULL) + panic("vm_pager_has_page: null pager"); + return ((*pager->pg_ops->pgo_haspage)(pager, offset)); +} + +/* + * Called by pageout daemon before going back to sleep. + * Gives pagers a chance to clean up any completed async pageing operations. + */ +void +vm_pager_sync() +{ + struct pagerops **pgops; + + for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++) + if (pgops) + (*(*pgops)->pgo_putpages)(NULL, NULL, 0, FALSE); +} + +void +vm_pager_cluster(pager, offset, loff, hoff) + vm_pager_t pager; + vm_offset_t offset; + vm_offset_t *loff; + vm_offset_t *hoff; +{ + if (pager == NULL) + panic("vm_pager_cluster: null pager"); + return ((*pager->pg_ops->pgo_cluster)(pager, offset, loff, hoff)); +} + +void +vm_pager_clusternull(pager, offset, loff, hoff) + vm_pager_t pager; + vm_offset_t offset; + vm_offset_t *loff; + vm_offset_t *hoff; +{ + panic("vm_pager_nullcluster called"); +} + +vm_offset_t +vm_pager_map_pages(mlist, npages, canwait) + vm_page_t *mlist; + int npages; + boolean_t canwait; +{ + vm_offset_t kva, va; + vm_size_t size; + vm_page_t m; + + /* + * Allocate space in the pager map, if none available return 0. + * This is basically an expansion of kmem_alloc_wait with optional + * blocking on no space. + */ + size = npages * PAGE_SIZE; + vm_map_lock(pager_map); + while (vm_map_findspace(pager_map, 0, size, &kva)) { + if (!canwait) { + vm_map_unlock(pager_map); + return (0); + } + pager_map_wanted = TRUE; + vm_map_unlock(pager_map); + (void) tsleep(pager_map, PVM, "pager_map", 0); + vm_map_lock(pager_map); + } + vm_map_insert(pager_map, NULL, 0, kva, kva + size); + vm_map_unlock(pager_map); + + for (va = kva; npages--; va += PAGE_SIZE) { + m = *mlist++; +#ifdef DEBUG + if ((m->flags & PG_BUSY) == 0) + panic("vm_pager_map_pages: page not busy"); + if (m->flags & PG_PAGEROWNED) + panic("vm_pager_map_pages: page already in pager"); +#endif +#ifdef DEBUG + m->flags |= PG_PAGEROWNED; +#endif + pmap_enter(vm_map_pmap(pager_map), va, VM_PAGE_TO_PHYS(m), + VM_PROT_DEFAULT, TRUE); + } + return (kva); +} + +void +vm_pager_unmap_pages(kva, npages) + vm_offset_t kva; + int npages; +{ + vm_size_t size = npages * PAGE_SIZE; + +#ifdef DEBUG + vm_offset_t va; + vm_page_t m; + int np = npages; + + for (va = kva; np--; va += PAGE_SIZE) { + m = vm_pager_atop(va); + if (m->flags & PG_PAGEROWNED) + m->flags &= ~PG_PAGEROWNED; + else + printf("vm_pager_unmap_pages: %x(%x/%x) not owned\n", + m, va, VM_PAGE_TO_PHYS(m)); + } +#endif + pmap_remove(vm_map_pmap(pager_map), kva, kva + size); + vm_map_lock(pager_map); + (void) vm_map_delete(pager_map, kva, kva + size); + if (pager_map_wanted) + wakeup(pager_map); + vm_map_unlock(pager_map); +} + +vm_page_t +vm_pager_atop(kva) + vm_offset_t kva; +{ + vm_offset_t pa; + + pa = pmap_extract(vm_map_pmap(pager_map), kva); + if (pa == 0) + panic("vm_pager_atop"); + return (PHYS_TO_VM_PAGE(pa)); +} + +vm_pager_t +vm_pager_lookup(pglist, handle) + register struct pagerlst *pglist; + caddr_t handle; +{ + register vm_pager_t pager; + + for (pager = pglist->tqh_first; pager; pager = pager->pg_list.tqe_next) + if (pager->pg_handle == handle) + return (pager); + return (NULL); +} + +/* + * This routine gains a reference to the object. + * Explicit deallocation is necessary. + */ +int +pager_cache(object, should_cache) + vm_object_t object; + boolean_t should_cache; +{ + if (object == NULL) + return (KERN_INVALID_ARGUMENT); + + vm_object_cache_lock(); + vm_object_lock(object); + if (should_cache) + object->flags |= OBJ_CANPERSIST; + else + object->flags &= ~OBJ_CANPERSIST; + vm_object_unlock(object); + vm_object_cache_unlock(); + + vm_object_deallocate(object); + + return (KERN_SUCCESS); +} diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h new file mode 100644 index 00000000000..e4659c268c1 --- /dev/null +++ b/sys/vm/vm_pager.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_pager.h 8.4 (Berkeley) 1/12/94 + */ + +/* + * Pager routine interface definition. + * For BSD we use a cleaner version of the internal pager interface. + */ + +#ifndef _VM_PAGER_ +#define _VM_PAGER_ + +TAILQ_HEAD(pagerlst, pager_struct); + +struct pager_struct { + TAILQ_ENTRY(pager_struct) pg_list; /* links for list management */ + caddr_t pg_handle; /* ext. handle (vp, dev, fp) */ + int pg_type; /* type of pager */ + int pg_flags; /* flags */ + struct pagerops *pg_ops; /* pager operations */ + void *pg_data; /* private pager data */ +}; + +/* pager types */ +#define PG_DFLT -1 +#define PG_SWAP 0 +#define PG_VNODE 1 +#define PG_DEVICE 2 + +/* flags */ +#define PG_CLUSTERGET 1 +#define PG_CLUSTERPUT 2 + +struct pagerops { + void (*pgo_init) /* Initialize pager. */ + __P((void)); + vm_pager_t (*pgo_alloc) /* Allocate pager. */ + __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); + void (*pgo_dealloc) /* Disassociate. */ + __P((vm_pager_t)); + int (*pgo_getpages) /* Get (read) page. */ + __P((vm_pager_t, vm_page_t *, int, boolean_t)); + int (*pgo_putpages) /* Put (write) page. */ + __P((vm_pager_t, vm_page_t *, int, boolean_t)); + boolean_t (*pgo_haspage) /* Does pager have page? */ + __P((vm_pager_t, vm_offset_t)); + void (*pgo_cluster) /* Return range of cluster. */ + __P((vm_pager_t, vm_offset_t, + vm_offset_t *, vm_offset_t *)); +}; + +/* + * get/put return values + * OK operation was successful + * BAD specified data was out of the accepted range + * FAIL specified data was in range, but doesn't exist + * PEND operations was initiated but not completed + * ERROR error while accessing data that is in range and exists + * AGAIN temporary resource shortage prevented operation from happening + */ +#define VM_PAGER_OK 0 +#define VM_PAGER_BAD 1 +#define VM_PAGER_FAIL 2 +#define VM_PAGER_PEND 3 +#define VM_PAGER_ERROR 4 +#define VM_PAGER_AGAIN 5 + +#ifdef KERNEL +extern struct pagerops *dfltpagerops; + +vm_pager_t vm_pager_allocate + __P((int, caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); +vm_page_t vm_pager_atop __P((vm_offset_t)); +void vm_pager_cluster + __P((vm_pager_t, vm_offset_t, + vm_offset_t *, vm_offset_t *)); +void vm_pager_clusternull + __P((vm_pager_t, vm_offset_t, + vm_offset_t *, vm_offset_t *)); +void vm_pager_deallocate __P((vm_pager_t)); +int vm_pager_get_pages + __P((vm_pager_t, vm_page_t *, int, boolean_t)); +boolean_t vm_pager_has_page __P((vm_pager_t, vm_offset_t)); +void vm_pager_init __P((void)); +vm_pager_t vm_pager_lookup __P((struct pagerlst *, caddr_t)); +vm_offset_t vm_pager_map_pages __P((vm_page_t *, int, boolean_t)); +int vm_pager_put_pages + __P((vm_pager_t, vm_page_t *, int, boolean_t)); +void vm_pager_sync __P((void)); +void vm_pager_unmap_pages __P((vm_offset_t, int)); + +#define vm_pager_cancluster(p, b) ((p)->pg_flags & (b)) + +/* + * XXX compat with old interface + */ +#define vm_pager_get(p, m, s) \ +({ \ + vm_page_t ml[1]; \ + ml[0] = (m); \ + vm_pager_get_pages(p, ml, 1, s); \ +}) +#define vm_pager_put(p, m, s) \ +({ \ + vm_page_t ml[1]; \ + ml[0] = (m); \ + vm_pager_put_pages(p, ml, 1, s); \ +}) +#endif + +#endif /* _VM_PAGER_ */ diff --git a/sys/vm/vm_param.h b/sys/vm/vm_param.h new file mode 100644 index 00000000000..2d2c71594ed --- /dev/null +++ b/sys/vm/vm_param.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_param.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Machine independent virtual memory parameters. + */ + +#ifndef _VM_PARAM_ +#define _VM_PARAM_ + +#include + +/* + * This belongs in types.h, but breaks too many existing programs. + */ +typedef int boolean_t; +#define TRUE 1 +#define FALSE 0 + +/* + * The machine independent pages are refered to as PAGES. A page + * is some number of hardware pages, depending on the target machine. + */ +#define DEFAULT_PAGE_SIZE 4096 + +/* + * All references to the size of a page should be done with PAGE_SIZE + * or PAGE_SHIFT. The fact they are variables is hidden here so that + * we can easily make them constant if we so desire. + */ +#define PAGE_SIZE cnt.v_page_size /* size of page */ +#define PAGE_MASK page_mask /* size of page - 1 */ +#define PAGE_SHIFT page_shift /* bits to shift for pages */ +#ifdef KERNEL +extern vm_size_t page_mask; +extern int page_shift; +#endif + +/* + * CTL_VM identifiers + */ +#define VM_METER 1 /* struct vmmeter */ +#define VM_LOADAVG 2 /* struct loadavg */ +#define VM_MAXID 3 /* number of valid vm ids */ + +#define CTL_VM_NAMES { \ + { 0, 0 }, \ + { "vmmeter", CTLTYPE_STRUCT }, \ + { "loadavg", CTLTYPE_STRUCT }, \ +} + +/* + * Return values from the VM routines. + */ +#define KERN_SUCCESS 0 +#define KERN_INVALID_ADDRESS 1 +#define KERN_PROTECTION_FAILURE 2 +#define KERN_NO_SPACE 3 +#define KERN_INVALID_ARGUMENT 4 +#define KERN_FAILURE 5 +#define KERN_RESOURCE_SHORTAGE 6 +#define KERN_NOT_RECEIVER 7 +#define KERN_NO_ACCESS 8 + +#ifndef ASSEMBLER +/* + * Convert addresses to pages and vice versa. + * No rounding is used. + */ +#ifdef KERNEL +#define atop(x) (((unsigned)(x)) >> PAGE_SHIFT) +#define ptoa(x) ((vm_offset_t)((x) << PAGE_SHIFT)) + +/* + * Round off or truncate to the nearest page. These will work + * for either addresses or counts (i.e., 1 byte rounds to 1 page). + */ +#define round_page(x) \ + ((vm_offset_t)((((vm_offset_t)(x)) + PAGE_MASK) & ~PAGE_MASK)) +#define trunc_page(x) \ + ((vm_offset_t)(((vm_offset_t)(x)) & ~PAGE_MASK)) +#define num_pages(x) \ + ((vm_offset_t)((((vm_offset_t)(x)) + PAGE_MASK) >> PAGE_SHIFT)) + +extern vm_size_t mem_size; /* size of physical memory (bytes) */ +extern vm_offset_t first_addr; /* first physical page */ +extern vm_offset_t last_addr; /* last physical page */ + +#else +/* out-of-kernel versions of round_page and trunc_page */ +#define round_page(x) \ + ((((vm_offset_t)(x) + (vm_page_size - 1)) / vm_page_size) * vm_page_size) +#define trunc_page(x) \ + ((((vm_offset_t)(x)) / vm_page_size) * vm_page_size) + +#endif /* KERNEL */ +#endif /* ASSEMBLER */ +#endif /* _VM_PARAM_ */ diff --git a/sys/vm/vm_prot.h b/sys/vm/vm_prot.h new file mode 100644 index 00000000000..b3bae438631 --- /dev/null +++ b/sys/vm/vm_prot.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_prot.h 8.1 (Berkeley) 6/11/93 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Virtual memory protection definitions. + */ + +#ifndef _VM_PROT_ +#define _VM_PROT_ + +/* + * Types defined: + * + * vm_prot_t VM protection values. + */ + +typedef int vm_prot_t; + +/* + * Protection values, defined as bits within the vm_prot_t type + */ + +#define VM_PROT_NONE ((vm_prot_t) 0x00) + +#define VM_PROT_READ ((vm_prot_t) 0x01) /* read permission */ +#define VM_PROT_WRITE ((vm_prot_t) 0x02) /* write permission */ +#define VM_PROT_EXECUTE ((vm_prot_t) 0x04) /* execute permission */ + +/* + * The default protection for newly-created virtual memory + */ + +#define VM_PROT_DEFAULT (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE) + +/* + * The maximum privileges possible, for parameter checking. + */ + +#define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE) + +#endif /* _VM_PROT_ */ diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c new file mode 100644 index 00000000000..10b7523ae23 --- /dev/null +++ b/sys/vm/vm_swap.c @@ -0,0 +1,427 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 + */ + +#include +#include +#include +#include +#include +#include +#include /* XXX */ +#include +#include +#include + +#include + +/* + * Indirect driver for multi-controller paging. + */ + +int nswap, nswdev; +#ifdef SEQSWAP +int niswdev; /* number of interleaved swap devices */ +int niswap; /* size of interleaved swap area */ +#endif + +/* + * Set up swap devices. + * Initialize linked list of free swap + * headers. These do not actually point + * to buffers, but rather to pages that + * are being swapped in and out. + */ +void +swapinit() +{ + register int i; + register struct buf *sp = swbuf; + register struct proc *p = &proc0; /* XXX */ + struct swdevt *swp; + int error; + + /* + * Count swap devices, and adjust total swap space available. + * Some of the space will not be countable until later (dynamically + * configurable devices) and some of the counted space will not be + * available until a swapon() system call is issued, both usually + * happen when the system goes multi-user. + * + * If using NFS for swap, swdevt[0] will already be bdevvp'd. XXX + */ +#ifdef SEQSWAP + nswdev = niswdev = 0; + nswap = niswap = 0; + /* + * All interleaved devices must come first + */ + for (swp = swdevt; swp->sw_dev != NODEV || swp->sw_vp != NULL; swp++) { + if (swp->sw_flags & SW_SEQUENTIAL) + break; + niswdev++; + if (swp->sw_nblks > niswap) + niswap = swp->sw_nblks; + } + niswap = roundup(niswap, dmmax); + niswap *= niswdev; + if (swdevt[0].sw_vp == NULL && + bdevvp(swdevt[0].sw_dev, &swdevt[0].sw_vp)) + panic("swapvp"); + /* + * The remainder must be sequential + */ + for ( ; swp->sw_dev != NODEV; swp++) { + if ((swp->sw_flags & SW_SEQUENTIAL) == 0) + panic("binit: mis-ordered swap devices"); + nswdev++; + if (swp->sw_nblks > 0) { + if (swp->sw_nblks % dmmax) + swp->sw_nblks -= (swp->sw_nblks % dmmax); + nswap += swp->sw_nblks; + } + } + nswdev += niswdev; + if (nswdev == 0) + panic("swapinit"); + nswap += niswap; +#else + nswdev = 0; + nswap = 0; + for (swp = swdevt; swp->sw_dev != NODEV || swp->sw_vp != NULL; swp++) { + nswdev++; + if (swp->sw_nblks > nswap) + nswap = swp->sw_nblks; + } + if (nswdev == 0) + panic("swapinit"); + if (nswdev > 1) + nswap = ((nswap + dmmax - 1) / dmmax) * dmmax; + nswap *= nswdev; + if (swdevt[0].sw_vp == NULL && + bdevvp(swdevt[0].sw_dev, &swdevt[0].sw_vp)) + panic("swapvp"); +#endif + if (nswap == 0) + printf("WARNING: no swap space found\n"); + else if (error = swfree(p, 0)) { + printf("swfree errno %d\n", error); /* XXX */ + panic("swapinit swfree 0"); + } + + /* + * Now set up swap buffer headers. + */ + bswlist.b_actf = sp; + for (i = 0; i < nswbuf - 1; i++, sp++) { + sp->b_actf = sp + 1; + sp->b_rcred = sp->b_wcred = p->p_ucred; + sp->b_vnbufs.le_next = NOLIST; + } + sp->b_rcred = sp->b_wcred = p->p_ucred; + sp->b_vnbufs.le_next = NOLIST; + sp->b_actf = NULL; +} + +void +swstrategy(bp) + register struct buf *bp; +{ + int sz, off, seg, index; + register struct swdevt *sp; + struct vnode *vp; + +#ifdef GENERIC + /* + * A mini-root gets copied into the front of the swap + * and we run over top of the swap area just long + * enough for us to do a mkfs and restor of the real + * root (sure beats rewriting standalone restor). + */ +#define MINIROOTSIZE 4096 + if (rootdev == dumpdev) + bp->b_blkno += MINIROOTSIZE; +#endif + sz = howmany(bp->b_bcount, DEV_BSIZE); + if (bp->b_blkno + sz > nswap) { + bp->b_error = EINVAL; + bp->b_flags |= B_ERROR; + biodone(bp); + return; + } + if (nswdev > 1) { +#ifdef SEQSWAP + if (bp->b_blkno < niswap) { + if (niswdev > 1) { + off = bp->b_blkno % dmmax; + if (off+sz > dmmax) { + bp->b_error = EINVAL; + bp->b_flags |= B_ERROR; + biodone(bp); + return; + } + seg = bp->b_blkno / dmmax; + index = seg % niswdev; + seg /= niswdev; + bp->b_blkno = seg*dmmax + off; + } else + index = 0; + } else { + register struct swdevt *swp; + + bp->b_blkno -= niswap; + for (index = niswdev, swp = &swdevt[niswdev]; + swp->sw_dev != NODEV; + swp++, index++) { + if (bp->b_blkno < swp->sw_nblks) + break; + bp->b_blkno -= swp->sw_nblks; + } + if (swp->sw_dev == NODEV || + bp->b_blkno+sz > swp->sw_nblks) { + bp->b_error = swp->sw_dev == NODEV ? + ENODEV : EINVAL; + bp->b_flags |= B_ERROR; + biodone(bp); + return; + } + } +#else + off = bp->b_blkno % dmmax; + if (off+sz > dmmax) { + bp->b_error = EINVAL; + bp->b_flags |= B_ERROR; + biodone(bp); + return; + } + seg = bp->b_blkno / dmmax; + index = seg % nswdev; + seg /= nswdev; + bp->b_blkno = seg*dmmax + off; +#endif + } else + index = 0; + sp = &swdevt[index]; + if ((bp->b_dev = sp->sw_dev) == NODEV) + panic("swstrategy"); + if (sp->sw_vp == NULL) { + bp->b_error = ENODEV; + bp->b_flags |= B_ERROR; + biodone(bp); + return; + } + VHOLD(sp->sw_vp); + if ((bp->b_flags & B_READ) == 0) { + if (vp = bp->b_vp) { + vp->v_numoutput--; + if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { + vp->v_flag &= ~VBWAIT; + wakeup((caddr_t)&vp->v_numoutput); + } + } + sp->sw_vp->v_numoutput++; + } + if (bp->b_vp != NULL) + brelvp(bp); + bp->b_vp = sp->sw_vp; + VOP_STRATEGY(bp); +} + +/* + * System call swapon(name) enables swapping on device name, + * which must be in the swdevsw. Return EBUSY + * if already swapping on this device. + */ +struct swapon_args { + char *name; +}; +/* ARGSUSED */ +int +swapon(p, uap, retval) + struct proc *p; + struct swapon_args *uap; + int *retval; +{ + register struct vnode *vp; + register struct swdevt *sp; + dev_t dev; + int error; + struct nameidata nd; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VBLK) { + vrele(vp); + return (ENOTBLK); + } + dev = (dev_t)vp->v_rdev; + if (major(dev) >= nblkdev) { + vrele(vp); + return (ENXIO); + } + for (sp = &swdevt[0]; sp->sw_dev != NODEV; sp++) { + if (sp->sw_dev == dev) { + if (sp->sw_flags & SW_FREED) { + vrele(vp); + return (EBUSY); + } + sp->sw_vp = vp; + if (error = swfree(p, sp - swdevt)) { + vrele(vp); + return (error); + } + return (0); + } +#ifdef SEQSWAP + /* + * If we have reached a non-freed sequential device without + * finding what we are looking for, it is an error. + * That is because all interleaved devices must come first + * and sequential devices must be freed in order. + */ + if ((sp->sw_flags & (SW_SEQUENTIAL|SW_FREED)) == SW_SEQUENTIAL) + break; +#endif + } + vrele(vp); + return (EINVAL); +} + +/* + * Swfree(index) frees the index'th portion of the swap map. + * Each of the nswdev devices provides 1/nswdev'th of the swap + * space, which is laid out with blocks of dmmax pages circularly + * among the devices. + */ +int +swfree(p, index) + struct proc *p; + int index; +{ + register struct swdevt *sp; + register swblk_t vsbase; + register long blk; + struct vnode *vp; + register swblk_t dvbase; + register int nblks; + int error; + + sp = &swdevt[index]; + vp = sp->sw_vp; + if (error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)) + return (error); + sp->sw_flags |= SW_FREED; + nblks = sp->sw_nblks; + /* + * Some devices may not exist til after boot time. + * If so, their nblk count will be 0. + */ + if (nblks <= 0) { + int perdev; + dev_t dev = sp->sw_dev; + + if (bdevsw[major(dev)].d_psize == 0 || + (nblks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { + (void) VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); + sp->sw_flags &= ~SW_FREED; + return (ENXIO); + } +#ifdef SEQSWAP + if (index < niswdev) { + perdev = niswap / niswdev; + if (nblks > perdev) + nblks = perdev; + } else { + if (nblks % dmmax) + nblks -= (nblks % dmmax); + nswap += nblks; + } +#else + perdev = nswap / nswdev; + if (nblks > perdev) + nblks = perdev; +#endif + sp->sw_nblks = nblks; + } + if (nblks == 0) { + (void) VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); + sp->sw_flags &= ~SW_FREED; + return (0); /* XXX error? */ + } +#ifdef SEQSWAP + if (sp->sw_flags & SW_SEQUENTIAL) { + register struct swdevt *swp; + + blk = niswap; + for (swp = &swdevt[niswdev]; swp != sp; swp++) + blk += swp->sw_nblks; + rmfree(swapmap, nblks, blk); + return (0); + } +#endif + for (dvbase = 0; dvbase < nblks; dvbase += dmmax) { + blk = nblks - dvbase; +#ifdef SEQSWAP + if ((vsbase = index*dmmax + dvbase*niswdev) >= niswap) + panic("swfree"); +#else + if ((vsbase = index*dmmax + dvbase*nswdev) >= nswap) + panic("swfree"); +#endif + if (blk > dmmax) + blk = dmmax; + if (vsbase == 0) { + /* + * First of all chunks... initialize the swapmap. + * Don't use the first cluster of the device + * in case it starts with a label or boot block. + */ + rminit(swapmap, blk - ctod(CLSIZE), + vsbase + ctod(CLSIZE), "swap", nswapmap); + } else if (dvbase == 0) { + /* + * Don't use the first cluster of the device + * in case it starts with a label or boot block. + */ + rmfree(swapmap, blk - ctod(CLSIZE), + vsbase + ctod(CLSIZE)); + } else + rmfree(swapmap, blk, vsbase); + } + return (0); +} diff --git a/sys/vm/vm_unix.c b/sys/vm/vm_unix.c new file mode 100644 index 00000000000..3d49ea71718 --- /dev/null +++ b/sys/vm/vm_unix.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: vm_unix.c 1.1 89/11/07$ + * + * @(#)vm_unix.c 8.1 (Berkeley) 6/11/93 + */ + +/* + * Traditional sbrk/grow interface to VM + */ +#include +#include +#include +#include + +#include + +struct obreak_args { + char *nsiz; +}; +/* ARGSUSED */ +int +obreak(p, uap, retval) + struct proc *p; + struct obreak_args *uap; + int *retval; +{ + register struct vmspace *vm = p->p_vmspace; + vm_offset_t new, old; + int rv; + register int diff; + + old = (vm_offset_t)vm->vm_daddr; + new = round_page(uap->nsiz); + if ((int)(new - old) > p->p_rlimit[RLIMIT_DATA].rlim_cur) + return(ENOMEM); + old = round_page(old + ctob(vm->vm_dsize)); + diff = new - old; + if (diff > 0) { + rv = vm_allocate(&vm->vm_map, &old, diff, FALSE); + if (rv != KERN_SUCCESS) { + uprintf("sbrk: grow failed, return = %d\n", rv); + return(ENOMEM); + } + vm->vm_dsize += btoc(diff); + } else if (diff < 0) { + diff = -diff; + rv = vm_deallocate(&vm->vm_map, new, diff); + if (rv != KERN_SUCCESS) { + uprintf("sbrk: shrink failed, return = %d\n", rv); + return(ENOMEM); + } + vm->vm_dsize -= btoc(diff); + } + return(0); +} + +/* + * Enlarge the "stack segment" to include the specified + * stack pointer for the process. + */ +int +grow(p, sp) + struct proc *p; + unsigned sp; +{ + register struct vmspace *vm = p->p_vmspace; + register int si; + + /* + * For user defined stacks (from sendsig). + */ + if (sp < (unsigned)vm->vm_maxsaddr) + return (0); + /* + * For common case of already allocated (from trap). + */ + if (sp >= USRSTACK - ctob(vm->vm_ssize)) + return (1); + /* + * Really need to check vs limit and increment stack size if ok. + */ + si = clrnd(btoc(USRSTACK-sp) - vm->vm_ssize); + if (vm->vm_ssize + si > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) + return (0); + vm->vm_ssize += si; + return (1); +} + +struct ovadvise_args { + int anom; +}; +/* ARGSUSED */ +int +ovadvise(p, uap, retval) + struct proc *p; + struct ovadvise_args *uap; + int *retval; +{ + + return (EINVAL); +} diff --git a/sys/vm/vm_user.c b/sys/vm/vm_user.c new file mode 100644 index 00000000000..20172c6c651 --- /dev/null +++ b/sys/vm/vm_user.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * The Mach Operating System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vm_user.c 8.2 (Berkeley) 1/12/94 + * + * + * Copyright (c) 1987, 1990 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Avadis Tevanian, Jr., Michael Wayne Young + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * User-exported virtual memory functions. + */ + +#include +#include +#include + +#include + +simple_lock_data_t vm_alloc_lock; /* XXX */ + +#ifdef MACHVMCOMPAT +/* + * BSD style syscall interfaces to MACH calls + * All return MACH return values. + */ +struct svm_allocate_args { + vm_map_t map; + vm_offset_t *addr; + vm_size_t size; + boolean_t anywhere; +}; +/* ARGSUSED */ +int +svm_allocate(p, uap, retval) + struct proc *p; + struct svm_allocate_args *uap; + int *retval; +{ + vm_offset_t addr; + int rv; + + uap->map = p->p_map; /* XXX */ + + if (copyin((caddr_t)uap->addr, (caddr_t)&addr, sizeof (addr))) + rv = KERN_INVALID_ARGUMENT; + else + rv = vm_allocate(uap->map, &addr, uap->size, uap->anywhere); + if (rv == KERN_SUCCESS) { + if (copyout((caddr_t)&addr, (caddr_t)uap->addr, sizeof(addr))) + rv = KERN_INVALID_ARGUMENT; + } + return((int)rv); +} + +struct svm_deallocate_args { + vm_map_t map; + vm_offset_t addr; + vm_size_t size; +}; +/* ARGSUSED */ +int +svm_deallocate(p, uap, retval) + struct proc *p; + struct svm_deallocate_args *uap; + int *retval; +{ + int rv; + + uap->map = p->p_map; /* XXX */ + rv = vm_deallocate(uap->map, uap->addr, uap->size); + return((int)rv); +} + +struct svm_inherit_args { + vm_map_t map; + vm_offset_t addr; + vm_size_t size; + vm_inherit_t inherit; +}; +/* ARGSUSED */ +int +svm_inherit(p, uap, retval) + struct proc *p; + struct svm_inherit_args *uap; + int *retval; +{ + int rv; + + uap->map = p->p_map; /* XXX */ + rv = vm_inherit(uap->map, uap->addr, uap->size, uap->inherit); + return((int)rv); +} + +struct svm_protect_args { + vm_map_t map; + vm_offset_t addr; + vm_size_t size; + boolean_t setmax; + vm_prot_t prot; +}; +/* ARGSUSED */ +int +svm_protect(p, uap, retval) + struct proc *p; + struct svm_protect_args *uap; + int *retval; +{ + int rv; + + uap->map = p->p_map; /* XXX */ + rv = vm_protect(uap->map, uap->addr, uap->size, uap->setmax, uap->prot); + return((int)rv); +} + +/* + * vm_inherit sets the inheritence of the specified range in the + * specified map. + */ +int +vm_inherit(map, start, size, new_inheritance) + register vm_map_t map; + vm_offset_t start; + vm_size_t size; + vm_inherit_t new_inheritance; +{ + if (map == NULL) + return(KERN_INVALID_ARGUMENT); + + return(vm_map_inherit(map, trunc_page(start), round_page(start+size), new_inheritance)); +} + +/* + * vm_protect sets the protection of the specified range in the + * specified map. + */ + +int +vm_protect(map, start, size, set_maximum, new_protection) + register vm_map_t map; + vm_offset_t start; + vm_size_t size; + boolean_t set_maximum; + vm_prot_t new_protection; +{ + if (map == NULL) + return(KERN_INVALID_ARGUMENT); + + return(vm_map_protect(map, trunc_page(start), round_page(start+size), new_protection, set_maximum)); +} +#endif + +/* + * vm_allocate allocates "zero fill" memory in the specfied + * map. + */ +int +vm_allocate(map, addr, size, anywhere) + register vm_map_t map; + register vm_offset_t *addr; + register vm_size_t size; + boolean_t anywhere; +{ + int result; + + if (map == NULL) + return(KERN_INVALID_ARGUMENT); + if (size == 0) { + *addr = 0; + return(KERN_SUCCESS); + } + + if (anywhere) + *addr = vm_map_min(map); + else + *addr = trunc_page(*addr); + size = round_page(size); + + result = vm_map_find(map, NULL, (vm_offset_t) 0, addr, size, anywhere); + + return(result); +} + +/* + * vm_deallocate deallocates the specified range of addresses in the + * specified address map. + */ +int +vm_deallocate(map, start, size) + register vm_map_t map; + vm_offset_t start; + vm_size_t size; +{ + if (map == NULL) + return(KERN_INVALID_ARGUMENT); + + if (size == (vm_offset_t) 0) + return(KERN_SUCCESS); + + return(vm_map_remove(map, trunc_page(start), round_page(start+size))); +} + +/* + * Similar to vm_allocate but assigns an explicit pager. + */ +int +vm_allocate_with_pager(map, addr, size, anywhere, pager, poffset, internal) + register vm_map_t map; + register vm_offset_t *addr; + register vm_size_t size; + boolean_t anywhere; + vm_pager_t pager; + vm_offset_t poffset; + boolean_t internal; +{ + register vm_object_t object; + register int result; + + if (map == NULL) + return(KERN_INVALID_ARGUMENT); + + *addr = trunc_page(*addr); + size = round_page(size); + + /* + * Lookup the pager/paging-space in the object cache. + * If it's not there, then create a new object and cache + * it. + */ + object = vm_object_lookup(pager); + cnt.v_lookups++; + if (object == NULL) { + object = vm_object_allocate(size); + /* + * From Mike Hibler: "unnamed anonymous objects should never + * be on the hash list ... For now you can just change + * vm_allocate_with_pager to not do vm_object_enter if this + * is an internal object ..." + */ + if (!internal) + vm_object_enter(object, pager); + } else + cnt.v_hits++; + if (internal) + object->flags |= OBJ_INTERNAL; + else { + object->flags &= ~OBJ_INTERNAL; + cnt.v_nzfod -= atop(size); + } + + result = vm_map_find(map, object, poffset, addr, size, anywhere); + if (result != KERN_SUCCESS) + vm_object_deallocate(object); + else if (pager != NULL) + vm_object_setpager(object, pager, (vm_offset_t) 0, TRUE); + return(result); +} diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c new file mode 100644 index 00000000000..9c2f8260cfb --- /dev/null +++ b/sys/vm/vnode_pager.c @@ -0,0 +1,580 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vnode_pager.c 8.8 (Berkeley) 2/13/94 + */ + +/* + * Page to/from files (vnodes). + * + * TODO: + * pageouts + * fix credential use (uses current process credentials now) + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +struct pagerlst vnode_pager_list; /* list of managed vnodes */ + +#ifdef DEBUG +int vpagerdebug = 0x00; +#define VDB_FOLLOW 0x01 +#define VDB_INIT 0x02 +#define VDB_IO 0x04 +#define VDB_FAIL 0x08 +#define VDB_ALLOC 0x10 +#define VDB_SIZE 0x20 +#endif + +static vm_pager_t vnode_pager_alloc + __P((caddr_t, vm_size_t, vm_prot_t, vm_offset_t)); +static void vnode_pager_cluster + __P((vm_pager_t, vm_offset_t, + vm_offset_t *, vm_offset_t *)); +static void vnode_pager_dealloc __P((vm_pager_t)); +static int vnode_pager_getpage + __P((vm_pager_t, vm_page_t *, int, boolean_t)); +static boolean_t vnode_pager_haspage __P((vm_pager_t, vm_offset_t)); +static void vnode_pager_init __P((void)); +static int vnode_pager_io + __P((vn_pager_t, vm_page_t *, int, + boolean_t, enum uio_rw)); +static boolean_t vnode_pager_putpage + __P((vm_pager_t, vm_page_t *, int, boolean_t)); + +struct pagerops vnodepagerops = { + vnode_pager_init, + vnode_pager_alloc, + vnode_pager_dealloc, + vnode_pager_getpage, + vnode_pager_putpage, + vnode_pager_haspage, + vnode_pager_cluster +}; + +static void +vnode_pager_init() +{ +#ifdef DEBUG + if (vpagerdebug & VDB_FOLLOW) + printf("vnode_pager_init()\n"); +#endif + TAILQ_INIT(&vnode_pager_list); +} + +/* + * Allocate (or lookup) pager for a vnode. + * Handle is a vnode pointer. + */ +static vm_pager_t +vnode_pager_alloc(handle, size, prot, foff) + caddr_t handle; + vm_size_t size; + vm_prot_t prot; + vm_offset_t foff; +{ + register vm_pager_t pager; + register vn_pager_t vnp; + vm_object_t object; + struct vattr vattr; + struct vnode *vp; + struct proc *p = curproc; /* XXX */ + +#ifdef DEBUG + if (vpagerdebug & (VDB_FOLLOW|VDB_ALLOC)) + printf("vnode_pager_alloc(%x, %x, %x)\n", handle, size, prot); +#endif + /* + * Pageout to vnode, no can do yet. + */ + if (handle == NULL) + return(NULL); + + /* + * Vnodes keep a pointer to any associated pager so no need to + * lookup with vm_pager_lookup. + */ + vp = (struct vnode *)handle; + pager = (vm_pager_t)vp->v_vmdata; + if (pager == NULL) { + /* + * Allocate pager structures + */ + pager = (vm_pager_t)malloc(sizeof *pager, M_VMPAGER, M_WAITOK); + if (pager == NULL) + return(NULL); + vnp = (vn_pager_t)malloc(sizeof *vnp, M_VMPGDATA, M_WAITOK); + if (vnp == NULL) { + free((caddr_t)pager, M_VMPAGER); + return(NULL); + } + /* + * And an object of the appropriate size + */ + if (VOP_GETATTR(vp, &vattr, p->p_ucred, p) == 0) { + object = vm_object_allocate(round_page(vattr.va_size)); + vm_object_enter(object, pager); + vm_object_setpager(object, pager, 0, TRUE); + } else { + free((caddr_t)vnp, M_VMPGDATA); + free((caddr_t)pager, M_VMPAGER); + return(NULL); + } + /* + * Hold a reference to the vnode and initialize pager data. + */ + VREF(vp); + vnp->vnp_flags = 0; + vnp->vnp_vp = vp; + vnp->vnp_size = vattr.va_size; + TAILQ_INSERT_TAIL(&vnode_pager_list, pager, pg_list); + pager->pg_handle = handle; + pager->pg_type = PG_VNODE; + pager->pg_flags = 0; + pager->pg_ops = &vnodepagerops; + pager->pg_data = vnp; + vp->v_vmdata = (caddr_t)pager; + } else { + /* + * vm_object_lookup() will remove the object from the + * cache if found and also gain a reference to the object. + */ + object = vm_object_lookup(pager); +#ifdef DEBUG + vnp = (vn_pager_t)pager->pg_data; +#endif + } +#ifdef DEBUG + if (vpagerdebug & VDB_ALLOC) + printf("vnode_pager_setup: vp %x sz %x pager %x object %x\n", + vp, vnp->vnp_size, pager, object); +#endif + return(pager); +} + +static void +vnode_pager_dealloc(pager) + vm_pager_t pager; +{ + register vn_pager_t vnp = (vn_pager_t)pager->pg_data; + register struct vnode *vp; +#ifdef NOTDEF + struct proc *p = curproc; /* XXX */ +#endif + +#ifdef DEBUG + if (vpagerdebug & VDB_FOLLOW) + printf("vnode_pager_dealloc(%x)\n", pager); +#endif + if (vp = vnp->vnp_vp) { + vp->v_vmdata = NULL; + vp->v_flag &= ~VTEXT; +#if NOTDEF + /* can hang if done at reboot on NFS FS */ + (void) VOP_FSYNC(vp, p->p_ucred, p); +#endif + vrele(vp); + } + TAILQ_REMOVE(&vnode_pager_list, pager, pg_list); + free((caddr_t)vnp, M_VMPGDATA); + free((caddr_t)pager, M_VMPAGER); +} + +static int +vnode_pager_getpage(pager, mlist, npages, sync) + vm_pager_t pager; + vm_page_t *mlist; + int npages; + boolean_t sync; +{ + +#ifdef DEBUG + if (vpagerdebug & VDB_FOLLOW) + printf("vnode_pager_getpage(%x, %x, %x, %x)\n", + pager, mlist, npages, sync); +#endif + return(vnode_pager_io((vn_pager_t)pager->pg_data, + mlist, npages, sync, UIO_READ)); +} + +static boolean_t +vnode_pager_putpage(pager, mlist, npages, sync) + vm_pager_t pager; + vm_page_t *mlist; + int npages; + boolean_t sync; +{ + int err; + +#ifdef DEBUG + if (vpagerdebug & VDB_FOLLOW) + printf("vnode_pager_putpage(%x, %x, %x, %x)\n", + pager, mlist, npages, sync); +#endif + if (pager == NULL) + return (FALSE); /* ??? */ + err = vnode_pager_io((vn_pager_t)pager->pg_data, + mlist, npages, sync, UIO_WRITE); + /* + * If the operation was successful, mark the pages clean. + */ + if (err == VM_PAGER_OK) { + while (npages--) { + (*mlist)->flags |= PG_CLEAN; + pmap_clear_modify(VM_PAGE_TO_PHYS(*mlist)); + mlist++; + } + } + return(err); +} + +static boolean_t +vnode_pager_haspage(pager, offset) + vm_pager_t pager; + vm_offset_t offset; +{ + register vn_pager_t vnp = (vn_pager_t)pager->pg_data; + daddr_t bn; + int err; + +#ifdef DEBUG + if (vpagerdebug & VDB_FOLLOW) + printf("vnode_pager_haspage(%x, %x)\n", pager, offset); +#endif + + /* + * Offset beyond end of file, do not have the page + * Lock the vnode first to make sure we have the most recent + * version of the size. + */ + VOP_LOCK(vnp->vnp_vp); + if (offset >= vnp->vnp_size) { + VOP_UNLOCK(vnp->vnp_vp); +#ifdef DEBUG + if (vpagerdebug & (VDB_FAIL|VDB_SIZE)) + printf("vnode_pager_haspage: pg %x, off %x, size %x\n", + pager, offset, vnp->vnp_size); +#endif + return(FALSE); + } + + /* + * Read the index to find the disk block to read + * from. If there is no block, report that we don't + * have this data. + * + * Assumes that the vnode has whole page or nothing. + */ + err = VOP_BMAP(vnp->vnp_vp, + offset / vnp->vnp_vp->v_mount->mnt_stat.f_iosize, + (struct vnode **)0, &bn, NULL); + VOP_UNLOCK(vnp->vnp_vp); + if (err) { +#ifdef DEBUG + if (vpagerdebug & VDB_FAIL) + printf("vnode_pager_haspage: BMAP err %d, pg %x, off %x\n", + err, pager, offset); +#endif + return(TRUE); + } + return((long)bn < 0 ? FALSE : TRUE); +} + +static void +vnode_pager_cluster(pager, offset, loffset, hoffset) + vm_pager_t pager; + vm_offset_t offset; + vm_offset_t *loffset; + vm_offset_t *hoffset; +{ + vn_pager_t vnp = (vn_pager_t)pager->pg_data; + vm_offset_t loff, hoff; + +#ifdef DEBUG + if (vpagerdebug & VDB_FOLLOW) + printf("vnode_pager_cluster(%x, %x) ", pager, offset); +#endif + loff = offset; + if (loff >= vnp->vnp_size) + panic("vnode_pager_cluster: bad offset"); + /* + * XXX could use VOP_BMAP to get maxcontig value + */ + hoff = loff + MAXBSIZE; + if (hoff > round_page(vnp->vnp_size)) + hoff = round_page(vnp->vnp_size); + + *loffset = loff; + *hoffset = hoff; +#ifdef DEBUG + if (vpagerdebug & VDB_FOLLOW) + printf("returns [%x-%x]\n", loff, hoff); +#endif +} + +/* + * (XXX) + * Lets the VM system know about a change in size for a file. + * If this vnode is mapped into some address space (i.e. we have a pager + * for it) we adjust our own internal size and flush any cached pages in + * the associated object that are affected by the size change. + * + * Note: this routine may be invoked as a result of a pager put + * operation (possibly at object termination time), so we must be careful. + */ +void +vnode_pager_setsize(vp, nsize) + struct vnode *vp; + u_long nsize; +{ + register vn_pager_t vnp; + register vm_object_t object; + vm_pager_t pager; + + /* + * Not a mapped vnode + */ + if (vp == NULL || vp->v_type != VREG || vp->v_vmdata == NULL) + return; + /* + * Hasn't changed size + */ + pager = (vm_pager_t)vp->v_vmdata; + vnp = (vn_pager_t)pager->pg_data; + if (nsize == vnp->vnp_size) + return; + /* + * No object. + * This can happen during object termination since + * vm_object_page_clean is called after the object + * has been removed from the hash table, and clean + * may cause vnode write operations which can wind + * up back here. + */ + object = vm_object_lookup(pager); + if (object == NULL) + return; + +#ifdef DEBUG + if (vpagerdebug & (VDB_FOLLOW|VDB_SIZE)) + printf("vnode_pager_setsize: vp %x obj %x osz %d nsz %d\n", + vp, object, vnp->vnp_size, nsize); +#endif + /* + * File has shrunk. + * Toss any cached pages beyond the new EOF. + */ + if (nsize < vnp->vnp_size) { + vm_object_lock(object); + vm_object_page_remove(object, + (vm_offset_t)nsize, vnp->vnp_size); + vm_object_unlock(object); + } + vnp->vnp_size = (vm_offset_t)nsize; + vm_object_deallocate(object); +} + +void +vnode_pager_umount(mp) + register struct mount *mp; +{ + register vm_pager_t pager, npager; + struct vnode *vp; + + for (pager = vnode_pager_list.tqh_first; pager != NULL; pager = npager){ + /* + * Save the next pointer now since uncaching may + * terminate the object and render pager invalid + */ + npager = pager->pg_list.tqe_next; + vp = ((vn_pager_t)pager->pg_data)->vnp_vp; + if (mp == (struct mount *)0 || vp->v_mount == mp) { + VOP_LOCK(vp); + (void) vnode_pager_uncache(vp); + VOP_UNLOCK(vp); + } + } +} + +/* + * Remove vnode associated object from the object cache. + * + * XXX unlock the vnode if it is currently locked. + * We must do this since uncaching the object may result in its + * destruction which may initiate paging activity which may necessitate + * re-locking the vnode. + */ +boolean_t +vnode_pager_uncache(vp) + register struct vnode *vp; +{ + register vm_object_t object; + boolean_t uncached; + vm_pager_t pager; + + /* + * Not a mapped vnode + */ + pager = (vm_pager_t)vp->v_vmdata; + if (pager == NULL) + return (TRUE); +#ifdef DEBUG + if (!VOP_ISLOCKED(vp)) { + extern int (**nfsv2_vnodeop_p)(); + + if (vp->v_op != nfsv2_vnodeop_p) + panic("vnode_pager_uncache: vnode not locked!"); + } +#endif + /* + * Must use vm_object_lookup() as it actually removes + * the object from the cache list. + */ + object = vm_object_lookup(pager); + if (object) { + uncached = (object->ref_count <= 1); + VOP_UNLOCK(vp); + pager_cache(object, FALSE); + VOP_LOCK(vp); + } else + uncached = TRUE; + return(uncached); +} + +static int +vnode_pager_io(vnp, mlist, npages, sync, rw) + register vn_pager_t vnp; + vm_page_t *mlist; + int npages; + boolean_t sync; + enum uio_rw rw; +{ + struct uio auio; + struct iovec aiov; + vm_offset_t kva, foff; + int error, size; + struct proc *p = curproc; /* XXX */ + + /* XXX */ + vm_page_t m; + if (npages != 1) + panic("vnode_pager_io: cannot handle multiple pages"); + m = *mlist; + /* XXX */ + +#ifdef DEBUG + if (vpagerdebug & VDB_FOLLOW) + printf("vnode_pager_io(%x, %x, %c): vnode %x\n", + vnp, m, rw == UIO_READ ? 'R' : 'W', vnp->vnp_vp); +#endif + foff = m->offset + m->object->paging_offset; + /* + * Allocate a kernel virtual address and initialize so that + * we can use VOP_READ/WRITE routines. + */ + kva = vm_pager_map_pages(mlist, npages, sync); + if (kva == NULL) + return(VM_PAGER_AGAIN); + /* + * After all of the potentially blocking operations have been + * performed, we can do the size checks: + * read beyond EOF (returns error) + * short read + */ + VOP_LOCK(vnp->vnp_vp); + if (foff >= vnp->vnp_size) { + VOP_UNLOCK(vnp->vnp_vp); + vm_pager_unmap_pages(kva, npages); +#ifdef DEBUG + if (vpagerdebug & VDB_SIZE) + printf("vnode_pager_io: vp %x, off %d size %d\n", + vnp->vnp_vp, foff, vnp->vnp_size); +#endif + return(VM_PAGER_BAD); + } + if (foff + PAGE_SIZE > vnp->vnp_size) + size = vnp->vnp_size - foff; + else + size = PAGE_SIZE; + aiov.iov_base = (caddr_t)kva; + aiov.iov_len = size; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = foff; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = rw; + auio.uio_resid = size; + auio.uio_procp = (struct proc *)0; +#ifdef DEBUG + if (vpagerdebug & VDB_IO) + printf("vnode_pager_io: vp %x kva %x foff %x size %x", + vnp->vnp_vp, kva, foff, size); +#endif + if (rw == UIO_READ) + error = VOP_READ(vnp->vnp_vp, &auio, 0, p->p_ucred); + else + error = VOP_WRITE(vnp->vnp_vp, &auio, 0, p->p_ucred); + VOP_UNLOCK(vnp->vnp_vp); +#ifdef DEBUG + if (vpagerdebug & VDB_IO) { + if (error || auio.uio_resid) + printf(" returns error %x, resid %x", + error, auio.uio_resid); + printf("\n"); + } +#endif + if (!error) { + register int count = size - auio.uio_resid; + + if (count == 0) + error = EINVAL; + else if (count != PAGE_SIZE && rw == UIO_READ) + bzero((void *)(kva + count), PAGE_SIZE - count); + } + vm_pager_unmap_pages(kva, npages); + return (error ? VM_PAGER_ERROR : VM_PAGER_OK); +} diff --git a/sys/vm/vnode_pager.h b/sys/vm/vnode_pager.h new file mode 100644 index 00000000000..95c9545452a --- /dev/null +++ b/sys/vm/vnode_pager.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 1990 University of Utah. + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vnode_pager.h 8.1 (Berkeley) 6/11/93 + */ + +#ifndef _VNODE_PAGER_ +#define _VNODE_PAGER_ 1 + +/* + * VNODE pager private data. + */ +struct vnpager { + int vnp_flags; /* flags */ + struct vnode *vnp_vp; /* vnode */ + vm_size_t vnp_size; /* vnode current size */ +}; +typedef struct vnpager *vn_pager_t; + +#define VN_PAGER_NULL ((vn_pager_t)0) + +#define VNP_PAGING 0x01 /* vnode used for pageout */ +#define VNP_CACHED 0x02 /* vnode is cached */ + +#endif /* _VNODE_PAGER_ */