From 5ee847d3ac9530cc0132b3fe16fa8063cd77e3aa Mon Sep 17 00:00:00 2001 From: Robert Watson Date: Sun, 19 Jul 2009 14:20:53 +0000 Subject: [PATCH] Reimplement and/or implement vnet list locking by replacing a mostly unused custom mutex/condvar-based sleep locks with two locks: an rwlock (for non-sleeping use) and sxlock (for sleeping use). Either acquired for read is sufficient to stabilize the vnet list, but both must be acquired for write to modify the list. Replace previous no-op read locking macros, used in various places in the stack, with actual locking to prevent race conditions. Callers must declare when they may perform unbounded sleeps or not when selecting how to lock. Refactor vnet sysinits so that the vnet list and locks are initialized before kernel modules are linked, as the kernel linker will use them for modules loaded by the boot loader. Update various consumers of these KPIs based on whether they may sleep or not. Reviewed by: bz Approved by: re (kib) --- sys/kern/kern_vimage.c | 48 +++++++++++++++++++++++------------------ sys/net/if.c | 4 ++-- sys/netgraph/ng_gif.c | 4 ++-- sys/netinet/igmp.c | 8 +++---- sys/netinet/in_pcb.c | 4 ++-- sys/netinet/in_rmx.c | 4 ++-- sys/netinet/ip_input.c | 8 +++---- sys/netinet/tcp_subr.c | 8 +++---- sys/netinet/tcp_timer.c | 4 ++-- sys/netinet6/frag6.c | 12 ++++++----- sys/netinet6/mld6.c | 8 +++---- sys/netipsec/key.c | 4 ++-- sys/sys/kernel.h | 5 +++-- sys/sys/vimage.h | 42 ++++++++++++++++++++++++++++-------- 14 files changed, 98 insertions(+), 65 deletions(-) diff --git a/sys/kern/kern_vimage.c b/sys/kern/kern_vimage.c index 328b2bac1a1..f3d4a867d82 100644 --- a/sys/kern/kern_vimage.c +++ b/sys/kern/kern_vimage.c @@ -58,23 +58,22 @@ static void vnet_mod_complete_registration(struct vnet_modlink *); static int vnet_mod_constructor(struct vnet_modlink *); static int vnet_mod_destructor(struct vnet_modlink *); -#define VNET_LIST_WLOCK() \ - mtx_lock(&vnet_list_refc_mtx); \ - while (vnet_list_refc != 0) \ - cv_wait(&vnet_list_condvar, &vnet_list_refc_mtx); +struct rwlock vnet_rwlock; +struct sx vnet_sxlock; -#define VNET_LIST_WUNLOCK() \ - mtx_unlock(&vnet_list_refc_mtx); +#define VNET_LIST_WLOCK() do { \ + sx_xlock(&vnet_sxlock); \ + rw_wlock(&vnet_rwlock); \ +} while (0) + +#define VNET_LIST_WUNLOCK() do { \ + rw_wunlock(&vnet_rwlock); \ + sx_xunlock(&vnet_sxlock); \ +} while (0) struct vnet_list_head vnet_head; - -struct cv vnet_list_condvar; -struct mtx vnet_list_refc_mtx; -int vnet_list_refc = 0; - struct vnet *vnet0; - /* * Move an ifnet to or from another vnet, specified by the jail id. */ @@ -373,17 +372,23 @@ vnet_foreach(void (*vnet_foreach_fn)(struct vnet *, void *), void *arg) } static void -vi_init(void *unused) +vnet_init_prelink(void *arg) +{ + + rw_init(&vnet_rwlock, "vnet_rwlock"); + sx_init(&vnet_sxlock, "vnet_sxlock"); + LIST_INIT(&vnet_head); +} +SYSINIT(vnet_init_prelink, SI_SUB_VNET_PRELINK, SI_ORDER_FIRST, + vnet_init_prelink, NULL); + +static void +vnet0_init(void *arg) { TAILQ_INIT(&vnet_modlink_head); TAILQ_INIT(&vnet_modpending_head); - LIST_INIT(&vnet_head); - - mtx_init(&vnet_list_refc_mtx, "vnet_list_refc_mtx", NULL, MTX_DEF); - cv_init(&vnet_list_condvar, "vnet_list_condvar"); - /* * We MUST clear curvnet in vi_init_done() before going SMP, * otherwise CURVNET_SET() macros would scream about unnecessary @@ -391,9 +396,10 @@ vi_init(void *unused) */ curvnet = prison0.pr_vnet = vnet0 = vnet_alloc(); } +SYSINIT(vnet0_init, SI_SUB_VNET, SI_ORDER_FIRST, vnet0_init, NULL); static void -vi_init_done(void *unused) +vnet_init_done(void *unused) { struct vnet_modlink *vml_iter; @@ -411,8 +417,8 @@ vi_init_done(void *unused) panic("going nowhere without my vnet modules!"); } -SYSINIT(vimage, SI_SUB_VIMAGE, SI_ORDER_FIRST, vi_init, NULL); -SYSINIT(vimage_done, SI_SUB_VIMAGE_DONE, SI_ORDER_FIRST, vi_init_done, NULL); +SYSINIT(vnet_init_done, SI_SUB_VNET_DONE, SI_ORDER_FIRST, vnet_init_done, + NULL); #ifdef DDB DB_SHOW_COMMAND(vnets, db_show_vnets) diff --git a/sys/net/if.c b/sys/net/if.c index e779e60e5cf..a26774722bc 100644 --- a/sys/net/if.c +++ b/sys/net/if.c @@ -1793,8 +1793,8 @@ if_slowtimo(void *arg) struct ifnet *ifp; int s = splimp(); + VNET_LIST_RLOCK_NOSLEEP(); IFNET_RLOCK(); - VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { @@ -1805,8 +1805,8 @@ if_slowtimo(void *arg) } CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); IFNET_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); splx(s); timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ); } diff --git a/sys/netgraph/ng_gif.c b/sys/netgraph/ng_gif.c index f099d614d6a..de2ded3e294 100644 --- a/sys/netgraph/ng_gif.c +++ b/sys/netgraph/ng_gif.c @@ -561,8 +561,8 @@ ng_gif_mod_event(module_t mod, int event, void *data) ng_gif_input_orphan_p = ng_gif_input_orphan; /* Create nodes for any already-existing gif interfaces */ + VNET_LIST_RLOCK_NOSLEEP(); IFNET_RLOCK(); - VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET_QUIET(vnet_iter); /* XXX revisit quiet */ TAILQ_FOREACH(ifp, &V_ifnet, if_link) { @@ -571,8 +571,8 @@ ng_gif_mod_event(module_t mod, int event, void *data) } CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); IFNET_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); break; case MOD_UNLOAD: diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c index 37cc1b3caf4..6a7cb83b430 100644 --- a/sys/netinet/igmp.c +++ b/sys/netinet/igmp.c @@ -1616,13 +1616,13 @@ igmp_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); - VNET_LIST_RLOCK(); + VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_fasttimo_vnet(); CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); } /* @@ -2159,13 +2159,13 @@ igmp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); - VNET_LIST_RLOCK(); + VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_slowtimo_vnet(); CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); } /* diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 94906ad2e6e..5d080ba867e 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1570,7 +1570,7 @@ ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); - VNET_LIST_RLOCK(); + VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ if (V_ipport_tcpallocs <= @@ -1582,7 +1582,7 @@ ipport_tick(void *xtp) V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c index 2ef4ed660e8..d3f9563d3ce 100644 --- a/sys/netinet/in_rmx.c +++ b/sys/netinet/in_rmx.c @@ -319,7 +319,7 @@ in_rtqdrain(void) struct rtqk_arg arg; int fibnum; - VNET_LIST_RLOCK(); + VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); @@ -336,7 +336,7 @@ in_rtqdrain(void) } CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); } static int _in_rt_was_here; diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index 8e84707ff1d..61fc8357ea0 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -1193,8 +1193,8 @@ ip_slowtimo(void) struct ipq *fp; int i; + VNET_LIST_RLOCK_NOSLEEP(); IPQ_LOCK(); - VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); for (i = 0; i < IPREASS_NHASH; i++) { @@ -1228,8 +1228,8 @@ ip_slowtimo(void) } CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); IPQ_UNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); } /* @@ -1241,8 +1241,8 @@ ip_drain(void) VNET_ITERATOR_DECL(vnet_iter); int i; + VNET_LIST_RLOCK_NOSLEEP(); IPQ_LOCK(); - VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); for (i = 0; i < IPREASS_NHASH; i++) { @@ -1254,8 +1254,8 @@ ip_drain(void) } CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); IPQ_UNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); in_rtqdrain(); } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index df61a0047ef..8ebc889b74f 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -940,7 +940,7 @@ tcp_drain(void) if (!do_tcpdrain) return; - VNET_LIST_RLOCK(); + VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb *inpb; @@ -976,7 +976,7 @@ tcp_drain(void) INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); } /* @@ -1576,7 +1576,7 @@ tcp_isn_tick(void *xtp) VNET_ITERATOR_DECL(vnet_iter); u_int32_t projected_offset; - VNET_LIST_RLOCK(); + VNET_LIST_RLOCK_NOSLEEP(); ISN_LOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */ @@ -1590,7 +1590,7 @@ tcp_isn_tick(void *xtp) CURVNET_RESTORE(); } ISN_UNLOCK(); - VNET_LIST_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); } diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 7006b70d674..3050324a2e7 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -127,7 +127,7 @@ tcp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); - VNET_LIST_RLOCK(); + VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); tcp_maxidle = tcp_keepcnt * tcp_keepintvl; @@ -136,7 +136,7 @@ tcp_slowtimo(void) INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = diff --git a/sys/netinet6/frag6.c b/sys/netinet6/frag6.c index 3fd8605c77c..ee1f4a2a393 100644 --- a/sys/netinet6/frag6.c +++ b/sys/netinet6/frag6.c @@ -720,8 +720,8 @@ frag6_slowtimo(void) VNET_ITERATOR_DECL(vnet_iter); struct ip6q *q6; + VNET_LIST_RLOCK_NOSLEEP(); IP6Q_LOCK(); - VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); q6 = V_ip6q.ip6q_next; @@ -748,8 +748,8 @@ frag6_slowtimo(void) } CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); IP6Q_UNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); } /* @@ -760,9 +760,11 @@ frag6_drain(void) { VNET_ITERATOR_DECL(vnet_iter); - if (IP6Q_TRYLOCK() == 0) + VNET_LIST_RLOCK_NOSLEEP(); + if (IP6Q_TRYLOCK() == 0) { + VNET_LIST_RUNLOCK_NOSLEEP(); return; - VNET_LIST_RLOCK(); + } VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); while (V_ip6q.ip6q_next != &V_ip6q) { @@ -772,6 +774,6 @@ frag6_drain(void) } CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); IP6Q_UNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); } diff --git a/sys/netinet6/mld6.c b/sys/netinet6/mld6.c index 9d69e11a699..4b69da2e135 100644 --- a/sys/netinet6/mld6.c +++ b/sys/netinet6/mld6.c @@ -1306,13 +1306,13 @@ mld_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); - VNET_LIST_RLOCK(); + VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_fasttimo_vnet(); CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); } /* @@ -1721,13 +1721,13 @@ mld_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); - VNET_LIST_RLOCK(); + VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_slowtimo_vnet(); CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); } /* diff --git a/sys/netipsec/key.c b/sys/netipsec/key.c index 4f2d27e6dc0..e90396b1f7a 100644 --- a/sys/netipsec/key.c +++ b/sys/netipsec/key.c @@ -4537,7 +4537,7 @@ key_timehandler(void) VNET_ITERATOR_DECL(vnet_iter); time_t now = time_second; - VNET_LIST_RLOCK(); + VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); key_flush_spd(now); @@ -4546,7 +4546,7 @@ key_timehandler(void) key_flush_spacq(now); CURVNET_RESTORE(); } - VNET_LIST_RUNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); #ifndef IPSEC_DEBUG2 /* do exchange to tick time !! */ diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h index 478b0d88557..87a4bfea569 100644 --- a/sys/sys/kernel.h +++ b/sys/sys/kernel.h @@ -106,13 +106,14 @@ enum sysinit_sub_id { SI_SUB_MTX_POOL_DYNAMIC = 0x1AC0000, /* dynamic mutex pool */ SI_SUB_LOCK = 0x1B00000, /* various locks */ SI_SUB_EVENTHANDLER = 0x1C00000, /* eventhandler init */ + SI_SUB_VNET_PRELINK = 0x1E00000, /* vnet init before modules */ SI_SUB_KLD = 0x2000000, /* KLD and module setup */ SI_SUB_CPU = 0x2100000, /* CPU resource(s)*/ SI_SUB_KDTRACE = 0x2140000, /* Kernel dtrace hooks */ SI_SUB_MAC = 0x2180000, /* TrustedBSD MAC subsystem */ SI_SUB_MAC_POLICY = 0x21C0000, /* TrustedBSD MAC policies */ SI_SUB_MAC_LATE = 0x21D0000, /* TrustedBSD MAC subsystem */ - SI_SUB_VIMAGE = 0x21E0000, /* vimage infrastructure */ + SI_SUB_VNET = 0x21E0000, /* vnet 0 */ SI_SUB_INTRINSIC = 0x2200000, /* proc 0*/ SI_SUB_VM_CONF = 0x2300000, /* config VM, set limits*/ SI_SUB_DDB_SERVICES = 0x2380000, /* capture, scripting, etc. */ @@ -158,7 +159,7 @@ enum sysinit_sub_id { SI_SUB_SWAP = 0xc000000, /* swap */ SI_SUB_INTRINSIC_POST = 0xd000000, /* proc 0 cleanup*/ SI_SUB_SYSCALLS = 0xd800000, /* register system calls */ - SI_SUB_VIMAGE_DONE = 0xdc00000, /* vnet registration complete */ + SI_SUB_VNET_DONE = 0xdc00000, /* vnet registration complete */ SI_SUB_KTHREAD_INIT = 0xe000000, /* init process*/ SI_SUB_KTHREAD_PAGE = 0xe400000, /* pageout daemon*/ SI_SUB_KTHREAD_VM = 0xe800000, /* vm daemon*/ diff --git a/sys/sys/vimage.h b/sys/sys/vimage.h index be835160985..69ab3d63256 100644 --- a/sys/sys/vimage.h +++ b/sys/sys/vimage.h @@ -38,6 +38,9 @@ #ifdef _KERNEL +#include +#include + #ifdef INVARIANTS #define VNET_DEBUG #endif @@ -176,17 +179,42 @@ struct vnet { #endif /* !VIMAGE */ #ifdef VIMAGE +/* + * Global linked list of all virtual network stacks, along with read locks to + * access it. If a caller may sleep while accessing the list, it must use + * the sleepable lock macros. + */ LIST_HEAD(vnet_list_head, vnet); extern struct vnet_list_head vnet_head; -extern struct vnet *vnet0; -#define VNET_ITERATOR_DECL(arg) struct vnet *arg; -#define VNET_FOREACH(arg) LIST_FOREACH(arg, &vnet_head, vnet_le) -#else +extern struct rwlock vnet_rwlock; +extern struct sx vnet_sxlock; + +#define VNET_LIST_RLOCK() sx_slock(&vnet_sxlock) +#define VNET_LIST_RLOCK_NOSLEEP() rw_rlock(&vnet_rwlock) +#define VNET_LIST_RUNLOCK() sx_sunlock(&vnet_sxlock) +#define VNET_LIST_RUNLOCK_NOSLEEP() rw_runlock(&vnet_rwlock) + +/* + * Iteration macros to walk the global list of virtual network stacks. + */ +#define VNET_ITERATOR_DECL(arg) struct vnet *arg +#define VNET_FOREACH(arg) LIST_FOREACH((arg), &vnet_head, vnet_le) + +#else /* !VIMAGE */ +/* + * No-op macros for the !VIMAGE case. + */ +#define VNET_LIST_RLOCK() +#define VNET_LIST_RLOCK_NOSLEEP() +#define VNET_LIST_RUNLOCK() +#define VNET_LIST_RUNLOCK_NOSLEEP() #define VNET_ITERATOR_DECL(arg) #define VNET_FOREACH(arg) -#endif + +#endif /* VIMAGE */ #ifdef VIMAGE +extern struct vnet *vnet0; #define IS_DEFAULT_VNET(arg) ((arg) == vnet0) #else #define IS_DEFAULT_VNET(arg) 1 @@ -202,10 +230,6 @@ extern struct vnet *vnet0; #define P_TO_VNET(p) NULL #endif /* VIMAGE */ -/* Non-VIMAGE null-macros */ -#define VNET_LIST_RLOCK() -#define VNET_LIST_RUNLOCK() - #endif /* _KERNEL */ #endif /* !_SYS_VIMAGE_H_ */