diff --git a/sys/netinet/in.h b/sys/netinet/in.h index 5f6708de786..79c69953f04 100644 --- a/sys/netinet/in.h +++ b/sys/netinet/in.h @@ -432,6 +432,8 @@ __END_DECLS #define IP_ONESBCAST 23 /* bool: send all-ones broadcast */ #define IP_BINDANY 24 /* bool: allow bind to any address */ +#define IP_BINDMULTI 25 /* bool: allow multiple listeners on a tuple */ +#define IP_RSS_LISTEN_BUCKET 26 /* int; set RSS listen bucket */ /* * Options for controlling the firewall and dummynet. diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index e8f5bb764de..a0aec02df85 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -487,6 +487,36 @@ inp_so_options(const struct inpcb *inp) #endif /* INET || INET6 */ #ifdef INET +/* + * Check if a new BINDMULTI socket is allowed to be created. + * + * ni points to the new inp. + * oi points to the exisitng inp. + * + * This checks whether the existing inp also has BINDMULTI and + * whether the credentials match. + */ +static int +in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) +{ + /* Check permissions match */ + if ((ni->inp_flags2 & INP_BINDMULTI) && + (ni->inp_cred->cr_uid != + oi->inp_cred->cr_uid)) + return (0); + + /* Check the existing inp has BINDMULTI set */ + if ((ni->inp_flags2 & INP_BINDMULTI) && + ((oi->inp_flags2 & INP_BINDMULTI) == 0)) + return (0); + + /* + * We're okay - either INP_BINDMULTI isn't set on ni, or + * it is and it matches the checks. + */ + return (1); +} + /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can @@ -589,6 +619,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, * This entire block sorely needs a rewrite. */ if (t && + ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && @@ -598,6 +629,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); + + /* + * If the socket is a BINDMULTI socket, then + * the credentials need to match and the + * original socket also has to have been bound + * with BINDMULTI. + */ + if (t && (! in_pcbbind_check_bindmulti(inp, t))) + return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); @@ -612,7 +652,9 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); - } else if (t && (reuseport & inp_so_options(t)) == 0) { + } else if (t && + ((inp->inp_flags2 & INP_BINDMULTI) == 0) && + (reuseport & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || @@ -622,6 +664,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); + if (t && (! in_pcbbind_check_bindmulti(inp, t))) + return (EADDRINUSE); } } } @@ -1556,6 +1600,88 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, goto found; } +#ifdef RSS + /* + * For incoming connections, we may wish to do a wildcard + * match for an RSS-local socket. + */ + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { + struct inpcb *local_wild = NULL, *local_exact = NULL; +#ifdef INET6 + struct inpcb *local_wild_mapped = NULL; +#endif + struct inpcb *jail_wild = NULL; + struct inpcbhead *head; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + + head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, + lport, 0, pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_pcbgrouphash) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY || + inp->inp_lport != lport) + continue; + + /* XXX inp locking */ + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP4); + if (injail) { + if (prison_check_ip4(inp->inp_cred, + &laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if (injail) + goto found; + else + local_exact = inp; + } else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#ifdef INET6 + /* XXX inp locking, NULL check */ + if (inp->inp_vflag & INP_IPV6PROTO) + local_wild_mapped = inp; + else +#endif + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; +#ifdef INET6 + if (inp == NULL) + inp = local_wild_mapped; +#endif + if (inp != NULL) + goto found; + } +#endif + /* * Then look for a wildcard match, if requested. */ diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 7cfc72a4377..350b962f90d 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -181,7 +181,8 @@ struct inpcb { u_int inp_refcount; /* (i) refcount */ void *inp_pspare[5]; /* (x) route caching / general use */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ - u_int inp_ispare[5]; /* (x) route caching / user cookie / + uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ + u_int inp_ispare[4]; /* (x) route caching / user cookie / * general use */ /* Local and foreign ports, local and foreign addr. */ @@ -546,6 +547,8 @@ short inp_so_options(const struct inpcb *inp); #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ #define INP_FREED 0x00000010 /* inp itself is not valid */ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ +#define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */ +#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ /* * Flags passed to in_pcblookup*() functions. diff --git a/sys/netinet/in_pcbgroup.c b/sys/netinet/in_pcbgroup.c index 22d07989c4d..8dd552946ec 100644 --- a/sys/netinet/in_pcbgroup.c +++ b/sys/netinet/in_pcbgroup.c @@ -297,6 +297,18 @@ in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr, struct inpcbgroup * in_pcbgroup_byinpcb(struct inpcb *inp) { +#ifdef RSS + /* + * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined + * RSS bucket and thus we should use this pcbgroup, rather than + * using a tuple or hash. + * + * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket + * fits in that! + */ + if (inp->inp_flags2 & INP_RSS_BUCKET_SET) + return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]); +#endif return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr, inp->inp_lport, inp->inp_faddr, inp->inp_fport)); @@ -346,6 +358,15 @@ in_pcbwild_remove(struct inpcb *inp) static __inline int in_pcbwild_needed(struct inpcb *inp) { +#ifdef RSS + /* + * If it's a listen socket and INP_RSS_BUCKET_SET is set, + * it's a wildcard socket _but_ it's in a specific pcbgroup. + * Thus we don't treat it as a pcbwild inp. + */ + if (inp->inp_flags2 & INP_RSS_BUCKET_SET) + return (0); +#endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6) @@ -398,9 +419,24 @@ in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo, #endif hashkey_faddr = inp->inp_faddr.s_addr; INP_GROUP_LOCK(newpcbgroup); - pcbhash = &newpcbgroup->ipg_hashbase[ - INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, - newpcbgroup->ipg_hashmask)]; + /* + * If the inp is an RSS bucket wildcard entry, ensure + * that the PCB hash is calculated correctly. + * + * The wildcard hash calculation differs from the + * non-wildcard definition. The source address is + * INADDR_ANY and the far port is 0. + */ + if (inp->inp_flags2 & INP_RSS_BUCKET_SET) { + pcbhash = &newpcbgroup->ipg_hashbase[ + INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0, + newpcbgroup->ipg_hashmask)]; + } else { + pcbhash = &newpcbgroup->ipg_hashbase[ + INP_PCBHASH(hashkey_faddr, inp->inp_lport, + inp->inp_fport, + newpcbgroup->ipg_hashmask)]; + } LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash); inp->inp_pcbgroup = newpcbgroup; INP_GROUP_UNLOCK(newpcbgroup); diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 7b5ebd3664f..4aea44fcdc2 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -1000,6 +1000,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) break; } /* FALLTHROUGH */ + case IP_BINDMULTI: +#ifdef RSS + case IP_RSS_LISTEN_BUCKET: +#endif case IP_TOS: case IP_TTL: case IP_MINTTL: @@ -1042,6 +1046,15 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) INP_WUNLOCK(inp); \ } while (0) +#define OPTSET2(bit, val) do { \ + INP_WLOCK(inp); \ + if (val) \ + inp->inp_flags2 |= bit; \ + else \ + inp->inp_flags2 &= ~bit; \ + INP_WUNLOCK(inp); \ +} while (0) + case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; @@ -1078,9 +1091,24 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVTOS: OPTSET(INP_RECVTOS); break; + case IP_BINDMULTI: + OPTSET2(INP_BINDMULTI, optval); + break; +#ifdef RSS + case IP_RSS_LISTEN_BUCKET: + if ((optval >= 0) && + (optval < rss_getnumbuckets())) { + inp->inp_rss_listen_bucket = optval; + OPTSET2(INP_RSS_BUCKET_SET, 1); + } else { + error = EINVAL; + } + break; +#endif } break; #undef OPTSET +#undef OPTSET2 /* * Multicast socket options are processed by the in_mcast @@ -1188,8 +1216,12 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: + case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: +#ifdef RSS + case IP_RSSBUCKETID: +#endif switch (sopt->sopt_name) { case IP_TOS: @@ -1205,6 +1237,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) +#define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); @@ -1268,6 +1301,9 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) error = EINVAL; break; #endif + case IP_BINDMULTI: + optval = OPTBIT2(INP_BINDMULTI); + break; } error = sooptcopyout(sopt, &optval, sizeof optval); break;