diff --git a/share/man/man4/vxlan.4 b/share/man/man4/vxlan.4 index fcac8f8f837..1848897d97c 100644 --- a/share/man/man4/vxlan.4 +++ b/share/man/man4/vxlan.4 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd December 31, 2017 +.Dd September 17, 2020 .Dt VXLAN 4 .Os .Sh NAME @@ -182,6 +182,39 @@ command may be used to reduce the MTU size on the .Nm interface to allow the encapsulated frame to fit in the current MTU of the physical network. +.Sh HARDWARE +The +.Nm +driver supports hardware checksum offload (receive and transmit) and TSO on the +encapsulated traffic over physical interfaces that support these features. +The +.Nm +interface examines the +.Cm vxlandev +interface, if one is specified, or the interface hosting the +.Cm vxlanlocal +address, and configures its capabilities based on the hardware offload +capabilities of that physical interface. +If multiple physical interfaces will transmit or receive traffic for the +.Nm +then they all must have the same hardware capabilities. +The transmit routine of a +.Nm +interface may fail with +.Er ENXIO +if an outbound physical interface does not support +an offload that the +.Nm +interface is requesting. +This can happen if there are multiple physical interfaces involved, with +different hardware capabilities, or an interface capability was disabled after +the +.Nm +interface had already started. +.Pp +At present, these devices are capable of generating checksums and performing TSO +on the inner frames in hardware: +.Xr cxgbe 4 . .Sh EXAMPLES Create a .Nm @@ -244,3 +277,7 @@ The .Nm driver was written by .An Bryan Venteicher Aq bryanv@freebsd.org . +Support for stateless hardware offloads was added by +.An Navdeep Parhar Aq np@freebsd.org +in +.Fx 13.0 . diff --git a/share/man/man9/EVENTHANDLER.9 b/share/man/man9/EVENTHANDLER.9 index 9fca3d414ea..c38bb3d6c93 100644 --- a/share/man/man9/EVENTHANDLER.9 +++ b/share/man/man9/EVENTHANDLER.9 @@ -23,7 +23,7 @@ .\" SUCH DAMAGE. .\" $FreeBSD$ .\" -.Dd October 21, 2018 +.Dd September 17, 2020 .Dt EVENTHANDLER 9 .Os .Sh NAME @@ -389,6 +389,10 @@ Callback invoked when the vlan configuration has changed. Callback invoked when a vlan is destroyed. .It Vt vm_lowmem Callbacks invoked when virtual memory is low. +.It Vt vxlan_start +Callback invoked when a vxlan interface starts. +.It Vt vxlan_stop +Callback invoked when a vxlan interface stops. .It Vt watchdog_list Callbacks invoked when the system watchdog timer is reinitialized. .El diff --git a/sys/net/if_vxlan.c b/sys/net/if_vxlan.c index b75cfc41ad1..fd2f1d94b29 100644 --- a/sys/net/if_vxlan.c +++ b/sys/net/if_vxlan.c @@ -1,6 +1,7 @@ /*- * Copyright (c) 2014, Bryan Venteicher * All rights reserved. + * Copyright (c) 2020, Chelsio Communications. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -60,6 +61,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #include @@ -70,6 +73,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #include @@ -77,6 +82,9 @@ __FBSDID("$FreeBSD$"); struct vxlan_softc; LIST_HEAD(vxlan_softc_head, vxlan_softc); +struct sx vxlan_sx; +SX_SYSINIT(vxlan, &vxlan_sx, "VXLAN global start/stop lock"); + struct vxlan_socket_mc_info { union vxlan_sockaddr vxlsomc_saddr; union vxlan_sockaddr vxlsomc_gaddr; @@ -92,6 +100,7 @@ struct vxlan_socket_mc_info { sizeof(struct udphdr) - \ sizeof(struct vxlan_header) - \ ETHER_HDR_LEN - ETHER_CRC_LEN - ETHER_VLAN_ENCAP_LEN) +#define VXLAN_BASIC_IFCAPS (IFCAP_LINKSTATE | IFCAP_JUMBO_MTU) #define VXLAN_SO_MC_MAX_GROUPS 32 @@ -146,10 +155,14 @@ LIST_HEAD(vxlan_ftable_head, vxlan_ftable_entry); struct vxlan_statistics { uint32_t ftable_nospace; uint32_t ftable_lock_upgrade_failed; + counter_u64_t txcsum; + counter_u64_t tso; + counter_u64_t rxcsum; }; struct vxlan_softc { struct ifnet *vxl_ifp; + int vxl_reqcap; struct vxlan_socket *vxl_sock; uint32_t vxl_vni; union vxlan_sockaddr vxl_src_addr; @@ -193,6 +206,10 @@ struct vxlan_softc { char vxl_mc_ifname[IFNAMSIZ]; LIST_ENTRY(vxlan_softc) vxl_entry; LIST_ENTRY(vxlan_softc) vxl_ifdetach_list; + + /* For rate limiting errors on the tx fast path. */ + struct timeval err_time; + int err_pps; }; #define VXLAN_RLOCK(_sc, _p) rm_rlock(&(_sc)->vxl_lock, (_p)) @@ -297,7 +314,10 @@ static int vxlan_setup_multicast_interface(struct vxlan_softc *); static int vxlan_setup_multicast(struct vxlan_softc *); static int vxlan_setup_socket(struct vxlan_softc *); -static void vxlan_setup_interface(struct vxlan_softc *); +#ifdef INET6 +static void vxlan_setup_zero_checksum_port(struct vxlan_softc *); +#endif +static void vxlan_setup_interface_hdrlen(struct vxlan_softc *); static int vxlan_valid_init_config(struct vxlan_softc *); static void vxlan_init_wait(struct vxlan_softc *); static void vxlan_init_complete(struct vxlan_softc *); @@ -347,9 +367,13 @@ static void vxlan_rcv_udp_packet(struct mbuf *, int, struct inpcb *, static int vxlan_input(struct vxlan_socket *, uint32_t, struct mbuf **, const struct sockaddr *); +static int vxlan_stats_alloc(struct vxlan_softc *); +static void vxlan_stats_free(struct vxlan_softc *); static void vxlan_set_default_config(struct vxlan_softc *); static int vxlan_set_user_config(struct vxlan_softc *, struct ifvxlanparam *); +static int vxlan_set_reqcap(struct vxlan_softc *, struct ifnet *, int); +static void vxlan_set_hwcaps(struct vxlan_softc *); static int vxlan_clone_create(struct if_clone *, int, caddr_t); static void vxlan_clone_destroy(struct ifnet *); @@ -1555,8 +1579,44 @@ out: return (error); } +#ifdef INET6 static void -vxlan_setup_interface(struct vxlan_softc *sc) +vxlan_setup_zero_checksum_port(struct vxlan_softc *sc) +{ + + if (!VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_src_addr)) + return; + + MPASS(sc->vxl_src_addr.in6.sin6_port != 0); + MPASS(sc->vxl_dst_addr.in6.sin6_port != 0); + + if (sc->vxl_src_addr.in6.sin6_port != sc->vxl_dst_addr.in6.sin6_port) { + if_printf(sc->vxl_ifp, "port %d in src address does not match " + "port %d in dst address, rfc6935_port (%d) not updated.\n", + ntohs(sc->vxl_src_addr.in6.sin6_port), + ntohs(sc->vxl_dst_addr.in6.sin6_port), + V_zero_checksum_port); + return; + } + + if (V_zero_checksum_port != 0) { + if (V_zero_checksum_port != + ntohs(sc->vxl_src_addr.in6.sin6_port)) { + if_printf(sc->vxl_ifp, "rfc6935_port is already set to " + "%d, cannot set it to %d.\n", V_zero_checksum_port, + ntohs(sc->vxl_src_addr.in6.sin6_port)); + } + return; + } + + V_zero_checksum_port = ntohs(sc->vxl_src_addr.in6.sin6_port); + if_printf(sc->vxl_ifp, "rfc6935_port set to %d\n", + V_zero_checksum_port); +} +#endif + +static void +vxlan_setup_interface_hdrlen(struct vxlan_softc *sc) { struct ifnet *ifp; @@ -1655,9 +1715,11 @@ vxlan_init(void *xsc) sc = xsc; ifp = sc->vxl_ifp; + sx_xlock(&vxlan_sx); VXLAN_WLOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { VXLAN_WUNLOCK(sc); + sx_xunlock(&vxlan_sx); return; } sc->vxl_flags |= VXLAN_FLAG_INIT; @@ -1666,11 +1728,13 @@ vxlan_init(void *xsc) if (vxlan_valid_init_config(sc) != 0) goto out; - vxlan_setup_interface(sc); - if (vxlan_setup_socket(sc) != 0) goto out; +#ifdef INET6 + vxlan_setup_zero_checksum_port(sc); +#endif + /* Initialize the default forwarding entry. */ vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac, &sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC); @@ -1682,8 +1746,12 @@ vxlan_init(void *xsc) VXLAN_WUNLOCK(sc); if_link_state_change(ifp, LINK_STATE_UP); + + EVENTHANDLER_INVOKE(vxlan_start, ifp, sc->vxl_src_addr.in4.sin_family, + ntohs(sc->vxl_src_addr.in4.sin_port)); out: vxlan_init_complete(sc); + sx_xunlock(&vxlan_sx); } static void @@ -1725,11 +1793,11 @@ vxlan_teardown_locked(struct vxlan_softc *sc) struct ifnet *ifp; struct vxlan_socket *vso; - ifp = sc->vxl_ifp; - + sx_assert(&vxlan_sx, SA_XLOCKED); VXLAN_LOCK_WASSERT(sc); MPASS(sc->vxl_flags & VXLAN_FLAG_TEARDOWN); + ifp = sc->vxl_ifp; ifp->if_flags &= ~IFF_UP; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&sc->vxl_callout); @@ -1738,6 +1806,8 @@ vxlan_teardown_locked(struct vxlan_softc *sc) VXLAN_WUNLOCK(sc); if_link_state_change(ifp, LINK_STATE_DOWN); + EVENTHANDLER_INVOKE(vxlan_stop, ifp, sc->vxl_src_addr.in4.sin_family, + ntohs(sc->vxl_src_addr.in4.sin_port)); if (vso != NULL) { vxlan_socket_remove_softc(vso, sc); @@ -1767,15 +1837,18 @@ static void vxlan_teardown(struct vxlan_softc *sc) { + sx_xlock(&vxlan_sx); VXLAN_WLOCK(sc); if (sc->vxl_flags & VXLAN_FLAG_TEARDOWN) { vxlan_teardown_wait(sc); VXLAN_WUNLOCK(sc); + sx_xunlock(&vxlan_sx); return; } sc->vxl_flags |= VXLAN_FLAG_TEARDOWN; vxlan_teardown_locked(sc); + sx_xunlock(&vxlan_sx); } static void @@ -1907,6 +1980,7 @@ vxlan_ctrl_set_local_addr(struct vxlan_softc *sc, void *arg) VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa); + vxlan_set_hwcaps(sc); error = 0; } else error = EBUSY; @@ -1936,6 +2010,7 @@ vxlan_ctrl_set_remote_addr(struct vxlan_softc *sc, void *arg) VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa); + vxlan_setup_interface_hdrlen(sc); error = 0; } else error = EBUSY; @@ -2063,6 +2138,7 @@ vxlan_ctrl_set_multicast_if(struct vxlan_softc * sc, void *arg) VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ); + vxlan_set_hwcaps(sc); error = 0; } else error = EBUSY; @@ -2284,6 +2360,14 @@ vxlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) ifp->if_mtu = ifr->ifr_mtu; break; + case SIOCSIFCAP: + VXLAN_WLOCK(sc); + error = vxlan_set_reqcap(sc, ifp, ifr->ifr_reqcap); + if (error == 0) + vxlan_set_hwcaps(sc); + VXLAN_WUNLOCK(sc); + break; + default: error = ether_ioctl(ifp, cmd, data); break; @@ -2335,6 +2419,48 @@ vxlan_encap_header(struct vxlan_softc *sc, struct mbuf *m, int ipoff, } #endif +/* + * Return the CSUM_INNER_* equivalent of CSUM_* caps. + */ +static uint32_t +csum_flags_to_inner_flags(uint32_t csum_flags_in, uint32_t encap) +{ + uint32_t csum_flags = CSUM_ENCAP_VXLAN; + const uint32_t v4 = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP; + + /* + * csum_flags can request either v4 or v6 offload but not both. + * tcp_output always sets CSUM_TSO (both CSUM_IP_TSO and CSUM_IP6_TSO) + * so those bits are no good to detect the IP version. Other bits are + * always set with CSUM_TSO and we use those to figure out the IP + * version. + */ + if (csum_flags_in & v4) { + if (csum_flags_in & CSUM_IP) + csum_flags |= CSUM_INNER_IP; + if (csum_flags_in & CSUM_IP_UDP) + csum_flags |= CSUM_INNER_IP_UDP; + if (csum_flags_in & CSUM_IP_TCP) + csum_flags |= CSUM_INNER_IP_TCP; + if (csum_flags_in & CSUM_IP_TSO) + csum_flags |= CSUM_INNER_IP_TSO; + } else { +#ifdef INVARIANTS + const uint32_t v6 = CSUM_IP6_UDP | CSUM_IP6_TCP; + + MPASS((csum_flags_in & v6) != 0); +#endif + if (csum_flags_in & CSUM_IP6_UDP) + csum_flags |= CSUM_INNER_IP6_UDP; + if (csum_flags_in & CSUM_IP6_TCP) + csum_flags |= CSUM_INNER_IP6_TCP; + if (csum_flags_in & CSUM_IP6_TSO) + csum_flags |= CSUM_INNER_IP6_TSO; + } + + return (csum_flags); +} + static int vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, struct mbuf *m) @@ -2345,6 +2471,11 @@ vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, struct in_addr srcaddr, dstaddr; uint16_t srcport, dstport; int len, mcast, error; + struct route route, *ro; + struct sockaddr_in *sin; + uint32_t csum_flags; + + NET_EPOCH_ASSERT(); ifp = sc->vxl_ifp; srcaddr = sc->vxl_src_addr.in4.sin_addr; @@ -2376,7 +2507,57 @@ vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; m->m_flags &= ~(M_MCAST | M_BCAST); - error = ip_output(m, NULL, NULL, 0, sc->vxl_im4o, NULL); + m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX; + if (m->m_pkthdr.csum_flags != 0) { + /* + * HW checksum (L3 and/or L4) or TSO has been requested. Look + * up the ifnet for the outbound route and verify that the + * outbound ifnet can perform the requested operation on the + * inner frame. + */ + bzero(&route, sizeof(route)); + ro = &route; + sin = (struct sockaddr_in *)&ro->ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ip->ip_dst; + ro->ro_nh = fib4_lookup(RT_DEFAULT_FIB, ip->ip_dst, 0, NHR_NONE, + 0); + if (ro->ro_nh == NULL) { + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (EHOSTUNREACH); + } + + csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags, + CSUM_ENCAP_VXLAN); + if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) != + csum_flags) { + if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) { + const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp; + + if_printf(ifp, "interface %s is missing hwcaps " + "0x%08x, csum_flags 0x%08x -> 0x%08x, " + "hwassist 0x%08x\n", nh_ifp->if_xname, + csum_flags & ~(uint32_t)nh_ifp->if_hwassist, + m->m_pkthdr.csum_flags, csum_flags, + (uint32_t)nh_ifp->if_hwassist); + } + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (ENXIO); + } + m->m_pkthdr.csum_flags = csum_flags; + if (csum_flags & + (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP | + CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) { + counter_u64_add(sc->vxl_stats.txcsum, 1); + if (csum_flags & CSUM_INNER_TSO) + counter_u64_add(sc->vxl_stats.tso, 1); + } + } else + ro = NULL; + error = ip_output(m, NULL, ro, 0, sc->vxl_im4o, NULL); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); @@ -2402,6 +2583,11 @@ vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, const struct in6_addr *srcaddr, *dstaddr; uint16_t srcport, dstport; int len, mcast, error; + struct route_in6 route, *ro; + struct sockaddr_in6 *sin6; + uint32_t csum_flags; + + NET_EPOCH_ASSERT(); ifp = sc->vxl_ifp; srcaddr = &sc->vxl_src_addr.in6.sin6_addr; @@ -2429,22 +2615,67 @@ vxlan_encap6(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport); - /* - * XXX BMV We need support for RFC6935 before we can send and - * receive IPv6 UDP packets with a zero checksum. - */ - { + mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; + m->m_flags &= ~(M_MCAST | M_BCAST); + + ro = NULL; + m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX; + if (m->m_pkthdr.csum_flags != 0) { + /* + * HW checksum (L3 and/or L4) or TSO has been requested. Look + * up the ifnet for the outbound route and verify that the + * outbound ifnet can perform the requested operation on the + * inner frame. + */ + bzero(&route, sizeof(route)); + ro = &route; + sin6 = (struct sockaddr_in6 *)&ro->ro_dst; + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_addr = ip6->ip6_dst; + ro->ro_nh = fib6_lookup(RT_DEFAULT_FIB, &ip6->ip6_dst, 0, + NHR_NONE, 0); + if (ro->ro_nh == NULL) { + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (EHOSTUNREACH); + } + + csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags, + CSUM_ENCAP_VXLAN); + if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) != + csum_flags) { + if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) { + const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp; + + if_printf(ifp, "interface %s is missing hwcaps " + "0x%08x, csum_flags 0x%08x -> 0x%08x, " + "hwassist 0x%08x\n", nh_ifp->if_xname, + csum_flags & ~(uint32_t)nh_ifp->if_hwassist, + m->m_pkthdr.csum_flags, csum_flags, + (uint32_t)nh_ifp->if_hwassist); + } + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (ENXIO); + } + m->m_pkthdr.csum_flags = csum_flags; + if (csum_flags & + (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP | + CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) { + counter_u64_add(sc->vxl_stats.txcsum, 1); + if (csum_flags & CSUM_INNER_TSO) + counter_u64_add(sc->vxl_stats.tso, 1); + } + } else if (ntohs(dstport) != V_zero_checksum_port) { struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr)); + hdr->uh_sum = in6_cksum_pseudo(ip6, m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0); m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } - - mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; - m->m_flags &= ~(M_MCAST | M_BCAST); - - error = ip6_output(m, NULL, NULL, 0, sc->vxl_im6o, NULL, NULL); + error = ip6_output(m, NULL, ro, 0, sc->vxl_im6o, NULL, NULL); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); @@ -2593,8 +2824,30 @@ vxlan_input(struct vxlan_socket *vso, uint32_t vni, struct mbuf **m0, m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; M_SETFIB(m, ifp->if_fib); + if (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN && + ((ifp->if_capenable & IFCAP_RXCSUM && + m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC) || + (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && + !(m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)))) { + uint32_t csum_flags = 0; - error = netisr_queue_src(NETISR_ETHER, 0, m); + if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC) + csum_flags |= CSUM_L3_CALC; + if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_VALID) + csum_flags |= CSUM_L3_VALID; + if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_CALC) + csum_flags |= CSUM_L4_CALC; + if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_VALID) + csum_flags |= CSUM_L4_VALID; + m->m_pkthdr.csum_flags = csum_flags; + counter_u64_add(sc->vxl_stats.rxcsum, 1); + } else { + /* clear everything */ + m->m_pkthdr.csum_flags = 0; + m->m_pkthdr.csum_data = 0; + } + + error = netisr_dispatch(NETISR_ETHER, m); *m0 = NULL; out: @@ -2602,6 +2855,48 @@ out: return (error); } +static int +vxlan_stats_alloc(struct vxlan_softc *sc) +{ + struct vxlan_statistics *stats = &sc->vxl_stats; + + stats->txcsum = counter_u64_alloc(M_WAITOK); + if (stats->txcsum == NULL) + goto failed; + + stats->tso = counter_u64_alloc(M_WAITOK); + if (stats->tso == NULL) + goto failed; + + stats->rxcsum = counter_u64_alloc(M_WAITOK); + if (stats->rxcsum == NULL) + goto failed; + + return (0); +failed: + vxlan_stats_free(sc); + return (ENOMEM); +} + +static void +vxlan_stats_free(struct vxlan_softc *sc) +{ + struct vxlan_statistics *stats = &sc->vxl_stats; + + if (stats->txcsum != NULL) { + counter_u64_free(stats->txcsum); + stats->txcsum = NULL; + } + if (stats->tso != NULL) { + counter_u64_free(stats->tso); + stats->tso = NULL; + } + if (stats->rxcsum != NULL) { + counter_u64_free(stats->rxcsum); + stats->rxcsum = NULL; + } +} + static void vxlan_set_default_config(struct vxlan_softc *sc) { @@ -2721,6 +3016,142 @@ vxlan_set_user_config(struct vxlan_softc *sc, struct ifvxlanparam *vxlp) return (0); } +static int +vxlan_set_reqcap(struct vxlan_softc *sc, struct ifnet *ifp, int reqcap) +{ + int mask = reqcap ^ ifp->if_capenable; + + /* Disable TSO if tx checksums are disabled. */ + if (mask & IFCAP_TXCSUM && !(reqcap & IFCAP_TXCSUM) && + reqcap & IFCAP_TSO4) { + reqcap &= ~IFCAP_TSO4; + if_printf(ifp, "tso4 disabled due to -txcsum.\n"); + } + if (mask & IFCAP_TXCSUM_IPV6 && !(reqcap & IFCAP_TXCSUM_IPV6) && + reqcap & IFCAP_TSO6) { + reqcap &= ~IFCAP_TSO6; + if_printf(ifp, "tso6 disabled due to -txcsum6.\n"); + } + + /* Do not enable TSO if tx checksums are disabled. */ + if (mask & IFCAP_TSO4 && reqcap & IFCAP_TSO4 && + !(reqcap & IFCAP_TXCSUM)) { + if_printf(ifp, "enable txcsum first.\n"); + return (EAGAIN); + } + if (mask & IFCAP_TSO6 && reqcap & IFCAP_TSO6 && + !(reqcap & IFCAP_TXCSUM_IPV6)) { + if_printf(ifp, "enable txcsum6 first.\n"); + return (EAGAIN); + } + + sc->vxl_reqcap = reqcap; + return (0); +} + +/* + * A VXLAN interface inherits the capabilities of the vxlandev or the interface + * hosting the vxlanlocal address. + */ +static void +vxlan_set_hwcaps(struct vxlan_softc *sc) +{ + struct epoch_tracker et; + struct ifnet *p; + struct ifaddr *ifa; + u_long hwa; + int cap, ena; + bool rel; + struct ifnet *ifp = sc->vxl_ifp; + + /* reset caps */ + ifp->if_capabilities &= VXLAN_BASIC_IFCAPS; + ifp->if_capenable &= VXLAN_BASIC_IFCAPS; + ifp->if_hwassist = 0; + + NET_EPOCH_ENTER(et); + CURVNET_SET(ifp->if_vnet); + + rel = false; + p = NULL; + if (sc->vxl_mc_ifname[0] != '\0') { + rel = true; + p = ifunit_ref(sc->vxl_mc_ifname); + } else if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) { + if (sc->vxl_src_addr.sa.sa_family == AF_INET) { + struct sockaddr_in in4 = sc->vxl_src_addr.in4; + + in4.sin_port = 0; + ifa = ifa_ifwithaddr((struct sockaddr *)&in4); + if (ifa != NULL) + p = ifa->ifa_ifp; + } else if (sc->vxl_src_addr.sa.sa_family == AF_INET6) { + struct sockaddr_in6 in6 = sc->vxl_src_addr.in6; + + in6.sin6_port = 0; + ifa = ifa_ifwithaddr((struct sockaddr *)&in6); + if (ifa != NULL) + p = ifa->ifa_ifp; + } + } + if (p == NULL) + goto done; + + cap = ena = hwa = 0; + + /* checksum offload */ + if (p->if_capabilities & IFCAP_VXLAN_HWCSUM) + cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); + if (p->if_capenable & IFCAP_VXLAN_HWCSUM) { + ena |= sc->vxl_reqcap & p->if_capenable & + (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); + if (ena & IFCAP_TXCSUM) { + if (p->if_hwassist & CSUM_INNER_IP) + hwa |= CSUM_IP; + if (p->if_hwassist & CSUM_INNER_IP_UDP) + hwa |= CSUM_IP_UDP; + if (p->if_hwassist & CSUM_INNER_IP_TCP) + hwa |= CSUM_IP_TCP; + } + if (ena & IFCAP_TXCSUM_IPV6) { + if (p->if_hwassist & CSUM_INNER_IP6_UDP) + hwa |= CSUM_IP6_UDP; + if (p->if_hwassist & CSUM_INNER_IP6_TCP) + hwa |= CSUM_IP6_TCP; + } + } + + /* hardware TSO */ + if (p->if_capabilities & IFCAP_VXLAN_HWTSO) { + cap |= p->if_capabilities & IFCAP_TSO; + if (p->if_hw_tsomax > IP_MAXPACKET - ifp->if_hdrlen) + ifp->if_hw_tsomax = IP_MAXPACKET - ifp->if_hdrlen; + else + ifp->if_hw_tsomax = p->if_hw_tsomax; + /* XXX: tsomaxsegcount decrement is cxgbe specific */ + ifp->if_hw_tsomaxsegcount = p->if_hw_tsomaxsegcount - 1; + ifp->if_hw_tsomaxsegsize = p->if_hw_tsomaxsegsize; + } + if (p->if_capenable & IFCAP_VXLAN_HWTSO) { + ena |= sc->vxl_reqcap & p->if_capenable & IFCAP_TSO; + if (ena & IFCAP_TSO) { + if (p->if_hwassist & CSUM_INNER_IP_TSO) + hwa |= CSUM_IP_TSO; + if (p->if_hwassist & CSUM_INNER_IP6_TSO) + hwa |= CSUM_IP6_TSO; + } + } + + ifp->if_capabilities |= cap; + ifp->if_capenable |= ena; + ifp->if_hwassist |= hwa; + if (rel) + if_rele(p); +done: + CURVNET_RESTORE(); + NET_EPOCH_EXIT(et); +} + static int vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params) { @@ -2732,6 +3163,9 @@ vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params) sc = malloc(sizeof(struct vxlan_softc), M_VXLAN, M_WAITOK | M_ZERO); sc->vxl_unit = unit; vxlan_set_default_config(sc); + error = vxlan_stats_alloc(sc); + if (error != 0) + goto fail; if (params != 0) { error = copyin(params, &vxlp, sizeof(vxlp)); @@ -2764,8 +3198,10 @@ vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params) ifp->if_ioctl = vxlan_ioctl; ifp->if_transmit = vxlan_transmit; ifp->if_qflush = vxlan_qflush; - ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; - ifp->if_capenable |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; + ifp->if_capabilities = VXLAN_BASIC_IFCAPS; + ifp->if_capenable = VXLAN_BASIC_IFCAPS; + sc->vxl_reqcap = -1; + vxlan_set_hwcaps(sc); ifmedia_init(&sc->vxl_media, 0, vxlan_media_change, vxlan_media_status); ifmedia_add(&sc->vxl_media, IFM_ETHER | IFM_AUTO, 0, NULL); @@ -2775,7 +3211,7 @@ vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params) ether_ifattach(ifp, sc->vxl_hwaddr.octet); ifp->if_baudrate = 0; - ifp->if_hdrlen = 0; + vxlan_setup_interface_hdrlen(sc); return (0); @@ -2803,6 +3239,7 @@ vxlan_clone_destroy(struct ifnet *ifp) vxlan_sysctl_destroy(sc); rm_destroy(&sc->vxl_lock); + vxlan_stats_free(sc); free(sc, M_VXLAN); } @@ -3087,6 +3524,15 @@ vxlan_sysctl_setup(struct vxlan_softc *sc) "ftable_lock_upgrade_failed", CTLFLAG_RD, &stats->ftable_lock_upgrade_failed, 0, "Forwarding table update required lock upgrade"); + + SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "txcsum", + CTLFLAG_RD, &stats->txcsum, + "# of times hardware assisted with tx checksum"); + SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "tso", + CTLFLAG_RD, &stats->tso, "# of times hardware assisted with TSO"); + SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "rxcsum", + CTLFLAG_RD, &stats->rxcsum, + "# of times hardware assisted with rx checksum"); } static void @@ -3131,10 +3577,12 @@ vxlan_ifdetach_event(void *arg __unused, struct ifnet *ifp) LIST_FOREACH_SAFE(sc, &list, vxl_ifdetach_list, tsc) { LIST_REMOVE(sc, vxl_ifdetach_list); + sx_xlock(&vxlan_sx); VXLAN_WLOCK(sc); if (sc->vxl_flags & VXLAN_FLAG_INIT) vxlan_init_wait(sc); vxlan_teardown_locked(sc); + sx_xunlock(&vxlan_sx); } } diff --git a/sys/net/if_vxlan.h b/sys/net/if_vxlan.h index 98b4d902624..01c73a9307c 100644 --- a/sys/net/if_vxlan.h +++ b/sys/net/if_vxlan.h @@ -143,4 +143,11 @@ struct ifvxlancmd { char vxlcmd_ifname[IFNAMSIZ]; }; +#ifdef _KERNEL +typedef void (*vxlan_event_handler_t)(void *, struct ifnet *, sa_family_t, + u_int); +EVENTHANDLER_DECLARE(vxlan_start, vxlan_event_handler_t); +EVENTHANDLER_DECLARE(vxlan_stop, vxlan_event_handler_t); +#endif + #endif /* _NET_IF_VXLAN_H_ */ diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index a37201ed237..c1dd6bad6c6 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -767,9 +767,13 @@ sendit: /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. + * Note that if_vxlan could have requested TSO even though the outer + * frame is UDP. It is correct to not fragment such datagrams and + * instead just pass them on to the driver. */ if (ip_len <= mtu || - (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { + (m->m_pkthdr.csum_flags & ifp->if_hwassist & + (CSUM_TSO | CSUM_INNER_TSO)) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); @@ -783,7 +787,8 @@ sendit: * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { - if (m->m_pkthdr.csum_flags & CSUM_TSO) + if (m->m_pkthdr.csum_flags & + (CSUM_TSO | CSUM_INNER_TSO)) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else @@ -807,7 +812,8 @@ sendit: } /* Balk when DF bit is set or the interface didn't support TSO. */ - if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { + if ((ip_off & IP_DF) || + (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c index 9d37cf84bde..6be710085b8 100644 --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -1119,7 +1119,8 @@ passout: */ sw_csum = m->m_pkthdr.csum_flags; if (!hdrsplit) { - tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0; + tso = ((sw_csum & ifp->if_hwassist & + (CSUM_TSO | CSUM_INNER_TSO)) != 0) ? 1 : 0; sw_csum &= ~ifp->if_hwassist; } else tso = 0;