diff --git a/sys/conf/files b/sys/conf/files index 1f20111572f..c61030225e8 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1875,8 +1875,10 @@ dev/ncv/ncr53c500_pccard.c optional ncv pccard dev/netmap/netmap.c optional netmap dev/netmap/netmap_freebsd.c optional netmap dev/netmap/netmap_generic.c optional netmap -dev/netmap/netmap_mbq.c optional netmap +dev/netmap/netmap_mbq.c optional netmap dev/netmap/netmap_mem2.c optional netmap +dev/netmap/netmap_offloadings.c optional netmap +dev/netmap/netmap_pipe.c optional netmap dev/netmap/netmap_vale.c optional netmap # compile-with "${NORMAL_C} -Wconversion -Wextra" dev/nge/if_nge.c optional nge diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index fdd368a346f..de88fb58fc8 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -156,9 +156,11 @@ ports attached to the switch) /* reduce conditional code */ -#define init_waitqueue_head(x) // only needed in linux - +// linux API, use for the knlist in FreeBSD +#define init_waitqueue_head(x) knlist_init_mtx(&(x)->si_note, NULL) +void freebsd_selwakeup(struct selinfo *si, int pri); +#define OS_selwakeup(a, b) freebsd_selwakeup(a, b) #elif defined(linux) @@ -231,6 +233,7 @@ static int netmap_admode = NETMAP_ADMODE_BEST; int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ int netmap_generic_ringsize = 1024; /* Generic ringsize. */ +int netmap_generic_rings = 1; /* number of queues in generic. */ SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); @@ -238,6 +241,7 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); NMG_LOCK_T netmap_global_lock; @@ -270,28 +274,30 @@ netmap_set_all_rings(struct ifnet *ifp, int stopped) { struct netmap_adapter *na; int i; + u_int ntx, nrx; if (!(ifp->if_capenable & IFCAP_NETMAP)) return; na = NA(ifp); - for (i = 0; i <= na->num_tx_rings; i++) { + ntx = netmap_real_tx_rings(na); + nrx = netmap_real_rx_rings(na); + + for (i = 0; i < ntx; i++) { if (stopped) netmap_disable_ring(na->tx_rings + i); else na->tx_rings[i].nkr_stopped = 0; - na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY | - (i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0)); + na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY); } - for (i = 0; i <= na->num_rx_rings; i++) { + for (i = 0; i < nrx; i++) { if (stopped) netmap_disable_ring(na->rx_rings + i); else na->rx_rings[i].nkr_stopped = 0; - na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY | - (i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0)); + na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY); } } @@ -426,14 +432,73 @@ netmap_update_config(struct netmap_adapter *na) return 1; } +static int +netmap_txsync_compat(struct netmap_kring *kring, int flags) +{ + struct netmap_adapter *na = kring->na; + return na->nm_txsync(na, kring->ring_id, flags); +} +static int +netmap_rxsync_compat(struct netmap_kring *kring, int flags) +{ + struct netmap_adapter *na = kring->na; + return na->nm_rxsync(na, kring->ring_id, flags); +} + +static int +netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) +{ + (void)flags; + netmap_txsync_to_host(kring->na); + return 0; +} + +static int +netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) +{ + (void)flags; + netmap_rxsync_from_host(kring->na, NULL, NULL); + return 0; +} + + + +/* create the krings array and initialize the fields common to all adapters. + * The array layout is this: + * + * +----------+ + * na->tx_rings ----->| | \ + * | | } na->num_tx_ring + * | | / + * +----------+ + * | | host tx kring + * na->rx_rings ----> +----------+ + * | | \ + * | | } na->num_rx_rings + * | | / + * +----------+ + * | | host rx kring + * +----------+ + * na->tailroom ----->| | \ + * | | } tailroom bytes + * | | / + * +----------+ + * + * Note: for compatibility, host krings are created even when not needed. + * The tailroom space is currently used by vale ports for allocating leases. + */ int -netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom) +netmap_krings_create(struct netmap_adapter *na, u_int tailroom) { u_int i, len, ndesc; struct netmap_kring *kring; + u_int ntx, nrx; + + /* account for the (possibly fake) host rings */ + ntx = na->num_tx_rings + 1; + nrx = na->num_rx_rings + 1; - // XXX additional space for extra rings ? len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); @@ -454,12 +519,19 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail kring->na = na; kring->ring_id = i; kring->nkr_num_slots = ndesc; + if (i < na->num_tx_rings) { + kring->nm_sync = netmap_txsync_compat; // XXX + } else if (i == na->num_tx_rings) { + kring->nm_sync = netmap_txsync_to_host_compat; + } /* * IMPORTANT: Always keep one slot empty. */ kring->rhead = kring->rcur = kring->nr_hwcur = 0; kring->rtail = kring->nr_hwtail = ndesc - 1; snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); + ND("ktx %s h %d c %d t %d", + kring->name, kring->rhead, kring->rcur, kring->rtail); mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); init_waitqueue_head(&kring->si); } @@ -471,9 +543,16 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail kring->na = na; kring->ring_id = i; kring->nkr_num_slots = ndesc; + if (i < na->num_rx_rings) { + kring->nm_sync = netmap_rxsync_compat; // XXX + } else if (i == na->num_rx_rings) { + kring->nm_sync = netmap_rxsync_from_host_compat; + } kring->rhead = kring->rcur = kring->nr_hwcur = 0; kring->rtail = kring->nr_hwtail = 0; snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); + ND("krx %s h %d c %d t %d", + kring->name, kring->rhead, kring->rcur, kring->rtail); mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); init_waitqueue_head(&kring->si); } @@ -486,17 +565,15 @@ netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tail } -/* XXX check boundaries */ +/* undo the actions performed by netmap_krings_create */ void netmap_krings_delete(struct netmap_adapter *na) { - int i; + struct netmap_kring *kring = na->tx_rings; - for (i = 0; i < na->num_tx_rings + 1; i++) { - mtx_destroy(&na->tx_rings[i].q_lock); - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - mtx_destroy(&na->rx_rings[i].q_lock); + /* we rely on the krings layout described above */ + for ( ; kring != na->tailroom; kring++) { + mtx_destroy(&kring->q_lock); } free(na->tx_rings, M_DEVBUF); na->tx_rings = na->rx_rings = na->tailroom = NULL; @@ -677,6 +754,20 @@ netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) netmap_mem_if_delete(na, nifp); } +static __inline int +nm_tx_si_user(struct netmap_priv_d *priv) +{ + return (priv->np_na != NULL && + (priv->np_txqlast - priv->np_txqfirst > 1)); +} + +static __inline int +nm_rx_si_user(struct netmap_priv_d *priv) +{ + return (priv->np_na != NULL && + (priv->np_rxqlast - priv->np_rxqfirst > 1)); +} + /* * returns 1 if this is the last instance and we can free priv @@ -702,6 +793,10 @@ netmap_dtor_locked(struct netmap_priv_d *priv) priv->np_nifp = NULL; netmap_drop_memory_locked(priv); if (priv->np_na) { + if (nm_tx_si_user(priv)) + na->tx_si_users--; + if (nm_rx_si_user(priv)) + na->rx_si_users--; netmap_adapter_put(na); priv->np_na = NULL; } @@ -864,22 +959,8 @@ netmap_txsync_to_host(struct netmap_adapter *na) struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; struct netmap_ring *ring = kring->ring; u_int const lim = kring->nkr_num_slots - 1; - u_int const head = nm_txsync_prologue(kring); + u_int const head = kring->rhead; struct mbq q; - int error; - - error = nm_kr_tryget(kring); - if (error) { - if (error == NM_KR_BUSY) - D("ring %p busy (user error)", kring); - return; - } - if (head > lim) { - D("invalid ring index in stack TX kring %p", kring); - netmap_ring_reinit(kring); - nm_kr_put(kring); - return; - } /* Take packets from hwcur to head and pass them up. * force head = cur since netmap_grab_packets() stops at head @@ -896,7 +977,6 @@ netmap_txsync_to_host(struct netmap_adapter *na) kring->nr_hwtail -= lim + 1; nm_txsync_finalize(kring); - nm_kr_put(kring); netmap_send_up(na->ifp, &q); } @@ -921,27 +1001,15 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai struct netmap_ring *ring = kring->ring; u_int nm_i, n; u_int const lim = kring->nkr_num_slots - 1; - u_int const head = nm_rxsync_prologue(kring); + u_int const head = kring->rhead; int ret = 0; struct mbq *q = &kring->rx_queue; (void)pwait; /* disable unused warnings */ - - if (head > lim) { - netmap_ring_reinit(kring); - return EINVAL; - } - - if (kring->nkr_stopped) /* check a first time without lock */ - return EBUSY; + (void)td; mtx_lock(&q->lock); - if (kring->nkr_stopped) { /* check again with lock held */ - ret = EBUSY; - goto unlock_out; - } - /* First part: import newly received packets */ n = mbq_len(q); if (n) { /* grab packets from the queue */ @@ -982,8 +1050,6 @@ netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwai if (kring->rcur == kring->rtail && td) /* no bufs available */ selrecord(td, &kring->si); -unlock_out: - mtx_unlock(&q->lock); return ret; } @@ -1107,19 +1173,26 @@ netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) { - struct ifnet *ifp; + struct ifnet *ifp = NULL; int error = 0; - struct netmap_adapter *ret; + struct netmap_adapter *ret = NULL; *na = NULL; /* default return value */ /* first try to see if this is a bridge port. */ NMG_LOCK_ASSERT(); - error = netmap_get_bdg_na(nmr, na, create); - if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */ + error = netmap_get_pipe_na(nmr, na, create); + if (error || *na != NULL) return error; + error = netmap_get_bdg_na(nmr, na, create); + if (error) + return error; + + if (*na != NULL) /* valid match in netmap_get_bdg_na() */ + goto pipes; + ifp = ifunit_ref(nmr->nr_name); if (ifp == NULL) { return ENXIO; @@ -1129,18 +1202,23 @@ netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) if (error) goto out; - if (ret != NULL) { - /* Users cannot use the NIC attached to a bridge directly */ - if (NETMAP_OWNED_BY_KERN(ret)) { - error = EBUSY; - goto out; - } - error = 0; - *na = ret; - netmap_adapter_get(ret); + /* Users cannot use the NIC attached to a bridge directly */ + if (NETMAP_OWNED_BY_KERN(ret)) { + error = EBUSY; + goto out; } + *na = ret; + netmap_adapter_get(ret); + +pipes: + error = netmap_pipe_alloc(*na, nmr); + out: - if_rele(ifp); + if (error && ret != NULL) + netmap_adapter_put(ret); + + if (ifp) + if_rele(ifp); return error; } @@ -1365,45 +1443,88 @@ netmap_ring_reinit(struct netmap_kring *kring) * for all rings is the same as a single ring. */ static int -netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) +netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) { struct netmap_adapter *na = priv->np_na; - struct ifnet *ifp = na->ifp; - u_int i = ringid & NETMAP_RING_MASK; - /* initially (np_qfirst == np_qlast) we don't want to lock */ - u_int lim = na->num_rx_rings; + u_int j, i = ringid & NETMAP_RING_MASK; + u_int reg = flags & NR_REG_MASK; - if (na->num_tx_rings > lim) - lim = na->num_tx_rings; - if ( (ringid & NETMAP_HW_RING) && i >= lim) { - D("invalid ring id %d", i); - return (EINVAL); + if (reg == NR_REG_DEFAULT) { + /* convert from old ringid to flags */ + if (ringid & NETMAP_SW_RING) { + reg = NR_REG_SW; + } else if (ringid & NETMAP_HW_RING) { + reg = NR_REG_ONE_NIC; + } else { + reg = NR_REG_ALL_NIC; + } + D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); } - priv->np_ringid = ringid; - if (ringid & NETMAP_SW_RING) { - priv->np_qfirst = NETMAP_SW_RING; - priv->np_qlast = 0; - } else if (ringid & NETMAP_HW_RING) { - priv->np_qfirst = i; - priv->np_qlast = i + 1; - } else { - priv->np_qfirst = 0; - priv->np_qlast = NETMAP_HW_RING ; + switch (reg) { + case NR_REG_ALL_NIC: + case NR_REG_PIPE_MASTER: + case NR_REG_PIPE_SLAVE: + priv->np_txqfirst = 0; + priv->np_txqlast = na->num_tx_rings; + priv->np_rxqfirst = 0; + priv->np_rxqlast = na->num_rx_rings; + ND("%s %d %d", "ALL/PIPE", + priv->np_rxqfirst, priv->np_rxqlast); + break; + case NR_REG_SW: + case NR_REG_NIC_SW: + if (!(na->na_flags & NAF_HOST_RINGS)) { + D("host rings not supported"); + return EINVAL; + } + priv->np_txqfirst = (reg == NR_REG_SW ? + na->num_tx_rings : 0); + priv->np_txqlast = na->num_tx_rings + 1; + priv->np_rxqfirst = (reg == NR_REG_SW ? + na->num_rx_rings : 0); + priv->np_rxqlast = na->num_rx_rings + 1; + ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", + priv->np_rxqfirst, priv->np_rxqlast); + break; + case NR_REG_ONE_NIC: + if (i >= na->num_tx_rings && i >= na->num_rx_rings) { + D("invalid ring id %d", i); + return EINVAL; + } + /* if not enough rings, use the first one */ + j = i; + if (j >= na->num_tx_rings) + j = 0; + priv->np_txqfirst = j; + priv->np_txqlast = j + 1; + j = i; + if (j >= na->num_rx_rings) + j = 0; + priv->np_rxqfirst = j; + priv->np_rxqlast = j + 1; + break; + default: + D("invalid regif type %d", reg); + return EINVAL; } priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; - if (netmap_verbose) { - if (ringid & NETMAP_SW_RING) - D("ringid %s set to SW RING", NM_IFPNAME(ifp)); - else if (ringid & NETMAP_HW_RING) - D("ringid %s set to HW RING %d", NM_IFPNAME(ifp), - priv->np_qfirst); - else - D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim); - } + priv->np_flags = (flags & ~NR_REG_MASK) | reg; + if (nm_tx_si_user(priv)) + na->tx_si_users++; + if (nm_rx_si_user(priv)) + na->rx_si_users++; + if (netmap_verbose) { + D("%s: tx [%d,%d) rx [%d,%d) id %d", + NM_IFPNAME(na->ifp), + priv->np_txqfirst, + priv->np_txqlast, + priv->np_rxqfirst, + priv->np_rxqlast, + i); + } return 0; } - /* * possibly move the interface to netmap-mode. * If success it returns a pointer to netmap_if, otherwise NULL. @@ -1411,7 +1532,7 @@ netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) */ struct netmap_if * netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, - uint16_t ringid, int *err) + uint16_t ringid, uint32_t flags, int *err) { struct ifnet *ifp = na->ifp; struct netmap_if *nifp = NULL; @@ -1421,7 +1542,7 @@ netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, /* ring configuration may have changed, fetch from the card */ netmap_update_config(na); priv->np_na = na; /* store the reference */ - error = netmap_set_ringid(priv, ringid); + error = netmap_set_ringid(priv, ringid, flags); if (error) goto out; /* ensure allocators are ready */ @@ -1501,26 +1622,12 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, struct nmreq *nmr = (struct nmreq *) data; struct netmap_adapter *na = NULL; int error; - u_int i, lim; + u_int i, qfirst, qlast; struct netmap_if *nifp; struct netmap_kring *krings; (void)dev; /* UNUSED */ (void)fflag; /* UNUSED */ -#ifdef linux -#define devfs_get_cdevpriv(pp) \ - ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ - (*pp ? 0 : ENOENT); }) - -/* devfs_set_cdevpriv cannot fail on linux */ -#define devfs_set_cdevpriv(p, fn) \ - ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) - - -#define devfs_clear_cdevpriv() do { \ - netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ - } while (0) -#endif /* linux */ if (cmd == NIOCGINFO || cmd == NIOCREGIF) { /* truncate name */ @@ -1530,6 +1637,9 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, nmr->nr_name, nmr->nr_version, NETMAP_API); nmr->nr_version = NETMAP_API; + } + if (nmr->nr_version < NETMAP_MIN_API || + nmr->nr_version > NETMAP_MAX_API) { return EINVAL; } } @@ -1564,7 +1674,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, nmd = na->nm_mem; /* get memory allocator */ } - error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); + error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags, + &nmr->nr_arg2); if (error) break; if (na == NULL) /* only memory info */ @@ -1576,8 +1687,6 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - if (memflags & NETMAP_MEM_PRIVATE) - nmr->nr_ringid |= NETMAP_PRIV_MEM; netmap_adapter_put(na); } while (0); NMG_UNLOCK(); @@ -1587,7 +1696,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, /* possibly attach/detach NIC and VALE switch */ i = nmr->nr_cmd; if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH - || i == NETMAP_BDG_OFFSET) { + || i == NETMAP_BDG_VNET_HDR) { error = netmap_bdg_ctl(nmr, NULL); break; } else if (i != 0) { @@ -1602,7 +1711,7 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, u_int memflags; if (priv->np_na != NULL) { /* thread already registered */ - error = netmap_set_ringid(priv, nmr->nr_ringid); + error = EBUSY; break; } /* find the interface and a reference */ @@ -1615,27 +1724,39 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, error = EBUSY; break; } - nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error); + nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error); if (!nifp) { /* reg. failed, release priv and ref */ netmap_adapter_put(na); priv->np_nifp = NULL; break; } + priv->np_td = td; // XXX kqueue, debugging only /* return the offset of the netmap_if object */ nmr->nr_rx_rings = na->num_rx_rings; nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); + error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags, + &nmr->nr_arg2); if (error) { netmap_adapter_put(na); break; } if (memflags & NETMAP_MEM_PRIVATE) { - nmr->nr_ringid |= NETMAP_PRIV_MEM; *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; } + priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ? + &na->tx_si : &na->tx_rings[priv->np_txqfirst].si; + priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ? + &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si; + + if (nmr->nr_arg3) { + D("requested %d extra buffers", nmr->nr_arg3); + nmr->nr_arg3 = netmap_extra_alloc(na, + &nifp->ni_bufs_head, nmr->nr_arg3); + D("got %d extra buffers", nmr->nr_arg3); + } nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); } while (0); NMG_UNLOCK(); @@ -1666,21 +1787,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, break; } - if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ - if (cmd == NIOCTXSYNC) - netmap_txsync_to_host(na); - else - netmap_rxsync_from_host(na, NULL, NULL); - break; + if (cmd == NIOCTXSYNC) { + krings = na->tx_rings; + qfirst = priv->np_txqfirst; + qlast = priv->np_txqlast; + } else { + krings = na->rx_rings; + qfirst = priv->np_rxqfirst; + qlast = priv->np_rxqlast; } - /* find the last ring to scan */ - lim = priv->np_qlast; - if (lim == NETMAP_HW_RING) - lim = (cmd == NIOCTXSYNC) ? - na->num_tx_rings : na->num_rx_rings; - krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; - for (i = priv->np_qfirst; i < lim; i++) { + for (i = qfirst; i < qlast; i++) { struct netmap_kring *kring = krings + i; if (nm_kr_tryget(kring)) { error = EBUSY; @@ -1694,14 +1811,14 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { netmap_ring_reinit(kring); } else { - na->nm_txsync(na, i, NAF_FORCE_RECLAIM); + kring->nm_sync(kring, NAF_FORCE_RECLAIM); } if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); } else { - na->nm_rxsync(na, i, NAF_FORCE_READ); + kring->nm_sync(kring, NAF_FORCE_READ); microtime(&na->rx_rings[i].ring->ts); } nm_kr_put(kring); @@ -1772,9 +1889,9 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) struct ifnet *ifp; struct netmap_kring *kring; u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; - u_int lim_tx, lim_rx; struct mbq q; /* packets from hw queues to host stack */ void *pwait = dev; /* linux compatibility */ + int is_kevent = 0; /* * In order to avoid nested locks, we need to "double check" @@ -1786,7 +1903,19 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) (void)pwait; mbq_init(&q); - if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) + /* + * XXX kevent has curthread->tp_fop == NULL, + * so devfs_get_cdevpriv() fails. We circumvent this by passing + * priv as the first argument, which is also useful to avoid + * the selrecord() which are not necessary in that case. + */ + if (devfs_get_cdevpriv((void **)&priv) != 0) { + is_kevent = 1; + if (netmap_verbose) + D("called from kevent"); + priv = (struct netmap_priv_d *)dev; + } + if (priv == NULL) return POLLERR; if (priv->np_nifp == NULL) { @@ -1811,28 +1940,6 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) want_tx = events & (POLLOUT | POLLWRNORM); want_rx = events & (POLLIN | POLLRDNORM); - lim_tx = na->num_tx_rings; - lim_rx = na->num_rx_rings; - - if (priv->np_qfirst == NETMAP_SW_RING) { - // XXX locking ? - /* handle the host stack ring */ - if (priv->np_txpoll || want_tx) { - /* push any packets up, then we are always ready */ - netmap_txsync_to_host(na); - revents |= want_tx; - } - if (want_rx) { - kring = &na->rx_rings[lim_rx]; - /* XXX replace with rxprologue etc. */ - if (nm_ring_empty(kring->ring)) - netmap_rxsync_from_host(na, td, dev); - if (!nm_ring_empty(kring->ring)) - revents |= want_rx; - } - return (revents); - } - /* * check_all_{tx|rx} are set if the card has more than one queue AND @@ -1847,19 +1954,15 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * there are pending packets to send. The latter can be disabled * passing NETMAP_NO_TX_POLL in the NIOCREG call. */ - check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); - check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); - - if (priv->np_qlast != NETMAP_HW_RING) { - lim_tx = lim_rx = priv->np_qlast; - } + check_all_tx = nm_tx_si_user(priv); + check_all_rx = nm_rx_si_user(priv); /* * We start with a lock free round which is cheap if we have * slots available. If this fails, then lock and call the sync * routines. */ - for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { + for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) { kring = &na->rx_rings[i]; /* XXX compare ring->cur and kring->tail */ if (!nm_ring_empty(kring->ring)) { @@ -1867,7 +1970,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) want_rx = 0; /* also breaks the loop */ } } - for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { + for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) { kring = &na->tx_rings[i]; /* XXX compare ring->cur and kring->tail */ if (!nm_ring_empty(kring->ring)) { @@ -1891,7 +1994,7 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * used to skip rings with no pending transmissions. */ flush_tx: - for (i = priv->np_qfirst; i < lim_tx; i++) { + for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { int found = 0; kring = &na->tx_rings[i]; @@ -1906,7 +2009,7 @@ flush_tx: netmap_ring_reinit(kring); revents |= POLLERR; } else { - if (na->nm_txsync(na, i, 0)) + if (kring->nm_sync(kring, 0)) revents |= POLLERR; } @@ -1921,12 +2024,12 @@ flush_tx: if (found) { /* notify other listeners */ revents |= want_tx; want_tx = 0; - na->nm_notify(na, i, NR_TX, NAF_GLOBAL_NOTIFY); + na->nm_notify(na, i, NR_TX, 0); } } - if (want_tx && retry_tx) { + if (want_tx && retry_tx && !is_kevent) { selrecord(td, check_all_tx ? - &na->tx_si : &na->tx_rings[priv->np_qfirst].si); + &na->tx_si : &na->tx_rings[priv->np_txqfirst].si); retry_tx = 0; goto flush_tx; } @@ -1940,7 +2043,7 @@ flush_tx: int send_down = 0; /* transparent mode */ /* two rounds here to for race avoidance */ do_retry_rx: - for (i = priv->np_qfirst; i < lim_rx; i++) { + for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { int found = 0; kring = &na->rx_rings[i]; @@ -1962,7 +2065,7 @@ do_retry_rx: netmap_grab_packets(kring, &q, netmap_fwd); } - if (na->nm_rxsync(na, i, 0)) + if (kring->nm_sync(kring, 0)) revents |= POLLERR; if (netmap_no_timestamp == 0 || kring->ring->flags & NR_TIMESTAMP) { @@ -1974,24 +2077,26 @@ do_retry_rx: if (found) { revents |= want_rx; retry_rx = 0; - na->nm_notify(na, i, NR_RX, NAF_GLOBAL_NOTIFY); + na->nm_notify(na, i, NR_RX, 0); } } /* transparent mode XXX only during first pass ? */ - kring = &na->rx_rings[lim_rx]; - if (check_all_rx - && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { - /* XXX fix to use kring fields */ - if (nm_ring_empty(kring->ring)) - send_down = netmap_rxsync_from_host(na, td, dev); - if (!nm_ring_empty(kring->ring)) - revents |= want_rx; + if (na->na_flags & NAF_HOST_RINGS) { + kring = &na->rx_rings[na->num_rx_rings]; + if (check_all_rx + && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { + /* XXX fix to use kring fields */ + if (nm_ring_empty(kring->ring)) + send_down = netmap_rxsync_from_host(na, td, dev); + if (!nm_ring_empty(kring->ring)) + revents |= want_rx; + } } - if (retry_rx) + if (retry_rx && !is_kevent) selrecord(td, check_all_rx ? - &na->rx_si : &na->rx_rings[priv->np_qfirst].si); + &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si); if (send_down > 0 || retry_rx) { retry_rx = 0; if (send_down) @@ -2032,14 +2137,14 @@ netmap_notify(struct netmap_adapter *na, u_int n_ring, if (tx == NR_TX) { kring = na->tx_rings + n_ring; - selwakeuppri(&kring->si, PI_NET); - if (flags & NAF_GLOBAL_NOTIFY) - selwakeuppri(&na->tx_si, PI_NET); + OS_selwakeup(&kring->si, PI_NET); + if (na->tx_si_users > 0) + OS_selwakeup(&na->tx_si, PI_NET); } else { kring = na->rx_rings + n_ring; - selwakeuppri(&kring->si, PI_NET); - if (flags & NAF_GLOBAL_NOTIFY) - selwakeuppri(&na->rx_si, PI_NET); + OS_selwakeup(&kring->si, PI_NET); + if (na->rx_si_users > 0) + OS_selwakeup(&na->rx_si, PI_NET); } return 0; } @@ -2090,6 +2195,7 @@ netmap_detach_common(struct netmap_adapter *na) D("freeing leftover tx_rings"); na->nm_krings_delete(na); } + netmap_pipe_dealloc(na); if (na->na_flags & NAF_MEM_OWNER) netmap_mem_private_delete(na->nm_mem); bzero(na, sizeof(*na)); @@ -2120,6 +2226,7 @@ netmap_attach(struct netmap_adapter *arg) if (hwna == NULL) goto fail; hwna->up = *arg; + hwna->up.na_flags |= NAF_HOST_RINGS; if (netmap_attach_common(&hwna->up)) { free(hwna, M_DEVBUF); goto fail; @@ -2177,12 +2284,10 @@ NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) return 1; } - int netmap_hw_krings_create(struct netmap_adapter *na) { - int ret = netmap_krings_create(na, - na->num_tx_rings + 1, na->num_rx_rings + 1, 0); + int ret = netmap_krings_create(na, 0); if (ret == 0) { /* initialize the mbq for the sw rx ring */ mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); @@ -2370,7 +2475,7 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, * We do the wakeup here, but the ring is not yet reconfigured. * However, we are under lock so there are no races. */ - na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY); + na->nm_notify(na, n, tx, 0); return kring->ring->slot; } @@ -2405,15 +2510,13 @@ netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) return; // not a physical queue kring = na->rx_rings + q; kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? - na->nm_notify(na, q, NR_RX, - (na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); + na->nm_notify(na, q, NR_RX, 0); *work_done = 1; /* do not fire napi again */ } else { /* TX path */ if (q >= na->num_tx_rings) return; // not a physical queue kring = na->tx_rings + q; - na->nm_notify(na, q, NR_TX, - (na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0)); + na->nm_notify(na, q, NR_TX, 0); } } diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index 6716168526d..a8e287c6ddd 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -29,8 +29,10 @@ #include #include #include /* defines used in kernel.h */ +#include /* POLLIN, POLLOUT */ #include /* types used in module initialization */ #include /* DEV_MODULE */ +#include #include @@ -49,6 +51,8 @@ #include #include #include /* bus_dmamap_* */ +#include /* in6_cksum_pseudo() */ +#include /* in_pseudo(), in_cksum_hdr() */ #include #include @@ -57,6 +61,73 @@ /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ +rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) +{ + /* TODO XXX please use the FreeBSD implementation for this. */ + uint16_t *words = (uint16_t *)data; + int nw = len / 2; + int i; + + for (i = 0; i < nw; i++) + cur_sum += be16toh(words[i]); + + if (len & 1) + cur_sum += (data[len-1] << 8); + + return cur_sum; +} + +/* Fold a raw checksum: 'cur_sum' is in host byte order, while the + * return value is in network byte order. + */ +uint16_t nm_csum_fold(rawsum_t cur_sum) +{ + /* TODO XXX please use the FreeBSD implementation for this. */ + while (cur_sum >> 16) + cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16); + + return htobe16((~cur_sum) & 0xFFFF); +} + +uint16_t nm_csum_ipv4(struct nm_iphdr *iph) +{ +#if 0 + return in_cksum_hdr((void *)iph); +#else + return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); +#endif +} + +void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, + size_t datalen, uint16_t *check) +{ + uint16_t pseudolen = datalen + iph->protocol; + + /* Compute and insert the pseudo-header cheksum. */ + *check = in_pseudo(iph->saddr, iph->daddr, + htobe16(pseudolen)); + /* Compute the checksum on TCP/UDP header + payload + * (includes the pseudo-header). + */ + *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); +} + +void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, + size_t datalen, uint16_t *check) +{ +#ifdef INET6 + *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); + *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); +#else + static int notsupported = 0; + if (!notsupported) { + notsupported = 1; + D("inet6 segmentation not supported"); + } +#endif +} + + /* * Intercept the rx routine in the standard device driver. * Second argument is non-zero to intercept, 0 to restore @@ -91,10 +162,7 @@ netmap_catch_rx(struct netmap_adapter *na, int intercept) * Intercept the packet steering routine in the tx path, * so that we can decide which queue is used for an mbuf. * Second argument is non-zero to intercept, 0 to restore. - * - * actually we also need to redirect the if_transmit ? - * - * XXX see if FreeBSD has such a mechanism + * On freebsd we just intercept if_transmit. */ void netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) @@ -111,7 +179,8 @@ netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) } -/* Transmit routine used by generic_netmap_txsync(). Returns 0 on success +/* + * Transmit routine used by generic_netmap_txsync(). Returns 0 on success * and non-zero on error (which may be packet drops or other errors). * addr and len identify the netmap buffer, m is the (preallocated) * mbuf to use for transmissions. @@ -162,38 +231,39 @@ void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) { D("called"); - *txq = 1; - *rxq = 1; + *txq = netmap_generic_rings; + *rxq = netmap_generic_rings; } -void netmap_mitigation_init(struct netmap_generic_adapter *na) +void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na) { ND("called"); - na->mit_pending = 0; + mit->mit_pending = 0; + mit->mit_na = na; } -void netmap_mitigation_start(struct netmap_generic_adapter *na) +void netmap_mitigation_start(struct nm_generic_mit *mit) { ND("called"); } -void netmap_mitigation_restart(struct netmap_generic_adapter *na) +void netmap_mitigation_restart(struct nm_generic_mit *mit) { ND("called"); } -int netmap_mitigation_active(struct netmap_generic_adapter *na) +int netmap_mitigation_active(struct nm_generic_mit *mit) { ND("called"); return 0; } -void netmap_mitigation_cleanup(struct netmap_generic_adapter *na) +void netmap_mitigation_cleanup(struct nm_generic_mit *mit) { ND("called"); } @@ -216,8 +286,10 @@ netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { struct netmap_vm_handle_t *vmh = handle; - D("handle %p size %jd prot %d foff %jd", - handle, (intmax_t)size, prot, (intmax_t)foff); + + if (netmap_verbose) + D("handle %p size %jd prot %d foff %jd", + handle, (intmax_t)size, prot, (intmax_t)foff); dev_ref(vmh->dev); return 0; } @@ -229,7 +301,9 @@ netmap_dev_pager_dtor(void *handle) struct netmap_vm_handle_t *vmh = handle; struct cdev *dev = vmh->dev; struct netmap_priv_d *priv = vmh->priv; - D("handle %p", handle); + + if (netmap_verbose) + D("handle %p", handle); netmap_dtor(priv); free(vmh, M_DEVBUF); dev_rel(dev); @@ -302,8 +376,9 @@ netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, struct netmap_priv_d *priv; vm_object_t obj; - D("cdev %p foff %jd size %jd objp %p prot %d", cdev, - (intmax_t )*foff, (intmax_t )objsize, objp, prot); + if (netmap_verbose) + D("cdev %p foff %jd size %jd objp %p prot %d", cdev, + (intmax_t )*foff, (intmax_t )objsize, objp, prot); vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, M_NOWAIT | M_ZERO); @@ -383,6 +458,157 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) return 0; } +/******************** kqueue support ****************/ + +/* + * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED. + * We use a non-zero argument to distinguish the call from the one + * in kevent_scan() which instead also needs to run netmap_poll(). + * The knote uses a global mutex for the time being. We might + * try to reuse the one in the si, but it is not allocated + * permanently so it might be a bit tricky. + * + * The *kqfilter function registers one or another f_event + * depending on read or write mode. + * In the call to f_event() td_fpop is NULL so any child function + * calling devfs_get_cdevpriv() would fail - and we need it in + * netmap_poll(). As a workaround we store priv into kn->kn_hook + * and pass it as first argument to netmap_poll(), which then + * uses the failure to tell that we are called from f_event() + * and do not need the selrecord(). + */ + +void freebsd_selwakeup(struct selinfo *si, int pri); + +void +freebsd_selwakeup(struct selinfo *si, int pri) +{ + if (netmap_verbose) + D("on knote %p", &si->si_note); + selwakeuppri(si, pri); + /* use a non-zero hint to tell the notification from the + * call done in kqueue_scan() which uses 0 + */ + KNOTE_UNLOCKED(&si->si_note, 0x100 /* notification */); +} + +static void +netmap_knrdetach(struct knote *kn) +{ + struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; + struct selinfo *si = priv->np_rxsi; + + D("remove selinfo %p", si); + knlist_remove(&si->si_note, kn, 0); +} + +static void +netmap_knwdetach(struct knote *kn) +{ + struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; + struct selinfo *si = priv->np_txsi; + + D("remove selinfo %p", si); + knlist_remove(&si->si_note, kn, 0); +} + +/* + * callback from notifies (generated externally) and our + * calls to kevent(). The former we just return 1 (ready) + * since we do not know better. + * In the latter we call netmap_poll and return 0/1 accordingly. + */ +static int +netmap_knrw(struct knote *kn, long hint, int events) +{ + struct netmap_priv_d *priv; + int revents; + + if (hint != 0) { + ND(5, "call from notify"); + return 1; /* assume we are ready */ + } + priv = kn->kn_hook; + /* the notification may come from an external thread, + * in which case we do not want to run the netmap_poll + * This should be filtered above, but check just in case. + */ + if (curthread != priv->np_td) { /* should not happen */ + RD(5, "curthread changed %p %p", curthread, priv->np_td); + return 1; + } else { + revents = netmap_poll((void *)priv, events, curthread); + return (events & revents) ? 1 : 0; + } +} + +static int +netmap_knread(struct knote *kn, long hint) +{ + return netmap_knrw(kn, hint, POLLIN); +} + +static int +netmap_knwrite(struct knote *kn, long hint) +{ + return netmap_knrw(kn, hint, POLLOUT); +} + +static struct filterops netmap_rfiltops = { + .f_isfd = 1, + .f_detach = netmap_knrdetach, + .f_event = netmap_knread, +}; + +static struct filterops netmap_wfiltops = { + .f_isfd = 1, + .f_detach = netmap_knwdetach, + .f_event = netmap_knwrite, +}; + + +/* + * This is called when a thread invokes kevent() to record + * a change in the configuration of the kqueue(). + * The 'priv' should be the same as in the netmap device. + */ +static int +netmap_kqfilter(struct cdev *dev, struct knote *kn) +{ + struct netmap_priv_d *priv; + int error; + struct netmap_adapter *na; + struct selinfo *si; + int ev = kn->kn_filter; + + if (ev != EVFILT_READ && ev != EVFILT_WRITE) { + D("bad filter request %d", ev); + return 1; + } + error = devfs_get_cdevpriv((void**)&priv); + if (error) { + D("device not yet setup"); + return 1; + } + na = priv->np_na; + if (na == NULL) { + D("no netmap adapter for this file descriptor"); + return 1; + } + /* the si is indicated in the priv */ + si = (ev == EVFILT_WRITE) ? priv->np_txsi : priv->np_rxsi; + // XXX lock(priv) ? + kn->kn_fop = (ev == EVFILT_WRITE) ? + &netmap_wfiltops : &netmap_rfiltops; + kn->kn_hook = priv; + knlist_add(&si->si_note, kn, 1); + // XXX unlock(priv) + ND("register %p %s td %p priv %p kn %p np_nifp %p kn_fp/fpop %s", + na, na->ifp->if_xname, curthread, priv, kn, + priv->np_nifp, + kn->kn_fp == curthread->td_fpop ? "match" : "MISMATCH"); + return 0; +} struct cdevsw netmap_cdevsw = { .d_version = D_VERSION, @@ -391,9 +617,10 @@ struct cdevsw netmap_cdevsw = { .d_mmap_single = netmap_mmap_single, .d_ioctl = netmap_ioctl, .d_poll = netmap_poll, + .d_kqfilter = netmap_kqfilter, .d_close = netmap_close, }; - +/*--- end of kqueue support ----*/ /* * Kernel entry point. diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c index e695fcbd29f..63253b6b069 100644 --- a/sys/dev/netmap/netmap_generic.c +++ b/sys/dev/netmap/netmap_generic.c @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #define rtnl_lock() D("rtnl_lock called"); #define rtnl_unlock() D("rtnl_unlock called"); #define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) +#define MBUF_RXQ(m) ((m)->m_pkthdr.flowid) #define smp_mb() /* @@ -222,6 +223,17 @@ generic_netmap_register(struct netmap_adapter *na, int enable) #endif /* REG_RESET */ if (enable) { /* Enable netmap mode. */ + /* Init the mitigation support. */ + gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!gna->mit) { + D("mitigation allocation failed"); + error = ENOMEM; + goto out; + } + for (r=0; rnum_rx_rings; r++) + netmap_mitigation_init(&gna->mit[r], na); + /* Initialize the rx queue, as generic_rx_handler() can * be called as soon as netmap_catch_rx() returns. */ @@ -229,9 +241,6 @@ generic_netmap_register(struct netmap_adapter *na, int enable) mbq_safe_init(&na->rx_rings[r].rx_queue); } - /* Init the mitigation timer. */ - netmap_mitigation_init(gna); - /* * Preallocate packet buffers for the tx rings. */ @@ -306,7 +315,9 @@ generic_netmap_register(struct netmap_adapter *na, int enable) mbq_safe_destroy(&na->rx_rings[r].rx_queue); } - netmap_mitigation_cleanup(gna); + for (r=0; rnum_rx_rings; r++) + netmap_mitigation_cleanup(&gna->mit[r]); + free(gna->mit, M_DEVBUF); for (r=0; rnum_tx_rings; r++) { for (i=0; inum_tx_desc; i++) { @@ -344,10 +355,12 @@ free_tx_pools: free(na->tx_rings[r].tx_pool, M_DEVBUF); na->tx_rings[r].tx_pool = NULL; } - netmap_mitigation_cleanup(gna); for (r=0; rnum_rx_rings; r++) { + netmap_mitigation_cleanup(&gna->mit[r]); mbq_safe_destroy(&na->rx_rings[r].rx_queue); } + free(gna->mit, M_DEVBUF); +out: return error; } @@ -557,12 +570,11 @@ generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) } slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); nm_i = nm_next(nm_i, lim); + IFRATE(rate_ctx.new.txpkt ++); } /* Update hwcur to the next slot to transmit. */ kring->nr_hwcur = nm_i; /* not head, we could break early */ - - IFRATE(rate_ctx.new.txpkt += ntx); } /* @@ -600,7 +612,11 @@ generic_rx_handler(struct ifnet *ifp, struct mbuf *m) struct netmap_adapter *na = NA(ifp); struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; u_int work_done; - u_int rr = 0; // receive ring number + u_int rr = MBUF_RXQ(m); // receive ring number + + if (rr >= na->num_rx_rings) { + rr = rr % na->num_rx_rings; // XXX expensive... + } /* limit the size of the queue */ if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { @@ -617,13 +633,13 @@ generic_rx_handler(struct ifnet *ifp, struct mbuf *m) /* same as send combining, filter notification if there is a * pending timer, otherwise pass it up and start a timer. */ - if (likely(netmap_mitigation_active(gna))) { + if (likely(netmap_mitigation_active(&gna->mit[rr]))) { /* Record that there is some pending work. */ - gna->mit_pending = 1; + gna->mit[rr].mit_pending = 1; } else { netmap_generic_irq(na->ifp, rr, &work_done); IFRATE(rate_ctx.new.rxirq++); - netmap_mitigation_start(gna); + netmap_mitigation_start(&gna->mit[rr]); } } } @@ -682,7 +698,6 @@ generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) ring->slot[nm_i].flags = slot_flags; m_freem(m); nm_i = nm_next(nm_i, lim); - n++; } if (n) { kring->nr_hwtail = nm_i; @@ -772,7 +787,7 @@ generic_netmap_attach(struct ifnet *ifp) /* when using generic, IFCAP_NETMAP is set so we force * NAF_SKIP_INTR to use the regular interrupt handler */ - na->na_flags = NAF_SKIP_INTR; + na->na_flags = NAF_SKIP_INTR | NAF_HOST_RINGS; ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)", ifp->num_tx_queues, ifp->real_num_tx_queues, diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index 668e083e0b9..ddcb0e3185a 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -35,6 +35,7 @@ #define _NET_NETMAP_KERN_H_ #define WITH_VALE // comment out to disable VALE support +#define WITH_PIPES #if defined(__FreeBSD__) @@ -267,11 +268,11 @@ struct netmap_kring { volatile int nkr_stopped; // XXX what for ? - /* support for adapters without native netmap support. + /* Support for adapters without native netmap support. * On tx rings we preallocate an array of tx buffers * (same size as the netmap ring), on rx rings we - * store incoming packets in a queue. - * XXX who writes to the rx queue ? + * store incoming mbufs in a queue that is drained by + * a rxsync. */ struct mbuf **tx_pool; // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ @@ -280,6 +281,13 @@ struct netmap_kring { uint32_t ring_id; /* debugging */ char name[64]; /* diagnostic */ + int (*nm_sync)(struct netmap_kring *kring, int flags); + +#ifdef WITH_PIPES + struct netmap_kring *pipe; + struct netmap_ring *save_ring; +#endif /* WITH_PIPES */ + } __attribute__((__aligned__(64))); @@ -388,6 +396,7 @@ struct netmap_adapter { * emulated. Where possible (e.g. FreeBSD) * IFCAP_NETMAP also mirrors this flag. */ +#define NAF_HOST_RINGS 64 /* the adapter supports the host rings */ int active_fds; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ @@ -411,6 +420,9 @@ struct netmap_adapter { NM_SELINFO_T tx_si, rx_si; /* global wait queues */ + /* count users of the global wait queues */ + int tx_si_users, rx_si_users; + /* copy of if_qflush and if_transmit pointers, to intercept * packets from the network stack when netmap is active. */ @@ -438,9 +450,11 @@ struct netmap_adapter { * * nm_config() returns configuration information from the OS * - * nm_krings_create() XXX + * nm_krings_create() create and init the krings array + * (the array layout must conform to the description + * found above the definition of netmap_krings_create) * - * nm_krings_delete() XXX + * nm_krings_delete() cleanup and delete the kring array * * nm_notify() is used to act after data have become available. * For hw devices this is typically a selwakeup(), @@ -464,7 +478,6 @@ struct netmap_adapter { void (*nm_krings_delete)(struct netmap_adapter *); int (*nm_notify)(struct netmap_adapter *, u_int ring, enum txrx, int flags); -#define NAF_GLOBAL_NOTIFY 4 #define NAF_DISABLE_NOTIFY 8 /* standard refcount to control the lifetime of the adapter @@ -484,6 +497,12 @@ struct netmap_adapter { * from userspace */ void *na_private; + +#ifdef WITH_PIPES + struct netmap_pipe_adapter **na_pipes; + int na_next_pipe; + int na_max_pipes; +#endif /* WITH_PIPES */ }; @@ -514,7 +533,10 @@ struct netmap_vp_adapter { /* VALE software port */ struct nm_bridge *na_bdg; int retry; - u_int offset; /* Offset of ethernet header for each packet. */ + /* Offset of ethernet header for each packet. */ + u_int virt_hdr_len; + /* Maximum Frame Size, used in bdg_mismatch_datapath() */ + u_int mfs; }; @@ -524,6 +546,12 @@ struct netmap_hw_adapter { /* physical device */ struct net_device_ops nm_ndo; // XXX linux only }; +/* Mitigation support. */ +struct nm_generic_mit { + struct hrtimer mit_timer; + int mit_pending; + struct netmap_adapter *mit_na; /* backpointer */ +}; struct netmap_generic_adapter { /* emulated device */ struct netmap_hw_adapter up; @@ -534,18 +562,29 @@ struct netmap_generic_adapter { /* emulated device */ /* generic netmap adapters support: * a net_device_ops struct overrides ndo_select_queue(), * save_if_input saves the if_input hook (FreeBSD), - * mit_timer and mit_pending implement rx interrupt mitigation, + * mit implements rx interrupt mitigation, */ struct net_device_ops generic_ndo; void (*save_if_input)(struct ifnet *, struct mbuf *); - struct hrtimer mit_timer; - int mit_pending; + struct nm_generic_mit *mit; #ifdef linux netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); #endif }; +static __inline int +netmap_real_tx_rings(struct netmap_adapter *na) +{ + return na->num_tx_rings + !!(na->na_flags & NAF_HOST_RINGS); +} + +static __inline int +netmap_real_rx_rings(struct netmap_adapter *na) +{ + return na->num_rx_rings + !!(na->na_flags & NAF_HOST_RINGS); +} + #ifdef WITH_VALE /* @@ -614,6 +653,25 @@ struct netmap_bwrap_adapter { #endif /* WITH_VALE */ +#ifdef WITH_PIPES + +#define NM_MAXPIPES 64 /* max number of pipes per adapter */ + +struct netmap_pipe_adapter { + struct netmap_adapter up; + + u_int id; /* pipe identifier */ + int role; /* either NR_REG_PIPE_MASTER or NR_REG_PIPE_SLAVE */ + + struct netmap_adapter *parent; /* adapter that owns the memory */ + struct netmap_pipe_adapter *peer; /* the other end of the pipe */ + int peer_ref; /* 1 iff we are holding a ref to the peer */ + + u_int parent_slot; /* index in the parent pipe array */ +}; + +#endif /* WITH_PIPES */ + /* return slots reserved to rx clients; used in drivers */ static inline uint32_t @@ -767,9 +825,8 @@ uint32_t nm_rxsync_prologue(struct netmap_kring *); static inline void nm_txsync_finalize(struct netmap_kring *kring) { - /* update ring head/tail to what the kernel knows */ + /* update ring tail to what the kernel knows */ kring->ring->tail = kring->rtail = kring->nr_hwtail; - kring->ring->head = kring->rhead = kring->nr_hwcur; /* note, head/rhead/hwcur might be behind cur/rcur * if no carrier @@ -819,14 +876,14 @@ nm_rxsync_finalize(struct netmap_kring *kring) * Support routines to be used with the VALE switch */ int netmap_update_config(struct netmap_adapter *na); -int netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom); +int netmap_krings_create(struct netmap_adapter *na, u_int tailroom); void netmap_krings_delete(struct netmap_adapter *na); int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); struct netmap_if * netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, - uint16_t ringid, int *err); + uint16_t ringid, uint32_t flags, int *err); @@ -868,6 +925,20 @@ int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); #define netmap_bdg_ctl(_1, _2) EINVAL #endif /* !WITH_VALE */ +#ifdef WITH_PIPES +/* max number of pipes per device */ +#define NM_MAXPIPES 64 /* XXX how many? */ +/* in case of no error, returns the actual number of pipes in nmr->nr_arg1 */ +int netmap_pipe_alloc(struct netmap_adapter *, struct nmreq *nmr); +void netmap_pipe_dealloc(struct netmap_adapter *); +int netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +#else /* !WITH_PIPES */ +#define NM_MAXPIPES 0 +#define netmap_pipe_alloc(_1, _2) EOPNOTSUPP +#define netmap_pipe_dealloc(_1) +#define netmap_get_pipe_na(_1, _2, _3) 0 +#endif + /* Various prototypes */ int netmap_poll(struct cdev *dev, int events, struct thread *td); int netmap_init(void); @@ -938,6 +1009,7 @@ enum { /* verbose flags */ extern int netmap_txsync_retry; extern int netmap_generic_mit; extern int netmap_generic_ringsize; +extern int netmap_generic_rings; /* * NA returns a pointer to the struct netmap adapter from the ifp, @@ -1160,13 +1232,21 @@ struct netmap_priv_d { struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ struct netmap_adapter *np_na; - int np_ringid; /* from the ioctl */ - u_int np_qfirst, np_qlast; /* range of rings to scan */ - uint16_t np_txpoll; + uint32_t np_flags; /* from the ioctl */ + u_int np_txqfirst, np_txqlast; /* range of tx rings to scan */ + u_int np_rxqfirst, np_rxqlast; /* range of rx rings to scan */ + uint16_t np_txpoll; /* XXX and also np_rxpoll ? */ struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ /* np_refcount is only used on FreeBSD */ int np_refcount; /* use with NMG_LOCK held */ + + /* pointers to the selinfo to be used for selrecord. + * Either the local or the global one depending on the + * number of rings. + */ + NM_SELINFO_T *np_rxsi, *np_txsi; + struct thread *np_td; /* kqueue, just debugging */ }; @@ -1188,10 +1268,113 @@ void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); * to reduce the number of interrupt requests/selwakeup * to clients on incoming packets. */ -void netmap_mitigation_init(struct netmap_generic_adapter *na); -void netmap_mitigation_start(struct netmap_generic_adapter *na); -void netmap_mitigation_restart(struct netmap_generic_adapter *na); -int netmap_mitigation_active(struct netmap_generic_adapter *na); -void netmap_mitigation_cleanup(struct netmap_generic_adapter *na); +void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na); +void netmap_mitigation_start(struct nm_generic_mit *mit); +void netmap_mitigation_restart(struct nm_generic_mit *mit); +int netmap_mitigation_active(struct nm_generic_mit *mit); +void netmap_mitigation_cleanup(struct nm_generic_mit *mit); + + + +/* Shared declarations for the VALE switch. */ + +/* + * Each transmit queue accumulates a batch of packets into + * a structure before forwarding. Packets to the same + * destination are put in a list using ft_next as a link field. + * ft_frags and ft_next are valid only on the first fragment. + */ +struct nm_bdg_fwd { /* forwarding entry for a bridge */ + void *ft_buf; /* netmap or indirect buffer */ + uint8_t ft_frags; /* how many fragments (only on 1st frag) */ + uint8_t _ft_port; /* dst port (unused) */ + uint16_t ft_flags; /* flags, e.g. indirect */ + uint16_t ft_len; /* src fragment len */ + uint16_t ft_next; /* next packet to same destination */ +}; + +/* struct 'virtio_net_hdr' from linux. */ +struct nm_vnet_hdr { +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ + uint8_t flags; +#define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ +#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ +#define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ +#define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ +#define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ + uint8_t gso_type; + uint16_t hdr_len; + uint16_t gso_size; + uint16_t csum_start; + uint16_t csum_offset; +}; + +#define WORST_CASE_GSO_HEADER (14+40+60) /* IPv6 + TCP */ + +/* Private definitions for IPv4, IPv6, UDP and TCP headers. */ + +struct nm_iphdr { + uint8_t version_ihl; + uint8_t tos; + uint16_t tot_len; + uint16_t id; + uint16_t frag_off; + uint8_t ttl; + uint8_t protocol; + uint16_t check; + uint32_t saddr; + uint32_t daddr; + /*The options start here. */ +}; + +struct nm_tcphdr { + uint16_t source; + uint16_t dest; + uint32_t seq; + uint32_t ack_seq; + uint8_t doff; /* Data offset + Reserved */ + uint8_t flags; + uint16_t window; + uint16_t check; + uint16_t urg_ptr; +}; + +struct nm_udphdr { + uint16_t source; + uint16_t dest; + uint16_t len; + uint16_t check; +}; + +struct nm_ipv6hdr { + uint8_t priority_version; + uint8_t flow_lbl[3]; + + uint16_t payload_len; + uint8_t nexthdr; + uint8_t hop_limit; + + uint8_t saddr[16]; + uint8_t daddr[16]; +}; + +/* Type used to store a checksum (in host byte order) that hasn't been + * folded yet. + */ +#define rawsum_t uint32_t + +rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); +uint16_t nm_csum_ipv4(struct nm_iphdr *iph); +void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, + size_t datalen, uint16_t *check); +void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, + size_t datalen, uint16_t *check); +uint16_t nm_csum_fold(rawsum_t cur_sum); + +void bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + u_int *j, u_int lim, u_int *howmany); #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index 55f59851843..5491845090e 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -82,6 +82,21 @@ struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { }, }; +struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = { + [NETMAP_IF_POOL] = { + .size = 1024, + .num = 1, + }, + [NETMAP_RING_POOL] = { + .size = 5*PAGE_SIZE, + .num = 4, + }, + [NETMAP_BUF_POOL] = { + .size = 2048, + .num = 4098, + }, +}; + /* * nm_mem is the memory allocator used for all physical interfaces @@ -118,9 +133,16 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */ .config = netmap_mem_global_config, .finalize = netmap_mem_global_finalize, .deref = netmap_mem_global_deref, + + .nm_id = 1, + + .prev = &nm_mem, + .next = &nm_mem, }; +struct netmap_mem_d *netmap_last_mem_d = &nm_mem; + // XXX logically belongs to nm_mem struct lut_entry *netmap_buffer_lut; /* exported */ @@ -135,7 +157,7 @@ const struct netmap_mem_d nm_blueprint = { .objminsize = sizeof(struct netmap_if), .objmaxsize = 4096, .nummin = 1, - .nummax = 10, + .nummax = 100, }, [NETMAP_RING_POOL] = { .name = "%s_ring", @@ -172,13 +194,67 @@ const struct netmap_mem_d nm_blueprint = { SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ - CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s") + CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_size, \ + CTLFLAG_RW, &netmap_min_priv_params[id].size, 0, \ + "Default size of private netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_num, \ + CTLFLAG_RW, &netmap_min_priv_params[id].num, 0, \ + "Default number of private netmap " STRINGIFY(name) "s") SYSCTL_DECL(_dev_netmap); DECLARE_SYSCTLS(NETMAP_IF_POOL, if); DECLARE_SYSCTLS(NETMAP_RING_POOL, ring); DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf); +static int +nm_mem_assign_id(struct netmap_mem_d *nmd) +{ + nm_memid_t id; + struct netmap_mem_d *scan = netmap_last_mem_d; + int error = ENOMEM; + + NMA_LOCK(&nm_mem); + + do { + /* we rely on unsigned wrap around */ + id = scan->nm_id + 1; + if (id == 0) /* reserve 0 as error value */ + id = 1; + scan = scan->next; + if (id != scan->nm_id) { + nmd->nm_id = id; + nmd->prev = scan->prev; + nmd->next = scan; + scan->prev->next = nmd; + scan->prev = nmd; + netmap_last_mem_d = nmd; + error = 0; + break; + } + } while (scan != netmap_last_mem_d); + + NMA_UNLOCK(&nm_mem); + return error; +} + +static void +nm_mem_release_id(struct netmap_mem_d *nmd) +{ + NMA_LOCK(&nm_mem); + + nmd->prev->next = nmd->next; + nmd->next->prev = nmd->prev; + + if (netmap_last_mem_d == nmd) + netmap_last_mem_d = nmd->prev; + + nmd->prev = nmd->next = NULL; + + NMA_UNLOCK(&nm_mem); +} + + /* * First, find the allocator that contains the requested offset, * then locate the cluster through a lookup table. @@ -216,7 +292,8 @@ netmap_mem_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) } int -netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags) +netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags, + nm_memid_t *id) { int error = 0; NMA_LOCK(nmd); @@ -234,6 +311,7 @@ netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags) } } *memflags = nmd->flags; + *id = nmd->nm_id; out: NMA_UNLOCK(nmd); return error; @@ -343,21 +421,34 @@ netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_ /* - * free by index, not by address. This is slow, but is only used - * for a small number of objects (rings, nifp) + * free by index, not by address. + * XXX should we also cleanup the content ? */ -static void +static int netmap_obj_free(struct netmap_obj_pool *p, uint32_t j) { + uint32_t *ptr, mask; + if (j >= p->objtotal) { D("invalid index %u, max %u", j, p->objtotal); - return; + return 1; + } + ptr = &p->bitmap[j / 32]; + mask = (1 << (j % 32)); + if (*ptr & mask) { + D("ouch, double free on buffer %d", j); + return 1; + } else { + *ptr |= mask; + p->objfree++; + return 0; } - p->bitmap[j / 32] |= (1 << (j % 32)); - p->objfree++; - return; } +/* + * free by address. This is slow but is only used for a few + * objects (rings, nifp) + */ static void netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) { @@ -388,9 +479,63 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], NETMAP_BDG_BUF_SIZE(n), _pos, _index) +#if 0 // XXX unused /* Return the index associated to the given packet buffer */ #define netmap_buf_index(n, v) \ (netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v)) / NETMAP_BDG_BUF_SIZE(n)) +#endif + +/* + * allocate extra buffers in a linked list. + * returns the actual number. + */ +uint32_t +netmap_extra_alloc(struct netmap_adapter *na, uint32_t *head, uint32_t n) +{ + struct netmap_mem_d *nmd = na->nm_mem; + uint32_t i, pos = 0; /* opaque, scan position in the bitmap */ + + NMA_LOCK(nmd); + + *head = 0; /* default, 'null' index ie empty list */ + for (i = 0 ; i < n; i++) { + uint32_t cur = *head; /* save current head */ + uint32_t *p = netmap_buf_malloc(nmd, &pos, head); + if (p == NULL) { + D("no more buffers after %d of %d", i, n); + *head = cur; /* restore */ + break; + } + RD(5, "allocate buffer %d -> %d", *head, cur); + *p = cur; /* link to previous head */ + } + + NMA_UNLOCK(nmd); + + return i; +} + +static void +netmap_extra_free(struct netmap_adapter *na, uint32_t head) +{ + struct lut_entry *lut = na->na_lut; + struct netmap_mem_d *nmd = na->nm_mem; + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + uint32_t i, cur, *buf; + + D("freeing the extra list"); + for (i = 0; head >=2 && head < p->objtotal; i++) { + cur = head; + buf = lut[head].vaddr; + head = *buf; + *buf = 0; + if (netmap_obj_free(p, cur)) + break; + } + if (head != 0) + D("breaking with head %d", head); + D("freed %d buffers", i); +} /* Return nonzero on error */ @@ -425,6 +570,19 @@ cleanup: return (ENOMEM); } +static void +netmap_mem_set_ring(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n, uint32_t index) +{ + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + u_int i; + + for (i = 0; i < n; i++) { + slot[i].buf_idx = index; + slot[i].len = p->_objsize; + slot[i].flags = 0; + } +} + static void netmap_free_buf(struct netmap_mem_d *nmd, uint32_t i) @@ -438,6 +596,18 @@ netmap_free_buf(struct netmap_mem_d *nmd, uint32_t i) netmap_obj_free(p, i); } + +static void +netmap_free_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n) +{ + u_int i; + + for (i = 0; i < n; i++) { + if (slot[i].buf_idx > 2) + netmap_free_buf(nmd, slot[i].buf_idx); + } +} + static void netmap_reset_obj_allocator(struct netmap_obj_pool *p) { @@ -677,7 +847,9 @@ static void netmap_mem_reset_all(struct netmap_mem_d *nmd) { int i; - D("resetting %p", nmd); + + if (netmap_verbose) + D("resetting %p", nmd); for (i = 0; i < NETMAP_POOLS_NR; i++) { netmap_reset_obj_allocator(&nmd->pools[i]); } @@ -703,12 +875,14 @@ netmap_mem_finalize_all(struct netmap_mem_d *nmd) nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; nmd->flags |= NETMAP_MEM_FINALIZED; - D("Have %d KB for interfaces, %d KB for rings and %d MB for buffers", - nmd->pools[NETMAP_IF_POOL].memtotal >> 10, - nmd->pools[NETMAP_RING_POOL].memtotal >> 10, - nmd->pools[NETMAP_BUF_POOL].memtotal >> 20); + if (netmap_verbose) + D("interfaces %d KB, rings %d KB, buffers %d MB", + nmd->pools[NETMAP_IF_POOL].memtotal >> 10, + nmd->pools[NETMAP_RING_POOL].memtotal >> 10, + nmd->pools[NETMAP_BUF_POOL].memtotal >> 20); - D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree); + if (netmap_verbose) + D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree); return 0; @@ -724,10 +898,13 @@ netmap_mem_private_delete(struct netmap_mem_d *nmd) { if (nmd == NULL) return; - D("deleting %p", nmd); + if (netmap_verbose) + D("deleting %p", nmd); if (nmd->refcount > 0) D("bug: deleting mem allocator with refcount=%d!", nmd->refcount); - D("done deleting %p", nmd); + nm_mem_release_id(nmd); + if (netmap_verbose) + D("done deleting %p", nmd); NMA_LOCK_DESTROY(nmd); free(nmd, M_DEVBUF); } @@ -762,35 +939,70 @@ netmap_mem_private_deref(struct netmap_mem_d *nmd) NMA_UNLOCK(nmd); } + +/* + * allocator for private memory + */ struct netmap_mem_d * -netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd) +netmap_mem_private_new(const char *name, u_int txr, u_int txd, + u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, int *perr) { struct netmap_mem_d *d = NULL; struct netmap_obj_params p[NETMAP_POOLS_NR]; - int i; - u_int maxd; + int i, err; + u_int v, maxd; d = malloc(sizeof(struct netmap_mem_d), M_DEVBUF, M_NOWAIT | M_ZERO); - if (d == NULL) - return NULL; + if (d == NULL) { + err = ENOMEM; + goto error; + } *d = nm_blueprint; - /* XXX the rest of the code assumes the stack rings are alwasy present */ + err = nm_mem_assign_id(d); + if (err) + goto error; + + /* account for the fake host rings */ txr++; rxr++; - p[NETMAP_IF_POOL].size = sizeof(struct netmap_if) + - sizeof(ssize_t) * (txr + rxr); - p[NETMAP_IF_POOL].num = 2; - maxd = (txd > rxd) ? txd : rxd; - p[NETMAP_RING_POOL].size = sizeof(struct netmap_ring) + - sizeof(struct netmap_slot) * maxd; - p[NETMAP_RING_POOL].num = txr + rxr; - p[NETMAP_BUF_POOL].size = 2048; /* XXX find a way to let the user choose this */ - p[NETMAP_BUF_POOL].num = rxr * (rxd + 2) + txr * (txd + 2); - D("req if %d*%d ring %d*%d buf %d*%d", + /* copy the min values */ + for (i = 0; i < NETMAP_POOLS_NR; i++) { + p[i] = netmap_min_priv_params[i]; + } + + /* possibly increase them to fit user request */ + v = sizeof(struct netmap_if) + sizeof(ssize_t) * (txr + rxr); + if (p[NETMAP_IF_POOL].size < v) + p[NETMAP_IF_POOL].size = v; + v = 2 + 4 * npipes; + if (p[NETMAP_IF_POOL].num < v) + p[NETMAP_IF_POOL].num = v; + maxd = (txd > rxd) ? txd : rxd; + v = sizeof(struct netmap_ring) + sizeof(struct netmap_slot) * maxd; + if (p[NETMAP_RING_POOL].size < v) + p[NETMAP_RING_POOL].size = v; + /* each pipe endpoint needs two tx rings (1 normal + 1 host, fake) + * and two rx rings (again, 1 normal and 1 fake host) + */ + v = txr + rxr + 8 * npipes; + if (p[NETMAP_RING_POOL].num < v) + p[NETMAP_RING_POOL].num = v; + /* for each pipe we only need the buffers for the 4 "real" rings. + * On the other end, the pipe ring dimension may be different from + * the parent port ring dimension. As a compromise, we allocate twice the + * space actually needed if the pipe rings were the same size as the parent rings + */ + v = (4 * npipes + rxr) * rxd + (4 * npipes + txr) * txd + 2 + extra_bufs; + /* the +2 is for the tx and rx fake buffers (indices 0 and 1) */ + if (p[NETMAP_BUF_POOL].num < v) + p[NETMAP_BUF_POOL].num = v; + + if (netmap_verbose) + D("req if %d*%d ring %d*%d buf %d*%d", p[NETMAP_IF_POOL].num, p[NETMAP_IF_POOL].size, p[NETMAP_RING_POOL].num, @@ -802,8 +1014,9 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int snprintf(d->pools[i].name, NETMAP_POOL_MAX_NAMSZ, nm_blueprint.pools[i].name, name); - if (netmap_config_obj_allocator(&d->pools[i], - p[i].num, p[i].size)) + err = netmap_config_obj_allocator(&d->pools[i], + p[i].num, p[i].size); + if (err) goto error; } @@ -814,6 +1027,8 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int return d; error: netmap_mem_private_delete(d); + if (perr) + *perr = err; return NULL; } @@ -917,20 +1132,25 @@ netmap_mem_fini(void) static void netmap_free_rings(struct netmap_adapter *na) { - u_int i; + struct netmap_kring *kring; + struct netmap_ring *ring; if (!na->tx_rings) return; - for (i = 0; i < na->num_tx_rings + 1; i++) { - if (na->tx_rings[i].ring) { - netmap_ring_free(na->nm_mem, na->tx_rings[i].ring); - na->tx_rings[i].ring = NULL; - } + for (kring = na->tx_rings; kring != na->rx_rings; kring++) { + ring = kring->ring; + if (ring == NULL) + continue; + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + netmap_ring_free(na->nm_mem, ring); + kring->ring = NULL; } - for (i = 0; i < na->num_rx_rings + 1; i++) { - if (na->rx_rings[i].ring) { - netmap_ring_free(na->nm_mem, na->rx_rings[i].ring); - na->rx_rings[i].ring = NULL; - } + for (/* cont'd from above */; kring != na->tailroom; kring++) { + ring = kring->ring; + if (ring == NULL) + continue; + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + netmap_ring_free(na->nm_mem, ring); + kring->ring = NULL; } } @@ -938,6 +1158,8 @@ netmap_free_rings(struct netmap_adapter *na) * * Allocate netmap rings and buffers for this card * The rings are contiguous, but have variable size. + * The kring array must follow the layout described + * in netmap_krings_create(). */ int netmap_mem_rings_create(struct netmap_adapter *na) @@ -945,10 +1167,16 @@ netmap_mem_rings_create(struct netmap_adapter *na) struct netmap_ring *ring; u_int len, ndesc; struct netmap_kring *kring; + u_int i; NMA_LOCK(na->nm_mem); - for (kring = na->tx_rings; kring != na->rx_rings; kring++) { /* Transmit rings */ + /* transmit rings */ + for (i =0, kring = na->tx_rings; kring != na->rx_rings; kring++, i++) { + if (kring->ring) { + ND("%s %ld already created", kring->name, kring - na->tx_rings); + continue; /* already created by somebody else */ + } ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); @@ -971,14 +1199,27 @@ netmap_mem_rings_create(struct netmap_adapter *na) ring->tail = kring->rtail; *(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); + ND("%s h %d c %d t %d", kring->name, + ring->head, ring->cur, ring->tail); ND("initializing slots for txring"); - if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { - D("Cannot allocate buffers for tx_ring"); - goto cleanup; + if (i != na->num_tx_rings || (na->na_flags & NAF_HOST_RINGS)) { + /* this is a real ring */ + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for tx_ring"); + goto cleanup; + } + } else { + /* this is a fake tx ring, set all indices to 0 */ + netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 0); } } - for ( ; kring != na->tailroom; kring++) { /* Receive rings */ + /* receive rings */ + for ( i = 0 /* kring cont'd from above */ ; kring != na->tailroom; kring++, i++) { + if (kring->ring) { + ND("%s %ld already created", kring->name, kring - na->rx_rings); + continue; /* already created by somebody else */ + } ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); @@ -1001,10 +1242,18 @@ netmap_mem_rings_create(struct netmap_adapter *na) ring->tail = kring->rtail; *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); + ND("%s h %d c %d t %d", kring->name, + ring->head, ring->cur, ring->tail); ND("initializing slots for rxring %p", ring); - if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { - D("Cannot allocate buffers for rx_ring"); - goto cleanup; + if (i != na->num_rx_rings || (na->na_flags & NAF_HOST_RINGS)) { + /* this is a real ring */ + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for rx_ring"); + goto cleanup; + } + } else { + /* this is a fake rx ring, set all indices to 1 */ + netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 1); } } @@ -1024,20 +1273,8 @@ void netmap_mem_rings_delete(struct netmap_adapter *na) { /* last instance, release bufs and rings */ - u_int i, lim; - struct netmap_kring *kring; - struct netmap_ring *ring; - NMA_LOCK(na->nm_mem); - for (kring = na->tx_rings; kring != na->tailroom; kring++) { - ring = kring->ring; - if (ring == NULL) - continue; - lim = kring->nkr_num_slots; - for (i = 0; i < lim; i++) - netmap_free_buf(na->nm_mem, ring->slot[i].buf_idx); - } netmap_free_rings(na); NMA_UNLOCK(na->nm_mem); @@ -1059,16 +1296,12 @@ netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) ssize_t base; /* handy for relative offsets between rings and nifp */ u_int i, len, ntx, nrx; - /* - * verify whether virtual port need the stack ring - */ - ntx = na->num_tx_rings + 1; /* shorthand, include stack ring */ - nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */ + /* account for the (eventually fake) host rings */ + ntx = na->num_tx_rings + 1; + nrx = na->num_rx_rings + 1; /* * the descriptor is followed inline by an array of offsets * to the tx and rx rings in the shared memory region. - * For virtual rx rings we also allocate an array of - * pointers to assign to nkr_leases. */ NMA_LOCK(na->nm_mem); @@ -1112,7 +1345,8 @@ netmap_mem_if_delete(struct netmap_adapter *na, struct netmap_if *nifp) /* nothing to do */ return; NMA_LOCK(na->nm_mem); - + if (nifp->ni_bufs_head) + netmap_extra_free(na, nifp->ni_bufs_head); netmap_if_free(na->nm_mem, nifp); NMA_UNLOCK(na->nm_mem); diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h index 8e6c58cbc4e..e83616a5195 100644 --- a/sys/dev/netmap/netmap_mem2.h +++ b/sys/dev/netmap/netmap_mem2.h @@ -160,6 +160,7 @@ typedef int (*netmap_mem_config_t)(struct netmap_mem_d*); typedef int (*netmap_mem_finalize_t)(struct netmap_mem_d*); typedef void (*netmap_mem_deref_t)(struct netmap_mem_d*); +typedef uint16_t nm_memid_t; /* We implement two kinds of netmap_mem_d structures: * @@ -192,6 +193,11 @@ struct netmap_mem_d { netmap_mem_config_t config; netmap_mem_finalize_t finalize; netmap_mem_deref_t deref; + + nm_memid_t nm_id; /* allocator identifier */ + + /* list of all existing allocators, sorted by nm_id */ + struct netmap_mem_d *prev, *next; }; extern struct netmap_mem_d nm_mem; @@ -206,14 +212,16 @@ void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *); int netmap_mem_rings_create(struct netmap_adapter *); void netmap_mem_rings_delete(struct netmap_adapter *); void netmap_mem_deref(struct netmap_mem_d *); -int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags); +int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr); -struct netmap_mem_d* - netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd); +struct netmap_mem_d* netmap_mem_private_new(const char *name, + u_int txr, u_int txd, u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, + int* error); void netmap_mem_private_delete(struct netmap_mem_d *); #define NETMAP_BDG_BUF_SIZE(n) ((n)->pools[NETMAP_BUF_POOL]._objsize) +uint32_t netmap_extra_alloc(struct netmap_adapter *, uint32_t *, uint32_t n); #endif diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c new file mode 100644 index 00000000000..a776a242457 --- /dev/null +++ b/sys/dev/netmap/netmap_offloadings.c @@ -0,0 +1,401 @@ +/* + * Copyright (C) 2014 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#if defined(__FreeBSD__) +#include /* prerequisite */ + +#include +#include +#include /* defines used in kernel.h */ +#include /* types used in module initialization */ +#include +#include /* struct socket */ +#include /* sockaddrs */ +#include +#include +#include /* bus_dmamap_* */ +#include + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +#include +#include + + + +/* This routine is called by bdg_mismatch_datapath() when it finishes + * accumulating bytes for a segment, in order to fix some fields in the + * segment headers (which still contain the same content as the header + * of the original GSO packet). 'buf' points to the beginning (e.g. + * the ethernet header) of the segment, and 'len' is its length. + */ +static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, + u_int segmented_bytes, u_int last_segment, + u_int tcp, u_int iphlen) +{ + struct nm_iphdr *iph = (struct nm_iphdr *)(buf + 14); + struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(buf + 14); + uint16_t *check = NULL; + uint8_t *check_data = NULL; + + if (iphlen == 20) { + /* Set the IPv4 "Total Length" field. */ + iph->tot_len = htobe16(len-14); + ND("ip total length %u", be16toh(ip->tot_len)); + + /* Set the IPv4 "Identification" field. */ + iph->id = htobe16(be16toh(iph->id) + idx); + ND("ip identification %u", be16toh(iph->id)); + + /* Compute and insert the IPv4 header checksum. */ + iph->check = 0; + iph->check = nm_csum_ipv4(iph); + ND("IP csum %x", be16toh(iph->check)); + } else {/* if (iphlen == 40) */ + /* Set the IPv6 "Payload Len" field. */ + ip6h->payload_len = htobe16(len-14-iphlen); + } + + if (tcp) { + struct nm_tcphdr *tcph = (struct nm_tcphdr *)(buf + 14 + iphlen); + + /* Set the TCP sequence number. */ + tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes); + ND("tcp seq %u", be32toh(tcph->seq)); + + /* Zero the PSH and FIN TCP flags if this is not the last + segment. */ + if (!last_segment) + tcph->flags &= ~(0x8 | 0x1); + ND("last_segment %u", last_segment); + + check = &tcph->check; + check_data = (uint8_t *)tcph; + } else { /* UDP */ + struct nm_udphdr *udph = (struct nm_udphdr *)(buf + 14 + iphlen); + + /* Set the UDP 'Length' field. */ + udph->len = htobe16(len-14-iphlen); + + check = &udph->check; + check_data = (uint8_t *)udph; + } + + /* Compute and insert TCP/UDP checksum. */ + *check = 0; + if (iphlen == 20) + nm_csum_tcpudp_ipv4(iph, check_data, len-14-iphlen, check); + else + nm_csum_tcpudp_ipv6(ip6h, check_data, len-14-iphlen, check); + + ND("TCP/UDP csum %x", be16toh(*check)); +} + + +/* The VALE mismatch datapath implementation. */ +void bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + u_int *j, u_int lim, u_int *howmany) +{ + struct netmap_slot *slot = NULL; + struct nm_vnet_hdr *vh = NULL; + /* Number of source slots to process. */ + u_int frags = ft_p->ft_frags; + struct nm_bdg_fwd *ft_end = ft_p + frags; + + /* Source and destination pointers. */ + uint8_t *dst, *src; + size_t src_len, dst_len; + + u_int j_start = *j; + u_int dst_slots = 0; + + /* If the source port uses the offloadings, while destination doesn't, + * we grab the source virtio-net header and do the offloadings here. + */ + if (na->virt_hdr_len && !dst_na->virt_hdr_len) { + vh = (struct nm_vnet_hdr *)ft_p->ft_buf; + } + + /* Init source and dest pointers. */ + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + dst_len = src_len; + + /* We are processing the first input slot and there is a mismatch + * between source and destination virt_hdr_len (SHL and DHL). + * When the a client is using virtio-net headers, the header length + * can be: + * - 10: the header corresponds to the struct nm_vnet_hdr + * - 12: the first 10 bytes correspond to the struct + * virtio_net_hdr, and the last 2 bytes store the + * "mergeable buffers" info, which is an optional + * hint that can be zeroed for compability + * + * The destination header is therefore built according to the + * following table: + * + * SHL | DHL | destination header + * ----------------------------- + * 0 | 10 | zero + * 0 | 12 | zero + * 10 | 0 | doesn't exist + * 10 | 12 | first 10 bytes are copied from source header, last 2 are zero + * 12 | 0 | doesn't exist + * 12 | 10 | copied from the first 10 bytes of source header + */ + bzero(dst, dst_na->virt_hdr_len); + if (na->virt_hdr_len && dst_na->virt_hdr_len) + memcpy(dst, src, sizeof(struct nm_vnet_hdr)); + /* Skip the virtio-net headers. */ + src += na->virt_hdr_len; + src_len -= na->virt_hdr_len; + dst += dst_na->virt_hdr_len; + dst_len = dst_na->virt_hdr_len + src_len; + + /* Here it could be dst_len == 0 (which implies src_len == 0), + * so we avoid passing a zero length fragment. + */ + if (dst_len == 0) { + ft_p++; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + dst_len = src_len; + } + + if (vh && vh->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + u_int gso_bytes = 0; + /* Length of the GSO packet header. */ + u_int gso_hdr_len = 0; + /* Pointer to the GSO packet header. Assume it is in a single fragment. */ + uint8_t *gso_hdr = NULL; + /* Index of the current segment. */ + u_int gso_idx = 0; + /* Payload data bytes segmented so far (e.g. TCP data bytes). */ + u_int segmented_bytes = 0; + /* Length of the IP header (20 if IPv4, 40 if IPv6). */ + u_int iphlen = 0; + /* Is this a TCP or an UDP GSO packet? */ + u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) + == VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1; + + /* Segment the GSO packet contained into the input slots (frags). */ + while (ft_p != ft_end) { + size_t copy; + + /* Grab the GSO header if we don't have it. */ + if (!gso_hdr) { + uint16_t ethertype; + + gso_hdr = src; + + /* Look at the 'Ethertype' field to see if this packet + * is IPv4 or IPv6. + */ + ethertype = be16toh(*((uint16_t *)(gso_hdr + 12))); + if (ethertype == 0x0800) + iphlen = 20; + else /* if (ethertype == 0x86DD) */ + iphlen = 40; + ND(3, "type=%04x", ethertype); + + /* Compute gso_hdr_len. For TCP we need to read the + * content of the 'Data Offset' field. + */ + if (tcp) { + struct nm_tcphdr *tcph = + (struct nm_tcphdr *)&gso_hdr[14+iphlen]; + + gso_hdr_len = 14 + iphlen + 4*(tcph->doff >> 4); + } else + gso_hdr_len = 14 + iphlen + 8; /* UDP */ + + ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len, + dst_na->mfs); + + /* Advance source pointers. */ + src += gso_hdr_len; + src_len -= gso_hdr_len; + if (src_len == 0) { + ft_p++; + if (ft_p == ft_end) + break; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + continue; + } + } + + /* Fill in the header of the current segment. */ + if (gso_bytes == 0) { + memcpy(dst, gso_hdr, gso_hdr_len); + gso_bytes = gso_hdr_len; + } + + /* Fill in data and update source and dest pointers. */ + copy = src_len; + if (gso_bytes + copy > dst_na->mfs) + copy = dst_na->mfs - gso_bytes; + memcpy(dst + gso_bytes, src, copy); + gso_bytes += copy; + src += copy; + src_len -= copy; + + /* A segment is complete or we have processed all the + the GSO payload bytes. */ + if (gso_bytes >= dst_na->mfs || + (src_len == 0 && ft_p + 1 == ft_end)) { + /* After raw segmentation, we must fix some header + * fields and compute checksums, in a protocol dependent + * way. */ + gso_fix_segment(dst, gso_bytes, gso_idx, + segmented_bytes, + src_len == 0 && ft_p + 1 == ft_end, + tcp, iphlen); + + ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes); + slot->len = gso_bytes; + slot->flags = 0; + segmented_bytes += gso_bytes - gso_hdr_len; + + dst_slots++; + + /* Next destination slot. */ + *j = nm_next(*j, lim); + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + + gso_bytes = 0; + gso_idx++; + } + + /* Next input slot. */ + if (src_len == 0) { + ft_p++; + if (ft_p == ft_end) + break; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + } + } + ND(3, "%d bytes segmented", segmented_bytes); + + } else { + /* Address of a checksum field into a destination slot. */ + uint16_t *check = NULL; + /* Accumulator for an unfolded checksum. */ + rawsum_t csum = 0; + + /* Process a non-GSO packet. */ + + /* Init 'check' if necessary. */ + if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + if (unlikely(vh->csum_offset + vh->csum_start > src_len)) + D("invalid checksum request"); + else + check = (uint16_t *)(dst + vh->csum_start + + vh->csum_offset); + } + + while (ft_p != ft_end) { + /* Init/update the packet checksum if needed. */ + if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + if (!dst_slots) + csum = nm_csum_raw(src + vh->csum_start, + src_len - vh->csum_start, 0); + else + csum = nm_csum_raw(src, src_len, csum); + } + + /* Round to a multiple of 64 */ + src_len = (src_len + 63) & ~63; + + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, src_len)) { + /* Invalid user pointer, pretend len is 0. */ + dst_len = 0; + } + } else { + memcpy(dst, src, (int)src_len); + } + slot->len = dst_len; + + dst_slots++; + + /* Next destination slot. */ + *j = nm_next(*j, lim); + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + + /* Next source slot. */ + ft_p++; + src = ft_p->ft_buf; + dst_len = src_len = ft_p->ft_len; + + } + + /* Finalize (fold) the checksum if needed. */ + if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + *check = nm_csum_fold(csum); + } + ND(3, "using %u dst_slots", dst_slots); + + /* A second pass on the desitations slots to set the slot flags, + * using the right number of destination slots. + */ + while (j_start != *j) { + slot = &ring->slot[j_start]; + slot->flags = (dst_slots << 8)| NS_MOREFRAG; + j_start = nm_next(j_start, lim); + } + /* Clear NS_MOREFRAG flag on last entry. */ + slot->flags = (dst_slots << 8); + } + + /* Update howmany. */ + if (unlikely(dst_slots > *howmany)) { + dst_slots = *howmany; + D("Slot allocation error: Should never happen"); + } + *howmany -= dst_slots; +} diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c new file mode 100644 index 00000000000..f8f29fa1770 --- /dev/null +++ b/sys/dev/netmap/netmap_pipe.c @@ -0,0 +1,711 @@ +/* + * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#if defined(__FreeBSD__) +#include /* prerequisite */ + +#include +#include +#include /* defines used in kernel.h */ +#include /* types used in module initialization */ +#include +#include +#include +#include +#include +#include +#include /* sockaddrs */ +#include +#include +#include /* bus_dmamap_* */ +#include + + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ + +#include +#include +#include + +#ifdef WITH_PIPES + +#define NM_PIPE_MAXSLOTS 4096 + +int netmap_default_pipes = 0; /* default number of pipes for each nic */ +SYSCTL_DECL(_dev_netmap); +SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , ""); + +/* allocate the pipe array in the parent adapter */ +int +netmap_pipe_alloc(struct netmap_adapter *na, struct nmreq *nmr) +{ + size_t len; + int mode = nmr->nr_flags & NR_REG_MASK; + u_int npipes; + + if (mode == NR_REG_PIPE_MASTER || mode == NR_REG_PIPE_SLAVE) { + /* this is for our parent, not for us */ + return 0; + } + + /* TODO: we can resize the array if the new + * request can accomodate the already existing pipes + */ + if (na->na_pipes) { + nmr->nr_arg1 = na->na_max_pipes; + return 0; + } + + npipes = nmr->nr_arg1; + if (npipes == 0) + npipes = netmap_default_pipes; + nm_bound_var(&npipes, 0, 0, NM_MAXPIPES, NULL); + + if (npipes == 0) { + /* really zero, nothing to alloc */ + goto out; + } + + len = sizeof(struct netmap_pipe_adapter *) * npipes; + na->na_pipes = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); + if (na->na_pipes == NULL) + return ENOMEM; + + na->na_max_pipes = npipes; + na->na_next_pipe = 0; + +out: + nmr->nr_arg1 = npipes; + + return 0; +} + +/* deallocate the parent array in the parent adapter */ +void +netmap_pipe_dealloc(struct netmap_adapter *na) +{ + if (na->na_pipes) { + ND("freeing pipes for %s", NM_IFPNAME(na->ifp)); + free(na->na_pipes, M_DEVBUF); + na->na_pipes = NULL; + na->na_max_pipes = 0; + na->na_next_pipe = 0; + } +} + +/* find a pipe endpoint with the given id among the parent's pipes */ +static struct netmap_pipe_adapter * +netmap_pipe_find(struct netmap_adapter *parent, u_int pipe_id) +{ + int i; + struct netmap_pipe_adapter *na; + + for (i = 0; i < parent->na_next_pipe; i++) { + na = parent->na_pipes[i]; + if (na->id == pipe_id) { + return na; + } + } + return NULL; +} + +/* add a new pipe endpoint to the parent array */ +static int +netmap_pipe_add(struct netmap_adapter *parent, struct netmap_pipe_adapter *na) +{ + if (parent->na_next_pipe >= parent->na_max_pipes) { + D("%s: no space left for pipes", NM_IFPNAME(parent->ifp)); + return ENOMEM; + } + + parent->na_pipes[parent->na_next_pipe] = na; + na->parent_slot = parent->na_next_pipe; + parent->na_next_pipe++; + return 0; +} + +/* remove the given pipe endpoint from the parent array */ +static void +netmap_pipe_remove(struct netmap_adapter *parent, struct netmap_pipe_adapter *na) +{ + u_int n; + n = --parent->na_next_pipe; + if (n != na->parent_slot) { + parent->na_pipes[na->parent_slot] = + parent->na_pipes[n]; + } + parent->na_pipes[n] = NULL; +} + +static int +netmap_pipe_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *txkring = na->tx_rings + ring_nr, + *rxkring = txkring->pipe; + u_int limit; /* slots to transfer */ + u_int j, k, lim_tx = txkring->nkr_num_slots - 1, + lim_rx = rxkring->nkr_num_slots - 1; + int m, busy; + + ND("%p: %s %x -> %s", txkring, txkring->name, flags, rxkring->name); + ND(2, "before: hwcur %d hwtail %d cur %d head %d tail %d", txkring->nr_hwcur, txkring->nr_hwtail, + txkring->rcur, txkring->rhead, txkring->rtail); + + j = rxkring->nr_hwtail; /* RX */ + k = txkring->nr_hwcur; /* TX */ + m = txkring->rhead - txkring->nr_hwcur; /* new slots */ + if (m < 0) + m += txkring->nkr_num_slots; + limit = m; + m = rxkring->nkr_num_slots - 1; /* max avail space on destination */ + busy = j - rxkring->nr_hwcur; /* busy slots */ + if (busy < 0) + busy += txkring->nkr_num_slots; + m -= busy; /* subtract busy slots */ + ND(2, "m %d limit %d", m, limit); + if (m < limit) + limit = m; + + if (limit == 0) { + /* either the rxring is full, or nothing to send */ + nm_txsync_finalize(txkring); /* actually useless */ + return 0; + } + + while (limit-- > 0) { + struct netmap_slot *rs = &rxkring->save_ring->slot[j]; + struct netmap_slot *ts = &txkring->ring->slot[k]; + struct netmap_slot tmp; + + /* swap the slots */ + tmp = *rs; + *rs = *ts; + *ts = tmp; + + /* no need to report the buffer change */ + + j = nm_next(j, lim_rx); + k = nm_next(k, lim_tx); + } + + wmb(); /* make sure the slots are updated before publishing them */ + rxkring->nr_hwtail = j; + txkring->nr_hwcur = k; + txkring->nr_hwtail = nm_prev(k, lim_tx); + + nm_txsync_finalize(txkring); + ND(2, "after: hwcur %d hwtail %d cur %d head %d tail %d j %d", txkring->nr_hwcur, txkring->nr_hwtail, + txkring->rcur, txkring->rhead, txkring->rtail, j); + + wmb(); /* make sure rxkring->nr_hwtail is updated before notifying */ + rxkring->na->nm_notify(rxkring->na, rxkring->ring_id, NR_RX, 0); + + return 0; +} + +static int +netmap_pipe_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *rxkring = na->rx_rings + ring_nr, + *txkring = rxkring->pipe; + uint32_t oldhwcur = rxkring->nr_hwcur; + + ND("%s %x <- %s", rxkring->name, flags, txkring->name); + rxkring->nr_hwcur = rxkring->rhead; /* recover user-relased slots */ + ND(5, "hwcur %d hwtail %d cur %d head %d tail %d", rxkring->nr_hwcur, rxkring->nr_hwtail, + rxkring->rcur, rxkring->rhead, rxkring->rtail); + rmb(); /* paired with the first wmb() in txsync */ + nm_rxsync_finalize(rxkring); + + if (oldhwcur != rxkring->nr_hwcur) { + /* we have released some slots, notify the other end */ + wmb(); /* make sure nr_hwcur is updated before notifying */ + txkring->na->nm_notify(txkring->na, txkring->ring_id, NR_TX, 0); + } + return 0; +} + +/* Pipe endpoints are created and destroyed together, so that endopoints do not + * have to check for the existence of their peer at each ?xsync. + * + * To play well with the existing netmap infrastructure (refcounts etc.), we + * adopt the following strategy: + * + * 1) The first endpoint that is created also creates the other endpoint and + * grabs a reference to it. + * + * state A) user1 --> endpoint1 --> endpoint2 + * + * 2) If, starting from state A, endpoint2 is then registered, endpoint1 gives + * its reference to the user: + * + * state B) user1 --> endpoint1 endpoint2 <--- user2 + * + * 3) Assume that, starting from state B endpoint2 is closed. In the unregister + * callback endpoint2 notes that endpoint1 is still active and adds a reference + * from endpoint1 to itself. When user2 then releases her own reference, + * endpoint2 is not destroyed and we are back to state A. A symmetrical state + * would be reached if endpoint1 were released instead. + * + * 4) If, starting from state A, endpoint1 is closed, the destructor notes that + * it owns a reference to endpoint2 and releases it. + * + * Something similar goes on for the creation and destruction of the krings. + */ + + +/* netmap_pipe_krings_delete. + * + * There are two cases: + * + * 1) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. We have to create both sets + * of krings. + * + * 2) state is + * + * usr1 --> e1 --> e2 + * + * and we are e2. e1 is certainly registered and our + * krings already exist, but they may be hidden. + */ +static int +netmap_pipe_krings_create(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona = &pna->peer->up; + int error = 0; + if (pna->peer_ref) { + int i; + + /* case 1) above */ + D("%p: case 1, create everything", na); + error = netmap_krings_create(na, 0); + if (error) + goto err; + + /* we also create all the rings, since we need to + * update the save_ring pointers. + * netmap_mem_rings_create (called by our caller) + * will not create the rings again + */ + + error = netmap_mem_rings_create(na); + if (error) + goto del_krings1; + + /* update our hidden ring pointers */ + for (i = 0; i < na->num_tx_rings + 1; i++) + na->tx_rings[i].save_ring = na->tx_rings[i].ring; + for (i = 0; i < na->num_rx_rings + 1; i++) + na->rx_rings[i].save_ring = na->rx_rings[i].ring; + + /* now, create krings and rings of the other end */ + error = netmap_krings_create(ona, 0); + if (error) + goto del_rings1; + + error = netmap_mem_rings_create(ona); + if (error) + goto del_krings2; + + for (i = 0; i < ona->num_tx_rings + 1; i++) + ona->tx_rings[i].save_ring = ona->tx_rings[i].ring; + for (i = 0; i < ona->num_rx_rings + 1; i++) + ona->rx_rings[i].save_ring = ona->rx_rings[i].ring; + + /* cross link the krings */ + for (i = 0; i < na->num_tx_rings; i++) { + na->tx_rings[i].pipe = pna->peer->up.rx_rings + i; + na->rx_rings[i].pipe = pna->peer->up.tx_rings + i; + pna->peer->up.tx_rings[i].pipe = na->rx_rings + i; + pna->peer->up.rx_rings[i].pipe = na->tx_rings + i; + } + } else { + int i; + /* case 2) above */ + /* recover the hidden rings */ + ND("%p: case 2, hidden rings", na); + for (i = 0; i < na->num_tx_rings + 1; i++) + na->tx_rings[i].ring = na->tx_rings[i].save_ring; + for (i = 0; i < na->num_rx_rings + 1; i++) + na->rx_rings[i].ring = na->rx_rings[i].save_ring; + } + return 0; + +del_krings2: + netmap_krings_delete(ona); +del_rings1: + netmap_mem_rings_delete(na); +del_krings1: + netmap_krings_delete(na); +err: + return error; +} + +/* netmap_pipe_reg. + * + * There are two cases on registration (onoff==1) + * + * 1.a) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. Nothing special to do. + * + * 1.b) state is + * + * usr1 --> e1 --> e2 <-- usr2 + * + * and we are e2. Drop the ref e1 is holding. + * + * There are two additional cases on unregister (onoff==0) + * + * 2.a) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. Nothing special to do, e2 will + * be cleaned up by the destructor of e1. + * + * 2.b) state is + * + * usr1 --> e1 e2 <-- usr2 + * + * and we are either e1 or e2. Add a ref from the + * other end and hide our rings. + */ +static int +netmap_pipe_reg(struct netmap_adapter *na, int onoff) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct ifnet *ifp = na->ifp; + ND("%p: onoff %d", na, onoff); + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + } else { + ifp->if_capenable &= ~IFCAP_NETMAP; + } + if (pna->peer_ref) { + ND("%p: case 1.a or 2.a, nothing to do", na); + return 0; + } + if (onoff) { + ND("%p: case 1.b, drop peer", na); + pna->peer->peer_ref = 0; + netmap_adapter_put(na); + } else { + int i; + ND("%p: case 2.b, grab peer", na); + netmap_adapter_get(na); + pna->peer->peer_ref = 1; + /* hide our rings from netmap_mem_rings_delete */ + for (i = 0; i < na->num_tx_rings + 1; i++) { + na->tx_rings[i].ring = NULL; + } + for (i = 0; i < na->num_rx_rings + 1; i++) { + na->rx_rings[i].ring = NULL; + } + } + return 0; +} + +/* netmap_pipe_krings_delete. + * + * There are two cases: + * + * 1) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1 (e2 is not registered, so krings_delete cannot be + * called on it); + * + * 2) state is + * + * usr1 --> e1 e2 <-- usr2 + * + * and we are either e1 or e2. + * + * In the former case we have to also delete the krings of e2; + * in the latter case we do nothing (note that our krings + * have already been hidden in the unregister callback). + */ +static void +netmap_pipe_krings_delete(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona; /* na of the other end */ + int i; + + if (!pna->peer_ref) { + ND("%p: case 2, kept alive by peer", na); + return; + } + /* case 1) above */ + ND("%p: case 1, deleting everyhing", na); + netmap_krings_delete(na); /* also zeroes tx_rings etc. */ + /* restore the ring to be deleted on the peer */ + ona = &pna->peer->up; + if (ona->tx_rings == NULL) { + /* already deleted, we must be on an + * cleanup-after-error path */ + return; + } + for (i = 0; i < ona->num_tx_rings + 1; i++) + ona->tx_rings[i].ring = ona->tx_rings[i].save_ring; + for (i = 0; i < ona->num_rx_rings + 1; i++) + ona->rx_rings[i].ring = ona->rx_rings[i].save_ring; + netmap_mem_rings_delete(ona); + netmap_krings_delete(ona); +} + + +static void +netmap_pipe_dtor(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + ND("%p", na); + if (pna->peer_ref) { + ND("%p: clean up peer", na); + pna->peer_ref = 0; + netmap_adapter_put(&pna->peer->up); + } + if (pna->role == NR_REG_PIPE_MASTER) + netmap_pipe_remove(pna->parent, pna); + netmap_adapter_put(pna->parent); + free(na->ifp, M_DEVBUF); + na->ifp = NULL; + pna->parent = NULL; +} + +int +netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +{ + struct nmreq pnmr; + struct netmap_adapter *pna; /* parent adapter */ + struct netmap_pipe_adapter *mna, *sna, *req; + struct ifnet *ifp, *ifp2; + u_int pipe_id; + int role = nmr->nr_flags & NR_REG_MASK; + int error; + + ND("flags %x", nmr->nr_flags); + + if (role != NR_REG_PIPE_MASTER && role != NR_REG_PIPE_SLAVE) { + ND("not a pipe"); + return 0; + } + role = nmr->nr_flags & NR_REG_MASK; + + /* first, try to find the parent adapter */ + bzero(&pnmr, sizeof(pnmr)); + memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ); + /* pass to parent the requested number of pipes */ + pnmr.nr_arg1 = nmr->nr_arg1; + error = netmap_get_na(&pnmr, &pna, create); + if (error) { + ND("parent lookup failed: %d", error); + return error; + } + ND("found parent: %s", NM_IFPNAME(pna->ifp)); + + if (NETMAP_OWNED_BY_KERN(pna)) { + ND("parent busy"); + error = EBUSY; + goto put_out; + } + + /* next, lookup the pipe id in the parent list */ + req = NULL; + pipe_id = nmr->nr_ringid & NETMAP_RING_MASK; + mna = netmap_pipe_find(pna, pipe_id); + if (mna) { + if (mna->role == role) { + ND("found %d directly at %d", pipe_id, mna->parent_slot); + req = mna; + } else { + ND("found %d indirectly at %d", pipe_id, mna->parent_slot); + req = mna->peer; + } + /* the pipe we have found already holds a ref to the parent, + * so we need to drop the one we got from netmap_get_na() + */ + netmap_adapter_put(pna); + goto found; + } + ND("pipe %d not found, create %d", pipe_id, create); + if (!create) { + error = ENODEV; + goto put_out; + } + /* we create both master and slave. + * The endpoint we were asked for holds a reference to + * the other one. + */ + ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) { + error = ENOMEM; + goto put_out; + } + strcpy(ifp->if_xname, NM_IFPNAME(pna->ifp)); + + mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (mna == NULL) { + error = ENOMEM; + goto free_ifp; + } + mna->up.ifp = ifp; + + mna->id = pipe_id; + mna->role = NR_REG_PIPE_MASTER; + mna->parent = pna; + + mna->up.nm_txsync = netmap_pipe_txsync; + mna->up.nm_rxsync = netmap_pipe_rxsync; + mna->up.nm_register = netmap_pipe_reg; + mna->up.nm_dtor = netmap_pipe_dtor; + mna->up.nm_krings_create = netmap_pipe_krings_create; + mna->up.nm_krings_delete = netmap_pipe_krings_delete; + mna->up.nm_mem = pna->nm_mem; + mna->up.na_lut = pna->na_lut; + mna->up.na_lut_objtotal = pna->na_lut_objtotal; + + mna->up.num_tx_rings = 1; + mna->up.num_rx_rings = 1; + mna->up.num_tx_desc = nmr->nr_tx_slots; + nm_bound_var(&mna->up.num_tx_desc, pna->num_tx_desc, + 1, NM_PIPE_MAXSLOTS, NULL); + mna->up.num_rx_desc = nmr->nr_rx_slots; + nm_bound_var(&mna->up.num_rx_desc, pna->num_rx_desc, + 1, NM_PIPE_MAXSLOTS, NULL); + error = netmap_attach_common(&mna->up); + if (error) + goto free_ifp; + /* register the master with the parent */ + error = netmap_pipe_add(pna, mna); + if (error) + goto free_mna; + + /* create the slave */ + ifp2 = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) { + error = ENOMEM; + goto free_mna; + } + strcpy(ifp2->if_xname, NM_IFPNAME(pna->ifp)); + + sna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (sna == NULL) { + error = ENOMEM; + goto free_ifp2; + } + /* most fields are the same, copy from master and then fix */ + *sna = *mna; + sna->up.ifp = ifp2; + sna->role = NR_REG_PIPE_SLAVE; + error = netmap_attach_common(&sna->up); + if (error) + goto free_sna; + + /* join the two endpoints */ + mna->peer = sna; + sna->peer = mna; + + /* we already have a reference to the parent, but we + * need another one for the other endpoint we created + */ + netmap_adapter_get(pna); + + if (role == NR_REG_PIPE_MASTER) { + req = mna; + mna->peer_ref = 1; + netmap_adapter_get(&sna->up); + } else { + req = sna; + sna->peer_ref = 1; + netmap_adapter_get(&mna->up); + } + ND("created master %p and slave %p", mna, sna); +found: + + ND("pipe %d %s at %p", pipe_id, + (req->role == NR_REG_PIPE_MASTER ? "master" : "slave"), req); + *na = &req->up; + netmap_adapter_get(*na); + + /* write the configuration back */ + nmr->nr_tx_rings = req->up.num_tx_rings; + nmr->nr_rx_rings = req->up.num_rx_rings; + nmr->nr_tx_slots = req->up.num_tx_desc; + nmr->nr_rx_slots = req->up.num_rx_desc; + + /* keep the reference to the parent. + * It will be released by the req destructor + */ + + return 0; + +free_sna: + free(sna, M_DEVBUF); +free_ifp2: + free(ifp2, M_DEVBUF); +free_mna: + free(mna, M_DEVBUF); +free_ifp: + free(ifp, M_DEVBUF); +put_out: + netmap_adapter_put(pna); + return error; +} + + +#endif /* WITH_PIPES */ diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c index 13a725378c2..34e39126e52 100644 --- a/sys/dev/netmap/netmap_vale.c +++ b/sys/dev/netmap/netmap_vale.c @@ -163,21 +163,6 @@ static int netmap_bwrap_attach(struct ifnet *, struct ifnet *); static int netmap_bwrap_register(struct netmap_adapter *, int onoff); int kern_netmap_regif(struct nmreq *nmr); -/* - * Each transmit queue accumulates a batch of packets into - * a structure before forwarding. Packets to the same - * destination are put in a list using ft_next as a link field. - * ft_frags and ft_next are valid only on the first fragment. - */ -struct nm_bdg_fwd { /* forwarding entry for a bridge */ - void *ft_buf; /* netmap or indirect buffer */ - uint8_t ft_frags; /* how many fragments (only on 1st frag) */ - uint8_t _ft_port; /* dst port (unused) */ - uint16_t ft_flags; /* flags, e.g. indirect */ - uint16_t ft_len; /* src fragment len */ - uint16_t ft_next; /* next packet to same destination */ -}; - /* * For each output interface, nm_bdg_q is used to construct a list. * bq_len is the number of output buffers (we can have coalescing @@ -381,7 +366,7 @@ nm_alloc_bdgfwd(struct netmap_adapter *na) l += sizeof(struct nm_bdg_q) * num_dstq; l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; - nrings = na->num_tx_rings + 1; + nrings = netmap_real_tx_rings(na); kring = na->tx_rings; for (i = 0; i < nrings; i++) { struct nm_bdg_fwd *ft; @@ -421,7 +406,8 @@ netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) acquire BDG_WLOCK() and copy back the array. */ - D("detach %d and %d (lim %d)", hw, sw, lim); + if (netmap_verbose) + D("detach %d and %d (lim %d)", hw, sw, lim); /* make a copy of the list of active ports, update it, * and then copy back within BDG_WLOCK(). */ @@ -675,7 +661,7 @@ nm_bdg_attach(struct nmreq *nmr) goto unref_exit; } - nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, &error); + nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error); if (!nifp) { goto unref_exit; } @@ -855,15 +841,23 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) NMG_UNLOCK(); break; - case NETMAP_BDG_OFFSET: + case NETMAP_BDG_VNET_HDR: + /* Valid lengths for the virtio-net header are 0 (no header), + 10 and 12. */ + if (nmr->nr_arg1 != 0 && + nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && + nmr->nr_arg1 != 12) { + error = EINVAL; + break; + } NMG_LOCK(); error = netmap_get_bdg_na(nmr, &na, 0); if (na && !error) { vpna = (struct netmap_vp_adapter *)na; - if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET) - nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET; - vpna->offset = nmr->nr_arg1; - D("Using offset %d for %p", vpna->offset, vpna); + vpna->virt_hdr_len = nmr->nr_arg1; + if (vpna->virt_hdr_len) + vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem); + D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); netmap_adapter_put(na); } NMG_UNLOCK(); @@ -877,26 +871,20 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) return error; } - static int netmap_vp_krings_create(struct netmap_adapter *na) { - u_int ntx, nrx, tailroom; + u_int tailroom; int error, i; uint32_t *leases; - - /* XXX vps do not need host rings, - * but we crash if we don't have one - */ - ntx = na->num_tx_rings + 1; - nrx = na->num_rx_rings + 1; + u_int nrx = netmap_real_rx_rings(na); /* * Leases are attached to RX rings on vale ports */ tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; - error = netmap_krings_create(na, ntx, nrx, tailroom); + error = netmap_krings_create(na, tailroom); if (error) return error; @@ -1212,16 +1200,16 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, u_int len = ft[i].ft_len; ND("slot %d frags %d", i, ft[i].ft_frags); - /* Drop the packet if the offset is not into the first + /* Drop the packet if the virtio-net header is not into the first fragment nor at the very beginning of the second. */ - if (unlikely(na->offset > len)) + if (unlikely(na->virt_hdr_len > len)) continue; - if (len == na->offset) { + if (len == na->virt_hdr_len) { buf = ft[i+1].ft_buf; len = ft[i+1].ft_len; } else { - buf += na->offset; - len -= na->offset; + buf += na->virt_hdr_len; + len -= na->virt_hdr_len; } dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na); if (netmap_verbose > 255) @@ -1280,13 +1268,13 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, struct netmap_vp_adapter *dst_na; struct netmap_kring *kring; struct netmap_ring *ring; - u_int dst_nr, lim, j, sent = 0, d_i, next, brd_next; + u_int dst_nr, lim, j, d_i, next, brd_next; u_int needed, howmany; int retry = netmap_txsync_retry; struct nm_bdg_q *d; uint32_t my_start = 0, lease_idx = 0; int nrings; - int offset_mismatch; + int virt_hdr_mismatch = 0; d_i = dsts[i]; ND("second pass %d port %d", i, d_i); @@ -1311,8 +1299,6 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, goto cleanup; } - offset_mismatch = (dst_na->offset != na->offset); - /* there is at least one either unicast or broadcast packet */ brd_next = brddst->bq_head; next = d->bq_head; @@ -1325,6 +1311,29 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, */ needed = d->bq_len + brddst->bq_len; + if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { + /* There is a virtio-net header/offloadings mismatch between + * source and destination. The slower mismatch datapath will + * be used to cope with all the mismatches. + */ + virt_hdr_mismatch = 1; + if (dst_na->mfs < na->mfs) { + /* We may need to do segmentation offloadings, and so + * we may need a number of destination slots greater + * than the number of input slots ('needed'). + * We look for the smallest integer 'x' which satisfies: + * needed * na->mfs + x * H <= x * na->mfs + * where 'H' is the length of the longest header that may + * be replicated in the segmentation process (e.g. for + * TCPv4 we must account for ethernet header, IP header + * and TCPv4 header). + */ + needed = (needed * na->mfs) / + (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; + ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); + } + } + ND(5, "pass 2 dst %d is %x %s", i, d_i, is_vp ? "virtual" : "nic/host"); dst_nr = d_i & (NM_BDG_MAXRINGS-1); @@ -1337,6 +1346,10 @@ nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, retry: + if (dst_na->retry && retry) { + /* try to get some free slot from the previous run */ + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); + } /* reserve the buffers in the queue and an entry * to report completion, and drop lock. * XXX this might become a helper function. @@ -1346,9 +1359,6 @@ retry: mtx_unlock(&kring->q_lock); goto cleanup; } - if (dst_na->retry) { - dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); - } my_start = j = kring->nkr_hwlease; howmany = nm_kr_space(kring, 1); if (needed < howmany) @@ -1365,7 +1375,6 @@ retry: struct netmap_slot *slot; struct nm_bdg_fwd *ft_p, *ft_end; u_int cnt; - int fix_mismatch = offset_mismatch; /* find the queue from which we pick next packet. * NM_FT_NULL is always higher than valid indexes @@ -1383,58 +1392,43 @@ retry: cnt = ft_p->ft_frags; // cnt > 0 if (unlikely(cnt > howmany)) break; /* no more space */ - howmany -= cnt; if (netmap_verbose && cnt > 1) RD(5, "rx %d frags to %d", cnt, j); ft_end = ft_p + cnt; - do { - char *dst, *src = ft_p->ft_buf; - size_t copy_len = ft_p->ft_len, dst_len = copy_len; + if (unlikely(virt_hdr_mismatch)) { + bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); + } else { + howmany -= cnt; + do { + char *dst, *src = ft_p->ft_buf; + size_t copy_len = ft_p->ft_len, dst_len = copy_len; - slot = &ring->slot[j]; - dst = BDG_NMB(&dst_na->up, slot); + slot = &ring->slot[j]; + dst = BDG_NMB(&dst_na->up, slot); - if (unlikely(fix_mismatch)) { - /* We are processing the first fragment - * and there is a mismatch between source - * and destination offsets. Create a zeroed - * header for the destination, independently - * of the source header length and content. - */ - src += na->offset; - copy_len -= na->offset; - bzero(dst, dst_na->offset); - dst += dst_na->offset; - dst_len = dst_na->offset + copy_len; - /* fix the first fragment only */ - fix_mismatch = 0; - /* Here it could be copy_len == dst_len == 0, - * and so a zero length fragment is passed. - */ - } + ND("send [%d] %d(%d) bytes at %s:%d", + i, (int)copy_len, (int)dst_len, + NM_IFPNAME(dst_ifp), j); + /* round to a multiple of 64 */ + copy_len = (copy_len + 63) & ~63; - ND("send [%d] %d(%d) bytes at %s:%d", - i, (int)copy_len, (int)dst_len, - NM_IFPNAME(dst_ifp), j); - /* round to a multiple of 64 */ - copy_len = (copy_len + 63) & ~63; - - if (ft_p->ft_flags & NS_INDIRECT) { - if (copyin(src, dst, copy_len)) { - // invalid user pointer, pretend len is 0 - dst_len = 0; - } - } else { - //memcpy(dst, src, copy_len); - pkt_copy(src, dst, (int)copy_len); - } - slot->len = dst_len; - slot->flags = (cnt << 8)| NS_MOREFRAG; - j = nm_next(j, lim); - ft_p++; - sent++; - } while (ft_p != ft_end); - slot->flags = (cnt << 8); /* clear flag on last entry */ + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, copy_len)) { + // invalid user pointer, pretend len is 0 + dst_len = 0; + } + } else { + //memcpy(dst, src, copy_len); + pkt_copy(src, dst, (int)copy_len); + } + slot->len = dst_len; + slot->flags = (cnt << 8)| NS_MOREFRAG; + j = nm_next(j, lim); + needed--; + ft_p++; + } while (ft_p != ft_end); + slot->flags = (cnt << 8); /* clear flag on last entry */ + } /* are we done ? */ if (next == NM_FT_NULL && brd_next == NM_FT_NULL) break; @@ -1484,9 +1478,9 @@ retry: */ if (likely(j != my_start)) { kring->nr_hwtail = j; - dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); still_locked = 0; mtx_unlock(&kring->q_lock); + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); if (dst_na->retry && retry--) goto retry; } @@ -1615,6 +1609,7 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) struct netmap_vp_adapter *vpna; struct netmap_adapter *na; int error; + u_int npipes = 0; vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); if (vpna == NULL) @@ -1636,8 +1631,23 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) na->num_tx_desc = nmr->nr_tx_slots; nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, 1, NM_BDG_MAXSLOTS, NULL); + /* validate number of pipes. We want at least 1, + * but probably can do with some more. + * So let's use 2 as default (when 0 is supplied) + */ + npipes = nmr->nr_arg1; + nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); + nmr->nr_arg1 = npipes; /* write back */ + /* validate extra bufs */ + nm_bound_var(&nmr->nr_arg3, 0, 0, + 128*NM_BDG_MAXSLOTS, NULL); na->num_rx_desc = nmr->nr_rx_slots; - vpna->offset = 0; + vpna->virt_hdr_len = 0; + vpna->mfs = 1514; + /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? + vpna->mfs = netmap_buf_size; */ + if (netmap_verbose) + D("max frame size %u", vpna->mfs); na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; na->nm_txsync = bdg_netmap_txsync; @@ -1648,14 +1658,21 @@ bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) na->nm_krings_delete = netmap_vp_krings_delete; na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp), na->num_tx_rings, na->num_tx_desc, - na->num_rx_rings, na->num_rx_desc); + na->num_rx_rings, na->num_rx_desc, + nmr->nr_arg3, npipes, &error); + if (na->nm_mem == NULL) + goto err; /* other nmd fields are set in the common routine */ error = netmap_attach_common(na); - if (error) { - free(vpna, M_DEVBUF); - return error; - } + if (error) + goto err; return 0; + +err: + if (na->nm_mem != NULL) + netmap_mem_private_delete(na->nm_mem); + free(vpna, M_DEVBUF); + return error; } @@ -1763,19 +1780,17 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, ring->cur = kring->rcur; ring->tail = kring->rtail; - /* simulate a user wakeup on the rx ring */ if (is_host_ring) { - netmap_rxsync_from_host(na, NULL, NULL); vpna = hostna; ring_nr = 0; - } else { - /* fetch packets that have arrived. - * XXX maybe do this in a loop ? - */ - error = na->nm_rxsync(na, ring_nr, 0); - if (error) - goto put_out; - } + } + /* simulate a user wakeup on the rx ring */ + /* fetch packets that have arrived. + * XXX maybe do this in a loop ? + */ + error = kring->nm_sync(kring, 0); + if (error) + goto put_out; if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { D("how strange, interrupt with no packets on %s", NM_IFPNAME(ifp)); @@ -1801,7 +1816,7 @@ netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, ring->tail = kring->rtail; /* another call to actually release the buffers */ if (!is_host_ring) { - error = na->nm_rxsync(na, ring_nr, 0); + error = kring->nm_sync(kring, 0); } else { /* mark all packets as released, as in the * second part of netmap_rxsync_from_host() @@ -1842,11 +1857,11 @@ netmap_bwrap_register(struct netmap_adapter *na, int onoff) * The original number of rings comes from hwna, * rx rings on one side equals tx rings on the other. */ - for (i = 0; i <= na->num_rx_rings; i++) { + for (i = 0; i < na->num_rx_rings + 1; i++) { hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots; hwna->tx_rings[i].ring = na->rx_rings[i].ring; } - for (i = 0; i <= na->num_tx_rings; i++) { + for (i = 0; i < na->num_tx_rings + 1; i++) { hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots; hwna->rx_rings[i].ring = na->tx_rings[i].ring; } @@ -1914,8 +1929,10 @@ netmap_bwrap_krings_create(struct netmap_adapter *na) return error; } - hostna->tx_rings = na->tx_rings + na->num_tx_rings; - hostna->rx_rings = na->rx_rings + na->num_rx_rings; + if (na->na_flags & NAF_HOST_RINGS) { + hostna->tx_rings = na->tx_rings + na->num_tx_rings; + hostna->rx_rings = na->rx_rings + na->num_rx_rings; + } return 0; } @@ -1957,6 +1974,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP)) return 0; + mtx_lock(&kring->q_lock); /* first step: simulate a user wakeup on the rx ring */ netmap_vp_rxsync(na, ring_n, flags); ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", @@ -1972,12 +1990,8 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f */ /* set tail to what the hw expects */ ring->tail = hw_kring->rtail; - if (ring_n == na->num_rx_rings) { - netmap_txsync_to_host(hwna); - } else { - nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ? - error = hwna->nm_txsync(hwna, ring_n, flags); - } + nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ? + error = hw_kring->nm_sync(hw_kring, flags); /* fourth step: now we are back the rx ring */ /* claim ownership on all hw owned bufs */ @@ -1991,7 +2005,7 @@ netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int f kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, ring->head, ring->cur, ring->tail, hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); - + mtx_unlock(&kring->q_lock); return error; } @@ -2047,18 +2061,21 @@ netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) bna->hwna = hwna; netmap_adapter_get(hwna); hwna->na_private = bna; /* weak reference */ - - hostna = &bna->host.up; - hostna->ifp = hwna->ifp; - hostna->num_tx_rings = 1; - hostna->num_tx_desc = hwna->num_rx_desc; - hostna->num_rx_rings = 1; - hostna->num_rx_desc = hwna->num_tx_desc; - // hostna->nm_txsync = netmap_bwrap_host_txsync; - // hostna->nm_rxsync = netmap_bwrap_host_rxsync; - hostna->nm_notify = netmap_bwrap_host_notify; - hostna->nm_mem = na->nm_mem; - hostna->na_private = bna; + + if (hwna->na_flags & NAF_HOST_RINGS) { + na->na_flags |= NAF_HOST_RINGS; + hostna = &bna->host.up; + hostna->ifp = hwna->ifp; + hostna->num_tx_rings = 1; + hostna->num_tx_desc = hwna->num_rx_desc; + hostna->num_rx_rings = 1; + hostna->num_rx_desc = hwna->num_tx_desc; + // hostna->nm_txsync = netmap_bwrap_host_txsync; + // hostna->nm_rxsync = netmap_bwrap_host_rxsync; + hostna->nm_notify = netmap_bwrap_host_notify; + hostna->nm_mem = na->nm_mem; + hostna->na_private = bna; + } ND("%s<->%s txr %d txd %d rxr %d rxd %d", fake->if_xname, real->if_xname, diff --git a/sys/modules/netmap/Makefile b/sys/modules/netmap/Makefile index aea844bde1c..647cd103600 100644 --- a/sys/modules/netmap/Makefile +++ b/sys/modules/netmap/Makefile @@ -14,5 +14,7 @@ SRCS += netmap_generic.c SRCS += netmap_mbq.c netmap_mbq.h SRCS += netmap_vale.c SRCS += netmap_freebsd.c +SRCS += netmap_offloadings.c +SRCS += netmap_pipe.c .include diff --git a/sys/net/netmap.h b/sys/net/netmap.h index a5ee9b55edc..f0b4c56d4e3 100644 --- a/sys/net/netmap.h +++ b/sys/net/netmap.h @@ -39,8 +39,10 @@ #ifndef _NET_NETMAP_H_ #define _NET_NETMAP_H_ -#define NETMAP_API 10 /* current API version */ +#define NETMAP_API 11 /* current API version */ +#define NETMAP_MIN_API 11 /* min and max versions accepted */ +#define NETMAP_MAX_API 15 /* * Some fields should be cache-aligned to reduce contention. * The alignment is architecture and OS dependent, but rather than @@ -73,20 +75,21 @@ +===============+ / | buf_idx, len | slot[1] | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | | txring_ofs[1] | +---------------+ - (tx+1+extra_tx entries) (num_slots entries) + (tx+1 entries) (num_slots entries) | txring_ofs[t] | | buf_idx, len | slot[n-1] +---------------+ | flags, ptr | | rxring_ofs[0] | +---------------+ | rxring_ofs[1] | - (rx+1+extra_rx entries) + (rx+1 entries) | rxring_ofs[r] | +---------------+ - * For each "interface" (NIC, host stack, VALE switch port) attached to a - * file descriptor, the mmap()ed region contains a (logically readonly) + * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to + * a file descriptor, the mmap()ed region contains a (logically readonly) * struct netmap_if pointing to struct netmap_ring's. + * * There is one netmap_ring per physical NIC ring, plus one tx/rx ring - * pair attached to the host stack (this pair is unused for VALE ports). + * pair attached to the host stack (this pair is unused for non-NIC ports). * * All physical/host stack ports share the same memory region, * so that zero-copy can be implemented between them. @@ -98,7 +101,42 @@ * is provided for user-supplied buffers in the tx path. * * In user space, the buffer address is computed as - * (char *)ring + buf_ofs + index*NETMAP_BUF_SIZE + * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE + * + * Added in NETMAP_API 11: + * + * + NIOCREGIF can request the allocation of extra spare buffers from + * the same memory pool. The desired number of buffers must be in + * nr_arg3. The ioctl may return fewer buffers, depending on memory + * availability. nr_arg3 will return the actual value, and, once + * mapped, nifp->ni_bufs_head will be the index of the first buffer. + * + * The buffers are linked to each other using the first uint32_t + * as the index. On close, ni_bufs_head must point to the list of + * buffers to be released. + * + * + NIOCREGIF can request space for extra rings (and buffers) + * allocated in the same memory space. The number of extra rings + * is in nr_arg1, and is advisory. This is a no-op on NICs where + * the size of the memory space is fixed. + * + * + NIOCREGIF can attach to PIPE rings sharing the same memory + * space with a parent device. The ifname indicates the parent device, + * which must already exist. Flags in nr_flags indicate if we want to + * bind the master or slave side, the index (from nr_ringid) + * is just a cookie and does need to be sequential. + * + * + NIOCREGIF can also attach to 'monitor' rings that replicate + * the content of specific rings, also from the same memory space. + * + * Extra flags in nr_flags support the above functions. + * Application libraries may use the following naming scheme: + * netmap:foo all NIC ring pairs + * netmap:foo^ only host ring pair + * netmap:foo+ all NIC ring + host ring pairs + * netmap:foo-k the k-th NIC ring pair + * netmap:foo{k PIPE ring pair k, master side + * netmap:foo}k PIPE ring pair k, slave side */ /* @@ -284,8 +322,8 @@ struct netmap_if { const uint32_t ni_tx_rings; /* number of HW tx rings */ const uint32_t ni_rx_rings; /* number of HW rx rings */ - const uint32_t ni_extra_tx_rings; - const uint32_t ni_extra_rx_rings; + uint32_t ni_bufs_head; /* head index for extra bufs */ + uint32_t ni_spare1[5]; /* * The following array contains the offset of each netmap ring * from this structure, in the following order: @@ -321,6 +359,7 @@ struct netmap_if { * * The actual argument (struct nmreq) has a number of options to request * different functions. + * The following are used in NIOCREGIF when nr_cmd == 0: * * nr_name (in) * The name of the port (em0, valeXXX:YYY, etc.) @@ -337,6 +376,13 @@ struct netmap_if { * * nr_ringid (in) * Indicates how rings should be bound to the file descriptors. + * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK) + * are used to indicate the ring number, and nr_flags specifies + * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected. + * + * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED: + * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control + * the binding as follows: * 0 (default) binds all physical rings * NETMAP_HW_RING | ring number binds a single ring pair * NETMAP_SW_RING binds only the host tx/rx rings @@ -345,8 +391,41 @@ struct netmap_if { * packets on tx rings only if POLLOUT is set. * The default is to push any pending packet. * - * NETMAP_PRIV_MEM is set on return for ports that use private - * memory regions and cannot use buffer swapping. + * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release + * packets on rx rings also when POLLIN is NOT set. + * The default is to touch the rx ring only with POLLIN. + * Note that this is the opposite of TX because it + * reflects the common usage. + * + * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead. + * NETMAP_PRIV_MEM is set on return for ports that do not use + * the global memory allocator. + * This information is not significant and applications + * should look at the region id in nr_arg2 + * + * nr_flags is the recommended mode to indicate which rings should + * be bound to a file descriptor. Values are NR_REG_* + * + * nr_arg1 (in) The number of extra rings to be reserved. + * Especially when allocating a VALE port the system only + * allocates the amount of memory needed for the port. + * If more shared memory rings are desired (e.g. for pipes), + * the first invocation for the same basename/allocator + * should specify a suitable number. Memory cannot be + * extended after the first allocation without closing + * all ports on the same region. + * + * nr_arg2 (in/out) The identity of the memory region used. + * On input, 0 means the system decides autonomously, + * other values may try to select a specific region. + * On return the actual value is reported. + * Region '1' is the global allocator, normally shared + * by all interfaces. Other values are private regions. + * If two ports the same region zero-copy is possible. + * + * nr_arg3 (in/out) number of extra buffers to be allocated. + * + * * * nr_cmd (in) if non-zero indicates a special command: * NETMAP_BDG_ATTACH and nr_name = vale*:ifname @@ -362,17 +441,33 @@ struct netmap_if { * NETMAP_BDG_LIST * list the configuration of VALE switches. * - * NETMAP_BDG_OFFSET XXX ? - * Set the offset of data in packets. Used with VALE - * switches where the clients use the vhost header. + * NETMAP_BDG_VNET_HDR + * Set the virtio-net header length used by the client + * of a VALE switch port. * - * nr_arg1, nr_arg2 (in/out) command specific + * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific + * + * * */ /* - * struct nmreq overlays a struct ifreq + * struct nmreq overlays a struct ifreq (just the name) + * + * On input, nr_ringid indicates which rings we are requesting, + * with the low flags for the specific ring number. + * selection FLAGS RING INDEX + * + * all the NIC rings 0x0000 - + * only HOST ring 0x2000 ring index + * single NIC ring 0x4000 - + * all the NIC+HOST rings 0x6000 - + * one pipe ring, master 0x8000 ring index + * *** INVALID 0xA000 + * one pipe ring, slave 0xC000 ring index + * *** INVALID 0xE000 + * */ struct nmreq { char nr_name[IFNAMSIZ]; @@ -383,28 +478,48 @@ struct nmreq { uint32_t nr_rx_slots; /* slots in rx rings */ uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ + uint16_t nr_ringid; /* ring(s) we care about */ -#define NETMAP_PRIV_MEM 0x8000 /* rings use private memory */ -#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ -#define NETMAP_SW_RING 0x2000 /* process the sw ring */ +#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */ +#define NETMAP_SW_RING 0x2000 /* only host ring pair */ + +#define NETMAP_RING_MASK 0x0fff /* the ring number */ + #define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ -#define NETMAP_RING_MASK 0xfff /* the ring number */ + +#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */ uint16_t nr_cmd; #define NETMAP_BDG_ATTACH 1 /* attach the NIC */ #define NETMAP_BDG_DETACH 2 /* detach the NIC */ #define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */ #define NETMAP_BDG_LIST 4 /* get bridge's info */ -#define NETMAP_BDG_OFFSET 5 /* set the port offset */ +#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */ +#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */ - uint16_t nr_arg1; + uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */ #define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ -#define NETMAP_BDG_MAX_OFFSET 12 uint16_t nr_arg2; - uint32_t spare2[3]; + uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */ + uint32_t nr_flags; + /* various modes, extends nr_ringid */ + uint32_t spare2[1]; }; +#define NR_REG_MASK 0xf /* values for nr_flags */ +enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ + NR_REG_ALL_NIC = 1, + NR_REG_SW = 2, + NR_REG_NIC_SW = 3, + NR_REG_ONE_NIC = 4, + NR_REG_PIPE_MASTER = 5, + NR_REG_PIPE_SLAVE = 6, +}; +/* monitor uses the NR_REG to select the rings to monitor */ +#define NR_MONITOR_TX 0x100 +#define NR_MONITOR_RX 0x200 + /* * FreeBSD uses the size value embedded in the _IOWR to determine diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h index 1bb337cf0ef..9c3a4c1e594 100644 --- a/sys/net/netmap_user.h +++ b/sys/net/netmap_user.h @@ -66,6 +66,7 @@ #define _NET_NETMAP_USER_H_ #include +#include /* apple needs sockaddr */ #include /* IFNAMSIZ */ #ifndef likely @@ -104,12 +105,12 @@ nm_ring_next(struct netmap_ring *r, uint32_t i) /* * Return 1 if we have pending transmissions in the tx ring. - * When everything is complete ring->cur = ring->tail + 1 (modulo ring size) + * When everything is complete ring->head = ring->tail + 1 (modulo ring size) */ static inline int nm_tx_pending(struct netmap_ring *r) { - return nm_ring_next(r, r->tail) != r->cur; + return nm_ring_next(r, r->tail) != r->head; } @@ -142,13 +143,41 @@ nm_ring_space(struct netmap_ring *ring) #include #include -struct nm_hdr_t { /* same as pcap_pkthdr */ +#ifndef ND /* debug macros */ +/* debug support */ +#define ND(_fmt, ...) do {} while(0) +#define D(_fmt, ...) \ + do { \ + struct timeval t0; \ + gettimeofday(&t0, NULL); \ + fprintf(stderr, "%03d.%06d %s [%d] " _fmt "\n", \ + (int)(t0.tv_sec % 1000), (int)t0.tv_usec, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + +/* Rate limited version of "D", lps indicates how many per second */ +#define RD(lps, format, ...) \ + do { \ + static int t0, __cnt; \ + struct timeval __xxts; \ + gettimeofday(&__xxts, NULL); \ + if (t0 != __xxts.tv_sec) { \ + t0 = __xxts.tv_sec; \ + __cnt = 0; \ + } \ + if (__cnt++ < lps) { \ + D(format, ##__VA_ARGS__); \ + } \ + } while (0) +#endif + +struct nm_pkthdr { /* same as pcap_pkthdr */ struct timeval ts; uint32_t caplen; uint32_t len; }; -struct nm_stat_t { // pcap_stat +struct nm_stat { /* same as pcap_stat */ u_int ps_recv; u_int ps_drop; u_int ps_ifdrop; @@ -159,19 +188,29 @@ struct nm_stat_t { // pcap_stat #define NM_ERRBUF_SIZE 512 -struct nm_desc_t { - struct nm_desc_t *self; +struct nm_desc { + struct nm_desc *self; /* point to self if netmap. */ int fd; void *mem; int memsize; - struct netmap_if *nifp; + int done_mmap; /* set if mem is the result of mmap */ + struct netmap_if * const nifp; uint16_t first_tx_ring, last_tx_ring, cur_tx_ring; uint16_t first_rx_ring, last_rx_ring, cur_rx_ring; struct nmreq req; /* also contains the nr_name = ifname */ - struct nm_hdr_t hdr; - - struct netmap_ring *tx, *rx; /* shortcuts to base hw/sw rings */ + struct nm_pkthdr hdr; + /* + * The memory contains netmap_if, rings and then buffers. + * Given a pointer (e.g. to nm_inject) we can compare with + * mem/buf_start/buf_end to tell if it is a buffer or + * some other descriptor in our region. + * We also store a pointer to some ring as it helps in the + * translation from buffer indexes to addresses. + */ + struct netmap_ring * const some_ring; + void * const buf_start; + void * const buf_end; /* parameters from pcap_open_live */ int snaplen; int promisc; @@ -183,7 +222,7 @@ struct nm_desc_t { uint32_t if_reqcap; uint32_t if_curcap; - struct nm_stat_t st; + struct nm_stat st; char msg[NM_ERRBUF_SIZE]; }; @@ -191,8 +230,8 @@ struct nm_desc_t { * when the descriptor is open correctly, d->self == d * Eventually we should also use some magic number. */ -#define P2NMD(p) ((struct nm_desc_t *)(p)) -#define IS_NETMAP_DESC(d) (P2NMD(d)->self == P2NMD(d)) +#define P2NMD(p) ((struct nm_desc *)(p)) +#define IS_NETMAP_DESC(d) ((d) && P2NMD(d)->self == P2NMD(d)) #define NETMAP_FD(d) (P2NMD(d)->fd) @@ -205,7 +244,7 @@ struct nm_desc_t { * XXX only for multiples of 64 bytes, non overlapped. */ static inline void -pkt_copy(const void *_src, void *_dst, int l) +nm_pkt_copy(const void *_src, void *_dst, int l) { const uint64_t *src = (const uint64_t *)_src; uint64_t *dst = (uint64_t *)_dst; @@ -230,7 +269,7 @@ pkt_copy(const void *_src, void *_dst, int l) /* * The callback, invoked on each received packet. Same as libpcap */ -typedef void (*nm_cb_t)(u_char *, const struct nm_hdr_t *, const u_char *d); +typedef void (*nm_cb_t)(u_char *, const struct nm_pkthdr *, const u_char *d); /* *--- the pcap-like API --- @@ -238,21 +277,49 @@ typedef void (*nm_cb_t)(u_char *, const struct nm_hdr_t *, const u_char *d); * nm_open() opens a file descriptor, binds to a port and maps memory. * * ifname (netmap:foo or vale:foo) is the port name - * flags can be NETMAP_SW_RING or NETMAP_HW_RING etc. - * ring_no only used if NETMAP_HW_RING is specified, is interpreted - * as a string or integer indicating the ring number - * ring_flags is stored in all ring flags (e.g. for transparent mode) - * to open. If successful, t opens the fd and maps the memory. + * a suffix can indicate the follwing: + * ^ bind the host (sw) ring pair + * * bind host and NIC ring pairs (transparent) + * -NN bind individual NIC ring pair + * {NN bind master side of pipe NN + * }NN bind slave side of pipe NN + * + * req provides the initial values of nmreq before parsing ifname. + * Remember that the ifname parsing will override the ring + * number in nm_ringid, and part of nm_flags; + * flags special functions, normally 0 + * indicates which fields of *arg are significant + * arg special functions, normally NULL + * if passed a netmap_desc with mem != NULL, + * use that memory instead of mmap. */ -static struct nm_desc_t *nm_open(const char *ifname, - const char *ring_no, int flags, int ring_flags); +static struct nm_desc *nm_open(const char *ifname, const struct nmreq *req, + uint64_t flags, const struct nm_desc *arg); + +/* + * nm_open can import some fields from the parent descriptor. + * These flags control which ones. + * Also in flags you can specify NETMAP_NO_TX_POLL and NETMAP_DO_RX_POLL, + * which set the initial value for these flags. + * Note that the 16 low bits of the flags are reserved for data + * that may go into the nmreq. + */ +enum { + NM_OPEN_NO_MMAP = 0x040000, /* reuse mmap from parent */ + NM_OPEN_IFNAME = 0x080000, /* nr_name, nr_ringid, nr_flags */ + NM_OPEN_ARG1 = 0x100000, + NM_OPEN_ARG2 = 0x200000, + NM_OPEN_ARG3 = 0x400000, + NM_OPEN_RING_CFG = 0x800000, /* tx|rx rings|slots */ +}; + /* * nm_close() closes and restores the port to its previous state */ -static int nm_close(struct nm_desc_t *); +static int nm_close(struct nm_desc *); /* * nm_inject() is the same as pcap_inject() @@ -260,111 +327,226 @@ static int nm_close(struct nm_desc_t *); * nm_nextpkt() is the same as pcap_next() */ -static int nm_inject(struct nm_desc_t *, const void *, size_t); -static int nm_dispatch(struct nm_desc_t *, int, nm_cb_t, u_char *); -static u_char *nm_nextpkt(struct nm_desc_t *, struct nm_hdr_t *); +static int nm_inject(struct nm_desc *, const void *, size_t); +static int nm_dispatch(struct nm_desc *, int, nm_cb_t, u_char *); +static u_char *nm_nextpkt(struct nm_desc *, struct nm_pkthdr *); /* * Try to open, return descriptor if successful, NULL otherwise. * An invalid netmap name will return errno = 0; + * You can pass a pointer to a pre-filled nm_desc to add special + * parameters. Flags is used as follows + * NM_OPEN_NO_MMAP use the memory from arg, only + * if the nr_arg2 (memory block) matches. + * NM_OPEN_ARG1 use req.nr_arg1 from arg + * NM_OPEN_ARG2 use req.nr_arg2 from arg + * NM_OPEN_RING_CFG user ring config from arg */ -static struct nm_desc_t * -nm_open(const char *ifname, const char *ring_name, int flags, int ring_flags) +static struct nm_desc * +nm_open(const char *ifname, const struct nmreq *req, + uint64_t new_flags, const struct nm_desc *arg) { - struct nm_desc_t *d; - u_int n, namelen; - char *port = NULL; + struct nm_desc *d = NULL; + const struct nm_desc *parent = arg; + u_int namelen; + uint32_t nr_ringid = 0, nr_flags; + const char *port = NULL; + const char *errmsg = NULL; if (strncmp(ifname, "netmap:", 7) && strncmp(ifname, "vale", 4)) { - errno = 0; /* name not recognised */ + errno = 0; /* name not recognised, not an error */ return NULL; } if (ifname[0] == 'n') ifname += 7; - port = strchr(ifname, '-'); - if (!port) { - namelen = strlen(ifname); - } else { - namelen = port - ifname; - flags &= ~(NETMAP_SW_RING | NETMAP_HW_RING | NETMAP_RING_MASK); - if (port[1] == 's') - flags |= NETMAP_SW_RING; - else - ring_name = port; + /* scan for a separator */ + for (port = ifname; *port && !index("-*^{}", *port); port++) + ; + namelen = port - ifname; + if (namelen >= sizeof(d->req.nr_name)) { + errmsg = "name too long"; + goto fail; + } + switch (*port) { + default: /* '\0', no suffix */ + nr_flags = NR_REG_ALL_NIC; + break; + case '-': /* one NIC */ + nr_flags = NR_REG_ONE_NIC; + nr_ringid = atoi(port + 1); + break; + case '*': /* NIC and SW, ignore port */ + nr_flags = NR_REG_NIC_SW; + if (port[1]) { + errmsg = "invalid port for nic+sw"; + goto fail; + } + break; + case '^': /* only sw ring */ + nr_flags = NR_REG_SW; + if (port[1]) { + errmsg = "invalid port for sw ring"; + goto fail; + } + break; + case '{': + nr_flags = NR_REG_PIPE_MASTER; + nr_ringid = atoi(port + 1); + break; + case '}': + nr_flags = NR_REG_PIPE_SLAVE; + nr_ringid = atoi(port + 1); + break; } - if (namelen >= sizeof(d->req.nr_name)) - namelen = sizeof(d->req.nr_name) - 1; - d = (struct nm_desc_t *)calloc(1, sizeof(*d)); + if (nr_ringid >= NETMAP_RING_MASK) { + errmsg = "invalid ringid"; + goto fail; + } + /* add the *XPOLL flags */ + nr_ringid |= new_flags & (NETMAP_NO_TX_POLL | NETMAP_DO_RX_POLL); + + d = (struct nm_desc *)calloc(1, sizeof(*d)); if (d == NULL) { + errmsg = "nm_desc alloc failure"; errno = ENOMEM; return NULL; } d->self = d; /* set this early so nm_close() works */ d->fd = open("/dev/netmap", O_RDWR); - if (d->fd < 0) + if (d->fd < 0) { + errmsg = "cannot open /dev/netmap"; goto fail; - - if (flags & NETMAP_SW_RING) { - d->req.nr_ringid = NETMAP_SW_RING; - } else { - u_int r; - if (flags & NETMAP_HW_RING) /* interpret ring as int */ - r = (uintptr_t)ring_name; - else /* interpret ring as numeric string */ - r = ring_name ? atoi(ring_name) : ~0; - r = (r < NETMAP_RING_MASK) ? (r | NETMAP_HW_RING) : 0; - d->req.nr_ringid = r; /* set the ring */ } - d->req.nr_ringid |= (flags & ~NETMAP_RING_MASK); + + if (req) + d->req = *req; d->req.nr_version = NETMAP_API; + d->req.nr_ringid &= ~NETMAP_RING_MASK; + + /* these fields are overridden by ifname and flags processing */ + d->req.nr_ringid |= nr_ringid; + d->req.nr_flags = nr_flags; memcpy(d->req.nr_name, ifname, namelen); d->req.nr_name[namelen] = '\0'; + /* optionally import info from parent */ + if (IS_NETMAP_DESC(parent) && new_flags) { + if (new_flags & NM_OPEN_ARG1) + D("overriding ARG1 %d", parent->req.nr_arg1); + d->req.nr_arg1 = new_flags & NM_OPEN_ARG1 ? + parent->req.nr_arg1 : 4; + if (new_flags & NM_OPEN_ARG2) + D("overriding ARG2 %d", parent->req.nr_arg2); + d->req.nr_arg2 = new_flags & NM_OPEN_ARG2 ? + parent->req.nr_arg2 : 0; + if (new_flags & NM_OPEN_ARG3) + D("overriding ARG3 %d", parent->req.nr_arg3); + d->req.nr_arg3 = new_flags & NM_OPEN_ARG3 ? + parent->req.nr_arg3 : 0; + if (new_flags & NM_OPEN_RING_CFG) { + D("overriding RING_CFG"); + d->req.nr_tx_slots = parent->req.nr_tx_slots; + d->req.nr_rx_slots = parent->req.nr_rx_slots; + d->req.nr_tx_rings = parent->req.nr_tx_rings; + d->req.nr_rx_rings = parent->req.nr_rx_rings; + } + if (new_flags & NM_OPEN_IFNAME) { + D("overriding ifname %s ringid 0x%x flags 0x%x", + parent->req.nr_name, parent->req.nr_ringid, + parent->req.nr_flags); + memcpy(d->req.nr_name, parent->req.nr_name, + sizeof(d->req.nr_name)); + d->req.nr_ringid = parent->req.nr_ringid; + d->req.nr_flags = parent->req.nr_flags; + } + } if (ioctl(d->fd, NIOCREGIF, &d->req)) { + errmsg = "NIOCREGIF failed"; goto fail; } - d->memsize = d->req.nr_memsize; - d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED, - d->fd, 0); - if (d->mem == NULL) - goto fail; - d->nifp = NETMAP_IF(d->mem, d->req.nr_offset); - if (d->req.nr_ringid & NETMAP_SW_RING) { + if (IS_NETMAP_DESC(parent) && parent->mem && + parent->req.nr_arg2 == d->req.nr_arg2) { + /* do not mmap, inherit from parent */ + d->memsize = parent->memsize; + d->mem = parent->mem; + } else { + d->memsize = d->req.nr_memsize; + d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED, + d->fd, 0); + if (d->mem == NULL) { + errmsg = "mmap failed"; + goto fail; + } + d->done_mmap = 1; + } + { + struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset); + struct netmap_ring *r = NETMAP_RXRING(nifp, ); + + *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp; + *(struct netmap_ring **)(uintptr_t)&d->some_ring = r; + *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0); + *(void **)(uintptr_t)&d->buf_end = + (char *)d->mem + d->memsize; + } + + if (nr_flags == NR_REG_SW) { /* host stack */ d->first_tx_ring = d->last_tx_ring = d->req.nr_tx_rings; d->first_rx_ring = d->last_rx_ring = d->req.nr_rx_rings; - } else if (d->req.nr_ringid & NETMAP_HW_RING) { - /* XXX check validity */ - d->first_tx_ring = d->last_tx_ring = - d->first_rx_ring = d->last_rx_ring = - d->req.nr_ringid & NETMAP_RING_MASK; - } else { - d->first_tx_ring = d->last_rx_ring = 0; + } else if (nr_flags == NR_REG_ALL_NIC) { /* only nic */ + d->first_tx_ring = 0; + d->first_rx_ring = 0; d->last_tx_ring = d->req.nr_tx_rings - 1; d->last_rx_ring = d->req.nr_rx_rings - 1; + } else if (nr_flags == NR_REG_NIC_SW) { + d->first_tx_ring = 0; + d->first_rx_ring = 0; + d->last_tx_ring = d->req.nr_tx_rings; + d->last_rx_ring = d->req.nr_rx_rings; + } else if (nr_flags == NR_REG_ONE_NIC) { + /* XXX check validity */ + d->first_tx_ring = d->last_tx_ring = + d->first_rx_ring = d->last_rx_ring = nr_ringid; + } else { /* pipes */ + d->first_tx_ring = d->last_tx_ring = 0; + d->first_rx_ring = d->last_rx_ring = 0; } - d->tx = NETMAP_TXRING(d->nifp, 0); - d->rx = NETMAP_RXRING(d->nifp, 0); + +#ifdef DEBUG_NETMAP_USER + { /* debugging code */ + int i; + + D("%s tx %d .. %d %d rx %d .. %d %d", ifname, + d->first_tx_ring, d->last_tx_ring, d->req.nr_tx_rings, + d->first_rx_ring, d->last_rx_ring, d->req.nr_rx_rings); + for (i = 0; i <= d->req.nr_tx_rings; i++) { + struct netmap_ring *r = NETMAP_TXRING(d->nifp, i); + D("TX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); + } + for (i = 0; i <= d->req.nr_rx_rings; i++) { + struct netmap_ring *r = NETMAP_RXRING(d->nifp, i); + D("RX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); + } + } +#endif /* debugging */ + d->cur_tx_ring = d->first_tx_ring; d->cur_rx_ring = d->first_rx_ring; - for (n = d->first_tx_ring; n <= d->last_tx_ring; n++) { - d->tx[n].flags |= ring_flags; - } - for (n = d->first_rx_ring; n <= d->last_rx_ring; n++) { - d->rx[n].flags |= ring_flags; - } return d; fail: nm_close(d); + if (errmsg) + D("%s %s", errmsg, ifname); errno = EINVAL; return NULL; } static int -nm_close(struct nm_desc_t *d) +nm_close(struct nm_desc *d) { /* * ugly trick to avoid unused warnings @@ -375,7 +557,7 @@ nm_close(struct nm_desc_t *d) if (d == NULL || d->self != d) return EINVAL; - if (d->mem) + if (d->done_mmap && d->mem) munmap(d->mem, d->memsize); if (d->fd != -1) close(d->fd); @@ -389,7 +571,7 @@ nm_close(struct nm_desc_t *d) * Same prototype as pcap_inject(), only need to cast. */ static int -nm_inject(struct nm_desc_t *d, const void *buf, size_t size) +nm_inject(struct nm_desc *d, const void *buf, size_t size) { u_int c, n = d->last_tx_ring - d->first_tx_ring + 1; @@ -408,7 +590,7 @@ nm_inject(struct nm_desc_t *d, const void *buf, size_t size) i = ring->cur; idx = ring->slot[i].buf_idx; ring->slot[i].len = size; - pkt_copy(buf, NETMAP_BUF(ring, idx), size); + nm_pkt_copy(buf, NETMAP_BUF(ring, idx), size); d->cur_tx_ring = ri; ring->head = ring->cur = nm_ring_next(ring, i); return size; @@ -421,7 +603,7 @@ nm_inject(struct nm_desc_t *d, const void *buf, size_t size) * Same prototype as pcap_dispatch(), only need to cast. */ static int -nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg) +nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg) { int n = d->last_rx_ring - d->first_rx_ring + 1; int c, got = 0, ri = d->cur_rx_ring; @@ -457,7 +639,7 @@ nm_dispatch(struct nm_desc_t *d, int cnt, nm_cb_t cb, u_char *arg) } static u_char * -nm_nextpkt(struct nm_desc_t *d, struct nm_hdr_t *hdr) +nm_nextpkt(struct nm_desc *d, struct nm_pkthdr *hdr) { int ri = d->cur_rx_ring; diff --git a/tools/tools/netmap/Makefile b/tools/tools/netmap/Makefile index e873389c717..c50247366b5 100644 --- a/tools/tools/netmap/Makefile +++ b/tools/tools/netmap/Makefile @@ -3,11 +3,11 @@ # # For multiple programs using a single source file each, # we can just define 'progs' and create custom targets. -PROGS = pkt-gen bridge vale-ctl testpcap libnetmap.so +PROGS = pkt-gen bridge vale-ctl -CLEANFILES = $(PROGS) pcap.o nm_util.o +CLEANFILES = $(PROGS) *.o NO_MAN= -CFLAGS += -Werror -Wall -nostdinc -I/usr/include -I../../../sys +CFLAGS += -Werror -Wall # -nostdinc -I/usr/include -I../../../sys CFLAGS += -Wextra LDFLAGS += -lpthread @@ -22,12 +22,11 @@ LDFLAGS += -lpcap all: $(PROGS) -pkt-gen bridge: nm_util.o - $(CC) $(CFLAGS) -o ${.TARGET} ${.TARGET:=.c} nm_util.o $(LDFLAGS) +pkt-gen: pkt-gen.o + $(CC) $(CFLAGS) -o pkt-gen pkt-gen.o $(LDFLAGS) -testpcap: pcap.c libnetmap.so - $(CC) $(CFLAGS) -DTEST -L. -lnetmap -o ${.TARGET} pcap.c - -libnetmap.so: pcap.c nm_util.c - $(CC) $(CFLAGS) -fpic -c ${.ALLSRC} - $(CC) -shared -o ${.TARGET} ${.ALLSRC:.c=.o} +bridge: bridge.o + $(CC) $(CFLAGS) -o bridge bridge.o + +vale-ctl: vale-ctl.o + $(CC) $(CFLAGS) -o vale-ctl vale-ctl.o diff --git a/tools/tools/netmap/README b/tools/tools/netmap/README index 2bde6f2ab4d..40378e62bbe 100644 --- a/tools/tools/netmap/README +++ b/tools/tools/netmap/README @@ -6,19 +6,4 @@ This directory contains examples that use netmap bridge a two-port jumper wire, also using the native API - testpcap a jumper wire using libnetmap (or libpcap) - - click* various click examples - ------------------------------------------------------------- -Some performance data as of may 2012 for applications using libpcap. -Throughput is generally in Mpps computed with the 64-byte frames, -using 1 core on a 2.9GHz CPU and 10Gbit/s interface - -Libpcap version -- Application --------------------- -BSD netmap ---------------------------------------------------- - 0.77 3.82 ports/trafshow (version 5) - 0.94 7.7 net-mgmt/ipcad (ip accounting daemon) - 0.9 5.0 net-mgmt/darkstat (ip accounting + graphing) - 0.83 2.45 net-mgmt/iftop (curses traffic display) + vale-ctl the program to control VALE bridges diff --git a/tools/tools/netmap/bridge.c b/tools/tools/netmap/bridge.c index cab545bfc91..0895d4ede67 100644 --- a/tools/tools/netmap/bridge.c +++ b/tools/tools/netmap/bridge.c @@ -9,14 +9,15 @@ * $FreeBSD$ */ -#include "nm_util.h" - +#include +#define NETMAP_WITH_LIBS +#include +#include int verbose = 0; -char *version = "$Id$"; - static int do_abort = 0; +static int zerocopy = 1; /* enable zerocopy if possible */ static void sigint_h(int sig) @@ -27,6 +28,26 @@ sigint_h(int sig) } +/* + * how many packets on this set of queues ? + */ +int +pkt_queued(struct nm_desc *d, int tx) +{ + u_int i, tot = 0; + + if (tx) { + for (i = d->first_tx_ring; i <= d->last_tx_ring; i++) { + tot += nm_ring_space(NETMAP_TXRING(d->nifp, i)); + } + } else { + for (i = d->first_rx_ring; i <= d->last_rx_ring; i++) { + tot += nm_ring_space(NETMAP_RXRING(d->nifp, i)); + } + } + return tot; +} + /* * move up to 'limit' pkts from rxring to txring swapping buffers. */ @@ -52,12 +73,6 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, while (limit-- > 0) { struct netmap_slot *rs = &rxring->slot[j]; struct netmap_slot *ts = &txring->slot[k]; -#ifdef NO_SWAP - char *rxbuf = NETMAP_BUF(rxring, rs->buf_idx); - char *txbuf = NETMAP_BUF(txring, ts->buf_idx); -#else - uint32_t pkt; -#endif /* swap packets */ if (ts->buf_idx < 2 || rs->buf_idx < 2) { @@ -65,24 +80,26 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, j, rs->buf_idx, k, ts->buf_idx); sleep(2); } -#ifndef NO_SWAP - pkt = ts->buf_idx; - ts->buf_idx = rs->buf_idx; - rs->buf_idx = pkt; -#endif /* copy the packet length. */ - if (rs->len < 14 || rs->len > 2048) + if (rs->len > 2048) { D("wrong len %d rx[%d] -> tx[%d]", rs->len, j, k); - else if (verbose > 1) + rs->len = 0; + } else if (verbose > 1) { D("%s send len %d rx[%d] -> tx[%d]", msg, rs->len, j, k); + } ts->len = rs->len; -#ifdef NO_SWAP - pkt_copy(rxbuf, txbuf, ts->len); -#else - /* report the buffer change. */ - ts->flags |= NS_BUF_CHANGED; - rs->flags |= NS_BUF_CHANGED; -#endif /* NO_SWAP */ + if (zerocopy) { + uint32_t pkt = ts->buf_idx; + ts->buf_idx = rs->buf_idx; + rs->buf_idx = pkt; + /* report the buffer change. */ + ts->flags |= NS_BUF_CHANGED; + rs->flags |= NS_BUF_CHANGED; + } else { + char *rxbuf = NETMAP_BUF(rxring, rs->buf_idx); + char *txbuf = NETMAP_BUF(txring, ts->buf_idx); + nm_pkt_copy(rxbuf, txbuf, ts->len); + } j = nm_ring_next(rxring, j); k = nm_ring_next(txring, k); } @@ -96,7 +113,7 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, /* move packts from src to destination */ static int -move(struct nm_desc_t *src, struct nm_desc_t *dst, u_int limit) +move(struct nm_desc *src, struct nm_desc *dst, u_int limit) { struct netmap_ring *txring, *rxring; u_int m = 0, si = src->first_rx_ring, di = dst->first_tx_ring; @@ -104,8 +121,8 @@ move(struct nm_desc_t *src, struct nm_desc_t *dst, u_int limit) "host->net" : "net->host"; while (si <= src->last_rx_ring && di <= dst->last_tx_ring) { - rxring = src->tx + si; - txring = dst->tx + di; + rxring = NETMAP_RXRING(src->nifp, si); + txring = NETMAP_TXRING(dst->nifp, di); ND("txring %p rxring %p", txring, rxring); if (nm_ring_empty(rxring)) { si++; @@ -141,15 +158,16 @@ int main(int argc, char **argv) { struct pollfd pollfd[2]; - int i, ch; + int ch; u_int burst = 1024, wait_link = 4; - struct nm_desc_t *pa = NULL, *pb = NULL; + struct nm_desc *pa = NULL, *pb = NULL; char *ifa = NULL, *ifb = NULL; + char ifabuf[64] = { 0 }; - fprintf(stderr, "%s %s built %s %s\n", - argv[0], version, __DATE__, __TIME__); + fprintf(stderr, "%s built %s %s\n", + argv[0], __DATE__, __TIME__); - while ( (ch = getopt(argc, argv, "b:i:vw:")) != -1) { + while ( (ch = getopt(argc, argv, "b:ci:vw:")) != -1) { switch (ch) { default: D("bad option %c %s", ch, optarg); @@ -167,6 +185,9 @@ main(int argc, char **argv) D("%s ignored, already have 2 interfaces", optarg); break; + case 'c': + zerocopy = 0; /* do not zerocopy */ + break; case 'v': verbose++; break; @@ -202,20 +223,25 @@ main(int argc, char **argv) } if (!strcmp(ifa, ifb)) { D("same interface, endpoint 0 goes to host"); - i = NETMAP_SW_RING; + snprintf(ifabuf, sizeof(ifabuf) - 1, "%s^", ifa); + ifa = ifabuf; } else { /* two different interfaces. Take all rings on if1 */ - i = 0; // all hw rings } - pa = netmap_open(ifa, i, 1); - if (pa == NULL) + pa = nm_open(ifa, NULL, 0, NULL); + if (pa == NULL) { + D("cannot open %s", ifa); return (1); + } // XXX use a single mmap ? - pb = netmap_open(ifb, 0, 1); + pb = nm_open(ifb, NULL, NM_OPEN_NO_MMAP, pa); if (pb == NULL) { + D("cannot open %s", ifb); nm_close(pa); return (1); } + zerocopy = zerocopy && (pa->mem == pb->mem); + D("------- zerocopy %ssupported", zerocopy ? "" : "NOT "); /* setup poll(2) variables. */ memset(pollfd, 0, sizeof(pollfd)); @@ -252,23 +278,25 @@ main(int argc, char **argv) pollfd[0].events, pollfd[0].revents, pkt_queued(pa, 0), - pa->rx->cur, + NETMAP_RXRING(pa->nifp, pa->cur_rx_ring)->cur, pkt_queued(pa, 1), pollfd[1].events, pollfd[1].revents, pkt_queued(pb, 0), - pb->rx->cur, + NETMAP_RXRING(pb->nifp, pb->cur_rx_ring)->cur, pkt_queued(pb, 1) ); if (ret < 0) continue; if (pollfd[0].revents & POLLERR) { - D("error on fd0, rx [%d,%d)", - pa->rx->cur, pa->rx->tail); + struct netmap_ring *rx = NETMAP_RXRING(pa->nifp, pa->cur_rx_ring); + D("error on fd0, rx [%d,%d,%d)", + rx->head, rx->cur, rx->tail); } if (pollfd[1].revents & POLLERR) { - D("error on fd1, rx [%d,%d)", - pb->rx->cur, pb->rx->tail); + struct netmap_ring *rx = NETMAP_RXRING(pb->nifp, pb->cur_rx_ring); + D("error on fd1, rx [%d,%d,%d)", + rx->head, rx->cur, rx->tail); } if (pollfd[0].revents & POLLOUT) { move(pb, pa, burst); diff --git a/tools/tools/netmap/click-test.cfg b/tools/tools/netmap/click-test.cfg deleted file mode 100644 index fc5759f88b1..00000000000 --- a/tools/tools/netmap/click-test.cfg +++ /dev/null @@ -1,19 +0,0 @@ -// -// $FreeBSD$ -// -// A sample test configuration for click -// -// -// create a switch - -myswitch :: EtherSwitch; - -// two input devices - -c0 :: FromDevice(ix0, PROMISC true); -c1 :: FromDevice(ix1, PROMISC true); - -// and now pass packets around - -c0[0] -> [0]sw[0] -> Queue(10000) -> ToDevice(ix0); -c1[0] -> [1]sw[1] -> Queue(10000) -> ToDevice(ix1); diff --git a/tools/tools/netmap/nm_util.c b/tools/tools/netmap/nm_util.c deleted file mode 100644 index deb52bbc87e..00000000000 --- a/tools/tools/netmap/nm_util.c +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Copyright (C) 2012-2014 Luigi Rizzo. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - * $Id$ - * - * utilities to use netmap devices. - * This does the basic functions of opening a device and issuing - * ioctls() - */ - -#include "nm_util.h" - -extern int verbose; - -int -nm_do_ioctl(struct nm_desc_t *me, u_long what, int subcmd) -{ - struct ifreq ifr; - int error; - int fd; - -#if defined( __FreeBSD__ ) || defined (__APPLE__) - (void)subcmd; // only used on Linux - fd = me->fd; -#endif - -#ifdef linux - struct ethtool_value eval; - - bzero(&eval, sizeof(eval)); - fd = socket(AF_INET, SOCK_DGRAM, 0); - if (fd < 0) { - printf("Error: cannot get device control socket.\n"); - return -1; - } -#endif /* linux */ - - bzero(&ifr, sizeof(ifr)); - strncpy(ifr.ifr_name, me->req.nr_name, sizeof(ifr.ifr_name)); - switch (what) { - case SIOCSIFFLAGS: -#ifndef __APPLE__ - ifr.ifr_flagshigh = me->if_flags >> 16; -#endif - ifr.ifr_flags = me->if_flags & 0xffff; - break; - -#if defined( __FreeBSD__ ) - case SIOCSIFCAP: - ifr.ifr_reqcap = me->if_reqcap; - ifr.ifr_curcap = me->if_curcap; - break; -#endif - -#ifdef linux - case SIOCETHTOOL: - eval.cmd = subcmd; - eval.data = 0; - ifr.ifr_data = (caddr_t)&eval; - break; -#endif /* linux */ - } - error = ioctl(fd, what, &ifr); - if (error) - goto done; - switch (what) { - case SIOCGIFFLAGS: -#ifndef __APPLE__ - me->if_flags = (ifr.ifr_flagshigh << 16) | - (0xffff & ifr.ifr_flags); -#endif - if (verbose) - D("flags are 0x%x", me->if_flags); - break; - -#if defined( __FreeBSD__ ) - case SIOCGIFCAP: - me->if_reqcap = ifr.ifr_reqcap; - me->if_curcap = ifr.ifr_curcap; - if (verbose) - D("curcap are 0x%x", me->if_curcap); - break; -#endif /* __FreeBSD__ */ - } -done: -#ifdef linux - close(fd); -#endif - if (error) - D("ioctl error %d %lu", error, what); - return error; -} - -/* - * open a device. if me->mem is null then do an mmap. - * Returns the file descriptor. - * The extra flag checks configures promisc mode. - */ -struct nm_desc_t * -netmap_open(const char *name, int ringid, int promisc) -{ - struct nm_desc_t *d = nm_open(name, NULL, ringid, 0); - - if (d == NULL) - return d; - - if (verbose) - D("memsize is %d MB", d->req.nr_memsize>>20); - - /* Set the operating mode. */ - if (ringid != NETMAP_SW_RING) { - nm_do_ioctl(d, SIOCGIFFLAGS, 0); - if ((d->if_flags & IFF_UP) == 0) { - D("%s is down, bringing up...", name); - d->if_flags |= IFF_UP; - } - if (promisc) { - d->if_flags |= IFF_PPROMISC; - nm_do_ioctl(d, SIOCSIFFLAGS, 0); - } - - /* disable GSO, TSO, RXCSUM, TXCSUM... - * TODO: set them back when done. - */ -#ifdef __FreeBSD__ - nm_do_ioctl(d, SIOCGIFCAP, 0); - d->if_reqcap = d->if_curcap; - d->if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE); - nm_do_ioctl(d, SIOCSIFCAP, 0); -#endif -#ifdef linux - nm_do_ioctl(d, SIOCETHTOOL, ETHTOOL_SGSO); - nm_do_ioctl(d, SIOCETHTOOL, ETHTOOL_STSO); - nm_do_ioctl(d, SIOCETHTOOL, ETHTOOL_SRXCSUM); - nm_do_ioctl(d, SIOCETHTOOL, ETHTOOL_STXCSUM); -#endif /* linux */ - } - - return d; -} - - -/* - * how many packets on this set of queues ? - */ -int -pkt_queued(struct nm_desc_t *d, int tx) -{ - u_int i, tot = 0; - - ND("me %p begin %d end %d", me, me->begin, me->end); - if (tx) { - for (i = d->first_tx_ring; i <= d->last_tx_ring; i++) - tot += nm_ring_space(d->tx + i); - } else { - for (i = d->first_rx_ring; i <= d->last_rx_ring; i++) - tot += nm_ring_space(d->rx + i); - } - return tot; -} - -#if 0 - -/* - * - -Helper routines for multiple readers from the same queue - -- all readers open the device in 'passive' mode (NETMAP_PRIV_RING set). - In this mode a thread that loses the race on a poll() just continues - without calling *xsync() - -- all readers share an extra 'ring' which contains the sync information. - In particular we have a shared head+tail pointers that work - together with cur and available - ON RETURN FROM THE SYSCALL: - shadow->cur = ring->cur - shadow->tail = ring->tail - shadow->link[i] = i for all slots // mark invalid - - */ - -struct nm_q_arg { - u_int want; /* Input */ - u_int have; /* Output, 0 on error */ - u_int cur; - u_int tail; - struct netmap_ring *ring; -}; - -/* - * grab a number of slots from the queue. - */ -struct nm_q_arg -my_grab(struct nm_q_arg q) -{ - const u_int ns = q.ring->num_slots; - - // lock(ring); - for (;;) { - - q.cur = (volatile u_int)q.ring->head; - q.have = ns + q.head - (volatile u_int)q.ring->tail; - if (q.have >= ns) - q.have -= ns; - if (q.have == 0) /* no space; caller may ioctl/retry */ - break; - if (q.want < q.have) - q.have = q.want; - q.tail = q.cur + q.have; - if (q.tail >= ns) - q.tail -= ns; - if (atomic_cmpset_int(&q.ring->cur, q.cur, q.tail) - break; /* success */ - } - // unlock(ring); - D("returns %d out of %d at %d,%d", - q.have, q.want, q.cur, q.tail); - /* the last one can clear avail ? */ - return q; -} - - -int -my_release(struct nm_q_arg q) -{ - u_int cur = q.cur, tail = q.tail, i; - struct netmap_ring *r = q.ring; - - /* link the block to the next one. - * there is no race here because the location is mine. - */ - r->slot[cur].ptr = tail; /* this is mine */ - r->slot[cur].flags |= NM_SLOT_PTR; // points to next block - // memory barrier - // lock(ring); - if (r->head != cur) - goto done; - for (;;) { - // advance head - r->head = head = r->slot[head].ptr; - // barrier ? - if (head == r->slot[head].ptr) - break; // stop here - } - /* we have advanced from q.head to head (r.head might be - * further down. - */ - // do an ioctl/poll to flush. -done: - // unlock(ring); - return; /* not my turn to release */ -} -#endif /* unused */ diff --git a/tools/tools/netmap/nm_util.h b/tools/tools/netmap/nm_util.h deleted file mode 100644 index 0ab2e2e8198..00000000000 --- a/tools/tools/netmap/nm_util.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (C) 2012-2014 Luigi Rizzo. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - * $Id$ - * - * Some utilities to build netmap-based programs. - */ - -#ifndef _NM_UTIL_H -#define _NM_UTIL_H - -#define _GNU_SOURCE /* for CPU_SET() */ - -#include /* fprintf */ -#include /* POLLIN */ -#include /* PRI* macros */ -#include /* u_char */ - -#include /* ntohs */ -#include /* sysctl */ -#include /* getifaddrs */ -#include /* ETHERTYPE_IP */ -#include /* IPPROTO_* */ -#include /* struct ip */ -#include /* struct udp */ - - -#define NETMAP_WITH_LIBS -#include - -#include /* pthread_* */ - -#ifdef linux - -#define cpuset_t cpu_set_t - -#define ifr_flagshigh ifr_flags /* only the low 16 bits here */ -#define IFF_PPROMISC IFF_PROMISC /* IFF_PPROMISC does not exist */ -#include -#include - -#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME -#include /* ether_aton */ -#include /* sockaddr_ll */ -#endif /* linux */ - -#ifdef __FreeBSD__ -#include /* le64toh */ -#include - -#include /* pthread w/ affinity */ -#include /* cpu_set */ -#include /* LLADDR */ -#endif /* __FreeBSD__ */ - -#ifdef __APPLE__ - -#define cpuset_t uint64_t // XXX -static inline void CPU_ZERO(cpuset_t *p) -{ - *p = 0; -} - -static inline void CPU_SET(uint32_t i, cpuset_t *p) -{ - *p |= 1<< (i & 0x3f); -} - -#define pthread_setaffinity_np(a, b, c) ((void)a, 0) - -#define ifr_flagshigh ifr_flags // XXX -#define IFF_PPROMISC IFF_PROMISC -#include /* LLADDR */ -#define clock_gettime(a,b) \ - do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) -#endif /* __APPLE__ */ - -static inline int min(int a, int b) { return a < b ? a : b; } -extern int time_second; - -/* debug support */ -#define ND(format, ...) do {} while(0) -#define D(format, ...) \ - fprintf(stderr, "%s [%d] " format "\n", \ - __FUNCTION__, __LINE__, ##__VA_ARGS__) - -#define RD(lps, format, ...) \ - do { \ - static int t0, cnt; \ - if (t0 != time_second) { \ - t0 = time_second; \ - cnt = 0; \ - } \ - if (cnt++ < lps) \ - D(format, ##__VA_ARGS__); \ - } while (0) - - - -struct nm_desc_t * netmap_open(const char *name, int ringid, int promisc); -int nm_do_ioctl(struct nm_desc_t *me, u_long what, int subcmd); -int pkt_queued(struct nm_desc_t *d, int tx); -#endif /* _NM_UTIL_H */ diff --git a/tools/tools/netmap/pcap.c b/tools/tools/netmap/pcap.c deleted file mode 100644 index b3c2be5d23f..00000000000 --- a/tools/tools/netmap/pcap.c +++ /dev/null @@ -1,528 +0,0 @@ -/* - * (C) 2011-2014 Luigi Rizzo - * - * BSD license - * - * A simple library that maps some pcap functions onto netmap - * This is not 100% complete but enough to let tcpdump, trafshow - * and other apps work. - * - * $FreeBSD$ - */ - -#define MY_PCAP -#include "nm_util.h" - -char *version = "$Id$"; -int verbose = 0; - -/* - * We redefine here a number of structures that are in pcap.h - * so we can compile this file without the system header. - */ -#ifndef PCAP_ERRBUF_SIZE -#define PCAP_ERRBUF_SIZE 128 -/* - * Each packet is accompanied by a header including the timestamp, - * captured size and actual size. - */ -struct pcap_pkthdr { - struct timeval ts; /* time stamp */ - uint32_t caplen; /* length of portion present */ - uint32_t len; /* length this packet (off wire) */ -}; - -typedef struct pcap_if pcap_if_t; - -/* - * Representation of an interface address. - */ -struct pcap_addr { - struct pcap_addr *next; - struct sockaddr *addr; /* address */ - struct sockaddr *netmask; /* netmask for the above */ - struct sockaddr *broadaddr; /* broadcast addr for the above */ - struct sockaddr *dstaddr; /* P2P dest. address for the above */ -}; - -struct pcap_if { - struct pcap_if *next; - char *name; /* name to hand to "pcap_open_live()" */ - char *description; /* textual description of interface, or NULL */ - struct pcap_addr *addresses; - uint32_t flags; /* PCAP_IF_ interface flags */ -}; - -/* - * We do not support stats (yet) - */ -struct pcap_stat { - u_int ps_recv; /* number of packets received */ - u_int ps_drop; /* number of packets dropped */ - u_int ps_ifdrop; /* drops by interface XXX not yet supported */ -#ifdef WIN32 - u_int bs_capt; /* number of packets that reach the app. */ -#endif /* WIN32 */ -}; - -typedef struct nm_desc_t pcap_t; -typedef enum { - PCAP_D_INOUT = 0, - PCAP_D_IN, - PCAP_D_OUT -} pcap_direction_t; - - - -typedef void (*pcap_handler)(u_char *user, - const struct pcap_pkthdr *h, const u_char *bytes); - -char errbuf[PCAP_ERRBUF_SIZE]; - -pcap_t *pcap_open_live(const char *device, int snaplen, - int promisc, int to_ms, char *errbuf); - -int pcap_findalldevs(pcap_if_t **alldevsp, char *errbuf); -void pcap_close(pcap_t *p); -int pcap_get_selectable_fd(pcap_t *p); -int pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user); -int pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf); -int pcap_setdirection(pcap_t *p, pcap_direction_t d); -char *pcap_lookupdev(char *errbuf); -int pcap_inject(pcap_t *p, const void *buf, size_t size); -int pcap_fileno(pcap_t *p); -const char *pcap_lib_version(void); - - -struct eproto { - const char *s; - u_short p; -}; -#endif /* !PCAP_ERRBUF_SIZE */ - -#ifndef TEST -/* - * build as a shared library - */ - -char pcap_version[] = "libnetmap version 0.3"; - - -/* - * There is a set of functions that tcpdump expects even if probably - * not used - */ -struct eproto eproto_db[] = { - { "ip", ETHERTYPE_IP }, - { "arp", ETHERTYPE_ARP }, - { (char *)0, 0 } -}; - - -const char *pcap_lib_version(void) -{ - return pcap_version; -} - -int -pcap_findalldevs(pcap_if_t **alldevsp, char *errbuf) -{ - pcap_if_t *top = NULL; -#ifndef linux - struct ifaddrs *i_head, *i; - pcap_if_t *cur; - struct pcap_addr *tail = NULL; - int l; - - D("listing all devs"); - *alldevsp = NULL; - i_head = NULL; - - if (getifaddrs(&i_head)) { - D("cannot get if addresses"); - return -1; - } - for (i = i_head; i; i = i->ifa_next) { - //struct ifaddrs *ifa; - struct pcap_addr *pca; - //struct sockaddr *sa; - - D("got interface %s", i->ifa_name); - if (!top || strcmp(top->name, i->ifa_name)) { - /* new interface */ - l = sizeof(*top) + strlen(i->ifa_name) + 1; - cur = calloc(1, l); - if (cur == NULL) { - D("no space for if descriptor"); - continue; - } - cur->name = (char *)(cur + 1); - //cur->flags = i->ifa_flags; - strcpy(cur->name, i->ifa_name); - cur->description = NULL; - cur->next = top; - top = cur; - tail = NULL; - } - /* now deal with addresses */ - D("%s addr family %d len %d %s %s", - top->name, - i->ifa_addr->sa_family, i->ifa_addr->sa_len, - i->ifa_netmask ? "Netmask" : "", - i->ifa_broadaddr ? "Broadcast" : ""); - l = sizeof(struct pcap_addr) + - (i->ifa_addr ? i->ifa_addr->sa_len:0) + - (i->ifa_netmask ? i->ifa_netmask->sa_len:0) + - (i->ifa_broadaddr? i->ifa_broadaddr->sa_len:0); - pca = calloc(1, l); - if (pca == NULL) { - D("no space for if addr"); - continue; - } -#define SA_NEXT(x) ((struct sockaddr *)((char *)(x) + (x)->sa_len)) - pca->addr = (struct sockaddr *)(pca + 1); - pkt_copy(i->ifa_addr, pca->addr, i->ifa_addr->sa_len); - if (i->ifa_netmask) { - pca->netmask = SA_NEXT(pca->addr); - bcopy(i->ifa_netmask, pca->netmask, i->ifa_netmask->sa_len); - if (i->ifa_broadaddr) { - pca->broadaddr = SA_NEXT(pca->netmask); - bcopy(i->ifa_broadaddr, pca->broadaddr, i->ifa_broadaddr->sa_len); - } - } - if (tail == NULL) { - top->addresses = pca; - } else { - tail->next = pca; - } - tail = pca; - - } - freeifaddrs(i_head); -#endif /* !linux */ - (void)errbuf; /* UNUSED */ - *alldevsp = top; - return 0; -} - -void pcap_freealldevs(pcap_if_t *alldevs) -{ - (void)alldevs; /* UNUSED */ - D("unimplemented"); -} - -char * -pcap_lookupdev(char *buf) -{ - D("%s", buf); - strcpy(buf, "/dev/netmap"); - return buf; -} - -pcap_t * -pcap_create(const char *source, char *errbuf) -{ - D("src %s (call open liveted)", source); - return pcap_open_live(source, 0, 1, 100, errbuf); -} - -int -pcap_activate(pcap_t *p) -{ - D("pcap %p running", p); - return 0; -} - -int -pcap_can_set_rfmon(pcap_t *p) -{ - (void)p; /* UNUSED */ - D(""); - return 0; /* no we can't */ -} - -int -pcap_set_snaplen(pcap_t *p, int snaplen) -{ - struct nm_desc_t *me = p; - - D("len %d", snaplen); - me->snaplen = snaplen; - return 0; -} - -int -pcap_snapshot(pcap_t *p) -{ - struct nm_desc_t *me = p; - - D("len %d", me->snaplen); - return me->snaplen; -} - -int -pcap_lookupnet(const char *device, uint32_t *netp, - uint32_t *maskp, char *errbuf) -{ - - (void)errbuf; /* UNUSED */ - D("device %s", device); - inet_aton("10.0.0.255", (struct in_addr *)netp); - inet_aton("255.255.255.0",(struct in_addr *) maskp); - return 0; -} - -int -pcap_set_promisc(pcap_t *p, int promisc) -{ - D("promisc %d", promisc); - if (nm_do_ioctl(p, SIOCGIFFLAGS, 0)) - D("SIOCGIFFLAGS failed"); - if (promisc) { - p->if_flags |= IFF_PPROMISC; - } else { - p->if_flags &= ~IFF_PPROMISC; - } - if (nm_do_ioctl(p, SIOCSIFFLAGS, 0)) - D("SIOCSIFFLAGS failed"); - return 0; -} - -int -pcap_set_timeout(pcap_t *p, int to_ms) -{ - D("%d ms", to_ms); - p->to_ms = to_ms; - return 0; -} - -struct bpf_program; - -int -pcap_compile(pcap_t *p, struct bpf_program *fp, - const char *str, int optimize, uint32_t netmask) -{ - (void)p; /* UNUSED */ - (void)fp; /* UNUSED */ - (void)optimize; /* UNUSED */ - (void)netmask; /* UNUSED */ - D("%s", str); - return 0; -} - -int -pcap_setfilter(pcap_t *p, struct bpf_program *fp) -{ - (void)p; /* UNUSED */ - (void)fp; /* UNUSED */ - D(""); - return 0; -} - -int -pcap_datalink(pcap_t *p) -{ - (void)p; /* UNUSED */ - D("returns 1"); - return 1; // ethernet -} - -const char * -pcap_datalink_val_to_name(int dlt) -{ - D("%d returns DLT_EN10MB", dlt); - return "DLT_EN10MB"; -} - -const char * -pcap_datalink_val_to_description(int dlt) -{ - D("%d returns Ethernet link", dlt); - return "Ethernet link"; -} - -struct pcap_stat; -int -pcap_stats(pcap_t *p, struct pcap_stat *ps) -{ - *ps = *(struct pcap_stat *)(void *)&(p->st); - return 0; /* accumulate from pcap_dispatch() */ -}; - -char * -pcap_geterr(pcap_t *p) -{ - D(""); - return p->msg; -} - -pcap_t * -pcap_open_live(const char *device, int snaplen, - int promisc, int to_ms, char *errbuf) -{ - struct nm_desc_t *d; - int l; - - if (!device) { - D("missing device name"); - return NULL; - } - - l = strlen(device) + 1; - D("request to open %s snaplen %d promisc %d timeout %dms", - device, snaplen, promisc, to_ms); - d = nm_open(device, NULL, 0, 0); - if (d == NULL) { - D("error opening %s", device); - return NULL; - } - d->to_ms = to_ms; - d->snaplen = snaplen; - d->errbuf = errbuf; - d->promisc = promisc; - - return d; -} - -void -pcap_close(pcap_t *p) -{ - nm_close(p); - /* restore original flags ? */ -} - -int -pcap_fileno(pcap_t *p) -{ - struct nm_desc_t *d = p; - D("returns %d", d->fd); - return d->fd; -} - -int -pcap_get_selectable_fd(pcap_t *p) -{ - struct nm_desc_t *d = p; - - return d->fd; -} - -int -pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf) -{ - (void)p; /* UNUSED */ - (void)errbuf; /* UNUSED */ - D("mode is %d", nonblock); - return 0; /* ignore */ -} - -int -pcap_setdirection(pcap_t *p, pcap_direction_t d) -{ - (void)p; /* UNUSED */ - (void)d; /* UNUSED */ - D(""); - return 0; /* ignore */ -}; - -int -pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user) -{ - return nm_dispatch(p, cnt, (void *)callback, user); -} - -int -pcap_inject(pcap_t *p, const void *buf, size_t size) -{ - return nm_inject(p, buf, size); -} - -int -pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user) -{ - struct pollfd fds[1]; - int i; - - ND("cnt %d", cnt); - memset(fds, 0, sizeof(fds)); - fds[0].fd = p->fd; - fds[0].events = (POLLIN); - - while (cnt == -1 || cnt > 0) { - if (poll(fds, 1, p->to_ms) <= 0) { - D("poll error/timeout"); - continue; - } - i = nm_dispatch(p, cnt, (void *)callback, user); - if (cnt > 0) - cnt -= i; - } - return 0; -} - -#endif /* !TEST */ - -#ifdef TEST /* build test code */ -void do_send(u_char *user, const struct pcap_pkthdr *h, const u_char *buf) -{ - pcap_inject((pcap_t *)user, buf, h->caplen); -} - -/* - * a simple pcap test program, bridge between two interfaces. - */ -int -main(int argc, char **argv) -{ - pcap_t *p0, *p1; - int burst = 1024; - struct pollfd pollfd[2]; - - fprintf(stderr, "%s %s built %s %s\n", - argv[0], version, __DATE__, __TIME__); - - while (argc > 1 && !strcmp(argv[1], "-v")) { - verbose++; - argv++; - argc--; - } - - if (argc < 3 || argc > 4 || !strcmp(argv[1], argv[2])) { - D("Usage: %s IFNAME1 IFNAME2 [BURST]", argv[0]); - return (1); - } - if (argc > 3) - burst = atoi(argv[3]); - - p0 = pcap_open_live(argv[1], 0, 1, 100, NULL); - p1 = pcap_open_live(argv[2], 0, 1, 100, NULL); - D("%s", version); - D("open returns %p %p", p0, p1); - if (!p0 || !p1) - return(1); - bzero(pollfd, sizeof(pollfd)); - pollfd[0].fd = pcap_fileno(p0); - pollfd[1].fd = pcap_fileno(p1); - pollfd[0].events = pollfd[1].events = POLLIN; - for (;;) { - /* do i need to reset ? */ - pollfd[0].revents = pollfd[1].revents = 0; - int ret = poll(pollfd, 2, 1000); - if (ret <= 0 || verbose) - D("poll %s [0] ev %x %x [1] ev %x %x", - ret <= 0 ? "timeout" : "ok", - pollfd[0].events, - pollfd[0].revents, - pollfd[1].events, - pollfd[1].revents); - if (ret < 0) - continue; - if (pollfd[0].revents & POLLIN) - pcap_dispatch(p0, burst, do_send, (void *)p1); - if (pollfd[1].revents & POLLIN) - pcap_dispatch(p1, burst, do_send, (void *)p0); - } - - return (0); -} -#endif /* TEST */ diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c index 3fb7702083f..8e78fa8e24e 100644 --- a/tools/tools/netmap/pkt-gen.c +++ b/tools/tools/netmap/pkt-gen.c @@ -37,26 +37,83 @@ * */ -#define MY_PCAP -#include "nm_util.h" -// #include +#define _GNU_SOURCE /* for CPU_SET() */ +#include +#define NETMAP_WITH_LIBS +#include + #include // isprint() +#include // sysconf() +#include +#include /* ntohs */ +#include /* sysctl */ +#include /* getifaddrs */ +#include +#include +#include +#include + +#include #ifndef NO_PCAP #include #endif + +#ifdef linux + +#define cpuset_t cpu_set_t + +#define ifr_flagshigh ifr_flags /* only the low 16 bits here */ +#define IFF_PPROMISC IFF_PROMISC /* IFF_PPROMISC does not exist */ +#include +#include + +#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME +#include /* ether_aton */ +#include /* sockaddr_ll */ +#endif /* linux */ + +#ifdef __FreeBSD__ +#include /* le64toh */ +#include + +#include /* pthread w/ affinity */ +#include /* cpu_set */ +#include /* LLADDR */ +#endif /* __FreeBSD__ */ + +#ifdef __APPLE__ + +#define cpuset_t uint64_t // XXX +static inline void CPU_ZERO(cpuset_t *p) +{ + *p = 0; +} + +static inline void CPU_SET(uint32_t i, cpuset_t *p) +{ + *p |= 1<< (i & 0x3f); +} + +#define pthread_setaffinity_np(a, b, c) ((void)a, 0) + +#define ifr_flagshigh ifr_flags // XXX +#define IFF_PPROMISC IFF_PROMISC +#include /* LLADDR */ +#define clock_gettime(a,b) \ + do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) +#endif /* __APPLE__ */ + const char *default_payload="netmap pkt-gen DIRECT payload\n" "http://info.iet.unipi.it/~luigi/netmap/ "; const char *indirect_payload="netmap pkt-gen indirect payload\n" "http://info.iet.unipi.it/~luigi/netmap/ "; -int time_second; // support for RD() debugging macro - int verbose = 0; -#define SKIP_PAYLOAD 1 /* do not check payload. */ +#define SKIP_PAYLOAD 1 /* do not check payload. XXX unused */ #define VIRT_HDR_1 10 /* length of a base vnet-hdr */ @@ -85,6 +142,8 @@ struct mac_range { struct ether_addr start, end; }; +/* ifname can be netmap:foo-xxxx */ +#define MAX_IFNAMELEN 64 /* our buffer for ifname */ /* * global arguments for all threads */ @@ -119,15 +178,16 @@ struct glob_arg { int affinity; int main_fd; + struct nm_desc *nmd; + uint64_t nmd_flags; int report_interval; /* milliseconds between prints */ void *(*td_body)(void *); void *mmap_addr; - int mmap_size; - char *ifname; + char ifname[MAX_IFNAMELEN]; char *nmr_config; int dummy_send; int virt_header; /* send also the virt_header */ - int host_ring; + int extra_bufs; /* goes in nr_arg3 */ }; enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP }; @@ -142,9 +202,7 @@ struct targ { int completed; int cancel; int fd; - struct nmreq nmr; - struct netmap_if *nifp; - uint16_t qfirst, qlast; /* range of queues to scan */ + struct nm_desc *nmd; volatile uint64_t count; struct timespec tic, toc; int me; @@ -187,7 +245,7 @@ extract_ip_range(struct ip_range *r) pp = index(ap, ':'); if (pp) { *pp++ = '\0'; - if (*pp) + if (*pp) r->port1 = strtol(pp, NULL, 0); } if (*ap) { @@ -261,19 +319,17 @@ sigint_h(int sig) static int system_ncpus(void) { -#ifdef __FreeBSD__ - int mib[2], ncpus; - size_t len; - - mib[0] = CTL_HW; - mib[1] = HW_NCPU; - len = sizeof(mib); + int ncpus; +#if defined (__FreeBSD__) + int mib[2] = { CTL_HW, HW_NCPU }; + size_t len = sizeof(mib); sysctl(mib, 2, &ncpus, &len, NULL, 0); - +#elif defined(linux) + ncpus = sysconf(_SC_NPROCESSORS_ONLN); +#else /* others */ + ncpus = 1; +#endif /* others */ return (ncpus); -#else - return 1; -#endif /* !__FreeBSD__ */ } #ifdef __linux__ @@ -299,15 +355,17 @@ system_ncpus(void) /* * parse the vale configuration in conf and put it in nmr. + * Return the flag set if necessary. * The configuration may consist of 0 to 4 numbers separated * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings. * Missing numbers or zeroes stand for default values. * As an additional convenience, if exactly one number * is specified, then this is assigned to both #tx-slots and #rx-slots. - * If there is no 4th number, then the 3rd is assigned to both #tx-rings + * If there is no 4th number, then the 3rd is assigned to both #tx-rings * and #rx-rings. */ -void parse_nmr_config(const char* conf, struct nmreq *nmr) +int +parse_nmr_config(const char* conf, struct nmreq *nmr) { char *w, *tok; int i, v; @@ -315,7 +373,7 @@ void parse_nmr_config(const char* conf, struct nmreq *nmr) nmr->nr_tx_rings = nmr->nr_rx_rings = 0; nmr->nr_tx_slots = nmr->nr_rx_slots = 0; if (conf == NULL || ! *conf) - return; + return 0; w = strdup(conf); for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) { v = atoi(tok); @@ -341,6 +399,9 @@ void parse_nmr_config(const char* conf, struct nmreq *nmr) nmr->nr_tx_rings, nmr->nr_tx_slots, nmr->nr_rx_rings, nmr->nr_rx_slots); free(w); + return (nmr->nr_tx_rings || nmr->nr_tx_slots || + nmr->nr_rx_rings || nmr->nr_rx_slots) ? + NM_OPEN_RING_CFG : 0; } @@ -385,7 +446,6 @@ source_hwaddr(const char *ifname, char *buf) static int setaffinity(pthread_t me, int i) { -#if 1 // def __FreeBSD__ cpuset_t cpumask; if (i == -1) @@ -399,10 +459,6 @@ setaffinity(pthread_t me, int i) D("Unable to set affinity: %s", strerror(errno)); return 1; } -#else - (void)me; /* suppress 'unused' warnings */ - (void)i; -#endif /* __FreeBSD__ */ return 0; } @@ -449,7 +505,7 @@ dump_payload(char *p, int len, struct netmap_ring *ring, int cur) int i, j, i0; /* get the length in ASCII of the length of the packet. */ - + printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n", ring, cur, ring->slot[cur].buf_idx, ring->slot[cur].flags, len); @@ -632,7 +688,7 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame, slot->flags |= NS_INDIRECT; slot->ptr = (uint64_t)frame; } else if (options & OPT_COPY) { - pkt_copy(frame, p, size); + nm_pkt_copy(frame, p, size); if (fcnt == nfrags) update_addresses(pkt, g); } else if (options & OPT_MEMCPY) { @@ -671,21 +727,19 @@ static void * pinger_body(void *data) { struct targ *targ = (struct targ *) data; - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; + struct netmap_if *nifp = targ->nmd->nifp; int i, rx = 0, n = targ->g->npackets; void *frame; int size; + uint32_t sent = 0; + struct timespec ts, now, last_print; + uint32_t count = 0, min = 1000000000, av = 0; frame = &targ->pkt; frame += sizeof(targ->pkt.vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; - fds[0].fd = targ->fd; - fds[0].events = (POLLIN); - static uint32_t sent; - struct timespec ts, now, last_print; - uint32_t count = 0, min = 1000000000, av = 0; if (targ->g->nthreads > 1) { D("can only ping with 1 thread"); @@ -706,7 +760,7 @@ pinger_body(void *data) if (nm_ring_empty(ring)) { D("-- ouch, cannot send"); } else { - pkt_copy(frame, p, size); + nm_pkt_copy(frame, p, size); clock_gettime(CLOCK_REALTIME_PRECISE, &ts); bcopy(&sent, p+42, sizeof(sent)); bcopy(&ts, p+46, sizeof(ts)); @@ -715,13 +769,14 @@ pinger_body(void *data) } } /* should use a parameter to decide how often to send */ - if (poll(fds, 1, 3000) <= 0) { + if (poll(&pfd, 1, 3000) <= 0) { D("poll error/timeout on queue %d: %s", targ->me, strerror(errno)); continue; } /* see what we got back */ - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_tx_ring; + i <= targ->nmd->last_tx_ring; i++) { ring = NETMAP_RXRING(nifp, i); while (!nm_ring_empty(ring)) { uint32_t seq; @@ -775,12 +830,10 @@ static void * ponger_body(void *data) { struct targ *targ = (struct targ *) data; - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; + struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *txring, *rxring; int i, rx = 0, sent = 0, n = targ->g->npackets; - fds[0].fd = targ->fd; - fds[0].events = (POLLIN); if (targ->g->nthreads > 1) { D("can only reply ping with 1 thread"); @@ -791,9 +844,9 @@ ponger_body(void *data) uint32_t txcur, txavail; //#define BUSYWAIT #ifdef BUSYWAIT - ioctl(fds[0].fd, NIOCRXSYNC, NULL); + ioctl(pfd.fd, NIOCRXSYNC, NULL); #else - if (poll(fds, 1, 1000) <= 0) { + if (poll(&pfd, 1, 1000) <= 0) { D("poll error/timeout on queue %d: %s", targ->me, strerror(errno)); continue; @@ -803,7 +856,7 @@ ponger_body(void *data) txcur = txring->cur; txavail = nm_ring_space(txring); /* see what we got back */ - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { rxring = NETMAP_RXRING(nifp, i); while (!nm_ring_empty(rxring)) { uint16_t *spkt, *dpkt; @@ -821,7 +874,7 @@ ponger_body(void *data) /* copy... */ dpkt = (uint16_t *)dst; spkt = (uint16_t *)src; - pkt_copy(src, dst, slot->len); + nm_pkt_copy(src, dst, slot->len); dpkt[0] = spkt[3]; dpkt[1] = spkt[4]; dpkt[2] = spkt[5]; @@ -838,7 +891,7 @@ ponger_body(void *data) txring->head = txring->cur = txcur; targ->count = sent; #ifdef BUSYWAIT - ioctl(fds[0].fd, NIOCTXSYNC, NULL); + ioctl(pfd.fd, NIOCTXSYNC, NULL); #endif //D("tx %d rx %d", sent, rx); } @@ -924,11 +977,11 @@ static void * sender_body(void *data) { struct targ *targ = (struct targ *) data; - - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; + struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *txring; - int i, n = targ->g->npackets / targ->g->nthreads, sent = 0; + int i, n = targ->g->npackets / targ->g->nthreads; + int64_t sent = 0; int options = targ->g->options | OPT_COPY; struct timespec nexttime = { 0, 0}; // XXX silence compiler int rate_limit = targ->g->tx_rate; @@ -943,10 +996,6 @@ sender_body(void *data) D("start"); if (setaffinity(targ->thread, targ->affinity)) goto quit; - /* setup poll(2) mechanism. */ - memset(fds, 0, sizeof(fds)); - fds[0].fd = targ->fd; - fds[0].events = (POLLOUT); /* main loop.*/ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); @@ -956,7 +1005,7 @@ sender_body(void *data) wait_time(targ->tic); nexttime = targ->tic; } - if (targ->g->dev_type == DEV_TAP) { + if (targ->g->dev_type == DEV_TAP) { D("writing to file desc %d", targ->g->main_fd); for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { @@ -997,14 +1046,14 @@ sender_body(void *data) /* * wait for available room in the send queue(s) */ - if (poll(fds, 1, 2000) <= 0) { + if (poll(&pfd, 1, 2000) <= 0) { if (targ->cancel) break; D("poll error/timeout on queue %d: %s", targ->me, strerror(errno)); - goto quit; + // goto quit; } - if (fds[0].revents & POLLERR) { + if (pfd.revents & POLLERR) { D("poll error"); goto quit; } @@ -1015,7 +1064,7 @@ sender_body(void *data) D("drop copy"); options &= ~OPT_COPY; } - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { int m, limit = rate_limit ? tosend : targ->g->burst; if (n > 0 && n - sent < limit) limit = n - sent; @@ -1024,10 +1073,10 @@ sender_body(void *data) continue; if (frags > 1) limit = ((limit + frags - 1) / frags) * frags; - + m = send_packets(txring, pkt, frame, size, targ->g, limit, options, frags); - ND("limit %d tail %d frags %d m %d", + ND("limit %d tail %d frags %d m %d", limit, txring->tail, frags, m); sent += m; targ->count = sent; @@ -1039,13 +1088,13 @@ sender_body(void *data) } } /* flush any remaining packets */ - ioctl(fds[0].fd, NIOCTXSYNC, NULL); + ioctl(pfd.fd, NIOCTXSYNC, NULL); /* final part: wait all the TX queues to be empty. */ - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { txring = NETMAP_TXRING(nifp, i); while (nm_tx_pending(txring)) { - ioctl(fds[0].fd, NIOCTXSYNC, NULL); + ioctl(pfd.fd, NIOCTXSYNC, NULL); usleep(1); /* wait 1 tick */ } } @@ -1102,8 +1151,8 @@ static void * receiver_body(void *data) { struct targ *targ = (struct targ *) data; - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; + struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *rxring; int i; uint64_t received = 0; @@ -1111,17 +1160,13 @@ receiver_body(void *data) if (setaffinity(targ->thread, targ->affinity)) goto quit; - /* setup poll(2) mechanism. */ - memset(fds, 0, sizeof(fds)); - fds[0].fd = targ->fd; - fds[0].events = (POLLIN); - /* unbounded wait for the first packet. */ for (;;) { - i = poll(fds, 1, 1000); - if (i > 0 && !(fds[0].revents & POLLERR)) + i = poll(&pfd, 1, 1000); + if (i > 0 && !(pfd.revents & POLLERR)) break; - RD(1, "waiting for initial packets, poll returns %d %d", i, fds[0].revents); + RD(1, "waiting for initial packets, poll returns %d %d", + i, pfd.revents); } /* main loop, exit after 1s silence */ @@ -1146,18 +1191,18 @@ receiver_body(void *data) while (!targ->cancel) { /* Once we started to receive packets, wait at most 1 seconds before quitting. */ - if (poll(fds, 1, 1 * 1000) <= 0 && !targ->g->forever) { + if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->toc.tv_sec -= 1; /* Subtract timeout time. */ - break; + goto out; } - if (fds[0].revents & POLLERR) { + if (pfd.revents & POLLERR) { D("poll err"); goto quit; } - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { int m; rxring = NETMAP_RXRING(nifp, i); @@ -1168,12 +1213,12 @@ receiver_body(void *data) received += m; } targ->count = received; - - // tell the card we have read the data - //ioctl(fds[0].fd, NIOCRXSYNC, NULL); } } + clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); + +out: targ->completed = 1; targ->count = received; @@ -1190,10 +1235,10 @@ quit: static const char * norm(char *buf, double val) { - char *units[] = { "", "K", "M", "G" }; + char *units[] = { "", "K", "M", "G", "T" }; u_int i; - for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *); i++) + for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++) val /= 1000; sprintf(buf, "%.2f %s", val, units[i]); return buf; @@ -1205,8 +1250,8 @@ tx_output(uint64_t sent, int size, double delta) double bw, raw_bw, pps; char b1[40], b2[80], b3[80]; - printf("Sent %" PRIu64 " packets, %d bytes each, in %.2f seconds.\n", - sent, size, delta); + printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n", + (unsigned long long)sent, size, delta); if (delta == 0) delta = 1e-6; if (size < 60) /* correct for min packet size */ @@ -1227,7 +1272,8 @@ rx_output(uint64_t received, double delta) double pps; char b1[40]; - printf("Received %" PRIu64 " packets, in %.2f seconds.\n", received, delta); + printf("Received %llu packets, in %.2f seconds.\n", + (unsigned long long) received, delta); if (delta == 0) delta = 1e-6; @@ -1262,7 +1308,6 @@ usage(void) "\t-R rate in packets per second\n" "\t-X dump payload\n" "\t-H len add empty virtio-net-header with size 'len'\n" - "\t-h use host ring\n" "", cmd); @@ -1280,77 +1325,57 @@ start_threads(struct glob_arg *g) * using a single descriptor. */ for (i = 0; i < g->nthreads; i++) { - bzero(&targs[i], sizeof(targs[i])); - targs[i].fd = -1; /* default, with pcap */ - targs[i].g = g; + struct targ *t = &targs[i]; + + bzero(t, sizeof(*t)); + t->fd = -1; /* default, with pcap */ + t->g = g; if (g->dev_type == DEV_NETMAP) { - struct nmreq tifreq; - int tfd; + struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */ - /* register interface. */ - tfd = open("/dev/netmap", O_RDWR); - if (tfd == -1) { - D("Unable to open /dev/netmap: %s", strerror(errno)); + if (g->nthreads > 1) { + if (nmd.req.nr_flags != NR_REG_ALL_NIC) { + D("invalid nthreads mode %d", nmd.req.nr_flags); + continue; + } + nmd.req.nr_flags = NR_REG_ONE_NIC; + nmd.req.nr_ringid = i; + } + /* Only touch one of the rings (rx is already ok) */ + if (g->td_body == receiver_body) + nmd.req.nr_ringid |= NETMAP_NO_TX_POLL; + + /* register interface. Override ifname and ringid etc. */ + + t->nmd = nm_open(t->g->ifname, NULL, g->nmd_flags | + NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, g->nmd); + if (t->nmd == NULL) { + D("Unable to open %s: %s", + t->g->ifname, strerror(errno)); continue; } - targs[i].fd = tfd; + t->fd = t->nmd->fd; - bzero(&tifreq, sizeof(tifreq)); - strncpy(tifreq.nr_name, g->ifname, sizeof(tifreq.nr_name)); - tifreq.nr_version = NETMAP_API; - if (g->host_ring) { - tifreq.nr_ringid = NETMAP_SW_RING; - } else { - tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0; - } - parse_nmr_config(g->nmr_config, &tifreq); - - /* - * if we are acting as a receiver only, do not touch the transmit ring. - * This is not the default because many apps may use the interface - * in both directions, but a pure receiver does not. - */ - if (g->td_body == receiver_body) { - tifreq.nr_ringid |= NETMAP_NO_TX_POLL; - } - - if ((ioctl(tfd, NIOCREGIF, &tifreq)) == -1) { - D("Unable to register %s: %s", g->ifname, strerror(errno)); - continue; - } - D("memsize is %d MB", tifreq.nr_memsize >> 20); - targs[i].nmr = tifreq; - targs[i].nifp = NETMAP_IF(g->mmap_addr, tifreq.nr_offset); - D("nifp flags 0x%x", targs[i].nifp->ni_flags); - /* start threads. */ - if (g->host_ring) { - targs[i].qfirst = (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings); - targs[i].qlast = targs[i].qfirst + 1; - } else { - targs[i].qfirst = (g->nthreads > 1) ? i : 0; - targs[i].qlast = (g->nthreads > 1) ? i+1 : - (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings); - } } else { targs[i].fd = g->main_fd; } - targs[i].used = 1; - targs[i].me = i; + t->used = 1; + t->me = i; if (g->affinity >= 0) { if (g->affinity < g->cpus) - targs[i].affinity = g->affinity; + t->affinity = g->affinity; else - targs[i].affinity = i % g->cpus; - } else - targs[i].affinity = -1; + t->affinity = i % g->cpus; + } else { + t->affinity = -1; + } /* default, init packets */ - initialize_packet(&targs[i]); + initialize_packet(t); - if (pthread_create(&targs[i].thread, NULL, g->td_body, - &targs[i]) == -1) { + if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) { D("Unable to create thread %d: %s", i, strerror(errno)); - targs[i].used = 0; + t->used = 0; } } } @@ -1375,7 +1400,6 @@ main_thread(struct glob_arg *g) delta.tv_usec = (g->report_interval%1000)*1000; select(0, NULL, NULL, NULL, &delta); gettimeofday(&now, NULL); - time_second = now.tv_sec; timersub(&now, &toc, &toc); my_count = 0; for (i = 0; i < g->nthreads; i++) { @@ -1388,8 +1412,10 @@ main_thread(struct glob_arg *g) continue; npkts = my_count - prev; pps = (npkts*1000000 + usec/2) / usec; - D("%" PRIu64 " pps (%" PRIu64 " pkts in %" PRIu64 " usec)", - pps, npkts, usec); + D("%llu pps (%llu pkts in %llu usec)", + (unsigned long long)pps, + (unsigned long long)npkts, + (unsigned long long)usec); prev = my_count; toc = now; if (done == g->nthreads) @@ -1433,7 +1459,7 @@ main_thread(struct glob_arg *g) rx_output(count, delta_t); if (g->dev_type == DEV_NETMAP) { - munmap(g->mmap_addr, g->mmap_size); + munmap(g->nmd->mem, g->nmd->req.nr_memsize); close(g->main_fd); } } @@ -1521,7 +1547,6 @@ main(int arc, char **argv) struct glob_arg g; - struct nmreq nmr; int ch; int wait_link = 2; int devqueues = 1; /* how many device queues */ @@ -1548,7 +1573,7 @@ main(int arc, char **argv) g.virt_header = 0; while ( (ch = getopt(arc, argv, - "a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:h")) != -1) { + "a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:")) != -1) { struct sf *fn; switch(ch) { @@ -1594,23 +1619,28 @@ main(int arc, char **argv) * otherwise we guess */ D("interface is %s", optarg); - g.ifname = optarg; + if (strlen(optarg) > MAX_IFNAMELEN - 8) { + D("ifname too long %s", optarg); + break; + } + strcpy(g.ifname, optarg); if (!strcmp(optarg, "null")) { g.dev_type = DEV_NETMAP; g.dummy_send = 1; } else if (!strncmp(optarg, "tap:", 4)) { g.dev_type = DEV_TAP; - g.ifname = optarg + 4; + strcpy(g.ifname, optarg + 4); } else if (!strncmp(optarg, "pcap:", 5)) { g.dev_type = DEV_PCAP; - g.ifname = optarg + 5; - } else if (!strncmp(optarg, "netmap:", 7)) { + strcpy(g.ifname, optarg + 5); + } else if (!strncmp(optarg, "netmap:", 7) || + !strncmp(optarg, "vale", 4)) { g.dev_type = DEV_NETMAP; - g.ifname = optarg + 7; } else if (!strncmp(optarg, "tap", 3)) { g.dev_type = DEV_TAP; - } else { + } else { /* prepend netmap: */ g.dev_type = DEV_NETMAP; + sprintf(g.ifname, "netmap:%s", optarg); } break; @@ -1618,18 +1648,6 @@ main(int arc, char **argv) g.options |= OPT_INDIRECT; /* XXX use indirect buffer */ break; - case 't': /* send, deprecated */ - D("-t deprecated, please use -f tx -n %s", optarg); - g.td_body = sender_body; - g.npackets = atoi(optarg); - break; - - case 'r': /* receive */ - D("-r deprecated, please use -f rx -n %s", optarg); - g.td_body = receiver_body; - g.npackets = atoi(optarg); - break; - case 'l': /* pkt_size */ g.pkt_size = atoi(optarg); break; @@ -1686,8 +1704,8 @@ main(int arc, char **argv) case 'H': g.virt_header = atoi(optarg); break; - case 'h': - g.host_ring = 1; + case 'e': /* extra bufs */ + g.extra_bufs = atoi(optarg); break; } } @@ -1759,42 +1777,33 @@ main(int arc, char **argv) } else if (g.dummy_send) { /* but DEV_NETMAP */ D("using a dummy send routine"); } else { - bzero(&nmr, sizeof(nmr)); - nmr.nr_version = NETMAP_API; + struct nm_desc base_nmd; + + bzero(&base_nmd, sizeof(base_nmd)); + + g.nmd_flags = 0; + g.nmd_flags |= parse_nmr_config(g.nmr_config, &base_nmd.req); + if (g.extra_bufs) { + base_nmd.req.nr_arg3 = g.extra_bufs; + g.nmd_flags |= NM_OPEN_ARG3; + } + /* - * Open the netmap device to fetch the number of queues of our - * interface. + * Open the netmap device using nm_open(). * - * The first NIOCREGIF also detaches the card from the * protocol stack and may cause a reset of the card, * which in turn may take some time for the PHY to - * reconfigure. + * reconfigure. We do the open here to have time to reset. */ - g.main_fd = open("/dev/netmap", O_RDWR); - if (g.main_fd == -1) { - D("Unable to open /dev/netmap: %s", strerror(errno)); - // fail later + g.nmd = nm_open(g.ifname, NULL, g.nmd_flags, &base_nmd); + if (g.nmd == NULL) { + D("Unable to open %s: %s", g.ifname, strerror(errno)); + goto out; } - /* - * Register the interface on the netmap device: from now on, - * we can operate on the network interface without any - * interference from the legacy network stack. - * - * We decide to put the first interface registration here to - * give time to cards that take a long time to reset the PHY. - */ - bzero(&nmr, sizeof(nmr)); - nmr.nr_version = NETMAP_API; - strncpy(nmr.nr_name, g.ifname, sizeof(nmr.nr_name)); - parse_nmr_config(g.nmr_config, &nmr); - if (ioctl(g.main_fd, NIOCREGIF, &nmr) == -1) { - D("Unable to register interface %s: %s", g.ifname, strerror(errno)); - //continue, fail later - } - ND("%s: txr %d txd %d rxr %d rxd %d", g.ifname, - nmr.nr_tx_rings, nmr.nr_tx_slots, - nmr.nr_rx_rings, nmr.nr_rx_slots); - devqueues = nmr.nr_rx_rings; + g.main_fd = g.nmd->fd; + D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem); + + devqueues = g.nmd->req.nr_rx_rings; /* validate provided nthreads. */ if (g.nthreads < 1 || g.nthreads > devqueues) { @@ -1802,32 +1811,18 @@ main(int arc, char **argv) // continue, fail later } - /* - * Map the netmap shared memory: instead of issuing mmap() - * inside the body of the threads, we prefer to keep this - * operation here to simplify the thread logic. - */ - D("mapping %d Kbytes", nmr.nr_memsize>>10); - g.mmap_size = nmr.nr_memsize; - g.mmap_addr = (struct netmap_d *) mmap(0, nmr.nr_memsize, - PROT_WRITE | PROT_READ, - MAP_SHARED, g.main_fd, 0); - if (g.mmap_addr == MAP_FAILED) { - D("Unable to mmap %d KB: %s", nmr.nr_memsize >> 10, strerror(errno)); - // continue, fail later - } - if (verbose) { - struct netmap_if *nifp = NETMAP_IF(g.mmap_addr, nmr.nr_offset); + struct netmap_if *nifp = g.nmd->nifp; + struct nmreq *req = &g.nmd->req; - D("nifp at offset %d, %d tx %d rx rings %s", - nmr.nr_offset, nmr.nr_tx_rings, nmr.nr_rx_rings, - nmr.nr_ringid & NETMAP_PRIV_MEM ? "PRIVATE" : "common" ); - for (i = 0; i <= nmr.nr_tx_rings; i++) { + D("nifp at offset %d, %d tx %d rx region %d", + req->nr_offset, req->nr_tx_rings, req->nr_rx_rings, + req->nr_arg2); + for (i = 0; i <= req->nr_tx_rings; i++) { D(" TX%d at 0x%lx", i, (char *)NETMAP_TXRING(nifp, i) - (char *)nifp); } - for (i = 0; i <= nmr.nr_rx_rings; i++) { + for (i = 0; i <= req->nr_rx_rings; i++) { D(" RX%d at 0x%lx", i, (char *)NETMAP_RXRING(nifp, i) - (char *)nifp); } @@ -1846,7 +1841,8 @@ main(int arc, char **argv) g.src_ip.name, g.dst_ip.name, g.src_mac.name, g.dst_mac.name); } - + +out: /* Exit if something went wrong. */ if (g.main_fd < 0) { D("aborting"); @@ -1854,7 +1850,7 @@ main(int arc, char **argv) } } - + if (g.options) { D("--- SPECIAL OPTIONS:%s%s%s%s%s\n", g.options & OPT_PREFETCH ? " prefetch" : "", diff --git a/tools/tools/netmap/vale-ctl.c b/tools/tools/netmap/vale-ctl.c index eb6c48d15a0..e1d8da56806 100644 --- a/tools/tools/netmap/vale-ctl.c +++ b/tools/tools/netmap/vale-ctl.c @@ -33,6 +33,7 @@ #include /* close */ #include /* ioctl */ #include +#include /* apple needs sockaddr */ #include /* ifreq */ #include #include