diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index 1cd0a92d0a3..deb550b7978 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2012 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,6 +23,8 @@ * SUCH DAMAGE. */ +#define NM_BRIDGE + /* * This module supports memory mapped access to network devices, * see netmap(4). @@ -52,6 +54,14 @@ * transmit or receive queues (or all queues for a given interface). */ +#ifdef linux +#include "bsd_glue.h" +static netdev_tx_t netmap_start_linux(struct sk_buff *skb, struct net_device *dev); +#endif /* linux */ +#ifdef __APPLE__ +#include "osx_glue.h" +#endif +#ifdef __FreeBSD__ #include /* prerequisite */ __FBSDID("$FreeBSD$"); @@ -83,6 +93,7 @@ __FBSDID("$FreeBSD$"); #include /* bus_dmamap_* */ MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map"); +#endif /* __FreeBSD__ */ /* * lock and unlock for the netmap memory allocator @@ -115,6 +126,173 @@ int netmap_no_pendintr = 1; SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); +int netmap_drop = 0; /* debugging */ +int netmap_flags = 0; /* debug flags */ +int netmap_copy = 0; /* debugging, copy content */ + +SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, copy, CTLFLAG_RW, &netmap_copy, 0 , ""); + +#ifdef NM_BRIDGE /* support for netmap bridge */ + +/* + * system parameters. + * + * All switched ports have prefix NM_NAME. + * The switch has a max of NM_BDG_MAXPORTS ports (often stored in a bitmap, + * so a practical upper bound is 64). + * Each tx ring is read-write, whereas rx rings are readonly (XXX not done yet). + * The virtual interfaces use per-queue lock instead of core lock. + * In the tx loop, we aggregate traffic in batches to make all operations + * faster. The batch size is NM_BDG_BATCH + */ +#define NM_NAME "vale" /* prefix for the interface */ +#define NM_BDG_MAXPORTS 16 /* up to 64 ? */ +#define NM_BRIDGE_RINGSIZE 1024 /* in the device */ +#define NM_BDG_HASH 1024 /* forwarding table entries */ +#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ +#define NM_BRIDGES 4 /* number of bridges */ +int netmap_bridge = NM_BDG_BATCH; /* bridge batch size */ +SYSCTL_INT(_dev_netmap, OID_AUTO, bridge, CTLFLAG_RW, &netmap_bridge, 0 , ""); +#ifdef linux +#define ADD_BDG_REF(ifp) (NA(ifp)->if_refcount++) +#define DROP_BDG_REF(ifp) (NA(ifp)->if_refcount-- <= 1) +#else /* !linux */ +#define ADD_BDG_REF(ifp) (ifp)->if_refcount++ +#define DROP_BDG_REF(ifp) refcount_release(&(ifp)->if_refcount) +#ifdef __FreeBSD__ +#include +#include +#endif /* __FreeBSD__ */ +#endif /* !linux */ + +static void bdg_netmap_attach(struct ifnet *ifp); +static int bdg_netmap_reg(struct ifnet *ifp, int onoff); +/* per-tx-queue entry */ +struct nm_bdg_fwd { /* forwarding entry for a bridge */ + void *buf; + uint64_t dst; /* dst mask */ + uint32_t src; /* src index ? */ + uint16_t len; /* src len */ +#if 0 + uint64_t src_mac; /* ignore 2 MSBytes */ + uint64_t dst_mac; /* ignore 2 MSBytes */ + uint32_t dst_idx; /* dst index in fwd table */ + uint32_t dst_buf; /* where we copy to */ +#endif +}; + +struct nm_hash_ent { + uint64_t mac; /* the top 2 bytes are the epoch */ + uint64_t ports; +}; + +/* + * Interfaces for a bridge are all in ports[]. + * The array has fixed size, an empty entry does not terminate + * the search. + */ +struct nm_bridge { + struct ifnet *bdg_ports[NM_BDG_MAXPORTS]; + int n_ports; + uint64_t act_ports; + int freelist; /* first buffer index */ + NM_SELINFO_T si; /* poll/select wait queue */ + NM_LOCK_T bdg_lock; /* protect the selinfo ? */ + + /* the forwarding table, MAC+ports */ + struct nm_hash_ent ht[NM_BDG_HASH]; + + int namelen; /* 0 means free */ + char basename[IFNAMSIZ]; +}; + +struct nm_bridge nm_bridges[NM_BRIDGES]; + +#define BDG_LOCK(b) mtx_lock(&(b)->bdg_lock) +#define BDG_UNLOCK(b) mtx_unlock(&(b)->bdg_lock) + +/* + * NA(ifp)->bdg_port port index + */ + +#ifndef linux +static inline void prefetch (const void *x) +{ + __asm volatile("prefetcht0 %0" :: "m" (*(const unsigned long *)x)); +} +#endif /* !linux */ + +// XXX only for multiples of 64 bytes, non overlapped. +static inline void +pkt_copy(void *_src, void *_dst, int l) +{ + uint64_t *src = _src; + uint64_t *dst = _dst; + if (unlikely(l >= 1024)) { + bcopy(src, dst, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + +/* + * locate a bridge among the existing ones. + * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. + * We assume that this is called with a name of at least NM_NAME chars. + */ +static struct nm_bridge * +nm_find_bridge(const char *name) +{ + int i, l, namelen, e; + struct nm_bridge *b = NULL; + + namelen = strlen(NM_NAME); /* base length */ + l = strlen(name); /* actual length */ + for (i = namelen + 1; i < l; i++) { + if (name[i] == ':') { + namelen = i; + break; + } + } + if (namelen >= IFNAMSIZ) + namelen = IFNAMSIZ; + ND("--- prefix is '%.*s' ---", namelen, name); + + /* use the first entry for locking */ + BDG_LOCK(nm_bridges); // XXX do better + for (e = -1, i = 1; i < NM_BRIDGES; i++) { + b = nm_bridges + i; + if (b->namelen == 0) + e = i; /* record empty slot */ + else if (strncmp(name, b->basename, namelen) == 0) { + ND("found '%.*s' at %d", namelen, name, i); + break; + } + } + if (i == NM_BRIDGES) { /* all full */ + if (e == -1) { /* no empty slot */ + b = NULL; + } else { + b = nm_bridges + e; + strncpy(b->basename, name, namelen); + b->namelen = namelen; + } + } + BDG_UNLOCK(nm_bridges); + return b; +} +#endif /* NM_BRIDGE */ /*------------- memory allocator -----------------*/ #ifdef NETMAP_MEM2 @@ -200,6 +378,46 @@ netmap_dtor_locked(void *data) netmap_if_free(nifp); } +static void +nm_if_rele(struct ifnet *ifp) +{ +#ifndef NM_BRIDGE + if_rele(ifp); +#else /* NM_BRIDGE */ + int i, full; + struct nm_bridge *b; + + if (strncmp(ifp->if_xname, NM_NAME, sizeof(NM_NAME) - 1)) { + if_rele(ifp); + return; + } + if (!DROP_BDG_REF(ifp)) + return; + b = ifp->if_bridge; + BDG_LOCK(nm_bridges); + BDG_LOCK(b); + ND("want to disconnect %s from the bridge", ifp->if_xname); + full = 0; + for (i = 0; i < NM_BDG_MAXPORTS; i++) { + if (b->bdg_ports[i] == ifp) { + b->bdg_ports[i] = NULL; + bzero(ifp, sizeof(*ifp)); + free(ifp, M_DEVBUF); + break; + } + else if (b->bdg_ports[i] != NULL) + full = 1; + } + BDG_UNLOCK(b); + if (full == 0) { + ND("freeing bridge %d", b - nm_bridges); + b->namelen = 0; + } + BDG_UNLOCK(nm_bridges); + if (i == NM_BDG_MAXPORTS) + D("ouch, cannot find ifp to remove"); +#endif /* NM_BRIDGE */ +} static void netmap_dtor(void *data) @@ -212,7 +430,7 @@ netmap_dtor(void *data) netmap_dtor_locked(data); na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); - if_rele(ifp); + nm_if_rele(ifp); bzero(priv, sizeof(*priv)); /* XXX for safety */ free(priv, M_DEVBUF); } @@ -228,6 +446,7 @@ netmap_dtor(void *data) * Return 0 on success, -1 otherwise. */ +#ifdef __FreeBSD__ static int netmap_mmap(__unused struct cdev *dev, #if __FreeBSD_version < 900000 @@ -246,6 +465,7 @@ netmap_mmap(__unused struct cdev *dev, return (0); } +#endif /* __FreeBSD__ */ /* @@ -363,6 +583,64 @@ netmap_sync_from_host(struct netmap_adapter *na, struct thread *td) static int get_ifp(const char *name, struct ifnet **ifp) { +#ifdef NM_BRIDGE + struct ifnet *iter = NULL; + + do { + struct nm_bridge *b; + int i, l, cand = -1; + + if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) + break; + b = nm_find_bridge(name); + if (b == NULL) { + D("no bridges available for '%s'", name); + return (ENXIO); + } + /* XXX locking */ + BDG_LOCK(b); + /* lookup in the local list of ports */ + for (i = 0; i < NM_BDG_MAXPORTS; i++) { + iter = b->bdg_ports[i]; + if (iter == NULL) { + if (cand == -1) + cand = i; /* potential insert point */ + continue; + } + if (!strcmp(iter->if_xname, name)) { + ADD_BDG_REF(iter); + ND("found existing interface"); + BDG_UNLOCK(b); + break; + } + } + if (i < NM_BDG_MAXPORTS) /* already unlocked */ + break; + if (cand == -1) { + D("bridge full, cannot create new port"); +no_port: + BDG_UNLOCK(b); + *ifp = NULL; + return EINVAL; + } + ND("create new bridge port %s", name); + /* space for forwarding list after the ifnet */ + l = sizeof(*iter) + + sizeof(struct nm_bdg_fwd)*NM_BDG_BATCH ; + iter = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); + if (!iter) + goto no_port; + strcpy(iter->if_xname, name); + bdg_netmap_attach(iter); + b->bdg_ports[cand] = iter; + iter->if_bridge = b; + ADD_BDG_REF(iter); + BDG_UNLOCK(b); + ND("attaching virtual bridge %p", b); + } while (0); + *ifp = iter; + if (! *ifp) +#endif /* NM_BRIDGE */ *ifp = ifunit_ref(name); if (*ifp == NULL) return (ENXIO); @@ -371,7 +649,7 @@ get_ifp(const char *name, struct ifnet **ifp) */ if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp)) return 0; /* valid pointer, we hold the refcount */ - if_rele(*ifp); + nm_if_rele(*ifp); return EINVAL; // not NETMAP capable } @@ -502,6 +780,21 @@ netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, u_int i, lim; struct netmap_if *nifp; +#ifdef linux +#define devfs_get_cdevpriv(pp) \ + ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ + (*pp ? 0 : ENOENT); }) + +/* devfs_set_cdevpriv cannot fail on linux */ +#define devfs_set_cdevpriv(p, fn) \ + ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) + + +#define devfs_clear_cdevpriv() do { \ + netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ + } while (0) +#endif /* linux */ + CURVNET_SET(TD_TO_VNET(td)); error = devfs_get_cdevpriv((void **)&priv); @@ -511,6 +804,7 @@ netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, } error = 0; /* Could be ENOENT */ + nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ switch (cmd) { case NIOCGINFO: /* return capabilities etc */ /* memsize is always valid */ @@ -535,7 +829,7 @@ netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - if_rele(ifp); /* return the refcount */ + nm_if_rele(ifp); /* return the refcount */ break; case NIOCREGIF: @@ -561,7 +855,7 @@ netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, M_NOWAIT | M_ZERO); if (priv == NULL) { error = ENOMEM; - if_rele(ifp); /* return the refcount */ + nm_if_rele(ifp); /* return the refcount */ break; } @@ -576,7 +870,7 @@ netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, D("too many NIOCREGIF attempts, give up"); error = EINVAL; free(priv, M_DEVBUF); - if_rele(ifp); /* return the refcount */ + nm_if_rele(ifp); /* return the refcount */ break; } @@ -593,6 +887,11 @@ netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, /* Otherwise set the card in netmap mode * and make it use the shared buffers. */ + for (i = 0 ; i < na->num_tx_rings + 1; i++) + mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", MTX_NETWORK_LOCK, MTX_DEF); + for (i = 0 ; i < na->num_rx_rings + 1; i++) { + mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", MTX_NETWORK_LOCK, MTX_DEF); + } error = na->nm_register(ifp, 1); /* mode on */ if (error) netmap_dtor_locked(priv); @@ -601,7 +900,7 @@ netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data, if (error) { /* reg. failed, release priv and ref */ error: na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0); - if_rele(ifp); /* return the refcount */ + nm_if_rele(ifp); /* return the refcount */ bzero(priv, sizeof(*priv)); free(priv, M_DEVBUF); break; @@ -679,6 +978,7 @@ error: break; +#ifdef __FreeBSD__ case BIOCIMMEDIATE: case BIOCGHDRCMPLT: case BIOCSHDRCMPLT: @@ -696,9 +996,14 @@ error: so.so_vnet = ifp->if_vnet; // so->so_proto not null. error = ifioctl(&so, cmd, data, td); - if_rele(ifp); + nm_if_rele(ifp); break; } + +#else /* linux */ + default: + error = EOPNOTSUPP; +#endif /* linux */ } CURVNET_RESTORE(); @@ -715,6 +1020,11 @@ error: * selfd or on the global one. * Device-dependent parts (locking and sync of tx/rx rings) * are done through callbacks. + * + * On linux, pwait is the poll table. + * If pwait == NULL someone else already woke up before. We can report + * events but they are filtered upstream. + * If pwait != NULL, then pwait->key contains the list of events. */ static int netmap_poll(__unused struct cdev *dev, int events, struct thread *td) @@ -801,6 +1111,13 @@ netmap_poll(__unused struct cdev *dev, int events, struct thread *td) * LOCKED_CL core lock is set, so we need to release it. */ core_lock = (check_all || !na->separate_locks) ? NEED_CL : NO_CL; +#ifdef NM_BRIDGE + /* the bridge uses separate locks */ + if (na->nm_register == bdg_netmap_reg) { + ND("not using core lock for %s", ifp->if_xname); + core_lock = NO_CL; + } +#endif /* NM_BRIDGE */ if (priv->np_qlast != NETMAP_HW_RING) { lim_tx = lim_rx = priv->np_qlast; } @@ -970,7 +1287,7 @@ netmap_lock_wrapper(struct ifnet *dev, int what, u_int queueid) int netmap_attach(struct netmap_adapter *na, int num_queues) { - int i, n, size; + int n, size; void *buf; struct ifnet *ifp = na->ifp; @@ -999,18 +1316,21 @@ netmap_attach(struct netmap_adapter *na, int num_queues) ifp->if_capabilities |= IFCAP_NETMAP; na = buf; - if (na->nm_lock == NULL) + if (na->nm_lock == NULL) { + ND("using default locks for %s", ifp->if_xname); na->nm_lock = netmap_lock_wrapper; - mtx_init(&na->core_lock, "netmap core lock", NULL, MTX_DEF); - for (i = 0 ; i < na->num_tx_rings + 1; i++) - mtx_init(&na->tx_rings[i].q_lock, "netmap txq lock", NULL, MTX_DEF); - for (i = 0 ; i < na->num_rx_rings + 1; i++) - mtx_init(&na->rx_rings[i].q_lock, "netmap rxq lock", NULL, MTX_DEF); + /* core lock initialized here. + * others initialized after netmap_if_new + */ + mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF); + } } #ifdef linux - D("netdev_ops %p", ifp->netdev_ops); - /* prepare a clone of the netdev ops */ - na->nm_ndo = *ifp->netdev_ops; + if (ifp->netdev_ops) { + D("netdev_ops %p", ifp->netdev_ops); + /* prepare a clone of the netdev ops */ + na->nm_ndo = *ifp->netdev_ops; + } na->nm_ndo.ndo_start_xmit = netmap_start_linux; #endif D("%s for %s", buf ? "ok" : "failed", ifp->if_xname); @@ -1137,6 +1457,16 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, int n, kring->nkr_hwofs, na->ifp->if_xname, tx == NR_TX ? "TX" : "RX", n); +#if 0 // def linux + /* XXX check that the mappings are correct */ + /* need ring_nr, adapter->pdev, direction */ + buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE); + if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { + D("error mapping rx netmap buffer %d", i); + // XXX fix error handling + } + +#endif /* linux */ /* * Wakeup on the individual and global lock * We do the wakeup here, but the ring is not yet reconfigured. @@ -1209,6 +1539,343 @@ static struct cdevsw netmap_cdevsw = { .d_poll = netmap_poll, }; +#ifdef NM_BRIDGE +/* + *---- support for virtual bridge ----- + */ + +/* ----- FreeBSD if_bridge hash function ------- */ + +/* + * The following hash function is adapted from "Hash Functions" by Bob Jenkins + * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + * + * http://www.burtleburtle.net/bob/hash/spooky.html + */ +#define mix(a, b, c) \ +do { \ + a -= b; a -= c; a ^= (c >> 13); \ + b -= c; b -= a; b ^= (a << 8); \ + c -= a; c -= b; c ^= (b >> 13); \ + a -= b; a -= c; a ^= (c >> 12); \ + b -= c; b -= a; b ^= (a << 16); \ + c -= a; c -= b; c ^= (b >> 5); \ + a -= b; a -= c; a ^= (c >> 3); \ + b -= c; b -= a; b ^= (a << 10); \ + c -= a; c -= b; c ^= (b >> 15); \ +} while (/*CONSTCOND*/0) + +static __inline uint32_t +nm_bridge_rthash(const uint8_t *addr) +{ + uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key + + b += addr[5] << 8; + b += addr[4]; + a += addr[3] << 24; + a += addr[2] << 16; + a += addr[1] << 8; + a += addr[0]; + + mix(a, b, c); +#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) + return (c & BRIDGE_RTHASH_MASK); +} + +#undef mix + + +static int +bdg_netmap_reg(struct ifnet *ifp, int onoff) +{ + int i, err = 0; + struct nm_bridge *b = ifp->if_bridge; + + BDG_LOCK(b); + if (onoff) { + /* the interface must be already in the list. + * only need to mark the port as active + */ + ND("should attach %s to the bridge", ifp->if_xname); + for (i=0; i < NM_BDG_MAXPORTS; i++) + if (b->bdg_ports[i] == ifp) + break; + if (i == NM_BDG_MAXPORTS) { + D("no more ports available"); + err = EINVAL; + goto done; + } + ND("setting %s in netmap mode", ifp->if_xname); + ifp->if_capenable |= IFCAP_NETMAP; + NA(ifp)->bdg_port = i; + b->act_ports |= (1<bdg_ports[i] = ifp; + } else { + /* should be in the list, too -- remove from the mask */ + ND("removing %s from netmap mode", ifp->if_xname); + ifp->if_capenable &= ~IFCAP_NETMAP; + i = NA(ifp)->bdg_port; + b->act_ports &= ~(1<bdg_port; + uint64_t smac, dmac; + struct netmap_slot *slot; + struct nm_bridge *b = ifp->if_bridge; + + ND("prepare to send %d packets, act_ports 0x%x", n, b->act_ports); + /* only consider valid destinations */ + all_dst = (b->act_ports & ~mysrc); + /* first pass: hash and find destinations */ + for (i = 0; likely(i < n); i++) { + uint8_t *buf = ft[i].buf; + dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; + smac = le64toh(*(uint64_t *)(buf + 4)); + smac >>= 16; + if (unlikely(netmap_verbose)) { + uint8_t *s = buf+6, *d = buf; + D("%d len %4d %02x:%02x:%02x:%02x:%02x:%02x -> %02x:%02x:%02x:%02x:%02x:%02x", + i, + ft[i].len, + s[0], s[1], s[2], s[3], s[4], s[5], + d[0], d[1], d[2], d[3], d[4], d[5]); + } + /* + * The hash is somewhat expensive, there might be some + * worthwhile optimizations here. + */ + if ((buf[6] & 1) == 0) { /* valid src */ + uint8_t *s = buf+6; + sh = nm_bridge_rthash(buf+6); // XXX hash of source + /* update source port forwarding entry */ + b->ht[sh].mac = smac; /* XXX expire ? */ + b->ht[sh].ports = mysrc; + if (netmap_verbose) + D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", + s[0], s[1], s[2], s[3], s[4], s[5], NA(ifp)->bdg_port); + } + dst = 0; + if ( (buf[0] & 1) == 0) { /* unicast */ + uint8_t *d = buf; + dh = nm_bridge_rthash(buf); // XXX hash of dst + if (b->ht[dh].mac == dmac) { /* found dst */ + dst = b->ht[dh].ports; + if (netmap_verbose) + D("dst %02x:%02x:%02x:%02x:%02x:%02x to port %x", + d[0], d[1], d[2], d[3], d[4], d[5], (uint32_t)(dst >> 16)); + } + } + if (dst == 0) + dst = all_dst; + dst &= all_dst; /* only consider valid ports */ + if (unlikely(netmap_verbose)) + D("pkt goes to ports 0x%x", (uint32_t)dst); + ft[i].dst = dst; + } + + /* second pass, scan interfaces and forward */ + all_dst = (b->act_ports & ~mysrc); + for (ifn = 0; all_dst; ifn++) { + struct ifnet *dst_ifp = b->bdg_ports[ifn]; + struct netmap_adapter *na; + struct netmap_kring *kring; + struct netmap_ring *ring; + int j, lim, sent, locked; + + if (!dst_ifp) + continue; + ND("scan port %d %s", ifn, dst_ifp->if_xname); + dst = 1 << ifn; + if ((dst & all_dst) == 0) /* skip if not set */ + continue; + all_dst &= ~dst; /* clear current node */ + na = NA(dst_ifp); + + ring = NULL; + kring = NULL; + lim = sent = locked = 0; + /* inside, scan slots */ + for (i = 0; likely(i < n); i++) { + if ((ft[i].dst & dst) == 0) + continue; /* not here */ + if (!locked) { + kring = &na->rx_rings[0]; + ring = kring->ring; + lim = kring->nkr_num_slots - 1; + na->nm_lock(dst_ifp, NETMAP_RX_LOCK, 0); + locked = 1; + } + if (unlikely(kring->nr_hwavail >= lim)) { + if (netmap_verbose) + D("rx ring full on %s", ifp->if_xname); + break; + } + j = kring->nr_hwcur + kring->nr_hwavail; + if (j > lim) + j -= kring->nkr_num_slots; + slot = &ring->slot[j]; + ND("send %d %d bytes at %s:%d", i, ft[i].len, dst_ifp->if_xname, j); + pkt_copy(ft[i].buf, NMB(slot), ft[i].len); + slot->len = ft[i].len; + kring->nr_hwavail++; + sent++; + } + if (locked) { + ND("sent %d on %s", sent, dst_ifp->if_xname); + if (sent) + selwakeuppri(&kring->si, PI_NET); + na->nm_lock(dst_ifp, NETMAP_RX_UNLOCK, 0); + } + } + return 0; +} + +/* + * main dispatch routine + */ +static int +bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +{ + struct netmap_adapter *na = NA(ifp); + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int i, j, k, lim = kring->nkr_num_slots - 1; + struct nm_bdg_fwd *ft = (struct nm_bdg_fwd *)(ifp + 1); + int ft_i; /* position in the forwarding table */ + + k = ring->cur; + if (k > lim) + return netmap_ring_reinit(kring); + if (do_lock) + na->nm_lock(ifp, NETMAP_TX_LOCK, ring_nr); + + if (netmap_bridge <= 0) { /* testing only */ + j = k; // used all + goto done; + } + if (netmap_bridge > NM_BDG_BATCH) + netmap_bridge = NM_BDG_BATCH; + + ft_i = 0; /* start from 0 */ + for (j = kring->nr_hwcur; likely(j != k); j = unlikely(j == lim) ? 0 : j+1) { + struct netmap_slot *slot = &ring->slot[j]; + int len = ft[ft_i].len = slot->len; + char *buf = ft[ft_i].buf = NMB(slot); + + prefetch(buf); + if (unlikely(len < 14)) + continue; + if (unlikely(++ft_i == netmap_bridge)) + ft_i = nm_bdg_flush(ft, ft_i, ifp); + } + if (ft_i) + ft_i = nm_bdg_flush(ft, ft_i, ifp); + /* count how many packets we sent */ + i = k - j; + if (i < 0) + i += kring->nkr_num_slots; + kring->nr_hwavail = kring->nkr_num_slots - 1 - i; + if (j != k) + D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); + +done: + kring->nr_hwcur = j; + ring->avail = kring->nr_hwavail; + if (do_lock) + na->nm_lock(ifp, NETMAP_TX_UNLOCK, ring_nr); + + if (netmap_verbose) + D("%s ring %d lock %d", ifp->if_xname, ring_nr, do_lock); + return 0; +} + +static int +bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int do_lock) +{ + struct netmap_adapter *na = NA(ifp); + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + int j, n, lim = kring->nkr_num_slots - 1; + u_int k = ring->cur, resvd = ring->reserved; + + ND("%s ring %d lock %d avail %d", + ifp->if_xname, ring_nr, do_lock, kring->nr_hwavail); + + if (k > lim) + return netmap_ring_reinit(kring); + if (do_lock) + na->nm_lock(ifp, NETMAP_RX_LOCK, ring_nr); + + /* skip past packets that userspace has released */ + j = kring->nr_hwcur; /* netmap ring index */ + if (resvd > 0) { + if (resvd + ring->avail >= lim + 1) { + D("XXX invalid reserve/avail %d %d", resvd, ring->avail); + ring->reserved = resvd = 0; // XXX panic... + } + k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; + } + + if (j != k) { /* userspace has released some packets. */ + n = k - j; + if (n < 0) + n += kring->nkr_num_slots; + ND("userspace releases %d packets", n); + for (n = 0; likely(j != k); n++) { + struct netmap_slot *slot = &ring->slot[j]; + void *addr = NMB(slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + if (do_lock) + na->nm_lock(ifp, NETMAP_RX_UNLOCK, ring_nr); + return netmap_ring_reinit(kring); + } + /* decrease refcount for buffer */ + + slot->flags &= ~NS_BUF_CHANGED; + j = unlikely(j == lim) ? 0 : j + 1; + } + kring->nr_hwavail -= n; + kring->nr_hwcur = k; + } + /* tell userspace that there are new packets */ + ring->avail = kring->nr_hwavail - resvd; + + if (do_lock) + na->nm_lock(ifp, NETMAP_RX_UNLOCK, ring_nr); + return 0; +} + +static void +bdg_netmap_attach(struct ifnet *ifp) +{ + struct netmap_adapter na; + + ND("attaching virtual bridge"); + bzero(&na, sizeof(na)); + + na.ifp = ifp; + na.separate_locks = 1; + na.num_tx_desc = NM_BRIDGE_RINGSIZE; + na.num_rx_desc = NM_BRIDGE_RINGSIZE; + na.nm_txsync = bdg_netmap_txsync; + na.nm_rxsync = bdg_netmap_rxsync; + na.nm_register = bdg_netmap_reg; + netmap_attach(&na, 1); +} + +#endif /* NM_BRIDGE */ static struct cdev *netmap_dev; /* /dev/netmap character device. */ @@ -1235,6 +1902,14 @@ netmap_init(void) (int)(nm_mem->nm_totalsize >> 20)); netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, "netmap"); + +#ifdef NM_BRIDGE + { + int i; + for (i = 0; i < NM_BRIDGES; i++) + mtx_init(&nm_bridges[i].bdg_lock, "bdg lock", "bdg_lock", MTX_DEF); + } +#endif return (error); } @@ -1253,6 +1928,7 @@ netmap_fini(void) } +#ifdef __FreeBSD__ /* * Kernel entry point. * @@ -1284,3 +1960,4 @@ netmap_loader(__unused struct module *module, int event, __unused void *arg) DEV_MODULE(netmap, netmap_loader, NULL); +#endif /* __FreeBSD__ */ diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index a1ac925b1c5..60ef7356cc4 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -25,7 +25,7 @@ /* * $FreeBSD$ - * $Id: netmap_kern.h 10602 2012-02-21 16:47:55Z luigi $ + * $Id: netmap_kern.h 11343 2012-07-03 09:08:38Z luigi $ * * The header contains the definitions of constants and function * prototypes used only in kernelspace. @@ -37,6 +37,9 @@ #define NETMAP_MEM2 // use the new memory allocator #if defined(__FreeBSD__) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + #define NM_LOCK_T struct mtx #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) @@ -46,6 +49,33 @@ #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) #define NM_SEND_UP(ifp, m) netif_rx(m) + +#ifndef DEV_NETMAP +#define DEV_NETMAP +#endif + +/* + * IFCAP_NETMAP goes into net_device's flags (if_capabilities) + * and priv_flags (if_capenable). The latter used to be 16 bits + * up to linux 2.6.36, so we need to use a 16 bit value on older + * platforms and tolerate the clash with IFF_DYNAMIC and IFF_BRIDGE_PORT. + * For the 32-bit value, 0x100000 (bit 20) has no clashes up to 3.3.1 + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) +#define IFCAP_NETMAP 0x8000 +#else +#define IFCAP_NETMAP 0x100000 +#endif + +#elif defined (__APPLE__) +#warning apple support is experimental +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#define NM_LOCK_T IOLock * +#define NM_SELINFO_T struct selinfo +#define MBUF_LEN(m) ((m)->m_pkthdr.len) +#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) + #else #error unsupported platform #endif @@ -64,6 +94,9 @@ MALLOC_DECLARE(M_NETMAP); __FUNCTION__, __LINE__, ##__VA_ARGS__); \ } while (0) +#ifndef IFF_NETMAP /* XXX is it really needed ? */ +#define IFF_NETMAP 0x20000 +#endif struct netmap_adapter; /* @@ -150,8 +183,11 @@ struct netmap_adapter { void (*nm_lock)(struct ifnet *, int what, u_int ringid); int (*nm_txsync)(struct ifnet *, u_int ring, int lock); int (*nm_rxsync)(struct ifnet *, u_int ring, int lock); + + int bdg_port; #ifdef linux struct net_device_ops nm_ndo; + int if_refcount; // XXX additions for bridge #endif /* linux */ }; @@ -240,6 +276,7 @@ enum { /* verbose flags */ #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) +#ifdef __FreeBSD__ /* Callback invoked by the dma machinery after a successfull dmamap_load */ static void netmap_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) @@ -267,6 +304,48 @@ netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } } +#else /* linux */ + +/* + * XXX How do we redefine these functions: + * + * on linux we need + * dma_map_single(&pdev->dev, virt_addr, len, direction) + * dma_unmap_single(&adapter->pdev->dev, phys_addr, len, direction + * The len can be implicit (on netmap it is NETMAP_BUF_SIZE) + * unfortunately the direction is not, so we need to change + * something to have a cross API + */ +#define netmap_load_map(_t, _m, _b) +#define netmap_reload_map(_t, _m, _b) +#if 0 + struct e1000_buffer *buffer_info = &tx_ring->buffer_info[l]; + /* set time_stamp *before* dma to help avoid a possible race */ + buffer_info->time_stamp = jiffies; + buffer_info->mapped_as_page = false; + buffer_info->length = len; + //buffer_info->next_to_watch = l; + /* reload dma map */ + dma_unmap_single(&adapter->pdev->dev, buffer_info->dma, + NETMAP_BUF_SIZE, DMA_TO_DEVICE); + buffer_info->dma = dma_map_single(&adapter->pdev->dev, + addr, NETMAP_BUF_SIZE, DMA_TO_DEVICE); + + if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) { + D("dma mapping error"); + /* goto dma_error; See e1000_put_txbuf() */ + /* XXX reset */ + } + tx_desc->buffer_addr = htole64(buffer_info->dma); //XXX + +#endif + +/* + * The bus_dmamap_sync() can be one of wmb() or rmb() depending on direction. + */ +#define bus_dmamap_sync(_a, _b, _c) + +#endif /* linux */ /* * functions to map NIC to KRING indexes (n2k) and vice versa (k2n) @@ -322,7 +401,7 @@ static inline void * NMB(struct netmap_slot *slot) { uint32_t i = slot->buf_idx; - return (i >= netmap_total_buffers) ? NMB_VA(0) : NMB_VA(i); + return (unlikely(i >= netmap_total_buffers)) ? NMB_VA(0) : NMB_VA(i); } static inline void * @@ -341,4 +420,6 @@ PNMB(struct netmap_slot *slot, uint64_t *pp) /* default functions to handle rx/tx interrupts */ int netmap_rx_irq(struct ifnet *, int, int *); #define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) + +extern int netmap_copy; #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index 76ef62e4ee7..b44d9f48718 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -679,11 +679,11 @@ netmap_if_new(const char *ifname, struct netmap_adapter *na) #ifdef linux // XXX initialize the selrecord structs. for (i = 0; i < ntx; i++) - init_waitqueue_head(&na->rx_rings[i].si); - for (i = 0; i < nrx; i++) init_waitqueue_head(&na->tx_rings[i].si); - init_waitqueue_head(&na->rx_si); + for (i = 0; i < nrx; i++) + init_waitqueue_head(&na->rx_rings[i].si); init_waitqueue_head(&na->tx_si); + init_waitqueue_head(&na->rx_si); #endif final: /*