diff --git a/share/man/man4/cxgbe.4 b/share/man/man4/cxgbe.4 index a3bb9690dec..91c59fe96a8 100644 --- a/share/man/man4/cxgbe.4 +++ b/share/man/man4/cxgbe.4 @@ -351,6 +351,17 @@ This tunable is for specialized applications only and should not be used in normal operation. The capabilities for which hardware resources have been reserved are listed in dev..X.*caps sysctls. +.It Va hw.cxgbe.tx_vm_wr +Setting this to 1 instructs the driver to use VM work requests to transmit data. +This lets PF interfaces transmit frames to VF interfaces over the internal +switch in the ASIC. +Note that the +.Xr cxgbev 4 +VF driver always uses VM work requests and is not affected by this tunable. +The default value is 0 and should be changed only if PF and VF interfaces need +to communicate with each other. +Different interfaces can be assigned different values using the +dev..X.tx_vm_wr sysctl when the interface is administratively down. .El .Sh SUPPORT For general information and support, diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 1e70d7b674d..b5dead62aaf 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -118,6 +118,8 @@ enum { SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */ TX_SGL_SEGS = 39, TX_SGL_SEGS_TSO = 38, + TX_SGL_SEGS_VM = 38, + TX_SGL_SEGS_VM_TSO = 37, TX_SGL_SEGS_EO_TSO = 30, /* XXX: lower for IPv6. */ TX_SGL_SEGS_VXLAN_TSO = 37, TX_WR_FLITS = SGE_MAX_WR_LEN / 8 @@ -173,6 +175,7 @@ enum { DOOMED = (1 << 0), VI_INIT_DONE = (1 << 1), VI_SYSCTL_CTX = (1 << 2), + TX_USES_VM_WR = (1 << 3), /* adapter debug_flags */ DF_DUMP_MBOX = (1 << 0), /* Log all mbox cmd/rpl. */ @@ -1267,7 +1270,7 @@ void t4_intr_evt(void *); void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *); void t4_update_fl_bufsize(struct ifnet *); struct mbuf *alloc_wr_mbuf(int, int); -int parse_pkt(struct adapter *, struct mbuf **); +int parse_pkt(struct mbuf **, bool); void *start_wrq_wr(struct sge_wrq *, int, struct wrq_cookie *); void commit_wrq_wr(struct sge_wrq *, void *, struct wrq_cookie *); int tnl_cong(struct port_info *, int); diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c index 042eadfbe2a..676111374d9 100644 --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -591,6 +591,10 @@ static int t4_panic_on_fatal_err = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, panic_on_fatal_err, CTLFLAG_RDTUN, &t4_panic_on_fatal_err, 0, "panic on fatal errors"); +static int t4_tx_vm_wr = 0; +SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_vm_wr, CTLFLAG_RWTUN, &t4_tx_vm_wr, 0, + "Use VM work requests to transmit packets."); + #ifdef TCP_OFFLOAD /* * TOE tunables. @@ -695,6 +699,7 @@ static int sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS); static int sysctl_btphy(SYSCTL_HANDLER_ARGS); static int sysctl_noflowq(SYSCTL_HANDLER_ARGS); +static int sysctl_tx_vm_wr(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS); @@ -1723,6 +1728,8 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi) vi->xact_addr_filt = -1; callout_init(&vi->tick, 1); + if (sc->flags & IS_VF || t4_tx_vm_wr != 0) + vi->flags |= TX_USES_VM_WR; /* Allocate an ifnet and set it up */ ifp = if_alloc_dev(IFT_ETHER, dev); @@ -1775,7 +1782,10 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi) #endif ifp->if_hw_tsomax = IP_MAXPACKET; - ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO; + if (vi->flags & TX_USES_VM_WR) + ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_VM_TSO; + else + ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO; #ifdef RATELIMIT if (is_ethoffload(sc) && vi->nofldtxq != 0) ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO; @@ -2174,7 +2184,7 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; - struct adapter *sc = pi->adapter; + struct adapter *sc; struct sge_txq *txq; #ifdef RATELIMIT struct cxgbe_snd_tag *cst; @@ -2194,7 +2204,7 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) return (ENETDOWN); } - rc = parse_pkt(sc, &m); + rc = parse_pkt(&m, vi->flags & TX_USES_VM_WR); if (__predict_false(rc != 0)) { MPASS(m == NULL); /* was freed already */ atomic_add_int(&pi->tx_parse_error, 1); /* rare, atomic is ok */ @@ -2209,6 +2219,7 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) #endif /* Select a txq. */ + sc = vi->adapter; txq = &sc->sge.txq[vi->first_txq]; if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) txq += ((m->m_pkthdr.flowid % (vi->ntxq - vi->rsrv_noflowq)) + @@ -6818,6 +6829,16 @@ vi_sysctls(struct vi_info *vi) "Reserve queue 0 for non-flowid packets"); } + if (vi->adapter->flags & IS_VF) { + MPASS(vi->flags & TX_USES_VM_WR); + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_vm_wr", CTLFLAG_RD, + NULL, 1, "use VM work requests for transmit"); + } else { + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_vm_wr", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, vi, 0, + sysctl_tx_vm_wr, "I", "use VM work requestes for transmit"); + } + #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD, @@ -7248,6 +7269,63 @@ sysctl_noflowq(SYSCTL_HANDLER_ARGS) return (rc); } +static int +sysctl_tx_vm_wr(SYSCTL_HANDLER_ARGS) +{ + struct vi_info *vi = arg1; + struct adapter *sc = vi->adapter; + int rc, val, i; + + MPASS(!(sc->flags & IS_VF)); + + val = vi->flags & TX_USES_VM_WR ? 1 : 0; + rc = sysctl_handle_int(oidp, &val, 0, req); + if (rc != 0 || req->newptr == NULL) + return (rc); + + if (val != 0 && val != 1) + return (EINVAL); + + rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, + "t4txvm"); + if (rc) + return (rc); + if (vi->ifp->if_drv_flags & IFF_DRV_RUNNING) { + /* + * We don't want parse_pkt to run with one setting (VF or PF) + * and then eth_tx to see a different setting but still use + * stale information calculated by parse_pkt. + */ + rc = EBUSY; + } else { + struct port_info *pi = vi->pi; + struct sge_txq *txq; + uint32_t ctrl0; + uint8_t npkt = sc->params.max_pkts_per_eth_tx_pkts_wr; + + if (val) { + vi->flags |= TX_USES_VM_WR; + vi->ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_VM_TSO; + ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | + V_TXPKT_INTF(pi->tx_chan)); + if (!(sc->flags & IS_VF)) + npkt--; + } else { + vi->flags &= ~TX_USES_VM_WR; + vi->ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO; + ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | + V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) | + V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld)); + } + for_each_txq(vi, i, txq) { + txq->cpl_ctrl0 = ctrl0; + txq->txp.max_npkt = npkt; + } + } + end_synchronized_op(sc, LOCK_HELD); + return (rc); +} + static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS) { diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index b45243aefdc..a1e5f28df63 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -278,7 +278,7 @@ static void add_fl_to_sfl(struct adapter *, struct sge_fl *); static inline void get_pkt_gl(struct mbuf *, struct sglist *); static inline u_int txpkt_len16(u_int, const u_int); static inline u_int txpkt_vm_len16(u_int, const u_int); -static inline void calculate_mbuf_len16(struct adapter *, struct mbuf *); +static inline void calculate_mbuf_len16(struct mbuf *, bool); static inline u_int txpkts0_len16(u_int); static inline u_int txpkts1_len16(void); static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int); @@ -2323,6 +2323,8 @@ set_mbuf_len16(struct mbuf *m, uint8_t len16) { M_ASSERTPKTHDR(m); + if (!(mbuf_cflags(m) & MC_TLS)) + MPASS(len16 > 0 && len16 <= SGE_MAX_WR_LEN / 16); m->m_pkthdr.PH_loc.eight[0] = len16; } @@ -2657,9 +2659,15 @@ count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cflags) * The maximum number of segments that can fit in a WR. */ static int -max_nsegs_allowed(struct mbuf *m) +max_nsegs_allowed(struct mbuf *m, bool vm_wr) { + if (vm_wr) { + if (needs_tso(m)) + return (TX_SGL_SEGS_VM_TSO); + return (TX_SGL_SEGS_VM); + } + if (needs_tso(m)) { if (needs_vxlan_tso(m)) return (TX_SGL_SEGS_VXLAN_TSO); @@ -2676,7 +2684,7 @@ max_nsegs_allowed(struct mbuf *m) * b) it may get defragged up if the gather list is too long for the hardware. */ int -parse_pkt(struct adapter *sc, struct mbuf **mp) +parse_pkt(struct mbuf **mp, bool vm_wr) { struct mbuf *m0 = *mp, *m; int rc, nsegs, defragged = 0, offset; @@ -2728,7 +2736,7 @@ restart: return (0); } #endif - if (nsegs > max_nsegs_allowed(m0)) { + if (nsegs > max_nsegs_allowed(m0, vm_wr)) { if (defragged++ > 0) { rc = EFBIG; goto fail; @@ -2756,7 +2764,7 @@ restart: } set_mbuf_nsegs(m0, nsegs); set_mbuf_cflags(m0, cflags); - calculate_mbuf_len16(sc, m0); + calculate_mbuf_len16(m0, vm_wr); #ifdef RATELIMIT /* @@ -3168,7 +3176,7 @@ eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing) if (txp->npkt > 0 || remaining > 1 || txp->score > 3 || atomic_load_int(&txq->eq.equiq) != 0) { - if (sc->flags & IS_VF) + if (vi->flags & TX_USES_VM_WR) rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd); else rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd); @@ -3184,14 +3192,14 @@ eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing) if (txp->score++ >= 10) txp->score = 10; MPASS(avail >= tx_len16_to_desc(txp->len16)); - if (sc->flags & IS_VF) + if (vi->flags & TX_USES_VM_WR) n = write_txpkts_vm_wr(sc, txq); else n = write_txpkts_wr(sc, txq); } else { MPASS(avail >= tx_len16_to_desc(mbuf_len16(txp->mb[0]))); - if (sc->flags & IS_VF) + if (vi->flags & TX_USES_VM_WR) n = write_txpkt_vm_wr(sc, txq, txp->mb[0]); else @@ -3241,7 +3249,7 @@ eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing) #endif } else { ETHER_BPF_MTAP(ifp, m0); - if (sc->flags & IS_VF) + if (vi->flags & TX_USES_VM_WR) n = write_txpkt_vm_wr(sc, txq, m0); else n = write_txpkt_wr(sc, txq, m0, avail); @@ -3285,14 +3293,14 @@ send_txpkts: ETHER_BPF_MTAP(ifp, txp->mb[i]); if (txp->npkt > 1) { MPASS(avail >= tx_len16_to_desc(txp->len16)); - if (sc->flags & IS_VF) + if (vi->flags & TX_USES_VM_WR) n = write_txpkts_vm_wr(sc, txq); else n = write_txpkts_wr(sc, txq); } else { MPASS(avail >= tx_len16_to_desc(mbuf_len16(txp->mb[0]))); - if (sc->flags & IS_VF) + if (vi->flags & TX_USES_VM_WR) n = write_txpkt_vm_wr(sc, txq, txp->mb[0]); else n = write_txpkt_wr(sc, txq, txp->mb[0], avail); @@ -4431,7 +4439,7 @@ alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx, TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq); txq->ifp = vi->ifp; txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK); - if (sc->flags & IS_VF) + if (vi->flags & TX_USES_VM_WR) txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan)); else @@ -4447,6 +4455,8 @@ alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx, MPASS(nitems(txp->mb) >= sc->params.max_pkts_per_eth_tx_pkts_wr); txq->txp.max_npkt = min(nitems(txp->mb), sc->params.max_pkts_per_eth_tx_pkts_wr); + if (vi->flags & TX_USES_VM_WR && !(sc->flags & IS_VF)) + txq->txp.max_npkt--; snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, @@ -4796,9 +4806,11 @@ get_pkt_gl(struct mbuf *m, struct sglist *gl) KASSERT(gl->sg_nseg == mbuf_nsegs(m), ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m, mbuf_nsegs(m), gl->sg_nseg)); - KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m), +#if 0 /* vm_wr not readily available here. */ + KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m, vm_wr), ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__, - gl->sg_nseg, max_nsegs_allowed(m))); + gl->sg_nseg, max_nsegs_allowed(m, vm_wr))); +#endif } /* @@ -4839,12 +4851,12 @@ txpkt_vm_len16(u_int nsegs, const u_int extra) } static inline void -calculate_mbuf_len16(struct adapter *sc, struct mbuf *m) +calculate_mbuf_len16(struct mbuf *m, bool vm_wr) { const int lso = sizeof(struct cpl_tx_pkt_lso_core); const int tnl_lso = sizeof(struct cpl_tx_tnl_lso); - if (sc->flags & IS_VF) { + if (vm_wr) { if (needs_tso(m)) set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), lso)); else @@ -5348,8 +5360,6 @@ add_to_txpkts_vf(struct adapter *sc, struct sge_txq *txq, struct mbuf *m, { struct txpkts *txp = &txq->txp; - MPASS(sc->flags & IS_VF); - /* Cannot have TSO and coalesce at the same time. */ if (cannot_use_txpkts(m)) { cannot_coalesce: