From e682d02e1273fc088c0827ba67a5414befed321f Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Fri, 17 Aug 2012 00:49:29 +0000 Subject: [PATCH] Support for TCP DDP (Direct Data Placement) in the T4 TOE module. Basically, this is automatic rx zero copy when feasible. TCP payload is DMA'd directly into the userspace buffer described by the uio submitted in soreceive by an application. - Works with sockets that are being handled by the TCP offload engine of a T4 chip (you need t4_tom.ko module loaded after cxgbe, and an "ifconfig +toe" on the cxgbe interface). - Does not require any modification to the application. - Not enabled by default. Use hw.t4nex..toe.ddp="1" to enable it. --- sys/dev/cxgbe/common/t4_hw.h | 2 + sys/dev/cxgbe/common/t4_msg.h | 13 + sys/dev/cxgbe/offload.h | 11 +- sys/dev/cxgbe/tom/t4_connect.c | 11 +- sys/dev/cxgbe/tom/t4_cpl_io.c | 163 ++++- sys/dev/cxgbe/tom/t4_ddp.c | 1223 ++++++++++++++++++++++++++++++++ sys/dev/cxgbe/tom/t4_listen.c | 24 +- sys/dev/cxgbe/tom/t4_tom.c | 22 + sys/dev/cxgbe/tom/t4_tom.h | 69 +- sys/modules/cxgbe/tom/Makefile | 2 +- 10 files changed, 1487 insertions(+), 53 deletions(-) create mode 100644 sys/dev/cxgbe/tom/t4_ddp.c diff --git a/sys/dev/cxgbe/common/t4_hw.h b/sys/dev/cxgbe/common/t4_hw.h index b93734e8de3..256a7d5e988 100644 --- a/sys/dev/cxgbe/common/t4_hw.h +++ b/sys/dev/cxgbe/common/t4_hw.h @@ -161,10 +161,12 @@ struct pagepod { #define S_PPOD_TAG 6 #define M_PPOD_TAG 0xFFFFFF #define V_PPOD_TAG(x) ((x) << S_PPOD_TAG) +#define G_PPOD_TAG(x) (((x) >> S_PPOD_TAG) & M_PPOD_TAG) #define S_PPOD_PGSZ 30 #define M_PPOD_PGSZ 0x3 #define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ) +#define G_PPOD_PGSZ(x) (((x) >> S_PPOD_PGSZ) & M_PPOD_PGSZ) #define S_PPOD_TID 32 #define M_PPOD_TID 0xFFFFFF diff --git a/sys/dev/cxgbe/common/t4_msg.h b/sys/dev/cxgbe/common/t4_msg.h index d6c01e2c65e..5bd3cef6d65 100644 --- a/sys/dev/cxgbe/common/t4_msg.h +++ b/sys/dev/cxgbe/common/t4_msg.h @@ -792,6 +792,14 @@ struct cpl_set_tcb_field { __be64 val; }; +struct cpl_set_tcb_field_core { + union opcode_tid ot; + __be16 reply_ctrl; + __be16 word_cookie; + __be64 mask; + __be64 val; +}; + /* cpl_set_tcb_field.word_cookie fields */ #define S_WORD 0 #define M_WORD 0x1F @@ -1376,6 +1384,11 @@ struct cpl_rx_data_ack { __be32 credit_dack; }; +struct cpl_rx_data_ack_core { + union opcode_tid ot; + __be32 credit_dack; +}; + /* cpl_rx_data_ack.ack_seq fields */ #define S_RX_CREDITS 0 #define M_RX_CREDITS 0x3FFFFFF diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h index 1ae9f1fa53c..ced15a62388 100644 --- a/sys/dev/cxgbe/offload.h +++ b/sys/dev/cxgbe/offload.h @@ -31,13 +31,16 @@ #ifndef __T4_OFFLOAD_H__ #define __T4_OFFLOAD_H__ -#define INIT_ULPTX_WR(w, wrlen, atomic, tid) do { \ - (w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \ - (w)->wr.wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \ +#define INIT_ULPTX_WRH(w, wrlen, atomic, tid) do { \ + (w)->wr_hi = htonl(V_FW_WR_OP(FW_ULPTX_WR) | V_FW_WR_ATOMIC(atomic)); \ + (w)->wr_mid = htonl(V_FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \ V_FW_WR_FLOWID(tid)); \ - (w)->wr.wr_lo = cpu_to_be64(0); \ + (w)->wr_lo = cpu_to_be64(0); \ } while (0) +#define INIT_ULPTX_WR(w, wrlen, atomic, tid) \ + INIT_ULPTX_WRH(&((w)->wr), wrlen, atomic, tid) + #define INIT_TP_WR(w, tid) do { \ (w)->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) | \ V_FW_WR_IMMDLEN(sizeof(*w) - sizeof(w->wr))); \ diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c index bc591716aea..b48a971cec4 100644 --- a/sys/dev/cxgbe/tom/t4_connect.c +++ b/sys/dev/cxgbe/tom/t4_connect.c @@ -247,10 +247,14 @@ calc_opt2a(struct socket *so) opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE); opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id); +#ifdef USE_DDP_RX_FLOW_CONTROL + if (toep->ulp_mode == ULP_MODE_TCPDDP) + opt2 |= F_RX_FC_VALID | F_RX_FC_DDP; +#endif + return (htobe32(opt2)); } - void t4_init_connect_cpl_handlers(struct adapter *sc) { @@ -320,7 +324,10 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, toep->tid = atid; toep->l2te = e; - toep->ulp_mode = ULP_MODE_NONE; + if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) + set_tcpddp_ulp_mode(toep); + else + toep->ulp_mode = ULP_MODE_NONE; SOCKBUF_LOCK(&so->so_rcv); /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index 384fc85aa03..829649ca1f3 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$"); #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" +#include "common/t4_tcb.h" #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" @@ -299,12 +300,14 @@ make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn, } static int -send_rx_credits(struct adapter *sc, struct toepcb *toep, uint32_t credits) +send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits) { struct wrqe *wr; struct cpl_rx_data_ack *req; uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); + KASSERT(credits >= 0, ("%s: %d credits", __func__, credits)); + wr = alloc_wrqe(sizeof(*req), toep->ctrlq); if (wr == NULL) return (0); @@ -323,25 +326,28 @@ t4_rcvd(struct toedev *tod, struct tcpcb *tp) struct adapter *sc = tod->tod_softc; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; - struct sockbuf *so_rcv = &so->so_rcv; + struct sockbuf *sb = &so->so_rcv; struct toepcb *toep = tp->t_toe; - int must_send; + int credits; INP_WLOCK_ASSERT(inp); - SOCKBUF_LOCK(so_rcv); - KASSERT(toep->enqueued >= so_rcv->sb_cc, - ("%s: so_rcv->sb_cc > enqueued", __func__)); - toep->rx_credits += toep->enqueued - so_rcv->sb_cc; - toep->enqueued = so_rcv->sb_cc; - SOCKBUF_UNLOCK(so_rcv); + SOCKBUF_LOCK(sb); + KASSERT(toep->sb_cc >= sb->sb_cc, + ("%s: sb %p has more data (%d) than last time (%d).", + __func__, sb, sb->sb_cc, toep->sb_cc)); + toep->rx_credits += toep->sb_cc - sb->sb_cc; + toep->sb_cc = sb->sb_cc; + credits = toep->rx_credits; + SOCKBUF_UNLOCK(sb); - must_send = toep->rx_credits + 16384 >= tp->rcv_wnd; - if (must_send || toep->rx_credits >= 15 * 1024) { - int credits; + if (credits > 0 && + (credits + 16384 >= tp->rcv_wnd || credits >= 15 * 1024)) { - credits = send_rx_credits(sc, toep, toep->rx_credits); + credits = send_rx_credits(sc, toep, credits); + SOCKBUF_LOCK(sb); toep->rx_credits -= credits; + SOCKBUF_UNLOCK(sb); tp->rcv_wnd += credits; tp->rcv_adv += credits; } @@ -537,7 +543,8 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep) KASSERT(toepcb_flag(toep, TPF_FLOWC_WR_SENT), ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); - if (toep->ulp_mode != ULP_MODE_NONE) + if (__predict_false(toep->ulp_mode != ULP_MODE_NONE && + toep->ulp_mode != ULP_MODE_TCPDDP)) CXGBE_UNIMPLEMENTED("ulp_mode"); /* @@ -765,7 +772,8 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) struct toepcb *toep = lookup_tid(sc, tid); struct inpcb *inp = toep->inp; struct tcpcb *tp = NULL; - struct socket *so = NULL; + struct socket *so; + struct sockbuf *sb; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif @@ -785,10 +793,35 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) if (toepcb_flag(toep, TPF_ABORT_SHUTDOWN)) goto done; - so = inp->inp_socket; - - socantrcvmore(so); tp->rcv_nxt++; /* FIN */ + + so = inp->inp_socket; + sb = &so->so_rcv; + SOCKBUF_LOCK(sb); + if (__predict_false(toep->ddp_flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) { + m = m_get(M_NOWAIT, MT_DATA); + if (m == NULL) + CXGBE_UNIMPLEMENTED("mbuf alloc failure"); + + m->m_len = be32toh(cpl->rcv_nxt) - tp->rcv_nxt; + m->m_flags |= M_DDP; /* Data is already where it should be */ + m->m_data = "nothing to see here"; + tp->rcv_nxt = be32toh(cpl->rcv_nxt); + + toep->ddp_flags &= ~(DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE); + + KASSERT(toep->sb_cc >= sb->sb_cc, + ("%s: sb %p has more data (%d) than last time (%d).", + __func__, sb, sb->sb_cc, toep->sb_cc)); + toep->rx_credits += toep->sb_cc - sb->sb_cc; +#ifdef USE_DDP_RX_FLOW_CONTROL + toep->rx_credits -= m->m_len; /* adjust for F_RX_FC_DDP */ +#endif + sbappendstream_locked(sb, m); + toep->sb_cc = sb->sb_cc; + } + socantrcvmore_locked(so); /* unlocks the sockbuf */ + KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, be32toh(cpl->rcv_nxt))); @@ -1046,7 +1079,8 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; - struct sockbuf *so_rcv; + struct sockbuf *sb; + int len; if (__predict_false(toepcb_flag(toep, TPF_SYNQE))) { /* @@ -1064,11 +1098,12 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) /* strip off CPL header */ m_adj(m, sizeof(*cpl)); + len = m->m_pkthdr.len; INP_WLOCK(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x", - __func__, tid, m->m_pkthdr.len, inp->inp_flags); + __func__, tid, len, inp->inp_flags); INP_WUNLOCK(inp); m_freem(m); return (0); @@ -1084,21 +1119,20 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) } #endif - tp->rcv_nxt += m->m_pkthdr.len; - KASSERT(tp->rcv_wnd >= m->m_pkthdr.len, - ("%s: negative window size", __func__)); - tp->rcv_wnd -= m->m_pkthdr.len; + tp->rcv_nxt += len; + KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); + tp->rcv_wnd -= len; tp->t_rcvtime = ticks; so = inp_inpcbtosocket(inp); - so_rcv = &so->so_rcv; - SOCKBUF_LOCK(so_rcv); + sb = &so->so_rcv; + SOCKBUF_LOCK(sb); - if (__predict_false(so_rcv->sb_state & SBS_CANTRCVMORE)) { + if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) { CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)", - __func__, tid, m->m_pkthdr.len); + __func__, tid, len); m_freem(m); - SOCKBUF_UNLOCK(so_rcv); + SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); INP_INFO_WLOCK(&V_tcbinfo); @@ -1112,23 +1146,76 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) } /* receive buffer autosize */ - if (so_rcv->sb_flags & SB_AUTOSIZE && + if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && - so_rcv->sb_hiwat < V_tcp_autorcvbuf_max && - m->m_pkthdr.len > (sbspace(so_rcv) / 8 * 7)) { - unsigned int hiwat = so_rcv->sb_hiwat; + sb->sb_hiwat < V_tcp_autorcvbuf_max && + len > (sbspace(sb) / 8 * 7)) { + unsigned int hiwat = sb->sb_hiwat; unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); - if (!sbreserve_locked(so_rcv, newsize, so, NULL)) - so_rcv->sb_flags &= ~SB_AUTOSIZE; + if (!sbreserve_locked(sb, newsize, so, NULL)) + sb->sb_flags &= ~SB_AUTOSIZE; else toep->rx_credits += newsize - hiwat; } - toep->enqueued += m->m_pkthdr.len; - sbappendstream_locked(so_rcv, m); + + if (toep->ulp_mode == ULP_MODE_TCPDDP) { + int changed = !(toep->ddp_flags & DDP_ON) ^ cpl->ddp_off; + + if (changed) { + if (__predict_false(!(toep->ddp_flags & DDP_SC_REQ))) { + /* XXX: handle this if legitimate */ + panic("%s: unexpected DDP state change %d", + __func__, cpl->ddp_off); + } + toep->ddp_flags ^= DDP_ON | DDP_SC_REQ; + } + + if ((toep->ddp_flags & DDP_OK) == 0 && + time_uptime >= toep->ddp_disabled + DDP_RETRY_WAIT) { + toep->ddp_score = DDP_LOW_SCORE; + toep->ddp_flags |= DDP_OK; + CTR3(KTR_CXGBE, "%s: tid %u DDP_OK @ %u", + __func__, tid, time_uptime); + } + + if (toep->ddp_flags & DDP_ON) { + + /* + * CPL_RX_DATA with DDP on can only be an indicate. Ask + * soreceive to post a buffer or disable DDP. The + * payload that arrived in this indicate is appended to + * the socket buffer as usual. + */ + +#if 0 + CTR5(KTR_CXGBE, + "%s: tid %u (0x%x) DDP indicate (seq 0x%x, len %d)", + __func__, tid, toep->flags, be32toh(cpl->seq), len); +#endif + sb->sb_flags |= SB_DDP_INDICATE; + } else if ((toep->ddp_flags & (DDP_OK|DDP_SC_REQ)) == DDP_OK && + tp->rcv_wnd > DDP_RSVD_WIN && len >= sc->tt.ddp_thres) { + + /* + * DDP allowed but isn't on (and a request to switch it + * on isn't pending either), and conditions are ripe for + * it to work. Switch it on. + */ + + enable_ddp(sc, toep); + } + } + + KASSERT(toep->sb_cc >= sb->sb_cc, + ("%s: sb %p has more data (%d) than last time (%d).", + __func__, sb, sb->sb_cc, toep->sb_cc)); + toep->rx_credits += toep->sb_cc - sb->sb_cc; + sbappendstream_locked(sb, m); + toep->sb_cc = sb->sb_cc; sorwakeup_locked(so); - SOCKBUF_UNLOCK_ASSERT(so_rcv); + SOCKBUF_UNLOCK_ASSERT(sb); INP_WUNLOCK(inp); return (0); diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c new file mode 100644 index 00000000000..38607a30e40 --- /dev/null +++ b/sys/dev/cxgbe/tom/t4_ddp.c @@ -0,0 +1,1223 @@ +/*- + * Copyright (c) 2012 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define TCPSTATES +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef TCP_OFFLOAD +#include "common/common.h" +#include "common/t4_msg.h" +#include "common/t4_regs.h" +#include "common/t4_tcb.h" +#include "tom/t4_tom.h" + +#define PPOD_SZ(n) ((n) * sizeof(struct pagepod)) +#define PPOD_SIZE (PPOD_SZ(1)) + +/* XXX: must match A_ULP_RX_TDDP_PSZ */ +static int t4_ddp_pgsz[] = {4096, 4096 << 2, 4096 << 4, 4096 << 6}; + +#if 0 +static void +t4_dump_tcb(struct adapter *sc, int tid) +{ + uint32_t tcb_base, off, i, j; + + /* Dump TCB for the tid */ + tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); + t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2), + tcb_base + tid * TCB_SIZE); + t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2)); + off = 0; + printf("\n"); + for (i = 0; i < 4; i++) { + uint32_t buf[8]; + for (j = 0; j < 8; j++, off += 4) + buf[j] = htonl(t4_read_reg(sc, MEMWIN2_BASE + off)); + + printf("%08x %08x %08x %08x %08x %08x %08x %08x\n", + buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], + buf[7]); + } +} +#endif + +#define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN) +static int +alloc_ppods(struct tom_data *td, int n, struct ppod_region *pr) +{ + int ppod; + + KASSERT(n > 0, ("%s: nonsense allocation (%d)", __func__, n)); + + mtx_lock(&td->ppod_lock); + if (n > td->nppods_free) { + mtx_unlock(&td->ppod_lock); + return (-1); + } + + if (td->nppods_free_head >= n) { + td->nppods_free_head -= n; + ppod = td->nppods_free_head; + TAILQ_INSERT_HEAD(&td->ppods, pr, link); + } else { + struct ppod_region *p; + + ppod = td->nppods_free_head; + TAILQ_FOREACH(p, &td->ppods, link) { + ppod += p->used + p->free; + if (n <= p->free) { + ppod -= n; + p->free -= n; + TAILQ_INSERT_AFTER(&td->ppods, p, pr, link); + goto allocated; + } + } + + if (__predict_false(ppod != td->nppods)) { + panic("%s: ppods TAILQ (%p) corrupt." + " At %d instead of %d at the end of the queue.", + __func__, &td->ppods, ppod, td->nppods); + } + + mtx_unlock(&td->ppod_lock); + return (-1); + } + +allocated: + pr->used = n; + pr->free = 0; + td->nppods_free -= n; + mtx_unlock(&td->ppod_lock); + + return (ppod); +} + +static void +free_ppods(struct tom_data *td, struct ppod_region *pr) +{ + struct ppod_region *p; + + KASSERT(pr->used > 0, ("%s: nonsense free (%d)", __func__, pr->used)); + + mtx_lock(&td->ppod_lock); + p = TAILQ_PREV(pr, ppod_head, link); + if (p != NULL) + p->free += pr->used + pr->free; + else + td->nppods_free_head += pr->used + pr->free; + td->nppods_free += pr->used; + KASSERT(td->nppods_free <= td->nppods, + ("%s: nppods_free (%d) > nppods (%d). %d freed this time.", + __func__, td->nppods_free, td->nppods, pr->used)); + TAILQ_REMOVE(&td->ppods, pr, link); + mtx_unlock(&td->ppod_lock); +} + +static inline int +pages_to_nppods(int npages, int ddp_pgsz) +{ + int nsegs = npages * PAGE_SIZE / ddp_pgsz; + + return (howmany(nsegs, PPOD_PAGES)); +} + +static void +free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db) +{ + + if (db == NULL) + return; + + if (db->pages) + free(db->pages, M_CXGBE); + + if (db->nppods > 0) + free_ppods(td, &db->ppod_region); + + free(db, M_CXGBE); +} + +void +release_ddp_resources(struct toepcb *toep) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(toep->db); i++) { + if (toep->db[i] != NULL) { + free_ddp_buffer(toep->td, toep->db[i]); + toep->db[i] = NULL; + } + } +} + +/* SET_TCB_FIELD sent as a ULP command looks like this */ +#define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ + sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) + +/* RX_DATA_ACK sent as a ULP command looks like this */ +#define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \ + sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core)) + +static inline void * +mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep, + uint64_t word, uint64_t mask, uint64_t val) +{ + struct ulptx_idata *ulpsc; + struct cpl_set_tcb_field_core *req; + + ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); + ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); + + ulpsc = (struct ulptx_idata *)(ulpmc + 1); + ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); + ulpsc->len = htobe32(sizeof(*req)); + + req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); + OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid)); + req->reply_ctrl = htobe16(V_NO_REPLY(1) | + V_QUEUENO(toep->ofld_rxq->iq.abs_id)); + req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); + req->mask = htobe64(mask); + req->val = htobe64(val); + + ulpsc = (struct ulptx_idata *)(req + 1); + if (LEN__SET_TCB_FIELD_ULP % 16) { + ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); + ulpsc->len = htobe32(0); + return (ulpsc + 1); + } + return (ulpsc); +} + +static inline void * +mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep) +{ + struct ulptx_idata *ulpsc; + struct cpl_rx_data_ack_core *req; + + ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); + ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16)); + + ulpsc = (struct ulptx_idata *)(ulpmc + 1); + ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); + ulpsc->len = htobe32(sizeof(*req)); + + req = (struct cpl_rx_data_ack_core *)(ulpsc + 1); + OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid)); + req->credit_dack = htobe32(F_RX_MODULATE_RX); + + ulpsc = (struct ulptx_idata *)(req + 1); + if (LEN__RX_DATA_ACK_ULP % 16) { + ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); + ulpsc->len = htobe32(0); + return (ulpsc + 1); + } + return (ulpsc); +} + +static inline uint64_t +select_ddp_flags(struct socket *so, int flags, int db_idx) +{ + uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0); + int waitall = flags & MSG_WAITALL; + int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO); + + KASSERT(db_idx == 0 || db_idx == 1, + ("%s: bad DDP buffer index %d", __func__, db_idx)); + + if (db_idx == 0) { + ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0); + if (waitall) + ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1); + else if (nb) + ddp_flags |= V_TF_DDP_BUF0_FLUSH(1); + else + ddp_flags |= V_TF_DDP_BUF0_FLUSH(0); + } else { + ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1); + if (waitall) + ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1); + else if (nb) + ddp_flags |= V_TF_DDP_BUF1_FLUSH(1); + else + ddp_flags |= V_TF_DDP_BUF1_FLUSH(0); + } + + return (ddp_flags); +} + +static struct wrqe * +mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx, + int offset, uint64_t ddp_flags) +{ + struct ddp_buffer *db = toep->db[db_idx]; + struct wrqe *wr; + struct work_request_hdr *wrh; + struct ulp_txpkt *ulpmc; + int len; + + KASSERT(db_idx == 0 || db_idx == 1, + ("%s: bad DDP buffer index %d", __func__, db_idx)); + + /* + * We'll send a compound work request that has 3 SET_TCB_FIELDs and an + * RX_DATA_ACK (with RX_MODULATE to speed up delivery). + * + * The work request header is 16B and always ends at a 16B boundary. + * The ULPTX master commands that follow must all end at 16B boundaries + * too so we round up the size to 16. + */ + len = sizeof(*wrh) + 3 * roundup(LEN__SET_TCB_FIELD_ULP, 16) + + roundup(LEN__RX_DATA_ACK_ULP, 16); + + wr = alloc_wrqe(len, toep->ctrlq); + if (wr == NULL) + return (NULL); + wrh = wrtod(wr); + INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ + ulpmc = (struct ulp_txpkt *)(wrh + 1); + + /* Write the buffer's tag */ + ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, + W_TCB_RX_DDP_BUF0_TAG + db_idx, + V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), + V_TCB_RX_DDP_BUF0_TAG(db->tag)); + + /* Update the current offset in the DDP buffer and its total length */ + if (db_idx == 0) + ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, + W_TCB_RX_DDP_BUF0_OFFSET, + V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | + V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), + V_TCB_RX_DDP_BUF0_OFFSET(offset) | + V_TCB_RX_DDP_BUF0_LEN(db->len)); + else + ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, + W_TCB_RX_DDP_BUF1_OFFSET, + V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | + V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32), + V_TCB_RX_DDP_BUF1_OFFSET(offset) | + V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32)); + + /* Update DDP flags */ + ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) | + V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) | + V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) | + V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags); + + /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */ + ulpmc = mk_rx_data_ack_ulp(ulpmc, toep); + + return (wr); +} + +static void +discourage_ddp(struct toepcb *toep) +{ + + if (toep->ddp_score && --toep->ddp_score == 0) { + toep->ddp_flags &= ~DDP_OK; + toep->ddp_disabled = time_uptime; + CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u", + __func__, toep->tid, time_uptime); + } +} + +static int +handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) +{ + uint32_t report = be32toh(ddp_report); + unsigned int db_flag; + struct inpcb *inp = toep->inp; + struct tcpcb *tp; + struct socket *so; + struct sockbuf *sb; + struct mbuf *m; + + db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; + + if (__predict_false(!(report & F_DDP_INV))) + CXGBE_UNIMPLEMENTED("DDP buffer still valid"); + + INP_WLOCK(inp); + so = inp_inpcbtosocket(inp); + sb = &so->so_rcv; + if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { + + /* + * XXX: think a bit more. + * tcpcb probably gone, but socket should still be around + * because we always wait for DDP completion in soreceive no + * matter what. Just wake it up and let it clean up. + */ + + CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", + __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); + SOCKBUF_LOCK(sb); + goto wakeup; + } + + tp = intotcpcb(inp); + len += be32toh(rcv_nxt) - tp->rcv_nxt; + tp->rcv_nxt += len; + tp->t_rcvtime = ticks; +#ifndef USE_DDP_RX_FLOW_CONTROL + KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); + tp->rcv_wnd -= len; +#endif + + m = m_get(M_NOWAIT, MT_DATA); + if (m == NULL) + CXGBE_UNIMPLEMENTED("mbuf alloc failure"); + m->m_len = len; + m->m_flags |= M_DDP; /* Data is already where it should be */ + m->m_data = "nothing to see here"; + + SOCKBUF_LOCK(sb); + if (report & F_DDP_BUF_COMPLETE) + toep->ddp_score = DDP_HIGH_SCORE; + else + discourage_ddp(toep); + + KASSERT(toep->sb_cc >= sb->sb_cc, + ("%s: sb %p has more data (%d) than last time (%d).", + __func__, sb, sb->sb_cc, toep->sb_cc)); + toep->rx_credits += toep->sb_cc - sb->sb_cc; +#ifdef USE_DDP_RX_FLOW_CONTROL + toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */ +#endif + sbappendstream_locked(sb, m); + toep->sb_cc = sb->sb_cc; +wakeup: + KASSERT(toep->ddp_flags & db_flag, + ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x", + __func__, toep, toep->ddp_flags, report)); + toep->ddp_flags &= ~db_flag; + sorwakeup_locked(so); + SOCKBUF_UNLOCK_ASSERT(sb); + + INP_WUNLOCK(inp); + return (0); +} + +#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ + F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ + F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ + F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR) + +static int +do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + uint32_t vld; + struct toepcb *toep = lookup_tid(sc, tid); + + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); + KASSERT(!toepcb_flag(toep, TPF_SYNQE), + ("%s: toep %p claims to be a synq entry", __func__, toep)); + + vld = be32toh(cpl->ddpvld); + if (__predict_false(vld & DDP_ERR)) { + panic("%s: DDP error 0x%x (tid %d, toep %p)", + __func__, vld, tid, toep); + } + + handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len)); + + return (0); +} + +static int +do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss, + struct mbuf *m) +{ + struct adapter *sc = iq->adapter; + const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1); + unsigned int tid = GET_TID(cpl); + struct toepcb *toep = lookup_tid(sc, tid); + + KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); + KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); + KASSERT(!toepcb_flag(toep, TPF_SYNQE), + ("%s: toep %p claims to be a synq entry", __func__, toep)); + + handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0); + + return (0); +} + +void +enable_ddp(struct adapter *sc, struct toepcb *toep) +{ + + KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK, + ("%s: toep %p has bad ddp_flags 0x%x", + __func__, toep, toep->ddp_flags)); + + CTR3(KTR_CXGBE, "%s: tid %u (time %u)", + __func__, toep->tid, time_uptime); + + toep->ddp_flags |= DDP_SC_REQ; + t4_set_tcb_field(sc, toep, W_TCB_RX_DDP_FLAGS, + V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) | + V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) | + V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), + V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1)); + t4_set_tcb_field(sc, toep, W_TCB_T_FLAGS, + V_TF_RCV_COALESCE_ENABLE(1), 0); +} + +static inline void +disable_ddp(struct adapter *sc, struct toepcb *toep) +{ + + KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON, + ("%s: toep %p has bad ddp_flags 0x%x", + __func__, toep, toep->ddp_flags)); + + CTR3(KTR_CXGBE, "%s: tid %u (time %u)", + __func__, toep->tid, time_uptime); + + toep->ddp_flags |= DDP_SC_REQ; + t4_set_tcb_field(sc, toep, W_TCB_T_FLAGS, + V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1)); + t4_set_tcb_field(sc, toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), + V_TF_DDP_OFF(1)); +} + +static int +hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages) +{ + struct vm_map *map; + struct iovec *iov; + vm_offset_t start, end; + vm_page_t *pp; + int n; + + KASSERT(uio->uio_iovcnt == 1, + ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt)); + KASSERT(uio->uio_td->td_proc == curproc, + ("%s: uio proc (%p) is not curproc (%p)", + __func__, uio->uio_td->td_proc, curproc)); + + map = &curproc->p_vmspace->vm_map; + iov = &uio->uio_iov[0]; + start = trunc_page((uintptr_t)iov->iov_base); + end = round_page((vm_offset_t)iov->iov_base + iov->iov_len); + n = howmany(end - start, PAGE_SIZE); + + if (end - start > MAX_DDP_BUFFER_SIZE) + return (E2BIG); + + pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT); + if (pp == NULL) + return (ENOMEM); + + if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base, + iov->iov_len, VM_PROT_WRITE, pp, n) < 0) { + free(pp, M_CXGBE); + return (EFAULT); + } + + *ppages = pp; + *pnpages = n; + + return (0); +} + +static int +bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len) +{ + int i; + + if (db == NULL || db->npages != npages || db->offset != offset || + db->len != len) + return (1); + + for (i = 0; i < npages; i++) { + if (pages[i]->phys_addr != db->pages[i]->phys_addr) + return (1); + } + + return (0); +} + +static int +calculate_hcf(int n1, int n2) +{ + int a, b, t; + + if (n1 <= n2) { + a = n1; + b = n2; + } else { + a = n2; + b = n1; + } + + while (a != 0) { + t = a; + a = b % a; + b = t; + } + + return (b); +} + +static struct ddp_buffer * +alloc_ddp_buffer(struct tom_data *td, vm_page_t *pages, int npages, int offset, + int len) +{ + int i, hcf, seglen, idx, ppod, nppods; + struct ddp_buffer *db; + + /* + * The DDP page size is unrelated to the VM page size. We combine + * contiguous physical pages into larger segments to get the best DDP + * page size possible. This is the largest of the four sizes in + * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in + * the page list. + */ + hcf = 0; + for (i = 0; i < npages; i++) { + seglen = PAGE_SIZE; + while (i < npages - 1 && + pages[i]->phys_addr + PAGE_SIZE == pages[i + 1]->phys_addr) { + seglen += PAGE_SIZE; + i++; + } + + hcf = calculate_hcf(hcf, seglen); + if (hcf < t4_ddp_pgsz[1]) { + idx = 0; + goto have_pgsz; /* give up, short circuit */ + } + } + + if (hcf % t4_ddp_pgsz[0] != 0) { + /* hmmm. This could only happen when PAGE_SIZE < 4K */ + KASSERT(PAGE_SIZE < 4096, + ("%s: PAGE_SIZE %d, hcf %d", __func__, PAGE_SIZE, hcf)); + CTR3(KTR_CXGBE, "%s: PAGE_SIZE %d, hcf %d", + __func__, PAGE_SIZE, hcf); + return (NULL); + } + + for (idx = ARRAY_SIZE(t4_ddp_pgsz) - 1; idx > 0; idx--) { + if (hcf % t4_ddp_pgsz[idx] == 0) + break; + } +have_pgsz: + + db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT); + if (db == NULL) { + CTR1(KTR_CXGBE, "%s: malloc failed.", __func__); + return (NULL); + } + + nppods = pages_to_nppods(npages, t4_ddp_pgsz[idx]); + ppod = alloc_ppods(td, nppods, &db->ppod_region); + if (ppod < 0) { + free(db, M_CXGBE); + CTR4(KTR_CXGBE, "%s: no pods, nppods %d, resid %d, pgsz %d", + __func__, nppods, len, t4_ddp_pgsz[idx]); + return (NULL); + } + + KASSERT(idx <= M_PPOD_PGSZ && ppod <= M_PPOD_TAG, + ("%s: DDP pgsz_idx = %d, ppod = %d", __func__, idx, ppod)); + + db->tag = V_PPOD_PGSZ(idx) | V_PPOD_TAG(ppod); + db->nppods = nppods; + db->npages = npages; + db->pages = pages; + db->offset = offset; + db->len = len; + + CTR6(KTR_CXGBE, "New DDP buffer. " + "ddp_pgsz %d, ppod 0x%x, npages %d, nppods %d, offset %d, len %d", + t4_ddp_pgsz[idx], ppod, db->npages, db->nppods, db->offset, + db->len); + + return (db); +} + +#define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE) + +static int +write_page_pods(struct adapter *sc, struct toepcb *toep, struct ddp_buffer *db) +{ + struct wrqe *wr; + struct ulp_mem_io *ulpmc; + struct ulptx_idata *ulpsc; + struct pagepod *ppod; + int i, j, k, n, chunk, len, ddp_pgsz, idx, ppod_addr; + + ddp_pgsz = t4_ddp_pgsz[G_PPOD_PGSZ(db->tag)]; + ppod_addr = sc->vres.ddp.start + G_PPOD_TAG(db->tag) * PPOD_SIZE; + for (i = 0; i < db->nppods; ppod_addr += chunk) { + + /* How many page pods are we writing in this cycle */ + n = min(db->nppods - i, NUM_ULP_TX_SC_IMM_PPODS); + chunk = PPOD_SZ(n); + len = roundup(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); + + wr = alloc_wrqe(len, toep->ctrlq); + if (wr == NULL) + return (ENOMEM); /* ok to just bail out */ + ulpmc = wrtod(wr); + + INIT_ULPTX_WR(ulpmc, len, 0, 0); + ulpmc->cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) | + F_ULP_MEMIO_ORDER); + ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); + ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); + ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); + + ulpsc = (struct ulptx_idata *)(ulpmc + 1); + ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); + ulpsc->len = htobe32(chunk); + + ppod = (struct pagepod *)(ulpsc + 1); + for (j = 0; j < n; i++, j++, ppod++) { + ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | + V_PPOD_TID(toep->tid) | db->tag); + ppod->len_offset = htobe64(V_PPOD_LEN(db->len) | + V_PPOD_OFST(db->offset)); + ppod->rsvd = 0; + idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE); + for (k = 0; k < ARRAY_SIZE(ppod->addr); k++) { + if (idx < db->npages) { + ppod->addr[k] = + htobe64(db->pages[idx]->phys_addr); + idx += ddp_pgsz / PAGE_SIZE; + } else + ppod->addr[k] = 0; +#if 0 + CTR5(KTR_CXGBE, + "%s: tid %d ppod[%d]->addr[%d] = %p", + __func__, toep->tid, i, k, + htobe64(ppod->addr[k])); +#endif + } + + } + + t4_wrq_tx(sc, wr); + } + + return (0); +} + +/* + * Reuse, or allocate (and program the page pods for) a new DDP buffer. + */ +static int +select_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages, + int npages, int db_off, int db_len) +{ + struct ddp_buffer *db; + struct tom_data *td = sc->tom_softc; + int i, empty_slot = -1; + + /* Try to reuse */ + for (i = 0; i < ARRAY_SIZE(toep->db); i++) { + if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) { + free(pages, M_CXGBE); + return (i); /* pages still held */ + } else if (toep->db[i] == NULL && empty_slot < 0) + empty_slot = i; + } + + /* Allocate new buffer, write its page pods. */ + db = alloc_ddp_buffer(td, pages, npages, db_off, db_len); + if (db == NULL) { + vm_page_unhold_pages(pages, npages); + free(pages, M_CXGBE); + return (-1); + } + if (write_page_pods(sc, toep, db) != 0) { + vm_page_unhold_pages(pages, npages); + free_ddp_buffer(td, db); + return (-1); + } + + i = empty_slot; + if (i < 0) { + i = arc4random() % ARRAY_SIZE(toep->db); + free_ddp_buffer(td, toep->db[i]); + } + toep->db[i] = db; + + CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)", + __func__, toep->tid, i, db, db->tag); + + return (i); +} + +static void +wire_ddp_buffer(struct ddp_buffer *db) +{ + int i; + vm_page_t p; + + for (i = 0; i < db->npages; i++) { + p = db->pages[i]; + vm_page_lock(p); + vm_page_wire(p); + vm_page_unhold(p); + vm_page_unlock(p); + } +} + +static void +unwire_ddp_buffer(struct ddp_buffer *db) +{ + int i; + vm_page_t p; + + for (i = 0; i < db->npages; i++) { + p = db->pages[i]; + vm_page_lock(p); + vm_page_unwire(p, 0); + vm_page_unlock(p); + } +} + +static inline void +unhold_ddp_buffer(struct ddp_buffer *db) +{ + + vm_page_unhold_pages(db->pages, db->npages); +} + +static int +handle_ddp(struct socket *so, struct uio *uio, int flags, int error) +{ + struct sockbuf *sb = &so->so_rcv; + struct tcpcb *tp = so_sototcpcb(so); + struct toepcb *toep = tp->t_toe; + struct adapter *sc = td_adapter(toep->td); + vm_page_t *pages; + int npages, db_idx, rc, buf_flag; + struct ddp_buffer *db; + struct wrqe *wr; + uint64_t ddp_flags; + + SOCKBUF_LOCK_ASSERT(sb); + +#if 0 + if (sb->sb_cc + sc->tt.ddp_thres > uio->uio_resid) { + CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d", + __func__, sb->sb_cc, sc->tt.ddp_thres, uio->uio_resid); + } +#endif + + /* XXX: too eager to disable DDP, could handle NBIO better than this. */ + if (sb->sb_cc >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres || + uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 || + so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) || + error || so->so_error || sb->sb_state & SBS_CANTRCVMORE) + goto no_ddp; + + /* + * Fault in and then hold the pages of the uio buffers. We'll wire them + * a bit later if everything else works out. + */ + if (hold_uio(uio, &pages, &npages) != 0) + goto no_ddp; + + /* + * Figure out which one of the two DDP buffers to use this time. + */ + db_idx = select_ddp_buffer(sc, toep, pages, npages, + (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid); + pages = NULL; /* pages either in use elsewhere or unheld + freed */ + if (db_idx < 0) + goto no_ddp; + db = toep->db[db_idx]; + buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE; + + /* + * Build the compound work request that tells the chip where to DMA the + * payload. + */ + ddp_flags = select_ddp_flags(so, flags, db_idx); + wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sb->sb_cc, ddp_flags); + if (wr == NULL) { + unhold_ddp_buffer(db); + goto no_ddp; + } + + /* Wire the pages and give the chip the go-ahead. */ + wire_ddp_buffer(db); + t4_wrq_tx(sc, wr); + sb->sb_flags &= ~SB_DDP_INDICATE; + toep->ddp_flags |= buf_flag; + + /* + * Wait for the DDP operation to complete and then unwire the pages. + * The return code from the sbwait will be the final return code of this + * function. But we do need to wait for DDP no matter what. + */ + rc = sbwait(sb); + while (toep->ddp_flags & buf_flag) { + sb->sb_flags |= SB_WAIT; + msleep(&sb->sb_cc, &sb->sb_mtx, PSOCK , "sbwait", 0); + } + unwire_ddp_buffer(db); + return (rc); +no_ddp: + disable_ddp(sc, toep); + discourage_ddp(toep); + sb->sb_flags &= ~SB_DDP_INDICATE; + return (0); +} + +void +t4_init_ddp(struct adapter *sc, struct tom_data *td) +{ + int nppods = sc->vres.ddp.size / PPOD_SIZE; + + td->nppods = nppods; + td->nppods_free = nppods; + td->nppods_free_head = nppods; + TAILQ_INIT(&td->ppods); + mtx_init(&td->ppod_lock, "page pods", NULL, MTX_DEF); + + t4_register_cpl_handler(sc, CPL_RX_DATA_DDP, do_rx_data_ddp); + t4_register_cpl_handler(sc, CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); +} + +void +t4_uninit_ddp(struct adapter *sc __unused, struct tom_data *td) +{ + + KASSERT(td->nppods == td->nppods_free, + ("%s: page pods still in use, nppods = %d, free = %d", + __func__, td->nppods, td->nppods_free)); + + if (mtx_initialized(&td->ppod_lock)) + mtx_destroy(&td->ppod_lock); +} + +#define VNET_SO_ASSERT(so) \ + VNET_ASSERT(curvnet != NULL, \ + ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) +static int +soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) +{ + + CXGBE_UNIMPLEMENTED(__func__); +} + +/* + * Copy an mbuf chain into a uio limited by len if set. + */ +static int +m_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len) +{ + int error, length, total; + int progress = 0; + + if (len > 0) + total = min(uio->uio_resid, len); + else + total = uio->uio_resid; + + /* Fill the uio with data from the mbufs. */ + for (; m != NULL; m = m->m_next) { + length = min(m->m_len, total - progress); + + if (m->m_flags & M_DDP) { + enum uio_seg segflag = uio->uio_segflg; + + uio->uio_segflg = UIO_NOCOPY; + error = uiomove(mtod(m, void *), length, uio); + uio->uio_segflg = segflag; + } else + error = uiomove(mtod(m, void *), length, uio); + if (error) + return (error); + + progress += length; + } + + return (0); +} + +/* + * Based on soreceive_stream() in uipc_socket.c + */ +int +t4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio, + struct mbuf **mp0, struct mbuf **controlp, int *flagsp) +{ + int len = 0, error = 0, flags, oresid, ddp_handled = 0; + struct sockbuf *sb; + struct mbuf *m, *n = NULL; + + /* We only do stream sockets. */ + if (so->so_type != SOCK_STREAM) + return (EINVAL); + if (psa != NULL) + *psa = NULL; + if (controlp != NULL) + return (EINVAL); + if (flagsp != NULL) + flags = *flagsp &~ MSG_EOR; + else + flags = 0; + if (flags & MSG_OOB) + return (soreceive_rcvoob(so, uio, flags)); + if (mp0 != NULL) + *mp0 = NULL; + + sb = &so->so_rcv; + + /* Prevent other readers from entering the socket. */ + error = sblock(sb, SBLOCKWAIT(flags)); + if (error) + goto out; + SOCKBUF_LOCK(sb); + + /* Easy one, no space to copyout anything. */ + if (uio->uio_resid == 0) { + error = EINVAL; + goto out; + } + oresid = uio->uio_resid; + + /* We will never ever get anything unless we are or were connected. */ + if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { + error = ENOTCONN; + goto out; + } + +restart: + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + + if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { + + /* uio should be just as it was at entry */ + KASSERT(oresid == uio->uio_resid, + ("%s: oresid = %d, uio_resid = %zd, sb_cc = %d", + __func__, oresid, uio->uio_resid, sb->sb_cc)); + + error = handle_ddp(so, uio, flags, 0); + ddp_handled = 1; + if (error) + goto out; + } + + /* Abort if socket has reported problems. */ + if (so->so_error) { + if (sb->sb_cc > 0) + goto deliver; + if (oresid > uio->uio_resid) + goto out; + error = so->so_error; + if (!(flags & MSG_PEEK)) + so->so_error = 0; + goto out; + } + + /* Door is closed. Deliver what is left, if any. */ + if (sb->sb_state & SBS_CANTRCVMORE) { + if (sb->sb_cc > 0) + goto deliver; + else + goto out; + } + + /* Socket buffer is empty and we shall not block. */ + if (sb->sb_cc == 0 && + ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { + error = EAGAIN; + goto out; + } + + /* Socket buffer got some data that we shall deliver now. */ + if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) && + ((sb->sb_flags & SS_NBIO) || + (flags & (MSG_DONTWAIT|MSG_NBIO)) || + sb->sb_cc >= sb->sb_lowat || + sb->sb_cc >= uio->uio_resid || + sb->sb_cc >= sb->sb_hiwat) ) { + goto deliver; + } + + /* On MSG_WAITALL we must wait until all data or error arrives. */ + if ((flags & MSG_WAITALL) && + (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat)) + goto deliver; + + /* + * Wait and block until (more) data comes in. + * NB: Drops the sockbuf lock during wait. + */ + error = sbwait(sb); + if (error) { + if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { + (void) handle_ddp(so, uio, flags, 1); + ddp_handled = 1; + } + goto out; + } + goto restart; + +deliver: + SOCKBUF_LOCK_ASSERT(&so->so_rcv); + KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__)); + KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); + + if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) + goto restart; + + /* Statistics. */ + if (uio->uio_td) + uio->uio_td->td_ru.ru_msgrcv++; + + /* Fill uio until full or current end of socket buffer is reached. */ + len = min(uio->uio_resid, sb->sb_cc); + if (mp0 != NULL) { + /* Dequeue as many mbufs as possible. */ + if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { + for (*mp0 = m = sb->sb_mb; + m != NULL && m->m_len <= len; + m = m->m_next) { + len -= m->m_len; + uio->uio_resid -= m->m_len; + sbfree(sb, m); + n = m; + } + sb->sb_mb = m; + if (sb->sb_mb == NULL) + SB_EMPTY_FIXUP(sb); + n->m_next = NULL; + } + /* Copy the remainder. */ + if (len > 0) { + KASSERT(sb->sb_mb != NULL, + ("%s: len > 0 && sb->sb_mb empty", __func__)); + + m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT); + if (m == NULL) + len = 0; /* Don't flush data from sockbuf. */ + else + uio->uio_resid -= m->m_len; + if (*mp0 != NULL) + n->m_next = m; + else + *mp0 = m; + if (*mp0 == NULL) { + error = ENOBUFS; + goto out; + } + } + } else { + /* NB: Must unlock socket buffer as uiomove may sleep. */ + SOCKBUF_UNLOCK(sb); + error = m_mbuftouio_ddp(uio, sb->sb_mb, len); + SOCKBUF_LOCK(sb); + if (error) + goto out; + } + SBLASTRECORDCHK(sb); + SBLASTMBUFCHK(sb); + + /* + * Remove the delivered data from the socket buffer unless we + * were only peeking. + */ + if (!(flags & MSG_PEEK)) { + if (len > 0) + sbdrop_locked(sb, len); + + /* Notify protocol that we drained some data. */ + if ((so->so_proto->pr_flags & PR_WANTRCVD) && + (((flags & MSG_WAITALL) && uio->uio_resid > 0) || + !(flags & MSG_SOCALLBCK))) { + SOCKBUF_UNLOCK(sb); + VNET_SO_ASSERT(so); + (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); + SOCKBUF_LOCK(sb); + } + } + + /* + * For MSG_WAITALL we may have to loop again and wait for + * more data to come in. + */ + if ((flags & MSG_WAITALL) && uio->uio_resid > 0) + goto restart; +out: + SOCKBUF_LOCK_ASSERT(sb); + SBLASTRECORDCHK(sb); + SBLASTMBUFCHK(sb); + SOCKBUF_UNLOCK(sb); + sbunlock(sb); + return (error); +} + +#endif diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index 895e57ad073..3e088cfa0c1 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -881,7 +881,7 @@ t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) */ static uint32_t calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, - const struct tcp_options *tcpopt, struct tcphdr *th) + const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode) { uint32_t opt2 = 0; struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid]; @@ -902,6 +902,11 @@ calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE); opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id); +#ifdef USE_DDP_RX_FLOW_CONTROL + if (ulp_mode == ULP_MODE_TCPDDP) + opt2 |= F_RX_FC_VALID | F_RX_FC_DDP; +#endif + return htobe32(opt2); } @@ -985,7 +990,7 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, struct l2t_entry *e = NULL; struct rtentry *rt; struct sockaddr_in nam; - int rscale, mtu_idx, rx_credits, rxqid; + int rscale, mtu_idx, rx_credits, rxqid, ulp_mode; struct synq_entry *synqe = NULL; int reject_reason; uint16_t vid; @@ -1108,9 +1113,13 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, get_qids_from_mbuf(m, NULL, &rxqid); INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); - rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits, - ULP_MODE_NONE); - rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th); + if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) { + ulp_mode = ULP_MODE_TCPDDP; + synqe_set_flag(synqe, TPF_SYNQE_TCPDDP); + } else + ulp_mode = ULP_MODE_NONE; + rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits, ulp_mode); + rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode); synqe->tid = tid; synqe->lctx = lctx; @@ -1313,7 +1322,10 @@ reset: } toep->tid = tid; toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx]; - toep->ulp_mode = ULP_MODE_NONE; + if (synqe_flag(synqe, TPF_SYNQE_TCPDDP)) + set_tcpddp_ulp_mode(toep); + else + toep->ulp_mode = ULP_MODE_NONE; /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ toep->rx_credits = synqe->rcv_bufsize; diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c index c6e9a1ffb2e..43b8e4812c5 100644 --- a/sys/dev/cxgbe/tom/t4_tom.c +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -55,6 +55,9 @@ __FBSDID("$FreeBSD$"); #include "tom/t4_tom_l2t.h" #include "tom/t4_tom.h" +static struct protosw ddp_protosw; +static struct pr_usrreqs ddp_usrreqs; + /* Module ops */ static int t4_tom_mod_load(void); static int t4_tom_mod_unload(void); @@ -167,6 +170,8 @@ offload_socket(struct socket *so, struct toepcb *toep) sb = &so->so_rcv; SOCKBUF_LOCK(sb); sb->sb_flags |= SB_NOCOALESCE; + if (toep->ulp_mode == ULP_MODE_TCPDDP) + so->so_proto = &ddp_protosw; SOCKBUF_UNLOCK(sb); /* Update TCP PCB */ @@ -235,6 +240,9 @@ release_offload_resources(struct toepcb *toep) CTR4(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p)", __func__, toep, tid, toep->l2te); + if (toep->ulp_mode == ULP_MODE_TCPDDP) + release_ddp_resources(toep); + if (toep->l2te) t4_l2t_release(toep->l2te); @@ -568,6 +576,8 @@ free_tom_data(struct adapter *sc, struct tom_data *td) ("%s: lctx hash table is not empty.", __func__)); t4_uninit_l2t_cpl_handlers(sc); + t4_uninit_cpl_io_handlers(sc); + t4_uninit_ddp(sc, td); if (td->listen_mask != 0) hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask); @@ -613,6 +623,8 @@ t4_tom_activate(struct adapter *sc) if (rc != 0) goto done; + t4_init_ddp(sc, td); + /* CPL handlers */ t4_init_connect_cpl_handlers(sc); t4_init_l2t_cpl_handlers(sc); @@ -688,6 +700,16 @@ static int t4_tom_mod_load(void) { int rc; + struct protosw *tcp_protosw; + + tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM); + if (tcp_protosw == NULL) + return (ENOPROTOOPT); + + bcopy(tcp_protosw, &ddp_protosw, sizeof(ddp_protosw)); + bcopy(tcp_protosw->pr_usrreqs, &ddp_usrreqs, sizeof(ddp_usrreqs)); + ddp_usrreqs.pru_soreceive = t4_soreceive_ddp; + ddp_protosw.pr_usrreqs = &ddp_usrreqs; rc = t4_register_uld(&tom_uld_info); if (rc != 0) diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h index 0078ada3f82..98c9bfa8f30 100644 --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -46,6 +46,13 @@ */ #define MAX_RCV_WND ((1U << 27) - 1) +#define DDP_RSVD_WIN (16 * 1024U) +#define SB_DDP_INDICATE SB_IN_TOE /* soreceive must respond to indicate */ + +#define M_DDP M_PROTO1 + +#define USE_DDP_RX_FLOW_CONTROL + /* TOE PCB flags */ enum { TPF_ATTACHED, /* a tcpcb refers to this toepcb */ @@ -58,6 +65,15 @@ enum { TPF_CPL_PENDING, /* haven't received the last CPL */ TPF_SYNQE, /* synq_entry, not really a toepcb */ TPF_SYNQE_NEEDFREE, /* synq_entry was allocated externally */ + TPF_SYNQE_TCPDDP, /* ulp_mode TCPDDP when toepcb is allocated */ +}; + +enum { + DDP_OK = (1 << 0), /* OK to turn on DDP */ + DDP_SC_REQ = (1 << 1), /* state change (on/off) requested */ + DDP_ON = (1 << 2), /* DDP is turned on */ + DDP_BUF0_ACTIVE = (1 << 3), /* buffer 0 in use (not invalidated) */ + DDP_BUF1_ACTIVE = (1 << 4), /* buffer 1 in use (not invalidated) */ }; struct ofld_tx_sdesc { @@ -65,6 +81,22 @@ struct ofld_tx_sdesc { uint8_t tx_credits; /* firmware tx credits (unit is 16B) */ }; +struct ppod_region { + TAILQ_ENTRY(ppod_region) link; + int used; /* # of pods used by this region */ + int free; /* # of contiguous pods free right after this region */ +}; + +struct ddp_buffer { + uint32_t tag; /* includes color, page pod addr, and DDP page size */ + int nppods; + int offset; + int len; + struct ppod_region ppod_region; + int npages; + vm_page_t *pages; +}; + struct toepcb { TAILQ_ENTRY(toepcb) link; /* toep_list */ unsigned int flags; /* miscellaneous flags */ @@ -77,11 +109,16 @@ struct toepcb { struct l2t_entry *l2te; /* L2 table entry used by this connection */ int tid; /* Connection identifier */ unsigned int tx_credits;/* tx WR credits (in 16 byte units) remaining */ - unsigned int enqueued; /* # of bytes added to so_rcv (not yet read) */ + unsigned int sb_cc; /* last noted value of so_rcv->sb_cc */ int rx_credits; /* rx credits (in bytes) to be returned to hw */ unsigned int ulp_mode; /* ULP mode */ + unsigned int ddp_flags; + struct ddp_buffer *db[2]; + time_t ddp_disabled; + uint8_t ddp_score; + /* Tx software descriptor */ uint8_t txsd_total; uint8_t txsd_pidx; @@ -118,6 +155,19 @@ toepcb_clr_flag(struct toepcb *toep, int flag) clrbit(&toep->flags, flag); } +#define DDP_RETRY_WAIT 5 /* seconds to wait before re-enabling DDP */ +#define DDP_LOW_SCORE 1 +#define DDP_HIGH_SCORE 3 + +static inline void +set_tcpddp_ulp_mode(struct toepcb *toep) +{ + + toep->ulp_mode = ULP_MODE_TCPDDP; + toep->ddp_flags = DDP_OK; + toep->ddp_score = DDP_LOW_SCORE; +} + /* * Compressed state for embryonic connections for a listener. Barely fits in * 64B, try not to grow it further. @@ -171,6 +221,8 @@ struct listen_ctx { TAILQ_HEAD(, synq_entry) synq; }; +TAILQ_HEAD(ppod_head, ppod_region); + struct tom_data { struct toedev tod; @@ -178,10 +230,16 @@ struct tom_data { struct mtx toep_list_lock; TAILQ_HEAD(, toepcb) toep_list; + struct mtx lctx_hash_lock; LIST_HEAD(, listen_ctx) *listen_hash; u_long listen_mask; int lctx_count; /* # of lctx in the hash table */ - struct mtx lctx_hash_lock; + + struct mtx ppod_lock; + int nppods; + int nppods_free; /* # of available ppods */ + int nppods_free_head; /* # of available ppods at the begining */ + struct ppod_head ppods; }; static inline struct tom_data * @@ -248,4 +306,11 @@ int t4_send_rst(struct toedev *, struct tcpcb *); void t4_set_tcb_field(struct adapter *, struct toepcb *, uint16_t, uint64_t, uint64_t); +/* t4_ddp.c */ +void t4_init_ddp(struct adapter *, struct tom_data *); +void t4_uninit_ddp(struct adapter *, struct tom_data *); +int t4_soreceive_ddp(struct socket *, struct sockaddr **, struct uio *, + struct mbuf **, struct mbuf **, int *); +void enable_ddp(struct adapter *, struct toepcb *toep); +void release_ddp_resources(struct toepcb *toep); #endif diff --git a/sys/modules/cxgbe/tom/Makefile b/sys/modules/cxgbe/tom/Makefile index d17ce6f2800..72721be6412 100644 --- a/sys/modules/cxgbe/tom/Makefile +++ b/sys/modules/cxgbe/tom/Makefile @@ -8,7 +8,7 @@ CXGBE = ${.CURDIR}/../../../dev/cxgbe .PATH: ${CXGBE}/tom KMOD = t4_tom -SRCS = t4_tom.c t4_connect.c t4_listen.c t4_cpl_io.c t4_tom_l2t.c +SRCS = t4_tom.c t4_connect.c t4_listen.c t4_cpl_io.c t4_tom_l2t.c t4_ddp.c SRCS+= device_if.h bus_if.h pci_if.h SRCS+= opt_inet.h