From e44c1887fdb21e349a778007fcd6d72d9cf45552 Mon Sep 17 00:00:00 2001
From: Steven Hartland <smh@FreeBSD.org>
Date: Mon, 10 Apr 2017 08:19:35 +0000
Subject: [PATCH] Use estimated RTT for receive buffer auto resizing instead of
 timestamps

Switched from using timestamps to RTT estimates when performing TCP receive
buffer auto resizing, as not all hosts support / enable TCP timestamps.

Disabled reset of receive buffer auto scaling when not in bulk receive mode,
which gives an extra 20% performance increase.

Also extracted auto resizing to a common method shared between standard and
fastpath modules.

With this AWS S3 downloads at ~17ms latency on a 1Gbps connection jump from
~3MB/s to ~100MB/s using the default settings.

Reviewed by:    lstewart, gnn
MFC after:      2 weeks
Relnotes:       Yes
Sponsored by:   Multiplay
Differential Revision:  https://reviews.freebsd.org/D9668
---
 sys/netinet/in_kdtrace.c          |   8 ++
 sys/netinet/in_kdtrace.h          |   1 +
 sys/netinet/tcp_input.c           | 123 +++++++++++++++---------------
 sys/netinet/tcp_output.c          |  10 ++-
 sys/netinet/tcp_stacks/fastpath.c |  62 +--------------
 sys/netinet/tcp_var.h             |   2 +
 6 files changed, 82 insertions(+), 124 deletions(-)

diff --git a/sys/netinet/in_kdtrace.c b/sys/netinet/in_kdtrace.c
index df217ee6ab4..79ae2599bc2 100644
--- a/sys/netinet/in_kdtrace.c
+++ b/sys/netinet/in_kdtrace.c
@@ -132,6 +132,14 @@ SDT_PROBE_DEFINE6_XLATE(tcp, , , state__change,
     "void *", "void *",
     "int", "tcplsinfo_t *");
 
+SDT_PROBE_DEFINE6_XLATE(tcp, , , receive__autoresize,
+    "void *", "void *",
+    "struct tcpcb *", "csinfo_t *",
+    "struct mbuf *", "ipinfo_t *",
+    "struct tcpcb *", "tcpsinfo_t *" ,
+    "struct tcphdr *", "tcpinfoh_t *",
+    "int", "int");
+
 SDT_PROBE_DEFINE5_XLATE(udp, , , receive,
     "void *", "pktinfo_t *",
     "struct inpcb *", "csinfo_t *",
diff --git a/sys/netinet/in_kdtrace.h b/sys/netinet/in_kdtrace.h
index a36991ef14c..0825c7dff35 100644
--- a/sys/netinet/in_kdtrace.h
+++ b/sys/netinet/in_kdtrace.h
@@ -65,6 +65,7 @@ SDT_PROBE_DECLARE(tcp, , , debug__input);
 SDT_PROBE_DECLARE(tcp, , , debug__output);
 SDT_PROBE_DECLARE(tcp, , , debug__user);
 SDT_PROBE_DECLARE(tcp, , , debug__drop);
+SDT_PROBE_DECLARE(tcp, , , receive__autoresize);
 
 SDT_PROBE_DECLARE(udp, , , receive);
 SDT_PROBE_DECLARE(udp, , , send);
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 85410b67948..367e45ed1ac 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1486,6 +1486,68 @@ drop:
 	return (IPPROTO_DONE);
 }
 
+/*
+ * Automatic sizing of receive socket buffer.  Often the send
+ * buffer size is not optimally adjusted to the actual network
+ * conditions at hand (delay bandwidth product).  Setting the
+ * buffer size too small limits throughput on links with high
+ * bandwidth and high delay (eg. trans-continental/oceanic links).
+ *
+ * On the receive side the socket buffer memory is only rarely
+ * used to any significant extent.  This allows us to be much
+ * more aggressive in scaling the receive socket buffer.  For
+ * the case that the buffer space is actually used to a large
+ * extent and we run out of kernel memory we can simply drop
+ * the new segments; TCP on the sender will just retransmit it
+ * later.  Setting the buffer size too big may only consume too
+ * much kernel memory if the application doesn't read() from
+ * the socket or packet loss or reordering makes use of the
+ * reassembly queue.
+ *
+ * The criteria to step up the receive buffer one notch are:
+ *  1. Application has not set receive buffer size with
+ *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
+ *  2. the number of bytes received during the time it takes
+ *     one timestamp to be reflected back to us (the RTT);
+ *  3. received bytes per RTT is within seven eighth of the
+ *     current socket buffer size;
+ *  4. receive buffer size has not hit maximal automatic size;
+ *
+ * This algorithm does one step per RTT at most and only if
+ * we receive a bulk stream w/o packet losses or reorderings.
+ * Shrinking the buffer during idle times is not necessary as
+ * it doesn't consume any memory when idle.
+ *
+ * TODO: Only step up if the application is actually serving
+ * the buffer to better manage the socket buffer resources.
+ */
+int
+tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
+    struct tcpcb *tp, int tlen)
+{
+	int newsize = 0;
+
+	if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
+	    tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
+	    TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
+	    (tp->t_srtt >> TCP_RTT_SHIFT)) {
+		if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
+		    so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
+			newsize = min(so->so_rcv.sb_hiwat +
+			    V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max);
+		}
+		TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
+
+		/* Start over with next RTT. */
+		tp->rfbuf_ts = 0;
+		tp->rfbuf_cnt = 0;
+	} else {
+		tp->rfbuf_cnt += tlen;	/* add up */
+	}
+
+	return (newsize);
+}
+
 void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
@@ -1849,62 +1911,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 #endif
 			TCP_PROBE3(debug__input, tp, th, m);
 
-		/*
-		 * Automatic sizing of receive socket buffer.  Often the send
-		 * buffer size is not optimally adjusted to the actual network
-		 * conditions at hand (delay bandwidth product).  Setting the
-		 * buffer size too small limits throughput on links with high
-		 * bandwidth and high delay (eg. trans-continental/oceanic links).
-		 *
-		 * On the receive side the socket buffer memory is only rarely
-		 * used to any significant extent.  This allows us to be much
-		 * more aggressive in scaling the receive socket buffer.  For
-		 * the case that the buffer space is actually used to a large
-		 * extent and we run out of kernel memory we can simply drop
-		 * the new segments; TCP on the sender will just retransmit it
-		 * later.  Setting the buffer size too big may only consume too
-		 * much kernel memory if the application doesn't read() from
-		 * the socket or packet loss or reordering makes use of the
-		 * reassembly queue.
-		 *
-		 * The criteria to step up the receive buffer one notch are:
-		 *  1. Application has not set receive buffer size with
-		 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
-		 *  2. the number of bytes received during the time it takes
-		 *     one timestamp to be reflected back to us (the RTT);
-		 *  3. received bytes per RTT is within seven eighth of the
-		 *     current socket buffer size;
-		 *  4. receive buffer size has not hit maximal automatic size;
-		 *
-		 * This algorithm does one step per RTT at most and only if
-		 * we receive a bulk stream w/o packet losses or reorderings.
-		 * Shrinking the buffer during idle times is not necessary as
-		 * it doesn't consume any memory when idle.
-		 *
-		 * TODO: Only step up if the application is actually serving
-		 * the buffer to better manage the socket buffer resources.
-		 */
-			if (V_tcp_do_autorcvbuf &&
-			    (to.to_flags & TOF_TS) &&
-			    to.to_tsecr &&
-			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
-				if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
-				    to.to_tsecr - tp->rfbuf_ts < hz) {
-					if (tp->rfbuf_cnt >
-					    (so->so_rcv.sb_hiwat / 8 * 7) &&
-					    so->so_rcv.sb_hiwat <
-					    V_tcp_autorcvbuf_max) {
-						newsize =
-						    min(so->so_rcv.sb_hiwat +
-						    V_tcp_autorcvbuf_inc,
-						    V_tcp_autorcvbuf_max);
-					}
-					/* Start over with next RTT. */
-					tp->rfbuf_ts = 0;
-					tp->rfbuf_cnt = 0;
-				} else
-					tp->rfbuf_cnt += tlen;	/* add up */
-			}
+			newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
 
 			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
@@ -1945,10 +1952,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 
-	/* Reset receive buffer auto scaling when not in bulk receive mode. */
-	tp->rfbuf_ts = 0;
-	tp->rfbuf_cnt = 0;
-
 	switch (tp->t_state) {
 
 	/*
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index f5c8c835e80..d997b271ea3 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -831,11 +831,13 @@ send:
 			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
-			/* Set receive buffer autosizing timestamp. */
-			if (tp->rfbuf_ts == 0 &&
-			    (so->so_rcv.sb_flags & SB_AUTOSIZE))
-				tp->rfbuf_ts = tcp_ts_getticks();
 		}
+
+		/* Set receive buffer autosizing timestamp. */
+		if (tp->rfbuf_ts == 0 &&
+		    (so->so_rcv.sb_flags & SB_AUTOSIZE))
+			tp->rfbuf_ts = tcp_ts_getticks();
+
 		/* Selective ACK's. */
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (flags & TH_SYN)
diff --git a/sys/netinet/tcp_stacks/fastpath.c b/sys/netinet/tcp_stacks/fastpath.c
index 7a3961f8da1..aadd9b17238 100644
--- a/sys/netinet/tcp_stacks/fastpath.c
+++ b/sys/netinet/tcp_stacks/fastpath.c
@@ -399,62 +399,8 @@ tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
 			  (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 	TCP_PROBE3(debug__input, tp, th, m);
-	/*
-	 * Automatic sizing of receive socket buffer.  Often the send
-	 * buffer size is not optimally adjusted to the actual network
-	 * conditions at hand (delay bandwidth product).  Setting the
-	 * buffer size too small limits throughput on links with high
-	 * bandwidth and high delay (eg. trans-continental/oceanic links).
-	 *
-	 * On the receive side the socket buffer memory is only rarely
-	 * used to any significant extent.  This allows us to be much
-	 * more aggressive in scaling the receive socket buffer.  For
-	 * the case that the buffer space is actually used to a large
-	 * extent and we run out of kernel memory we can simply drop
-	 * the new segments; TCP on the sender will just retransmit it
-	 * later.  Setting the buffer size too big may only consume too
-	 * much kernel memory if the application doesn't read() from
-	 * the socket or packet loss or reordering makes use of the
-	 * reassembly queue.
-	 *
-	 * The criteria to step up the receive buffer one notch are:
-	 *  1. Application has not set receive buffer size with
-	 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
-	 *  2. the number of bytes received during the time it takes
-	 *     one timestamp to be reflected back to us (the RTT);
-	 *  3. received bytes per RTT is within seven eighth of the
-	 *     current socket buffer size;
-	 *  4. receive buffer size has not hit maximal automatic size;
-	 *
-	 * This algorithm does one step per RTT at most and only if
-	 * we receive a bulk stream w/o packet losses or reorderings.
-	 * Shrinking the buffer during idle times is not necessary as
-	 * it doesn't consume any memory when idle.
-	 *
-	 * TODO: Only step up if the application is actually serving
-	 * the buffer to better manage the socket buffer resources.
-	 */
-	if (V_tcp_do_autorcvbuf &&
-	    (to->to_flags & TOF_TS) &&
-	    to->to_tsecr &&
-	    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
-		if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
-		    to->to_tsecr - tp->rfbuf_ts < hz) {
-			if (tp->rfbuf_cnt >
-			    (so->so_rcv.sb_hiwat / 8 * 7) &&
-			    so->so_rcv.sb_hiwat <
-			    V_tcp_autorcvbuf_max) {
-				newsize =
-					min(so->so_rcv.sb_hiwat +
-					    V_tcp_autorcvbuf_inc,
-					    V_tcp_autorcvbuf_max);
-			}
-			/* Start over with next RTT. */
-			tp->rfbuf_ts = 0;
-			tp->rfbuf_cnt = 0;
-		} else
-			tp->rfbuf_cnt += tlen;	/* add up */
-	}
+
+	newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
 
 	/* Add data to socket buffer. */
 	SOCKBUF_LOCK(&so->so_rcv);
@@ -532,10 +478,6 @@ tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 
-	/* Reset receive buffer auto scaling when not in bulk receive mode. */
-	tp->rfbuf_ts = 0;
-	tp->rfbuf_cnt = 0;
-
 	switch (tp->t_state) {
 
 	/*
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 5705e553d1d..d298c9dd6c6 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -778,6 +778,8 @@ void	hhook_run_tcp_est_in(struct tcpcb *tp,
 #endif
 
 int	 tcp_input(struct mbuf **, int *, int);
+int	 tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
+	    struct tcpcb *, int);
 void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 			struct socket *, struct tcpcb *, int, int, uint8_t,
 			int);