[9.20] chg: usr: Optimize the TCP source port selection on Linux

Enable a socket option on the outgoing TCP sockets to allow faster selection of the source <address,port> tuple for different destination <address,port> tuples when nearing over 70-80% of the source port utilization.

Backport of MR !11569

Merge branch 'backport-improve-selection-of-outgoing-TCP-port-9.20' into 'bind-9.20'

See merge request isc-projects/bind9!11573
This commit is contained in:
Ondřej Surý 2026-02-20 17:56:37 +01:00
commit d4426f85b3
14 changed files with 202 additions and 63 deletions

View file

@ -8419,7 +8419,7 @@ load_configuration(const char *filename, named_server_t *server,
dns_view_t *view_next = NULL;
dns_viewlist_t tmpviewlist;
dns_viewlist_t viewlist, builtin_viewlist;
in_port_t listen_port, udpport_low, udpport_high;
in_port_t listen_port, port_low, port_high;
int i, backlog;
isc_interval_t interval;
isc_logconfig_t *logc = NULL;
@ -8849,28 +8849,18 @@ load_configuration(const char *filename, named_server_t *server,
if (usev4ports != NULL) {
portset_fromconf(v4portset, usev4ports, true);
} else {
result = isc_net_getudpportrange(AF_INET, &udpport_low,
&udpport_high);
if (result != ISC_R_SUCCESS) {
isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL,
NAMED_LOGMODULE_SERVER, ISC_LOG_ERROR,
"get the default UDP/IPv4 port range: %s",
isc_result_totext(result));
goto cleanup_v6portset;
}
if (udpport_low == udpport_high) {
isc_portset_add(v4portset, udpport_low);
isc_net_getportrange(AF_INET, &port_low, &port_high);
if (port_low == port_high) {
isc_portset_add(v4portset, port_low);
} else {
isc_portset_addrange(v4portset, udpport_low,
udpport_high);
isc_portset_addrange(v4portset, port_low, port_high);
}
if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE4)) {
isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL,
NAMED_LOGMODULE_SERVER, ISC_LOG_INFO,
"using default UDP/IPv4 port range: "
"[%d, %d]",
udpport_low, udpport_high);
port_low, port_high);
}
}
(void)named_config_get(maps, "avoid-v4-udp-ports", &avoidv4ports);
@ -8882,27 +8872,18 @@ load_configuration(const char *filename, named_server_t *server,
if (usev6ports != NULL) {
portset_fromconf(v6portset, usev6ports, true);
} else {
result = isc_net_getudpportrange(AF_INET6, &udpport_low,
&udpport_high);
if (result != ISC_R_SUCCESS) {
isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL,
NAMED_LOGMODULE_SERVER, ISC_LOG_ERROR,
"get the default UDP/IPv6 port range: %s",
isc_result_totext(result));
goto cleanup_v6portset;
}
if (udpport_low == udpport_high) {
isc_portset_add(v6portset, udpport_low);
isc_net_getportrange(AF_INET6, &port_low, &port_high);
if (port_low == port_high) {
isc_portset_add(v6portset, port_low);
} else {
isc_portset_addrange(v6portset, udpport_low,
udpport_high);
isc_portset_addrange(v6portset, port_low, port_high);
}
if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE6)) {
isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL,
NAMED_LOGMODULE_SERVER, ISC_LOG_INFO,
"using default UDP/IPv6 port range: "
"[%d, %d]",
udpport_low, udpport_high);
port_low, port_high);
}
}
(void)named_config_get(maps, "avoid-v6-udp-ports", &avoidv6ports);

View file

@ -771,14 +771,12 @@ set_source_ports(dns_dispatchmgr_t *manager) {
result = isc_portset_create(gmctx, &v4portset);
check_result(result, "isc_portset_create (v4)");
result = isc_net_getudpportrange(AF_INET, &udpport_low, &udpport_high);
check_result(result, "isc_net_getudpportrange (v4)");
isc_net_getportrange(AF_INET, &udpport_low, &udpport_high);
isc_portset_addrange(v4portset, udpport_low, udpport_high);
result = isc_portset_create(gmctx, &v6portset);
check_result(result, "isc_portset_create (v6)");
result = isc_net_getudpportrange(AF_INET6, &udpport_low, &udpport_high);
check_result(result, "isc_net_getudpportrange (v6)");
isc_net_getportrange(AF_INET6, &udpport_low, &udpport_high);
isc_portset_addrange(v6portset, udpport_low, udpport_high);
result = dns_dispatchmgr_setavailports(manager, v4portset, v6portset);

View file

@ -2046,10 +2046,7 @@ set_source_ports(dns_dispatchmgr_t *manager) {
fatal("isc_portset_create (v4) failed");
}
result = isc_net_getudpportrange(AF_INET, &udpport_low, &udpport_high);
if (result != ISC_R_SUCCESS) {
fatal("isc_net_getudpportrange (v4) failed");
}
isc_net_getportrange(AF_INET, &udpport_low, &udpport_high);
isc_portset_addrange(v4portset, udpport_low, udpport_high);
@ -2057,10 +2054,7 @@ set_source_ports(dns_dispatchmgr_t *manager) {
if (result != ISC_R_SUCCESS) {
fatal("isc_portset_create (v6) failed");
}
result = isc_net_getudpportrange(AF_INET6, &udpport_low, &udpport_high);
if (result != ISC_R_SUCCESS) {
fatal("isc_net_getudpportrange (v6) failed");
}
isc_net_getportrange(AF_INET6, &udpport_low, &udpport_high);
isc_portset_addrange(v6portset, udpport_low, udpport_high);

View file

@ -146,20 +146,14 @@ setsourceports(isc_mem_t *mctx, dns_dispatchmgr_t *manager) {
if (result != ISC_R_SUCCESS) {
goto cleanup;
}
result = isc_net_getudpportrange(AF_INET, &udpport_low, &udpport_high);
if (result != ISC_R_SUCCESS) {
goto cleanup;
}
isc_net_getportrange(AF_INET, &udpport_low, &udpport_high);
isc_portset_addrange(v4portset, udpport_low, udpport_high);
result = isc_portset_create(mctx, &v6portset);
if (result != ISC_R_SUCCESS) {
goto cleanup;
}
result = isc_net_getudpportrange(AF_INET6, &udpport_low, &udpport_high);
if (result != ISC_R_SUCCESS) {
goto cleanup;
}
isc_net_getportrange(AF_INET6, &udpport_low, &udpport_high);
isc_portset_addrange(v6portset, udpport_low, udpport_high);
result = dns_dispatchmgr_setavailports(manager, v4portset, v6portset);

View file

@ -917,7 +917,7 @@ static void
create_default_portset(isc_mem_t *mctx, int family, isc_portset_t **portsetp) {
in_port_t low, high;
isc_net_getudpportrange(family, &low, &high);
isc_net_getportrange(family, &low, &high);
isc_portset_create(mctx, portsetp);
isc_portset_addrange(*portsetp, low, high);

View file

@ -248,8 +248,8 @@ isc_net_enableipv4(void);
void
isc_net_enableipv6(void);
isc_result_t
isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high);
void
isc_net_getportrange(int af, in_port_t *low, in_port_t *high);
/*%<
* Returns system's default range of ephemeral UDP ports, if defined.
* If the range is not available or unknown, ISC_NET_PORTRANGELOW and

View file

@ -897,3 +897,10 @@ isc_nmsocket_getaddr(isc_nmsocket_t *sock);
/*%<
* Return the local address of 'sock'.
*/
void
isc_netmgr_portrange(isc_nm_t *netmgr, sa_family_t af, in_port_t low,
in_port_t high);
/*%<
* Set the ephemeral port range <low, high> for 'af' family.
*/

View file

@ -49,4 +49,11 @@ isc_os_umask(void);
* Return umask of the current process as initialized at the program start
*/
void
isc_os_kernel(char **name, int *major, int *minor, int *patch);
/*%<
* Fill the running kernel version into major, minor and patch.
* If any of these are not available then -1 is returned.
*/
ISC_LANG_ENDDECLS

View file

@ -333,7 +333,7 @@ isc_net_probe_ipv6pktinfo(void) {
#if defined(USE_SYSCTL_PORTRANGE)
#if defined(HAVE_SYSCTLBYNAME)
static isc_result_t
getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
getportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
int port_low, port_high;
size_t portlen;
const char *sysctlname_lowport, *sysctlname_hiport;
@ -366,7 +366,7 @@ getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
}
#else /* !HAVE_SYSCTLBYNAME */
static isc_result_t
getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
getportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
int mib_lo4[4] = SYSCTL_V4PORTRANGE_LOW;
int mib_hi4[4] = SYSCTL_V4PORTRANGE_HIGH;
int mib_lo6[4] = SYSCTL_V6PORTRANGE_LOW;
@ -407,18 +407,18 @@ getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
#endif /* HAVE_SYSCTLBYNAME */
#endif /* USE_SYSCTL_PORTRANGE */
isc_result_t
isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) {
void
isc_net_getportrange(int af, in_port_t *low, in_port_t *high) {
int result = ISC_R_FAILURE;
#if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux)
#if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux__)
FILE *fp;
#endif /* if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux) */
#endif /* if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux__) */
REQUIRE(low != NULL && high != NULL);
#if defined(USE_SYSCTL_PORTRANGE)
result = getudpportrange_sysctl(af, low, high);
#elif defined(__linux)
result = getportrange_sysctl(af, low, high);
#elif defined(__linux__)
UNUSED(af);
@ -446,8 +446,6 @@ isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) {
*low = ISC_NET_PORTRANGELOW;
*high = ISC_NET_PORTRANGEHIGH;
}
return ISC_R_SUCCESS; /* we currently never fail in this function */
}
void

View file

@ -366,6 +366,11 @@ struct isc_nm {
atomic_int_fast32_t send_udp_buffer_size;
atomic_int_fast32_t recv_tcp_buffer_size;
atomic_int_fast32_t send_tcp_buffer_size;
_Atomic(in_port_t) port_low4;
_Atomic(in_port_t) port_high4;
_Atomic(in_port_t) port_low6;
_Atomic(in_port_t) port_high6;
};
/*%
@ -1373,6 +1378,15 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family);
* Use minimum MTU on IPv6 sockets
*/
isc_result_t
isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
sa_family_t sa_family ISC_ATTR_UNUSED,
in_port_t port_low, in_port_t port_high);
/*%<
* Set IP_BIND_ADDRESS_NO_PORT and IP_LOCAL_PORT_RANGE on the socket
* (Linux only).
*/
void
isc__nm_set_network_buffers(isc_nm_t *nm, uv_handle_t *handle);
/*%>

View file

@ -155,6 +155,7 @@ netmgr_teardown(void *arg) {
void
isc_netmgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, isc_nm_t **netmgrp) {
isc_nm_t *netmgr = NULL;
in_port_t port_low, port_high;
#ifdef MAXIMAL_UV_VERSION
if (uv_version() > MAXIMAL_UV_VERSION) {
@ -186,6 +187,11 @@ isc_netmgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, isc_nm_t **netmgrp) {
atomic_init(&netmgr->send_tcp_buffer_size, 0);
atomic_init(&netmgr->recv_udp_buffer_size, 0);
atomic_init(&netmgr->send_udp_buffer_size, 0);
atomic_init(&netmgr->port_low4, 0);
atomic_init(&netmgr->port_high4, 65535);
atomic_init(&netmgr->port_low6, 0);
atomic_init(&netmgr->port_high6, 65535);
#if HAVE_SO_REUSEPORT_LB
netmgr->load_balance_sockets = true;
#else
@ -237,6 +243,15 @@ isc_netmgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, isc_nm_t **netmgrp) {
}
*netmgrp = netmgr;
/*
* Set the initial port range for IP_LOCAL_PORT_RANGE.
*/
isc_net_getportrange(AF_INET, &port_low, &port_high);
isc_netmgr_portrange(netmgr, AF_INET, port_low, port_high);
isc_net_getportrange(AF_INET6, &port_low, &port_high);
isc_netmgr_portrange(netmgr, AF_INET6, port_low, port_high);
}
/*
@ -2838,6 +2853,24 @@ isc_nm_proxyheader_info_init_complete(isc_nm_proxyheader_info_t *restrict info,
.complete_header = *header_data };
}
void
isc_netmgr_portrange(isc_nm_t *netmgr, sa_family_t af, in_port_t low,
in_port_t high) {
REQUIRE(VALID_NM(netmgr));
switch (af) {
case AF_INET:
atomic_store_relaxed(&netmgr->port_low4, low);
atomic_store_relaxed(&netmgr->port_high4, high);
break;
case AF_INET6:
atomic_store_relaxed(&netmgr->port_low6, low);
atomic_store_relaxed(&netmgr->port_high6, high);
break;
default:
UNREACHABLE();
}
}
#if ISC_NETMGR_TRACE
/*
* Dump all active sockets in netmgr. We output to stderr

View file

@ -11,7 +11,10 @@
* information regarding copyright ownership.
*/
#include <netinet/in.h>
#include <isc/errno.h>
#include <isc/result.h>
#include <isc/uv.h>
#include "netmgr-int.h"
@ -369,3 +372,72 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) {
return ISC_R_SUCCESS;
}
/*
* See
* https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel
* for rationalle.
*/
#define PORT_RANGE 1000
isc_result_t
isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED,
sa_family_t sa_family ISC_ATTR_UNUSED,
in_port_t port_low ISC_ATTR_UNUSED,
in_port_t port_high ISC_ATTR_UNUSED) {
#ifdef IP_BIND_ADDRESS_NO_PORT
if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) {
return ISC_R_FAILURE;
}
#endif
#if defined(IP_LOCAL_PORT_RANGE) && defined(__linux__)
/*
* The option takes an uint32_t value with the high 16 bits
* set to the upper range bound, and the low 16 bits set to
* the lower range bound. Range bounds are inclusive. The
* 16-bit values should be in host byte order.
*/
uint32_t port_range;
int major, minor;
isc_os_kernel(NULL, &major, &minor, NULL);
/*
* Linux 6.8 implemented a following patch:
*
* If IP_LOCAL_PORT_RANGE is set on a socket before accept(),
* port selection no longer favors even ports.
*
* This means that connect() can find a suitable source port
* faster, and applications can use a different split between
* connect() and bind() users.
*/
if (major < 6 || (major == 6 && minor < 8)) {
/*
* On Linux << 6.8, use IP_LOCAL_PORT_RANGE to
* partition ephemeral port range randomly to help
* with the port selection.
*/
if (port_high - port_low <= PORT_RANGE) {
return ISC_R_RANGE;
}
/*
* port_low <= N < port_high - PORT_RANGE
*/
port_high -= PORT_RANGE;
port_low += isc_random_uniform(port_high - port_low);
port_high = port_low + PORT_RANGE;
}
INSIST(port_low > 0);
INSIST(port_low < port_high);
port_range = (uint32_t)port_low | ((uint32_t)port_high << 16);
if (setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE, &port_range,
sizeof(port_range)) == -1)
{
return ISC_R_FAILURE;
}
#endif
return ISC_R_SUCCESS;
}

View file

@ -12,6 +12,7 @@
*/
#include <libgen.h>
#include <string.h>
#include <unistd.h>
#include <isc/async.h>
@ -225,6 +226,7 @@ isc_nm_tcpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer,
sa_family_t sa_family;
isc__networker_t *worker = NULL;
uv_os_sock_t fd = -1;
in_port_t port_low, port_high;
REQUIRE(VALID_NM(mgr));
REQUIRE(local != NULL);
@ -261,6 +263,18 @@ isc_nm_tcpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer,
(void)isc__nm_socket_min_mtu(sock->fd, sa_family);
(void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG);
port_low = (sa_family == AF_INET) ? mgr->port_low4 : mgr->port_low6;
port_high = (sa_family == AF_INET) ? mgr->port_high4 : mgr->port_high6;
result = isc__nm_socket_max_port_range(sock->fd, sa_family, port_low,
port_high);
if (result != ISC_R_SUCCESS) {
isc__nmsocket_log(sock, ISC_LOG_DEBUG(99),
"setting up IP_BIND_ADDRESS_NO_PORT or "
"IP_LOCAL_PORT_RANGE failed: %s\n",
result == ISC_R_RANGE
? isc_result_totext(result)
: strerror(errno));
}
sock->active = true;

View file

@ -11,10 +11,13 @@
* information regarding copyright ownership.
*/
#include <ctype.h>
#include <inttypes.h>
#include <sys/stat.h>
#include <sys/utsname.h>
#include <isc/os.h>
#include <isc/string.h>
#include <isc/types.h>
#include <isc/util.h>
@ -23,6 +26,8 @@
static unsigned int isc__os_ncpus = 0;
static unsigned long isc__os_cacheline = ISC_OS_CACHELINE_SIZE;
static mode_t isc__os_umask = 0;
static int kernel_major = -1, kernel_minor = -1, kernel_patch = -1;
static char kernel_name[64];
#ifdef HAVE_SYSCONF
@ -159,6 +164,19 @@ umask_initialize(void) {
(void)umask(isc__os_umask);
}
static void
kernel_initialize(void) {
struct utsname buffer;
if (uname(&buffer) == -1) {
return;
}
(void)sscanf(buffer.release, "%d.%d.%d", &kernel_major, &kernel_minor,
&kernel_patch);
(void)strlcpy(kernel_name, buffer.sysname, sizeof(kernel_name));
}
unsigned int
isc_os_ncpus(void) {
return isc__os_ncpus;
@ -174,10 +192,19 @@ isc_os_umask(void) {
return isc__os_umask;
}
void
isc_os_kernel(char **name, int *major, int *minor, int *patch) {
SET_IF_NOT_NULL(name, kernel_name)
SET_IF_NOT_NULL(major, kernel_major);
SET_IF_NOT_NULL(minor, kernel_minor);
SET_IF_NOT_NULL(patch, kernel_patch);
}
void
isc__os_initialize(void) {
umask_initialize();
ncpus_initialize();
kernel_initialize();
#if defined(HAVE_SYSCONF) && defined(_SC_LEVEL1_DCACHE_LINESIZE)
long s = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
if (s > 0 && (unsigned long)s > isc__os_cacheline) {