diff --git a/bin/named/server.c b/bin/named/server.c index 6e919d872a..796b4cc97e 100644 --- a/bin/named/server.c +++ b/bin/named/server.c @@ -8419,7 +8419,7 @@ load_configuration(const char *filename, named_server_t *server, dns_view_t *view_next = NULL; dns_viewlist_t tmpviewlist; dns_viewlist_t viewlist, builtin_viewlist; - in_port_t listen_port, udpport_low, udpport_high; + in_port_t listen_port, port_low, port_high; int i, backlog; isc_interval_t interval; isc_logconfig_t *logc = NULL; @@ -8849,28 +8849,18 @@ load_configuration(const char *filename, named_server_t *server, if (usev4ports != NULL) { portset_fromconf(v4portset, usev4ports, true); } else { - result = isc_net_getudpportrange(AF_INET, &udpport_low, - &udpport_high); - if (result != ISC_R_SUCCESS) { - isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL, - NAMED_LOGMODULE_SERVER, ISC_LOG_ERROR, - "get the default UDP/IPv4 port range: %s", - isc_result_totext(result)); - goto cleanup_v6portset; - } - - if (udpport_low == udpport_high) { - isc_portset_add(v4portset, udpport_low); + isc_net_getportrange(AF_INET, &port_low, &port_high); + if (port_low == port_high) { + isc_portset_add(v4portset, port_low); } else { - isc_portset_addrange(v4portset, udpport_low, - udpport_high); + isc_portset_addrange(v4portset, port_low, port_high); } if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE4)) { isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER, ISC_LOG_INFO, "using default UDP/IPv4 port range: " "[%d, %d]", - udpport_low, udpport_high); + port_low, port_high); } } (void)named_config_get(maps, "avoid-v4-udp-ports", &avoidv4ports); @@ -8882,27 +8872,18 @@ load_configuration(const char *filename, named_server_t *server, if (usev6ports != NULL) { portset_fromconf(v6portset, usev6ports, true); } else { - result = isc_net_getudpportrange(AF_INET6, &udpport_low, - &udpport_high); - if (result != ISC_R_SUCCESS) { - isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL, - NAMED_LOGMODULE_SERVER, ISC_LOG_ERROR, - "get the default UDP/IPv6 port range: %s", - isc_result_totext(result)); - goto cleanup_v6portset; - } - if (udpport_low == udpport_high) { - isc_portset_add(v6portset, udpport_low); + isc_net_getportrange(AF_INET6, &port_low, &port_high); + if (port_low == port_high) { + isc_portset_add(v6portset, port_low); } else { - isc_portset_addrange(v6portset, udpport_low, - udpport_high); + isc_portset_addrange(v6portset, port_low, port_high); } if (!ns_server_getoption(server->sctx, NS_SERVER_DISABLE6)) { isc_log_write(named_g_lctx, NAMED_LOGCATEGORY_GENERAL, NAMED_LOGMODULE_SERVER, ISC_LOG_INFO, "using default UDP/IPv6 port range: " "[%d, %d]", - udpport_low, udpport_high); + port_low, port_high); } } (void)named_config_get(maps, "avoid-v6-udp-ports", &avoidv6ports); diff --git a/bin/nsupdate/nsupdate.c b/bin/nsupdate/nsupdate.c index 76013feaa6..a33693d512 100644 --- a/bin/nsupdate/nsupdate.c +++ b/bin/nsupdate/nsupdate.c @@ -771,14 +771,12 @@ set_source_ports(dns_dispatchmgr_t *manager) { result = isc_portset_create(gmctx, &v4portset); check_result(result, "isc_portset_create (v4)"); - result = isc_net_getudpportrange(AF_INET, &udpport_low, &udpport_high); - check_result(result, "isc_net_getudpportrange (v4)"); + isc_net_getportrange(AF_INET, &udpport_low, &udpport_high); isc_portset_addrange(v4portset, udpport_low, udpport_high); result = isc_portset_create(gmctx, &v6portset); check_result(result, "isc_portset_create (v6)"); - result = isc_net_getudpportrange(AF_INET6, &udpport_low, &udpport_high); - check_result(result, "isc_net_getudpportrange (v6)"); + isc_net_getportrange(AF_INET6, &udpport_low, &udpport_high); isc_portset_addrange(v6portset, udpport_low, udpport_high); result = dns_dispatchmgr_setavailports(manager, v4portset, v6portset); diff --git a/bin/tools/mdig.c b/bin/tools/mdig.c index 94291d218e..3da295db3f 100644 --- a/bin/tools/mdig.c +++ b/bin/tools/mdig.c @@ -2046,10 +2046,7 @@ set_source_ports(dns_dispatchmgr_t *manager) { fatal("isc_portset_create (v4) failed"); } - result = isc_net_getudpportrange(AF_INET, &udpport_low, &udpport_high); - if (result != ISC_R_SUCCESS) { - fatal("isc_net_getudpportrange (v4) failed"); - } + isc_net_getportrange(AF_INET, &udpport_low, &udpport_high); isc_portset_addrange(v4portset, udpport_low, udpport_high); @@ -2057,10 +2054,7 @@ set_source_ports(dns_dispatchmgr_t *manager) { if (result != ISC_R_SUCCESS) { fatal("isc_portset_create (v6) failed"); } - result = isc_net_getudpportrange(AF_INET6, &udpport_low, &udpport_high); - if (result != ISC_R_SUCCESS) { - fatal("isc_net_getudpportrange (v6) failed"); - } + isc_net_getportrange(AF_INET6, &udpport_low, &udpport_high); isc_portset_addrange(v6portset, udpport_low, udpport_high); diff --git a/lib/dns/client.c b/lib/dns/client.c index 25a0106661..5f9e5d0557 100644 --- a/lib/dns/client.c +++ b/lib/dns/client.c @@ -146,20 +146,14 @@ setsourceports(isc_mem_t *mctx, dns_dispatchmgr_t *manager) { if (result != ISC_R_SUCCESS) { goto cleanup; } - result = isc_net_getudpportrange(AF_INET, &udpport_low, &udpport_high); - if (result != ISC_R_SUCCESS) { - goto cleanup; - } + isc_net_getportrange(AF_INET, &udpport_low, &udpport_high); isc_portset_addrange(v4portset, udpport_low, udpport_high); result = isc_portset_create(mctx, &v6portset); if (result != ISC_R_SUCCESS) { goto cleanup; } - result = isc_net_getudpportrange(AF_INET6, &udpport_low, &udpport_high); - if (result != ISC_R_SUCCESS) { - goto cleanup; - } + isc_net_getportrange(AF_INET6, &udpport_low, &udpport_high); isc_portset_addrange(v6portset, udpport_low, udpport_high); result = dns_dispatchmgr_setavailports(manager, v4portset, v6portset); diff --git a/lib/dns/dispatch.c b/lib/dns/dispatch.c index a31ad8833e..6113423659 100644 --- a/lib/dns/dispatch.c +++ b/lib/dns/dispatch.c @@ -917,7 +917,7 @@ static void create_default_portset(isc_mem_t *mctx, int family, isc_portset_t **portsetp) { in_port_t low, high; - isc_net_getudpportrange(family, &low, &high); + isc_net_getportrange(family, &low, &high); isc_portset_create(mctx, portsetp); isc_portset_addrange(*portsetp, low, high); diff --git a/lib/isc/include/isc/net.h b/lib/isc/include/isc/net.h index eda799c60d..4cd4269641 100644 --- a/lib/isc/include/isc/net.h +++ b/lib/isc/include/isc/net.h @@ -248,8 +248,8 @@ isc_net_enableipv4(void); void isc_net_enableipv6(void); -isc_result_t -isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high); +void +isc_net_getportrange(int af, in_port_t *low, in_port_t *high); /*%< * Returns system's default range of ephemeral UDP ports, if defined. * If the range is not available or unknown, ISC_NET_PORTRANGELOW and diff --git a/lib/isc/include/isc/netmgr.h b/lib/isc/include/isc/netmgr.h index f6d656fff8..1f574ea403 100644 --- a/lib/isc/include/isc/netmgr.h +++ b/lib/isc/include/isc/netmgr.h @@ -897,3 +897,10 @@ isc_nmsocket_getaddr(isc_nmsocket_t *sock); /*%< * Return the local address of 'sock'. */ + +void +isc_netmgr_portrange(isc_nm_t *netmgr, sa_family_t af, in_port_t low, + in_port_t high); +/*%< + * Set the ephemeral port range for 'af' family. + */ diff --git a/lib/isc/include/isc/os.h b/lib/isc/include/isc/os.h index 32770b992f..60b32d1c06 100644 --- a/lib/isc/include/isc/os.h +++ b/lib/isc/include/isc/os.h @@ -49,4 +49,11 @@ isc_os_umask(void); * Return umask of the current process as initialized at the program start */ +void +isc_os_kernel(char **name, int *major, int *minor, int *patch); +/*%< + * Fill the running kernel version into major, minor and patch. + * If any of these are not available then -1 is returned. + */ + ISC_LANG_ENDDECLS diff --git a/lib/isc/net.c b/lib/isc/net.c index 988c242147..42d86548d6 100644 --- a/lib/isc/net.c +++ b/lib/isc/net.c @@ -333,7 +333,7 @@ isc_net_probe_ipv6pktinfo(void) { #if defined(USE_SYSCTL_PORTRANGE) #if defined(HAVE_SYSCTLBYNAME) static isc_result_t -getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) { +getportrange_sysctl(int af, in_port_t *low, in_port_t *high) { int port_low, port_high; size_t portlen; const char *sysctlname_lowport, *sysctlname_hiport; @@ -366,7 +366,7 @@ getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) { } #else /* !HAVE_SYSCTLBYNAME */ static isc_result_t -getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) { +getportrange_sysctl(int af, in_port_t *low, in_port_t *high) { int mib_lo4[4] = SYSCTL_V4PORTRANGE_LOW; int mib_hi4[4] = SYSCTL_V4PORTRANGE_HIGH; int mib_lo6[4] = SYSCTL_V6PORTRANGE_LOW; @@ -407,18 +407,18 @@ getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) { #endif /* HAVE_SYSCTLBYNAME */ #endif /* USE_SYSCTL_PORTRANGE */ -isc_result_t -isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) { +void +isc_net_getportrange(int af, in_port_t *low, in_port_t *high) { int result = ISC_R_FAILURE; -#if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux) +#if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux__) FILE *fp; -#endif /* if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux) */ +#endif /* if !defined(USE_SYSCTL_PORTRANGE) && defined(__linux__) */ REQUIRE(low != NULL && high != NULL); #if defined(USE_SYSCTL_PORTRANGE) - result = getudpportrange_sysctl(af, low, high); -#elif defined(__linux) + result = getportrange_sysctl(af, low, high); +#elif defined(__linux__) UNUSED(af); @@ -446,8 +446,6 @@ isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) { *low = ISC_NET_PORTRANGELOW; *high = ISC_NET_PORTRANGEHIGH; } - - return ISC_R_SUCCESS; /* we currently never fail in this function */ } void diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h index e6c6e82830..8b36afa2e9 100644 --- a/lib/isc/netmgr/netmgr-int.h +++ b/lib/isc/netmgr/netmgr-int.h @@ -366,6 +366,11 @@ struct isc_nm { atomic_int_fast32_t send_udp_buffer_size; atomic_int_fast32_t recv_tcp_buffer_size; atomic_int_fast32_t send_tcp_buffer_size; + + _Atomic(in_port_t) port_low4; + _Atomic(in_port_t) port_high4; + _Atomic(in_port_t) port_low6; + _Atomic(in_port_t) port_high6; }; /*% @@ -1373,6 +1378,15 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family); * Use minimum MTU on IPv6 sockets */ +isc_result_t +isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED, + sa_family_t sa_family ISC_ATTR_UNUSED, + in_port_t port_low, in_port_t port_high); +/*%< + * Set IP_BIND_ADDRESS_NO_PORT and IP_LOCAL_PORT_RANGE on the socket + * (Linux only). + */ + void isc__nm_set_network_buffers(isc_nm_t *nm, uv_handle_t *handle); /*%> diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c index f8c3643e79..3f3be464bb 100644 --- a/lib/isc/netmgr/netmgr.c +++ b/lib/isc/netmgr/netmgr.c @@ -155,6 +155,7 @@ netmgr_teardown(void *arg) { void isc_netmgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, isc_nm_t **netmgrp) { isc_nm_t *netmgr = NULL; + in_port_t port_low, port_high; #ifdef MAXIMAL_UV_VERSION if (uv_version() > MAXIMAL_UV_VERSION) { @@ -186,6 +187,11 @@ isc_netmgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, isc_nm_t **netmgrp) { atomic_init(&netmgr->send_tcp_buffer_size, 0); atomic_init(&netmgr->recv_udp_buffer_size, 0); atomic_init(&netmgr->send_udp_buffer_size, 0); + atomic_init(&netmgr->port_low4, 0); + atomic_init(&netmgr->port_high4, 65535); + atomic_init(&netmgr->port_low6, 0); + atomic_init(&netmgr->port_high6, 65535); + #if HAVE_SO_REUSEPORT_LB netmgr->load_balance_sockets = true; #else @@ -237,6 +243,15 @@ isc_netmgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, isc_nm_t **netmgrp) { } *netmgrp = netmgr; + + /* + * Set the initial port range for IP_LOCAL_PORT_RANGE. + */ + isc_net_getportrange(AF_INET, &port_low, &port_high); + isc_netmgr_portrange(netmgr, AF_INET, port_low, port_high); + + isc_net_getportrange(AF_INET6, &port_low, &port_high); + isc_netmgr_portrange(netmgr, AF_INET6, port_low, port_high); } /* @@ -2838,6 +2853,24 @@ isc_nm_proxyheader_info_init_complete(isc_nm_proxyheader_info_t *restrict info, .complete_header = *header_data }; } +void +isc_netmgr_portrange(isc_nm_t *netmgr, sa_family_t af, in_port_t low, + in_port_t high) { + REQUIRE(VALID_NM(netmgr)); + switch (af) { + case AF_INET: + atomic_store_relaxed(&netmgr->port_low4, low); + atomic_store_relaxed(&netmgr->port_high4, high); + break; + case AF_INET6: + atomic_store_relaxed(&netmgr->port_low6, low); + atomic_store_relaxed(&netmgr->port_high6, high); + break; + default: + UNREACHABLE(); + } +} + #if ISC_NETMGR_TRACE /* * Dump all active sockets in netmgr. We output to stderr diff --git a/lib/isc/netmgr/socket.c b/lib/isc/netmgr/socket.c index 4b71e9dbfd..6f546edc94 100644 --- a/lib/isc/netmgr/socket.c +++ b/lib/isc/netmgr/socket.c @@ -11,7 +11,10 @@ * information regarding copyright ownership. */ +#include + #include +#include #include #include "netmgr-int.h" @@ -369,3 +372,72 @@ isc__nm_socket_min_mtu(uv_os_sock_t fd, sa_family_t sa_family) { return ISC_R_SUCCESS; } + +/* + * See + * https://blog.cloudflare.com/linux-transport-protocol-port-selection-performance/#kernel + * for rationalle. + */ +#define PORT_RANGE 1000 + +isc_result_t +isc__nm_socket_max_port_range(uv_os_sock_t fd ISC_ATTR_UNUSED, + sa_family_t sa_family ISC_ATTR_UNUSED, + in_port_t port_low ISC_ATTR_UNUSED, + in_port_t port_high ISC_ATTR_UNUSED) { +#ifdef IP_BIND_ADDRESS_NO_PORT + if (setsockopt_on(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT) == -1) { + return ISC_R_FAILURE; + } +#endif + +#if defined(IP_LOCAL_PORT_RANGE) && defined(__linux__) + /* + * The option takes an uint32_t value with the high 16 bits + * set to the upper range bound, and the low 16 bits set to + * the lower range bound. Range bounds are inclusive. The + * 16-bit values should be in host byte order. + */ + uint32_t port_range; + int major, minor; + isc_os_kernel(NULL, &major, &minor, NULL); + + /* + * Linux 6.8 implemented a following patch: + * + * If IP_LOCAL_PORT_RANGE is set on a socket before accept(), + * port selection no longer favors even ports. + * + * This means that connect() can find a suitable source port + * faster, and applications can use a different split between + * connect() and bind() users. + */ + if (major < 6 || (major == 6 && minor < 8)) { + /* + * On Linux << 6.8, use IP_LOCAL_PORT_RANGE to + * partition ephemeral port range randomly to help + * with the port selection. + */ + if (port_high - port_low <= PORT_RANGE) { + return ISC_R_RANGE; + } + + /* + * port_low <= N < port_high - PORT_RANGE + */ + port_high -= PORT_RANGE; + port_low += isc_random_uniform(port_high - port_low); + port_high = port_low + PORT_RANGE; + } + INSIST(port_low > 0); + INSIST(port_low < port_high); + + port_range = (uint32_t)port_low | ((uint32_t)port_high << 16); + if (setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE, &port_range, + sizeof(port_range)) == -1) + { + return ISC_R_FAILURE; + } +#endif + return ISC_R_SUCCESS; +} diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c index 0ddbef56e2..7d70d54623 100644 --- a/lib/isc/netmgr/tcp.c +++ b/lib/isc/netmgr/tcp.c @@ -12,6 +12,7 @@ */ #include +#include #include #include @@ -225,6 +226,7 @@ isc_nm_tcpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, sa_family_t sa_family; isc__networker_t *worker = NULL; uv_os_sock_t fd = -1; + in_port_t port_low, port_high; REQUIRE(VALID_NM(mgr)); REQUIRE(local != NULL); @@ -261,6 +263,18 @@ isc_nm_tcpconnect(isc_nm_t *mgr, isc_sockaddr_t *local, isc_sockaddr_t *peer, (void)isc__nm_socket_min_mtu(sock->fd, sa_family); (void)isc__nm_socket_tcp_maxseg(sock->fd, NM_MAXSEG); + port_low = (sa_family == AF_INET) ? mgr->port_low4 : mgr->port_low6; + port_high = (sa_family == AF_INET) ? mgr->port_high4 : mgr->port_high6; + result = isc__nm_socket_max_port_range(sock->fd, sa_family, port_low, + port_high); + if (result != ISC_R_SUCCESS) { + isc__nmsocket_log(sock, ISC_LOG_DEBUG(99), + "setting up IP_BIND_ADDRESS_NO_PORT or " + "IP_LOCAL_PORT_RANGE failed: %s\n", + result == ISC_R_RANGE + ? isc_result_totext(result) + : strerror(errno)); + } sock->active = true; diff --git a/lib/isc/os.c b/lib/isc/os.c index aa48c31791..0c5f2c8037 100644 --- a/lib/isc/os.c +++ b/lib/isc/os.c @@ -11,10 +11,13 @@ * information regarding copyright ownership. */ +#include #include #include +#include #include +#include #include #include @@ -23,6 +26,8 @@ static unsigned int isc__os_ncpus = 0; static unsigned long isc__os_cacheline = ISC_OS_CACHELINE_SIZE; static mode_t isc__os_umask = 0; +static int kernel_major = -1, kernel_minor = -1, kernel_patch = -1; +static char kernel_name[64]; #ifdef HAVE_SYSCONF @@ -159,6 +164,19 @@ umask_initialize(void) { (void)umask(isc__os_umask); } +static void +kernel_initialize(void) { + struct utsname buffer; + + if (uname(&buffer) == -1) { + return; + } + + (void)sscanf(buffer.release, "%d.%d.%d", &kernel_major, &kernel_minor, + &kernel_patch); + (void)strlcpy(kernel_name, buffer.sysname, sizeof(kernel_name)); +} + unsigned int isc_os_ncpus(void) { return isc__os_ncpus; @@ -174,10 +192,19 @@ isc_os_umask(void) { return isc__os_umask; } +void +isc_os_kernel(char **name, int *major, int *minor, int *patch) { + SET_IF_NOT_NULL(name, kernel_name) + SET_IF_NOT_NULL(major, kernel_major); + SET_IF_NOT_NULL(minor, kernel_minor); + SET_IF_NOT_NULL(patch, kernel_patch); +} + void isc__os_initialize(void) { umask_initialize(); ncpus_initialize(); + kernel_initialize(); #if defined(HAVE_SYSCONF) && defined(_SC_LEVEL1_DCACHE_LINESIZE) long s = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); if (s > 0 && (unsigned long)s > isc__os_cacheline) {