From fffedd81a4bf86b1f77fc4ba0d170e7ef73d552c Mon Sep 17 00:00:00 2001
From: Kristof Provost <kp@FreeBSD.org>
Date: Mon, 3 Feb 2025 22:39:35 +0100
Subject: [PATCH] pf: send ICMP destination unreachable fragmentation needed
 when appropriate

Just like we do for IPv6, generate an ICMP fragmentation needed packet if we're
going to need fragmenation for IPv4 as well (i.e. DF is set). Do so before full
processing, so we generate it with pre-NAT addreses, just as we do for IPv6.

Sponsored by:	Rubicon Communications, LLC ("Netgate")
Differential Revision:	https://reviews.freebsd.org/D48805
---
 sys/net/pfvar.h              |  1 +
 sys/netpfil/pf/pf.c          | 41 +++++++++++++++++++-----------
 tests/sys/netpfil/pf/icmp.py | 48 ++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h
index e50fbc96a8b..6f10a55b64a 100644
--- a/sys/net/pfvar.h
+++ b/sys/net/pfvar.h
@@ -1625,6 +1625,7 @@ struct pf_pdesc {
 	struct pf_rule_actions	act;
 
 	u_int32_t	 off;		/* protocol header offset */
+	bool		 df;		/* IPv4 Don't fragment flag. */
 	u_int32_t	 hdrlen;	/* protocol header length */
 	u_int32_t	 p_len;		/* total length of protocol payload */
 	u_int32_t	 extoff;	/* extentsion header offset */
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index 0ebc813756f..d78978a7531 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -9990,6 +9990,7 @@ pf_setup_pdesc(sa_family_t af, int dir, struct pf_pdesc *pd, struct mbuf **m0,
 		pd->ttl = h->ip_ttl;
 		pd->tot_len = ntohs(h->ip_len);
 		pd->act.rtableid = -1;
+		pd->df = h->ip_off & htons(IP_DF);
 
 		if (h->ip_hl > 5)	/* has options */
 			pd->badopts++;
@@ -10317,21 +10318,6 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
 		return (PF_PASS);
 	}
 
-#ifdef INET6
-	/*
-	 * If we end up changing IP addresses (e.g. binat) the stack may get
-	 * confused and fail to send the icmp6 packet too big error. Just send
-	 * it here, before we do any NAT.
-	 */
-	if (af == AF_INET6 && dir == PF_OUT && pflags & PFIL_FWD &&
-	    IN6_LINKMTU(ifp) < pf_max_frag_size(*m0)) {
-		PF_RULES_RUNLOCK();
-		icmp6_error(*m0, ICMP6_PACKET_TOO_BIG, 0, IN6_LINKMTU(ifp));
-		*m0 = NULL;
-		return (PF_DROP);
-	}
-#endif
-
 	if (__predict_false(! M_WRITABLE(*m0))) {
 		*m0 = m_unshare(*m0, M_NOWAIT);
 		if (*m0 == NULL) {
@@ -10380,6 +10366,31 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0
 		goto done;
 	}
 
+#ifdef INET
+	if (af == AF_INET && dir == PF_OUT && pflags & PFIL_FWD &&
+	    pd.df && (*m0)->m_pkthdr.len > ifp->if_mtu) {
+		PF_RULES_RUNLOCK();
+		icmp_error(*m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
+			0, ifp->if_mtu);
+		*m0 = NULL;
+		return (PF_DROP);
+	}
+#endif
+#ifdef INET6
+	/*
+	 * If we end up changing IP addresses (e.g. binat) the stack may get
+	 * confused and fail to send the icmp6 packet too big error. Just send
+	 * it here, before we do any NAT.
+	 */
+	if (af == AF_INET6 && dir == PF_OUT && pflags & PFIL_FWD &&
+	    IN6_LINKMTU(ifp) < pf_max_frag_size(*m0)) {
+		PF_RULES_RUNLOCK();
+		icmp6_error(*m0, ICMP6_PACKET_TOO_BIG, 0, IN6_LINKMTU(ifp));
+		*m0 = NULL;
+		return (PF_DROP);
+	}
+#endif
+
 	if (__predict_false(ip_divert_ptr != NULL) &&
 	    ((mtag = m_tag_locate(pd.m, MTAG_PF_DIVERT, 0, NULL)) != NULL)) {
 		struct pf_divert_mtag *dt = (struct pf_divert_mtag *)(mtag+1);
diff --git a/tests/sys/netpfil/pf/icmp.py b/tests/sys/netpfil/pf/icmp.py
index cb9de2bf0f3..e54f9f20a05 100644
--- a/tests/sys/netpfil/pf/icmp.py
+++ b/tests/sys/netpfil/pf/icmp.py
@@ -48,6 +48,7 @@ class TestICMP(VnetTestTemplate):
 
     def vnet2_handler(self, vnet):
         ifname = vnet.iface_alias_map["if1"].name
+        if2name = vnet.iface_alias_map["if2"].name
 
         ToolsHelper.print_output("/sbin/pfctl -e")
         ToolsHelper.pf_rules([
@@ -59,6 +60,8 @@ class TestICMP(VnetTestTemplate):
         ToolsHelper.print_output("/sbin/sysctl net.inet.ip.forwarding=1")
         ToolsHelper.print_output("/sbin/pfctl -x loud")
 
+        ToolsHelper.print_output("/sbin/ifconfig %s mtu 1492" % if2name)
+
     def vnet3_handler(self, vnet):
         # Import in the correct vnet, so at to not confuse Scapy
         import scapy.all as sp
@@ -66,6 +69,7 @@ class TestICMP(VnetTestTemplate):
         ifname = vnet.iface_alias_map["if2"].name
         ToolsHelper.print_output("/sbin/route add default 198.51.100.1")
         ToolsHelper.print_output("/sbin/ifconfig %s inet alias 198.51.100.3/24" % ifname)
+        ToolsHelper.print_output("/sbin/ifconfig %s mtu 1492" % ifname)
 
         def checkfn(packet):
             icmp = packet.getlayer(sp.ICMP)
@@ -124,3 +128,47 @@ class TestICMP(VnetTestTemplate):
             # We expect the timeout here. It means we didn't get the destination
             # unreachable packet in vnet3
             pass
+
+    def check_icmp_echo(self, sp, payload_size):
+        packet = sp.IP(dst="198.51.100.2", flags="DF") \
+            / sp.ICMP(type='echo-request') \
+            / sp.raw(bytes.fromhex('f0') * payload_size)
+
+        p = sp.sr1(packet, iface=self.vnet.iface_alias_map["if1"].name,
+            timeout=3)
+        p.show()
+
+        ip = p.getlayer(sp.IP)
+        icmp = p.getlayer(sp.ICMP)
+        assert ip
+        assert icmp
+
+        if payload_size > 1464:
+            # Expect ICMP destination unreachable, fragmentation needed
+            assert ip.src == "192.0.2.1"
+            assert ip.dst == "192.0.2.2"
+            assert icmp.type == 3 # dest-unreach
+            assert icmp.code == 4
+            assert icmp.nexthopmtu == 1492
+        else:
+            # Expect echo reply
+            assert ip.src == "198.51.100.2"
+            assert ip.dst == "192.0.2.2"
+            assert icmp.type == 0 # "echo-reply"
+            assert icmp.code == 0
+
+        return
+
+    @pytest.mark.require_user("root")
+    def test_fragmentation_needed(self):
+        ToolsHelper.print_output("/sbin/route add default 192.0.2.1")
+
+        ToolsHelper.print_output("/sbin/ping -c 1 198.51.100.2")
+        ToolsHelper.print_output("/sbin/ping -c 1 -D -s 1472 198.51.100.2")
+
+        # Import in the correct vnet, so at to not confuse Scapy
+        import scapy.all as sp
+
+        self.check_icmp_echo(sp, 128)
+        self.check_icmp_echo(sp, 1464)
+        self.check_icmp_echo(sp, 1468)