From: Felix Fietkau Only handles packets where the pppoe header length field matches the exact packet length. Significantly improves rx throughput. When running NAT traffic through a MediaTek MT7621 devices from a host behind PPPoE to a host directly connected via ethernet, the TCP throughput that the device is able to handle improves from ~130 Mbit/s to ~630 Mbit/s, using fraglist GRO. Signed-off-by: Felix Fietkau Signed-off-by: Qingfang Deng Tested-by: Pablo Neira Ayuso --- v10: use NETIF_F_GSO_SOFTWARE instead of NETIF_F_TSO|TSO6 v9: https://lore.kernel.org/netdev/20260509030507.387050-1-qingfang.deng@linux.dev - clear TSO features to force software segmentation - remove skb_gso_error_unwind(), which is intented for tunnel skbs v8: https://lore.kernel.org/netdev/20260501035102.293031-1-qingfang.deng@linux.dev - no changes v7: https://lore.kernel.org/netdev/20260428064717.74794-1-qingfang.deng@linux.dev - use PPPOE_SES_HLEN macro instead of +2 magic number v6: https://lore.kernel.org/netdev/20260326081127.61229-1-dqfext@gmail.com - avoid phdr->length field overflow - restore skb_is_gso() check - do not register GRO if INET=n - do not check for PPP_IPV6 if IPV6=n - tail call gro_complete drivers/net/ppp/pppoe.c | 161 +++++++++++++++++++++++++++++++++++++++- net/ipv4/af_inet.c | 2 + net/ipv6/ip6_offload.c | 2 + 3 files changed, 164 insertions(+), 1 deletion(-) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index ad5bb98c1579..4a018acb5262 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -77,6 +77,7 @@ #include #include #include +#include #include @@ -409,7 +410,7 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, if (ppp_skb_is_compressed_proto(skb)) goto drop; - if (pskb_trim_rcsum(skb, len)) + if (!skb_is_gso(skb) && pskb_trim_rcsum(skb, len)) goto drop; ph = pppoe_hdr(skb); @@ -1103,6 +1104,160 @@ static struct pernet_operations pppoe_net_ops = { .size = sizeof(struct pppoe_net), }; +static u16 +compare_pppoe_header(const struct pppoe_hdr *phdr, + const struct pppoe_hdr *phdr2) +{ + __be16 proto = *(const __be16 *)(phdr + 1); + __be16 proto2 = *(const __be16 *)(phdr2 + 1); + + return (__force u16)((phdr->sid ^ phdr2->sid) | (proto ^ proto2)); +} + +static __be16 pppoe_hdr_proto(const struct pppoe_hdr *phdr) +{ + __be16 proto = *(const __be16 *)(phdr + 1); + + switch (proto) { + case cpu_to_be16(PPP_IP): + return cpu_to_be16(ETH_P_IP); +#if IS_ENABLED(CONFIG_IPV6) + case cpu_to_be16(PPP_IPV6): + return cpu_to_be16(ETH_P_IPV6); +#endif + default: + return 0; + } +} + +static struct sk_buff *pppoe_gro_receive(struct list_head *head, + struct sk_buff *skb) +{ + const struct packet_offload *ptype; + unsigned int hlen, off_pppoe; + const struct pppoe_hdr *phdr; + struct sk_buff *pp = NULL; + struct sk_buff *p; + int flush = 1; + __be16 type; + + off_pppoe = skb_gro_offset(skb); + hlen = off_pppoe + PPPOE_SES_HLEN; + phdr = skb_gro_header(skb, hlen, off_pppoe); + if (unlikely(!phdr)) + goto out; + + /* filter for session packets (type:1, ver:1, code:0) */ + if (*(const __be16 *)phdr != cpu_to_be16(0x1100)) + goto out; + + /* ignore packets with padding or invalid length */ + if (skb_gro_len(skb) != be16_to_cpu(phdr->length) + sizeof(*phdr)) + goto out; + + type = pppoe_hdr_proto(phdr); + ptype = gro_find_receive_by_type(type); + if (!ptype) + goto out; + + flush = 0; + + list_for_each_entry(p, head, list) { + const struct pppoe_hdr *phdr2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + phdr2 = (const struct pppoe_hdr *)(p->data + off_pppoe); + if (compare_pppoe_header(phdr, phdr2)) + NAPI_GRO_CB(p)->same_flow = 0; + } + + skb_gro_pull(skb, PPPOE_SES_HLEN); + skb_gro_postpull_rcsum(skb, phdr, PPPOE_SES_HLEN); + + pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + head, skb); + +out: + skb_gro_flush_final(skb, pp, flush); + + return pp; +} + +static int pppoe_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct pppoe_hdr *phdr = (struct pppoe_hdr *)(skb->data + nhoff); + __be16 type = pppoe_hdr_proto(phdr); + struct packet_offload *ptype; + unsigned int len; + + ptype = gro_find_complete_by_type(type); + if (!ptype) + return -ENOENT; + + len = skb->len - (nhoff + sizeof(*phdr)); + len = min(len, 0xFFFFU); + phdr->length = cpu_to_be16(len); + + return INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, nhoff + PPPOE_SES_HLEN); +} + +static struct sk_buff *pppoe_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct packet_offload *ptype; + struct pppoe_hdr *phdr; + __be16 orig_type, type; + int len, nhoff; + + skb_reset_network_header(skb); + nhoff = skb_network_header(skb) - skb_mac_header(skb); + + if (unlikely(!pskb_may_pull(skb, PPPOE_SES_HLEN))) + goto out; + + phdr = (struct pppoe_hdr *)skb_network_header(skb); + type = pppoe_hdr_proto(phdr); + ptype = gro_find_complete_by_type(type); + if (!ptype) + goto out; + + orig_type = skb->protocol; + __skb_pull(skb, PPPOE_SES_HLEN); + features &= ~NETIF_F_GSO_SOFTWARE; + segs = ptype->callbacks.gso_segment(skb, features); + if (IS_ERR_OR_NULL(segs)) + goto out; + + skb = segs; + do { + phdr = (struct pppoe_hdr *)(skb_mac_header(skb) + nhoff); + len = skb->len - (nhoff + sizeof(*phdr)); + phdr->length = cpu_to_be16(len); + skb->network_header = (u8 *)phdr - skb->head; + skb->protocol = orig_type; + skb_reset_mac_len(skb); + } while ((skb = skb->next)); + +out: + return segs; +} + +static struct packet_offload pppoe_packet_offload __read_mostly = { + .type = cpu_to_be16(ETH_P_PPP_SES), + .priority = 20, + .callbacks = { + .gro_receive = pppoe_gro_receive, + .gro_complete = pppoe_gro_complete, + .gso_segment = pppoe_gso_segment, + }, +}; + static int __init pppoe_init(void) { int err; @@ -1119,6 +1274,8 @@ static int __init pppoe_init(void) if (err) goto out_unregister_pppoe_proto; + if (IS_ENABLED(CONFIG_INET)) + dev_add_offload(&pppoe_packet_offload); dev_add_pack(&pppoes_ptype); dev_add_pack(&pppoed_ptype); register_netdevice_notifier(&pppoe_notifier); @@ -1138,6 +1295,8 @@ static void __exit pppoe_exit(void) unregister_netdevice_notifier(&pppoe_notifier); dev_remove_pack(&pppoed_ptype); dev_remove_pack(&pppoes_ptype); + if (IS_ENABLED(CONFIG_INET)) + dev_remove_offload(&pppoe_packet_offload); unregister_pppox_proto(PX_PROTO_OE); proto_unregister(&pppoe_sk_proto); unregister_pernet_device(&pppoe_net_ops); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0e62032e76b1..cbac072633bb 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1540,6 +1540,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) return pp; } +EXPORT_INDIRECT_CALLABLE(inet_gro_receive); static struct sk_buff *ipip_gro_receive(struct list_head *head, struct sk_buff *skb) @@ -1625,6 +1626,7 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) out: return err; } +EXPORT_INDIRECT_CALLABLE(inet_gro_complete); static int ipip_gro_complete(struct sk_buff *skb, int nhoff) { diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index d8072ad6b8c4..78f50c93c536 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -297,6 +297,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, return pp; } +EXPORT_INDIRECT_CALLABLE(ipv6_gro_receive); static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head, struct sk_buff *skb) @@ -359,6 +360,7 @@ INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) out: return err; } +EXPORT_INDIRECT_CALLABLE(ipv6_gro_complete); static int sit_gro_complete(struct sk_buff *skb, int nhoff) { -- 2.43.0 Add PPPoE test-cases to the GRO selftest. Only run a subset of common_tests to avoid changing the hardcoded L3 offsets everywhere. Add a new "pppoe_sid" test case to verify that packets with different PPPoE session IDs are correctly identified as separate flows and not coalesced. Signed-off-by: Qingfang Deng --- v10: no changes v9: https://lore.kernel.org/netdev/20260509030507.387050-2-qingfang.deng@linux.dev - fix buffer length in send_changed_pppoe_sid() v8: https://lore.kernel.org/netdev/20260501035102.293031-2-qingfang.deng@linux.dev - add a new macro L2_HLEN_MAX for PPPoE v7: https://lore.kernel.org/netdev/20260428064717.74794-2-qingfang.deng@linux.dev - Do not run all the tests for PPPoE - Add a new test for PPPoE v6: https://lore.kernel.org/netdev/20260326081127.61229-2-dqfext@gmail.com tools/testing/selftests/drivers/net/config | 2 + tools/testing/selftests/drivers/net/gro.py | 11 +++ tools/testing/selftests/net/lib/gro.c | 100 +++++++++++++++++---- 3 files changed, 97 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config index 2309109a94ec..617de8aaf551 100644 --- a/tools/testing/selftests/drivers/net/config +++ b/tools/testing/selftests/drivers/net/config @@ -10,5 +10,7 @@ CONFIG_NETCONSOLE_EXTENDED_LOG=y CONFIG_NETDEVSIM=m CONFIG_NET_SCH_ETF=m CONFIG_NET_SCH_FQ=m +CONFIG_PPP=y +CONFIG_PPPOE=y CONFIG_VLAN_8021Q=m CONFIG_XDP_SOCKETS=y diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py index 5ffaa7bdbff4..fd158c775b1c 100755 --- a/tools/testing/selftests/drivers/net/gro.py +++ b/tools/testing/selftests/drivers/net/gro.py @@ -323,6 +323,12 @@ def _gro_variants(): "ip_frag6", "ip_v6ext_same", "ip_v6ext_diff", ] + # Tests specific to PPPoE + pppoe_tests = [ + "data_same", "data_lrg_sml", "data_sml_lrg", "data_lrg_1byte", + "data_burst", "pppoe_sid", + ] + for mode in ["sw", "hw", "lro"]: for protocol in ["ipv4", "ipv6", "ipip", "ip6ip6"]: for test_name in common_tests: @@ -335,6 +341,11 @@ def _gro_variants(): for test_name in ipv6_tests: yield mode, protocol, test_name + for mode in ["sw"]: + for protocol in ["pppoev4", "pppoev6"]: + for test_name in pppoe_tests: + yield mode, protocol, test_name + @ksft_variants(_gro_variants()) def test(cfg, mode, protocol, test_name): diff --git a/tools/testing/selftests/net/lib/gro.c b/tools/testing/selftests/net/lib/gro.c index 11b16ae5f0e8..fa35dfc8e790 100644 --- a/tools/testing/selftests/net/lib/gro.c +++ b/tools/testing/selftests/net/lib/gro.c @@ -67,12 +67,14 @@ #include #include #include +#include +#include #include #include +#include #include #include -#include -#include +#include #include #include #include @@ -102,6 +104,7 @@ #define MAX_LARGE_PKT_CNT ((IP_MAXPACKET - (MAX_HDR_LEN - ETH_HLEN)) / \ (ASSUMED_MTU - (MAX_HDR_LEN - ETH_HLEN))) #define MIN_EXTHDR_SIZE 8 +#define L2_HLEN_MAX (ETH_HLEN + PPPOE_SES_HLEN) #define EXT_PAYLOAD_1 "\x00\x00\x00\x00\x00\x00" #define EXT_PAYLOAD_2 "\x11\x11\x11\x11\x11\x11" @@ -134,6 +137,7 @@ static int total_hdr_len = -1; static int ethhdr_proto = -1; static bool ipip; static bool ip6ip6; +static bool pppoe; static uint64_t txtime_ns; static int num_flows = 4; static bool order_check; @@ -171,6 +175,22 @@ static void vlog(const char *fmt, ...) } } +static void fill_pppoelayer(void *buf, int payload_len, uint16_t sid) +{ + struct pppoe_ppp_hdr { + struct pppoe_hdr eh; + __be16 proto; + } *ph = buf; + + payload_len += sizeof(struct tcphdr); + ph->eh.type = 1; + ph->eh.ver = 1; + ph->eh.code = 0; + ph->eh.sid = htons(sid); + ph->eh.length = htons(payload_len + sizeof(ph->proto)); + ph->proto = htons(proto == PF_INET ? PPP_IP : PPP_IPV6); +} + static void setup_sock_filter(int fd) { const int dport_off = tcp_offset + offsetof(struct tcphdr, dest); @@ -412,11 +432,15 @@ static void create_packet(void *buf, int seq_offset, int ack_offset, fill_networklayer(buf + inner_ip_off, payload_len, IPPROTO_TCP); if (inner_ip_off > ETH_HLEN) { - int encap_proto = (proto == PF_INET) ? - IPPROTO_IPIP : IPPROTO_IPV6; + if (pppoe) { + fill_pppoelayer(buf + ETH_HLEN, payload_len + ip_hdr_len, 0x1234); + } else { + int encap_proto = (proto == PF_INET) ? + IPPROTO_IPIP : IPPROTO_IPV6; - fill_networklayer(buf + ETH_HLEN, - payload_len + ip_hdr_len, encap_proto); + fill_networklayer(buf + ETH_HLEN, + payload_len + ip_hdr_len, encap_proto); + } } fill_datalinklayer(buf); @@ -526,7 +550,7 @@ static void send_flags(int fd, struct sockaddr_ll *daddr, int psh, int syn, static void send_data_pkts(int fd, struct sockaddr_ll *daddr, int payload_len1, int payload_len2) { - static char buf[ETH_HLEN + IP_MAXPACKET]; + static char buf[L2_HLEN_MAX + IP_MAXPACKET]; create_packet(buf, 0, 0, payload_len1, 0); write_packet(fd, buf, total_hdr_len + payload_len1, daddr); @@ -1071,6 +1095,20 @@ static void send_fragment6(int fd, struct sockaddr_ll *daddr) write_packet(fd, buf, bufpkt_len, daddr); } +static void send_changed_pppoe_sid(int fd, struct sockaddr_ll *daddr) +{ + static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; + int pkt_size = total_hdr_len + PAYLOAD_LEN; + struct pppoe_hdr *hdr = (struct pppoe_hdr *)(buf + ETH_HLEN); + + create_packet(buf, 0, 0, PAYLOAD_LEN, 0); + write_packet(fd, buf, pkt_size, daddr); + + create_packet(buf, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + hdr->sid = htons(0x4321); + write_packet(fd, buf, pkt_size, daddr); +} + static void bind_packetsocket(int fd) { struct sockaddr_ll daddr = {}; @@ -1121,9 +1159,10 @@ static void recv_error(int fd, int rcv_errno) static void check_recv_pkts(int fd, int *correct_payload, int correct_num_pkts) { - static char buffer[IP_MAXPACKET + ETH_HLEN + 1]; - struct iphdr *iph = (struct iphdr *)(buffer + ETH_HLEN); - struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + ETH_HLEN); + static char buffer[IP_MAXPACKET + L2_HLEN_MAX + 1]; + int nhoff = ETH_HLEN + (pppoe ? PPPOE_SES_HLEN : 0); + struct iphdr *iph = (struct iphdr *)(buffer + nhoff); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + nhoff); struct tcphdr *tcph; bool bad_packet = false; int tcp_ext_len = 0; @@ -1140,7 +1179,7 @@ static void check_recv_pkts(int fd, int *correct_payload, while (1) { ip_ext_len = 0; - pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0); + pkt_size = recv(fd, buffer, sizeof(buffer), 0); if (pkt_size < 0) recv_error(fd, errno); @@ -1183,9 +1222,10 @@ static void check_recv_pkts(int fd, int *correct_payload, static void check_capacity_pkts(int fd) { - static char buffer[IP_MAXPACKET + ETH_HLEN + 1]; - struct iphdr *iph = (struct iphdr *)(buffer + ETH_HLEN); - struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + ETH_HLEN); + static char buffer[IP_MAXPACKET + L2_HLEN_MAX + 1]; + int nhoff = ETH_HLEN + (pppoe ? PPPOE_SES_HLEN : 0); + struct iphdr *iph = (struct iphdr *)(buffer + nhoff); + struct ipv6hdr *ip6h = (struct ipv6hdr *)(buffer + nhoff); int num_pkt = 0, num_coal = 0, pkt_idx; const char *fail_reason = NULL; int flow_order[num_flows * 2]; @@ -1203,7 +1243,7 @@ static void check_capacity_pkts(int fd) while (1) { ip_ext_len = 0; - pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0); + pkt_size = recv(fd, buffer, sizeof(buffer), 0); if (pkt_size < 0) recv_error(fd, errno); @@ -1499,6 +1539,12 @@ static void gro_sender(void) usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + /* PPPoE sub-tests */ + } else if (strcmp(testname, "pppoe_sid") == 0) { + send_changed_pppoe_sid(txfd, &daddr); + usleep(fin_delay_us); + write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + } else { error(1, 0, "Unknown testcase: %s", testname); } @@ -1716,6 +1762,12 @@ static void gro_receiver(void) } else if (strcmp(testname, "capacity") == 0) { check_capacity_pkts(rxfd); + } else if (strcmp(testname, "pppoe_sid") == 0) { + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + printf("different PPPoE session ID doesn't coalesce: "); + check_recv_pkts(rxfd, correct_payload, 2); + } else { error(1, 0, "Test case error: unknown testname %s", testname); } @@ -1734,6 +1786,8 @@ static void parse_args(int argc, char **argv) { "ipv6", no_argument, NULL, '6' }, { "ipip", no_argument, NULL, 'e' }, { "ip6ip6", no_argument, NULL, 'E' }, + { "pppoev4", no_argument, NULL, 'p' }, + { "pppoev6", no_argument, NULL, 'P' }, { "num-flows", required_argument, NULL, 'n' }, { "rx", no_argument, NULL, 'r' }, { "saddr", required_argument, NULL, 's' }, @@ -1745,7 +1799,7 @@ static void parse_args(int argc, char **argv) }; int c; - while ((c = getopt_long(argc, argv, "46d:D:eEi:n:rs:S:t:ov", opts, NULL)) != -1) { + while ((c = getopt_long(argc, argv, "46d:D:eEi:n:pPrs:S:t:ov", opts, NULL)) != -1) { switch (c) { case '4': proto = PF_INET; @@ -1765,6 +1819,16 @@ static void parse_args(int argc, char **argv) proto = PF_INET6; ethhdr_proto = htons(ETH_P_IPV6); break; + case 'p': + pppoe = true; + proto = PF_INET; + ethhdr_proto = htons(ETH_P_PPP_SES); + break; + case 'P': + pppoe = true; + proto = PF_INET6; + ethhdr_proto = htons(ETH_P_PPP_SES); + break; case 'd': addr4_dst = addr6_dst = optarg; break; @@ -1812,6 +1876,10 @@ int main(int argc, char **argv) } else if (ip6ip6) { tcp_offset = ETH_HLEN + sizeof(struct ipv6hdr) * 2; total_hdr_len = tcp_offset + sizeof(struct tcphdr); + } else if (pppoe) { + tcp_offset = ETH_HLEN + PPPOE_SES_HLEN + + (proto == PF_INET ? sizeof(struct iphdr) : sizeof(struct ipv6hdr)); + total_hdr_len = tcp_offset + sizeof(struct tcphdr); } else if (proto == PF_INET) { tcp_offset = ETH_HLEN + sizeof(struct iphdr); total_hdr_len = tcp_offset + sizeof(struct tcphdr); -- 2.43.0