From 8b283b7a539227eda844f8354d01542c00f69c68 Mon Sep 17 00:00:00 2001 From: KaneGreen <737445366KG@Gmail.com> Date: Sun, 31 Mar 2024 15:41:34 +0800 Subject: [PATCH] add TCP backports --- ...p-avoid-indirect-calls-to-sock_rfree.patch | 77 +++++++ ...reeing-after-socket-lock-is-released.patch | 203 ++++++++++++++++++ SCRIPTS/02_prepare_package.sh | 10 +- 3 files changed, 286 insertions(+), 4 deletions(-) create mode 100644 PATCH/backport/TCP/680-01-v5.17-tcp-avoid-indirect-calls-to-sock_rfree.patch create mode 100644 PATCH/backport/TCP/680-02-v5.17-tcp-defer-skb-freeing-after-socket-lock-is-released.patch diff --git a/PATCH/backport/TCP/680-01-v5.17-tcp-avoid-indirect-calls-to-sock_rfree.patch b/PATCH/backport/TCP/680-01-v5.17-tcp-avoid-indirect-calls-to-sock_rfree.patch new file mode 100644 index 0000000..2330355 --- /dev/null +++ b/PATCH/backport/TCP/680-01-v5.17-tcp-avoid-indirect-calls-to-sock_rfree.patch @@ -0,0 +1,77 @@ +From 4ece2cb09bde0bdb5a678bf6c6d0fc4b82a57076 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Mon, 15 Nov 2021 11:02:45 -0800 +Subject: tcp: avoid indirect calls to sock_rfree + +TCP uses sk_eat_skb() when skbs can be removed from receive queue. +However, the call so skb_orphan() from __kfree_skb() incurs +an indirect call so sock_rfee(), which is more expensive than +a direct call, especially for CONFIG_RETPOLINE=y. + +Add tcp_eat_recv_skb() function to make the call before +__kfree_skb(). + +Signed-off-by: Eric Dumazet +--- + net/ipv4/tcp.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -1596,6 +1596,16 @@ void tcp_cleanup_rbuf(struct sock *sk, i + tcp_send_ack(sk); + } + ++static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) ++{ ++ if (likely(skb->destructor == sock_rfree)) { ++ sock_rfree(skb); ++ skb->destructor = NULL; ++ skb->sk = NULL; ++ } ++ sk_eat_skb(sk, skb); ++} ++ + static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) + { + struct sk_buff *skb; +@@ -1615,7 +1625,7 @@ static struct sk_buff *tcp_recv_skb(stru + * splitted a fat GRO packet, while we released socket lock + * in skb_splice_bits() + */ +- sk_eat_skb(sk, skb); ++ tcp_eat_recv_skb(sk, skb); + } + return NULL; + } +@@ -1683,11 +1693,11 @@ int tcp_read_sock(struct sock *sk, read_ + continue; + } + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { +- sk_eat_skb(sk, skb); ++ tcp_eat_recv_skb(sk, skb); + ++seq; + break; + } +- sk_eat_skb(sk, skb); ++ tcp_eat_recv_skb(sk, skb); + if (!desc->count) + break; + WRITE_ONCE(tp->copied_seq, seq); +@@ -2513,14 +2523,14 @@ skip_copy: + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + goto found_fin_ok; + if (!(flags & MSG_PEEK)) +- sk_eat_skb(sk, skb); ++ tcp_eat_recv_skb(sk, skb); + continue; + + found_fin_ok: + /* Process the FIN. */ + WRITE_ONCE(*seq, *seq + 1); + if (!(flags & MSG_PEEK)) +- sk_eat_skb(sk, skb); ++ tcp_eat_recv_skb(sk, skb); + break; + } while (len > 0); + diff --git a/PATCH/backport/TCP/680-02-v5.17-tcp-defer-skb-freeing-after-socket-lock-is-released.patch b/PATCH/backport/TCP/680-02-v5.17-tcp-defer-skb-freeing-after-socket-lock-is-released.patch new file mode 100644 index 0000000..ce83a04 --- /dev/null +++ b/PATCH/backport/TCP/680-02-v5.17-tcp-defer-skb-freeing-after-socket-lock-is-released.patch @@ -0,0 +1,203 @@ +From f35f821935d8df76f9c92e2431a225bdff938169 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Mon, 15 Nov 2021 11:02:46 -0800 +Subject: tcp: defer skb freeing after socket lock is released + +tcp recvmsg() (or rx zerocopy) spends a fair amount of time +freeing skbs after their payload has been consumed. + +A typical ~64KB GRO packet has to release ~45 page +references, eventually going to page allocator +for each of them. + +Currently, this freeing is performed while socket lock +is held, meaning that there is a high chance that +BH handler has to queue incoming packets to tcp socket backlog. + +This can cause additional latencies, because the user +thread has to process the backlog at release_sock() time, +and while doing so, additional frames can be added +by BH handler. + +This patch adds logic to defer these frees after socket +lock is released, or directly from BH handler if possible. + +Being able to free these skbs from BH handler helps a lot, +because this avoids the usual alloc/free assymetry, +when BH handler and user thread do not run on same cpu or +NUMA node. + +One cpu can now be fully utilized for the kernel->user copy, +and another cpu is handling BH processing and skb/page +allocs/frees (assuming RFS is not forcing use of a single CPU) + +Tested: + 100Gbit NIC + Max throughput for one TCP_STREAM flow, over 10 runs + +MTU : 1500 +Before: 55 Gbit +After: 66 Gbit + +MTU : 4096+(headers) +Before: 82 Gbit +After: 95 Gbit + +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +--- + include/linux/skbuff.h | 2 ++ + include/net/sock.h | 3 +++ + include/net/tcp.h | 10 ++++++++++ + net/ipv4/tcp.c | 27 +++++++++++++++++++++++++-- + net/ipv4/tcp_ipv4.c | 1 + + net/ipv6/tcp_ipv6.c | 1 + + 6 files changed, 42 insertions(+), 2 deletions(-) + +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + #include + #include + #if IS_ENABLED(CONFIG_NF_CONNTRACK) +@@ -777,6 +778,7 @@ struct sk_buff { + }; + struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ + struct list_head list; ++ struct llist_node ll_node; + }; + + union { +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -63,6 +63,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -410,6 +411,8 @@ struct sock { + struct sk_buff *head; + struct sk_buff *tail; + } sk_backlog; ++ struct llist_head defer_list; ++ + #define sk_rmem_alloc sk_backlog.rmem_alloc + + int sk_forward_alloc; +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1444,6 +1444,16 @@ static inline bool tcp_checksum_complete + } + + bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); ++ ++void __sk_defer_free_flush(struct sock *sk); ++ ++static inline void sk_defer_free_flush(struct sock *sk) ++{ ++ if (llist_empty(&sk->defer_list)) ++ return; ++ __sk_defer_free_flush(sk); ++} ++ + int tcp_filter(struct sock *sk, struct sk_buff *skb); + void tcp_set_state(struct sock *sk, int state); + void tcp_done(struct sock *sk); +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -1596,14 +1596,34 @@ void tcp_cleanup_rbuf(struct sock *sk, i + tcp_send_ack(sk); + } + ++void __sk_defer_free_flush(struct sock *sk) ++{ ++ struct llist_node *head; ++ struct sk_buff *skb, *n; ++ ++ head = llist_del_all(&sk->defer_list); ++ llist_for_each_entry_safe(skb, n, head, ll_node) { ++ prefetch(n); ++ skb_mark_not_on_list(skb); ++ __kfree_skb(skb); ++ } ++} ++EXPORT_SYMBOL(__sk_defer_free_flush); ++ + static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) + { ++ __skb_unlink(skb, &sk->sk_receive_queue); + if (likely(skb->destructor == sock_rfree)) { + sock_rfree(skb); + skb->destructor = NULL; + skb->sk = NULL; ++ if (!skb_queue_empty(&sk->sk_receive_queue) || ++ !llist_empty(&sk->defer_list)) { ++ llist_add(&skb->ll_node, &sk->defer_list); ++ return; ++ } + } +- sk_eat_skb(sk, skb); ++ __kfree_skb(skb); + } + + static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +@@ -2454,6 +2474,7 @@ static int tcp_recvmsg_locked(struct soc + release_sock(sk); + lock_sock(sk); + } else { ++ sk_defer_free_flush(sk); + sk_wait_data(sk, &timeo, last); + } + +@@ -2572,6 +2593,7 @@ int tcp_recvmsg(struct sock *sk, struct + ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss, + &cmsg_flags); + release_sock(sk); ++ sk_defer_free_flush(sk); + + if (cmsg_flags && ret >= 0) { + if (cmsg_flags & TCP_CMSG_TS) +@@ -3116,7 +3138,7 @@ int tcp_disconnect(struct sock *sk, int + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; + } +- ++ sk_defer_free_flush(sk); + sk_error_report(sk); + return 0; + } +@@ -4251,6 +4273,7 @@ static int do_tcp_getsockopt(struct sock + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, + &zc, &len, err); + release_sock(sk); ++ sk_defer_free_flush(sk); + if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) + goto zerocopy_rcv_cmsg; + switch (len) { +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -2121,6 +2121,7 @@ process: + + sk_incoming_cpu_update(sk); + ++ sk_defer_free_flush(sk); + bh_lock_sock_nested(sk); + tcp_segs_in(tcp_sk(sk), skb); + ret = 0; +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -1758,6 +1758,7 @@ process: + + sk_incoming_cpu_update(sk); + ++ sk_defer_free_flush(sk); + bh_lock_sock_nested(sk); + tcp_segs_in(tcp_sk(sk), skb); + ret = 0; diff --git a/SCRIPTS/02_prepare_package.sh b/SCRIPTS/02_prepare_package.sh index 4282570..853be6e 100644 --- a/SCRIPTS/02_prepare_package.sh +++ b/SCRIPTS/02_prepare_package.sh @@ -17,9 +17,9 @@ mkdir -p package/new ### 2. 补丁 ### # BBR v3 -mv -f ../PATCH/BBRv3/kernel/* ./target/linux/generic/backport-5.15/ +mv -f ../PATCH/BBRv3/kernel/*.patch ./target/linux/generic/backport-5.15/ # # LRNG -# mv -f ../PATCH/LRNG/* ./target/linux/generic/hack-5.15/ +# mv -f ../PATCH/LRNG/*.patch ./target/linux/generic/hack-5.15/ # echo ' # # CONFIG_RANDOM_DEFAULT_IMPL is not set # CONFIG_LRNG=y @@ -55,6 +55,8 @@ git clone --depth 1 https://github.com/sbwml/feeds_packages_lang_node-prebuilt.g # hotplug 配置 mkdir -p files/etc/hotplug.d/net mv ../PATCH/hotplug_conf/01-maximize_nic_rx_tx_buffers ./files/etc/hotplug.d/net/ +# TCP optimizations +mv -f ../PATCH/backport/TCP/*.patch ./target/linux/generic/backport-5.15/ # 根据体系调整 case ${MYOPENWRTTARGET} in R2S) @@ -70,14 +72,14 @@ case ${MYOPENWRTTARGET} in cp -r ../Immortalwrt_2305/package/boot/arm-trusted-firmware-rockchip/ ./package/boot/arm-trusted-firmware-rockchip/ cp -r ../Immortalwrt_2305/package/boot/uboot-rockchip/ ./package/boot/uboot-rockchip/ mv -f ../Immortalwrt_2305/target/linux/rockchip/ ./target/linux/rockchip/ - mv -f ../PATCH/rockchip-5.15/* ./target/linux/rockchip/patches-5.15/ + mv -f ../PATCH/rockchip-5.15/*.patch ./target/linux/rockchip/patches-5.15/ sed -i '/REQUIRE_IMAGE_METADATA/d' target/linux/rockchip/armv8/base-files/lib/upgrade/platform.sh ;; x86) # 平台优化,不再考虑过于老旧的平台 sed -i 's/-Os/-O2 -march=x86-64-v2/g' ./include/target.mk # x86 csum - mv -f ../PATCH/backport/x86_csum/* ./target/linux/generic/backport-5.15/ + mv -f ../PATCH/backport/x86_csum/*.patch ./target/linux/generic/backport-5.15/ # Enable SMP echo ' CONFIG_X86_INTEL_PSTATE=y