Skip to content

Commit

Permalink
add TCP backports
Browse files Browse the repository at this point in the history
  • Loading branch information
KaneGreen committed Mar 31, 2024
1 parent 9ebd60f commit 8b283b7
Show file tree
Hide file tree
Showing 3 changed files with 286 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
From 4ece2cb09bde0bdb5a678bf6c6d0fc4b82a57076 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Nov 2021 11:02:45 -0800
Subject: tcp: avoid indirect calls to sock_rfree

TCP uses sk_eat_skb() when skbs can be removed from receive queue.
However, the call so skb_orphan() from __kfree_skb() incurs
an indirect call so sock_rfee(), which is more expensive than
a direct call, especially for CONFIG_RETPOLINE=y.

Add tcp_eat_recv_skb() function to make the call before
__kfree_skb().

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/tcp.c | 20 +++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)

--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1596,6 +1596,16 @@ void tcp_cleanup_rbuf(struct sock *sk, i
tcp_send_ack(sk);
}

+static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (likely(skb->destructor == sock_rfree)) {
+ sock_rfree(skb);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ }
+ sk_eat_skb(sk, skb);
+}
+
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
@@ -1615,7 +1625,7 @@ static struct sk_buff *tcp_recv_skb(stru
* splitted a fat GRO packet, while we released socket lock
* in skb_splice_bits()
*/
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
}
return NULL;
}
@@ -1683,11 +1693,11 @@ int tcp_read_sock(struct sock *sk, read_
continue;
}
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
++seq;
break;
}
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
if (!desc->count)
break;
WRITE_ONCE(tp->copied_seq, seq);
@@ -2513,14 +2523,14 @@ skip_copy:
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
continue;

found_fin_ok:
/* Process the FIN. */
WRITE_ONCE(*seq, *seq + 1);
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
break;
} while (len > 0);

Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
From f35f821935d8df76f9c92e2431a225bdff938169 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Nov 2021 11:02:46 -0800
Subject: tcp: defer skb freeing after socket lock is released

tcp recvmsg() (or rx zerocopy) spends a fair amount of time
freeing skbs after their payload has been consumed.

A typical ~64KB GRO packet has to release ~45 page
references, eventually going to page allocator
for each of them.

Currently, this freeing is performed while socket lock
is held, meaning that there is a high chance that
BH handler has to queue incoming packets to tcp socket backlog.

This can cause additional latencies, because the user
thread has to process the backlog at release_sock() time,
and while doing so, additional frames can be added
by BH handler.

This patch adds logic to defer these frees after socket
lock is released, or directly from BH handler if possible.

Being able to free these skbs from BH handler helps a lot,
because this avoids the usual alloc/free assymetry,
when BH handler and user thread do not run on same cpu or
NUMA node.

One cpu can now be fully utilized for the kernel->user copy,
and another cpu is handling BH processing and skb/page
allocs/frees (assuming RFS is not forcing use of a single CPU)

Tested:
100Gbit NIC
Max throughput for one TCP_STREAM flow, over 10 runs

MTU : 1500
Before: 55 Gbit
After: 66 Gbit

MTU : 4096+(headers)
Before: 82 Gbit
After: 95 Gbit

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/linux/skbuff.h | 2 ++
include/net/sock.h | 3 +++
include/net/tcp.h | 10 ++++++++++
net/ipv4/tcp.c | 27 +++++++++++++++++++++++++--
net/ipv4/tcp_ipv4.c | 1 +
net/ipv6/tcp_ipv6.c | 1 +
6 files changed, 42 insertions(+), 2 deletions(-)

--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -36,6 +36,7 @@
#include <linux/splice.h>
#include <linux/in6.h>
#include <linux/if_packet.h>
+#include <linux/llist.h>
#include <net/flow.h>
#include <net/page_pool.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -777,6 +778,7 @@ struct sk_buff {
};
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct list_head list;
+ struct llist_node ll_node;
};

union {
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -63,6 +63,7 @@
#include <linux/indirect_call_wrapper.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
+#include <linux/llist.h>
#include <net/dst.h>
#include <net/checksum.h>
#include <net/tcp_states.h>
@@ -410,6 +411,8 @@ struct sock {
struct sk_buff *head;
struct sk_buff *tail;
} sk_backlog;
+ struct llist_head defer_list;
+
#define sk_rmem_alloc sk_backlog.rmem_alloc

int sk_forward_alloc;
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1444,6 +1444,16 @@ static inline bool tcp_checksum_complete
}

bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
+
+void __sk_defer_free_flush(struct sock *sk);
+
+static inline void sk_defer_free_flush(struct sock *sk)
+{
+ if (llist_empty(&sk->defer_list))
+ return;
+ __sk_defer_free_flush(sk);
+}
+
int tcp_filter(struct sock *sk, struct sk_buff *skb);
void tcp_set_state(struct sock *sk, int state);
void tcp_done(struct sock *sk);
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1596,14 +1596,34 @@ void tcp_cleanup_rbuf(struct sock *sk, i
tcp_send_ack(sk);
}

+void __sk_defer_free_flush(struct sock *sk)
+{
+ struct llist_node *head;
+ struct sk_buff *skb, *n;
+
+ head = llist_del_all(&sk->defer_list);
+ llist_for_each_entry_safe(skb, n, head, ll_node) {
+ prefetch(n);
+ skb_mark_not_on_list(skb);
+ __kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL(__sk_defer_free_flush);
+
static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
{
+ __skb_unlink(skb, &sk->sk_receive_queue);
if (likely(skb->destructor == sock_rfree)) {
sock_rfree(skb);
skb->destructor = NULL;
skb->sk = NULL;
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ !llist_empty(&sk->defer_list)) {
+ llist_add(&skb->ll_node, &sk->defer_list);
+ return;
+ }
}
- sk_eat_skb(sk, skb);
+ __kfree_skb(skb);
}

static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
@@ -2454,6 +2474,7 @@ static int tcp_recvmsg_locked(struct soc
release_sock(sk);
lock_sock(sk);
} else {
+ sk_defer_free_flush(sk);
sk_wait_data(sk, &timeo, last);
}

@@ -2572,6 +2593,7 @@ int tcp_recvmsg(struct sock *sk, struct
ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
&cmsg_flags);
release_sock(sk);
+ sk_defer_free_flush(sk);

if (cmsg_flags && ret >= 0) {
if (cmsg_flags & TCP_CMSG_TS)
@@ -3116,7 +3138,7 @@ int tcp_disconnect(struct sock *sk, int
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
}
-
+ sk_defer_free_flush(sk);
sk_error_report(sk);
return 0;
}
@@ -4251,6 +4273,7 @@ static int do_tcp_getsockopt(struct sock
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
&zc, &len, err);
release_sock(sk);
+ sk_defer_free_flush(sk);
if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
goto zerocopy_rcv_cmsg;
switch (len) {
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2121,6 +2121,7 @@ process:

sk_incoming_cpu_update(sk);

+ sk_defer_free_flush(sk);
bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1758,6 +1758,7 @@ process:

sk_incoming_cpu_update(sk);

+ sk_defer_free_flush(sk);
bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
10 changes: 6 additions & 4 deletions SCRIPTS/02_prepare_package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ mkdir -p package/new

### 2. 补丁 ###
# BBR v3
mv -f ../PATCH/BBRv3/kernel/* ./target/linux/generic/backport-5.15/
mv -f ../PATCH/BBRv3/kernel/*.patch ./target/linux/generic/backport-5.15/
# # LRNG
# mv -f ../PATCH/LRNG/* ./target/linux/generic/hack-5.15/
# mv -f ../PATCH/LRNG/*.patch ./target/linux/generic/hack-5.15/
# echo '
# # CONFIG_RANDOM_DEFAULT_IMPL is not set
# CONFIG_LRNG=y
Expand Down Expand Up @@ -55,6 +55,8 @@ git clone --depth 1 https://github.com/sbwml/feeds_packages_lang_node-prebuilt.g
# hotplug 配置
mkdir -p files/etc/hotplug.d/net
mv ../PATCH/hotplug_conf/01-maximize_nic_rx_tx_buffers ./files/etc/hotplug.d/net/
# TCP optimizations
mv -f ../PATCH/backport/TCP/*.patch ./target/linux/generic/backport-5.15/
# 根据体系调整
case ${MYOPENWRTTARGET} in
R2S)
Expand All @@ -70,14 +72,14 @@ case ${MYOPENWRTTARGET} in
cp -r ../Immortalwrt_2305/package/boot/arm-trusted-firmware-rockchip/ ./package/boot/arm-trusted-firmware-rockchip/
cp -r ../Immortalwrt_2305/package/boot/uboot-rockchip/ ./package/boot/uboot-rockchip/
mv -f ../Immortalwrt_2305/target/linux/rockchip/ ./target/linux/rockchip/
mv -f ../PATCH/rockchip-5.15/* ./target/linux/rockchip/patches-5.15/
mv -f ../PATCH/rockchip-5.15/*.patch ./target/linux/rockchip/patches-5.15/
sed -i '/REQUIRE_IMAGE_METADATA/d' target/linux/rockchip/armv8/base-files/lib/upgrade/platform.sh
;;
x86)
# 平台优化,不再考虑过于老旧的平台
sed -i 's/-Os/-O2 -march=x86-64-v2/g' ./include/target.mk
# x86 csum
mv -f ../PATCH/backport/x86_csum/* ./target/linux/generic/backport-5.15/
mv -f ../PATCH/backport/x86_csum/*.patch ./target/linux/generic/backport-5.15/
# Enable SMP
echo '
CONFIG_X86_INTEL_PSTATE=y
Expand Down

0 comments on commit 8b283b7

Please sign in to comment.