-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
286 additions
and
4 deletions.
There are no files selected for viewing
77 changes: 77 additions & 0 deletions
77
PATCH/backport/TCP/680-01-v5.17-tcp-avoid-indirect-calls-to-sock_rfree.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
From 4ece2cb09bde0bdb5a678bf6c6d0fc4b82a57076 Mon Sep 17 00:00:00 2001 | ||
From: Eric Dumazet <edumazet@google.com> | ||
Date: Mon, 15 Nov 2021 11:02:45 -0800 | ||
Subject: tcp: avoid indirect calls to sock_rfree | ||
|
||
TCP uses sk_eat_skb() when skbs can be removed from receive queue. | ||
However, the call so skb_orphan() from __kfree_skb() incurs | ||
an indirect call so sock_rfee(), which is more expensive than | ||
a direct call, especially for CONFIG_RETPOLINE=y. | ||
|
||
Add tcp_eat_recv_skb() function to make the call before | ||
__kfree_skb(). | ||
|
||
Signed-off-by: Eric Dumazet <edumazet@google.com> | ||
--- | ||
net/ipv4/tcp.c | 20 +++++++++++++++----- | ||
1 file changed, 15 insertions(+), 5 deletions(-) | ||
|
||
--- a/net/ipv4/tcp.c | ||
+++ b/net/ipv4/tcp.c | ||
@@ -1596,6 +1596,16 @@ void tcp_cleanup_rbuf(struct sock *sk, i | ||
tcp_send_ack(sk); | ||
} | ||
|
||
+static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) | ||
+{ | ||
+ if (likely(skb->destructor == sock_rfree)) { | ||
+ sock_rfree(skb); | ||
+ skb->destructor = NULL; | ||
+ skb->sk = NULL; | ||
+ } | ||
+ sk_eat_skb(sk, skb); | ||
+} | ||
+ | ||
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) | ||
{ | ||
struct sk_buff *skb; | ||
@@ -1615,7 +1625,7 @@ static struct sk_buff *tcp_recv_skb(stru | ||
* splitted a fat GRO packet, while we released socket lock | ||
* in skb_splice_bits() | ||
*/ | ||
- sk_eat_skb(sk, skb); | ||
+ tcp_eat_recv_skb(sk, skb); | ||
} | ||
return NULL; | ||
} | ||
@@ -1683,11 +1693,11 @@ int tcp_read_sock(struct sock *sk, read_ | ||
continue; | ||
} | ||
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { | ||
- sk_eat_skb(sk, skb); | ||
+ tcp_eat_recv_skb(sk, skb); | ||
++seq; | ||
break; | ||
} | ||
- sk_eat_skb(sk, skb); | ||
+ tcp_eat_recv_skb(sk, skb); | ||
if (!desc->count) | ||
break; | ||
WRITE_ONCE(tp->copied_seq, seq); | ||
@@ -2513,14 +2523,14 @@ skip_copy: | ||
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) | ||
goto found_fin_ok; | ||
if (!(flags & MSG_PEEK)) | ||
- sk_eat_skb(sk, skb); | ||
+ tcp_eat_recv_skb(sk, skb); | ||
continue; | ||
|
||
found_fin_ok: | ||
/* Process the FIN. */ | ||
WRITE_ONCE(*seq, *seq + 1); | ||
if (!(flags & MSG_PEEK)) | ||
- sk_eat_skb(sk, skb); | ||
+ tcp_eat_recv_skb(sk, skb); | ||
break; | ||
} while (len > 0); | ||
|
203 changes: 203 additions & 0 deletions
203
PATCH/backport/TCP/680-02-v5.17-tcp-defer-skb-freeing-after-socket-lock-is-released.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
From f35f821935d8df76f9c92e2431a225bdff938169 Mon Sep 17 00:00:00 2001 | ||
From: Eric Dumazet <edumazet@google.com> | ||
Date: Mon, 15 Nov 2021 11:02:46 -0800 | ||
Subject: tcp: defer skb freeing after socket lock is released | ||
|
||
tcp recvmsg() (or rx zerocopy) spends a fair amount of time | ||
freeing skbs after their payload has been consumed. | ||
|
||
A typical ~64KB GRO packet has to release ~45 page | ||
references, eventually going to page allocator | ||
for each of them. | ||
|
||
Currently, this freeing is performed while socket lock | ||
is held, meaning that there is a high chance that | ||
BH handler has to queue incoming packets to tcp socket backlog. | ||
|
||
This can cause additional latencies, because the user | ||
thread has to process the backlog at release_sock() time, | ||
and while doing so, additional frames can be added | ||
by BH handler. | ||
|
||
This patch adds logic to defer these frees after socket | ||
lock is released, or directly from BH handler if possible. | ||
|
||
Being able to free these skbs from BH handler helps a lot, | ||
because this avoids the usual alloc/free assymetry, | ||
when BH handler and user thread do not run on same cpu or | ||
NUMA node. | ||
|
||
One cpu can now be fully utilized for the kernel->user copy, | ||
and another cpu is handling BH processing and skb/page | ||
allocs/frees (assuming RFS is not forcing use of a single CPU) | ||
|
||
Tested: | ||
100Gbit NIC | ||
Max throughput for one TCP_STREAM flow, over 10 runs | ||
|
||
MTU : 1500 | ||
Before: 55 Gbit | ||
After: 66 Gbit | ||
|
||
MTU : 4096+(headers) | ||
Before: 82 Gbit | ||
After: 95 Gbit | ||
|
||
Signed-off-by: Eric Dumazet <edumazet@google.com> | ||
Signed-off-by: David S. Miller <davem@davemloft.net> | ||
--- | ||
include/linux/skbuff.h | 2 ++ | ||
include/net/sock.h | 3 +++ | ||
include/net/tcp.h | 10 ++++++++++ | ||
net/ipv4/tcp.c | 27 +++++++++++++++++++++++++-- | ||
net/ipv4/tcp_ipv4.c | 1 + | ||
net/ipv6/tcp_ipv6.c | 1 + | ||
6 files changed, 42 insertions(+), 2 deletions(-) | ||
|
||
--- a/include/linux/skbuff.h | ||
+++ b/include/linux/skbuff.h | ||
@@ -36,6 +36,7 @@ | ||
#include <linux/splice.h> | ||
#include <linux/in6.h> | ||
#include <linux/if_packet.h> | ||
+#include <linux/llist.h> | ||
#include <net/flow.h> | ||
#include <net/page_pool.h> | ||
#if IS_ENABLED(CONFIG_NF_CONNTRACK) | ||
@@ -777,6 +778,7 @@ struct sk_buff { | ||
}; | ||
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ | ||
struct list_head list; | ||
+ struct llist_node ll_node; | ||
}; | ||
|
||
union { | ||
--- a/include/net/sock.h | ||
+++ b/include/net/sock.h | ||
@@ -63,6 +63,7 @@ | ||
#include <linux/indirect_call_wrapper.h> | ||
#include <linux/atomic.h> | ||
#include <linux/refcount.h> | ||
+#include <linux/llist.h> | ||
#include <net/dst.h> | ||
#include <net/checksum.h> | ||
#include <net/tcp_states.h> | ||
@@ -410,6 +411,8 @@ struct sock { | ||
struct sk_buff *head; | ||
struct sk_buff *tail; | ||
} sk_backlog; | ||
+ struct llist_head defer_list; | ||
+ | ||
#define sk_rmem_alloc sk_backlog.rmem_alloc | ||
|
||
int sk_forward_alloc; | ||
--- a/include/net/tcp.h | ||
+++ b/include/net/tcp.h | ||
@@ -1444,6 +1444,16 @@ static inline bool tcp_checksum_complete | ||
} | ||
|
||
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); | ||
+ | ||
+void __sk_defer_free_flush(struct sock *sk); | ||
+ | ||
+static inline void sk_defer_free_flush(struct sock *sk) | ||
+{ | ||
+ if (llist_empty(&sk->defer_list)) | ||
+ return; | ||
+ __sk_defer_free_flush(sk); | ||
+} | ||
+ | ||
int tcp_filter(struct sock *sk, struct sk_buff *skb); | ||
void tcp_set_state(struct sock *sk, int state); | ||
void tcp_done(struct sock *sk); | ||
--- a/net/ipv4/tcp.c | ||
+++ b/net/ipv4/tcp.c | ||
@@ -1596,14 +1596,34 @@ void tcp_cleanup_rbuf(struct sock *sk, i | ||
tcp_send_ack(sk); | ||
} | ||
|
||
+void __sk_defer_free_flush(struct sock *sk) | ||
+{ | ||
+ struct llist_node *head; | ||
+ struct sk_buff *skb, *n; | ||
+ | ||
+ head = llist_del_all(&sk->defer_list); | ||
+ llist_for_each_entry_safe(skb, n, head, ll_node) { | ||
+ prefetch(n); | ||
+ skb_mark_not_on_list(skb); | ||
+ __kfree_skb(skb); | ||
+ } | ||
+} | ||
+EXPORT_SYMBOL(__sk_defer_free_flush); | ||
+ | ||
static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) | ||
{ | ||
+ __skb_unlink(skb, &sk->sk_receive_queue); | ||
if (likely(skb->destructor == sock_rfree)) { | ||
sock_rfree(skb); | ||
skb->destructor = NULL; | ||
skb->sk = NULL; | ||
+ if (!skb_queue_empty(&sk->sk_receive_queue) || | ||
+ !llist_empty(&sk->defer_list)) { | ||
+ llist_add(&skb->ll_node, &sk->defer_list); | ||
+ return; | ||
+ } | ||
} | ||
- sk_eat_skb(sk, skb); | ||
+ __kfree_skb(skb); | ||
} | ||
|
||
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) | ||
@@ -2454,6 +2474,7 @@ static int tcp_recvmsg_locked(struct soc | ||
release_sock(sk); | ||
lock_sock(sk); | ||
} else { | ||
+ sk_defer_free_flush(sk); | ||
sk_wait_data(sk, &timeo, last); | ||
} | ||
|
||
@@ -2572,6 +2593,7 @@ int tcp_recvmsg(struct sock *sk, struct | ||
ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss, | ||
&cmsg_flags); | ||
release_sock(sk); | ||
+ sk_defer_free_flush(sk); | ||
|
||
if (cmsg_flags && ret >= 0) { | ||
if (cmsg_flags & TCP_CMSG_TS) | ||
@@ -3116,7 +3138,7 @@ int tcp_disconnect(struct sock *sk, int | ||
sk->sk_frag.page = NULL; | ||
sk->sk_frag.offset = 0; | ||
} | ||
- | ||
+ sk_defer_free_flush(sk); | ||
sk_error_report(sk); | ||
return 0; | ||
} | ||
@@ -4251,6 +4273,7 @@ static int do_tcp_getsockopt(struct sock | ||
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, | ||
&zc, &len, err); | ||
release_sock(sk); | ||
+ sk_defer_free_flush(sk); | ||
if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) | ||
goto zerocopy_rcv_cmsg; | ||
switch (len) { | ||
--- a/net/ipv4/tcp_ipv4.c | ||
+++ b/net/ipv4/tcp_ipv4.c | ||
@@ -2121,6 +2121,7 @@ process: | ||
|
||
sk_incoming_cpu_update(sk); | ||
|
||
+ sk_defer_free_flush(sk); | ||
bh_lock_sock_nested(sk); | ||
tcp_segs_in(tcp_sk(sk), skb); | ||
ret = 0; | ||
--- a/net/ipv6/tcp_ipv6.c | ||
+++ b/net/ipv6/tcp_ipv6.c | ||
@@ -1758,6 +1758,7 @@ process: | ||
|
||
sk_incoming_cpu_update(sk); | ||
|
||
+ sk_defer_free_flush(sk); | ||
bh_lock_sock_nested(sk); | ||
tcp_segs_in(tcp_sk(sk), skb); | ||
ret = 0; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters