Skip to content

Commit 28044fc

Browse files
joannekoongkuba-moo
authored andcommitted
net: Add a bhash2 table hashed by port and address
The current bind hashtable (bhash) is hashed by port only. In the socket bind path, we have to check for bind conflicts by traversing the specified port's inet_bind_bucket while holding the hashbucket's spinlock (see inet_csk_get_port() and inet_csk_bind_conflict()). In instances where there are tons of sockets hashed to the same port at different addresses, the bind conflict check is time-intensive and can cause softirq cpu lockups, as well as stops new tcp connections since __inet_inherit_port() also contests for the spinlock. This patch adds a second bind table, bhash2, that hashes by port and sk->sk_rcv_saddr (ipv4) and sk->sk_v6_rcv_saddr (ipv6). Searching the bhash2 table leads to significantly faster conflict resolution and less time holding the hashbucket spinlock. Please note a few things: * There can be the case where the a socket's address changes after it has been bound. There are two cases where this happens: 1) The case where there is a bind() call on INADDR_ANY (ipv4) or IPV6_ADDR_ANY (ipv6) and then a connect() call. The kernel will assign the socket an address when it handles the connect() 2) In inet_sk_reselect_saddr(), which is called when rebuilding the sk header and a few pre-conditions are met (eg rerouting fails). In these two cases, we need to update the bhash2 table by removing the entry for the old address, and add a new entry reflecting the updated address. * The bhash2 table must have its own lock, even though concurrent accesses on the same port are protected by the bhash lock. Bhash2 must have its own lock to protect against cases where sockets on different ports hash to different bhash hashbuckets but to the same bhash2 hashbucket. This brings up a few stipulations: 1) When acquiring both the bhash and the bhash2 lock, the bhash2 lock will always be acquired after the bhash lock and released before the bhash lock is released. 2) There are no nested bhash2 hashbucket locks. A bhash2 lock is always acquired+released before another bhash2 lock is acquired+released. * The bhash table cannot be superseded by the bhash2 table because for bind requests on INADDR_ANY (ipv4) or IPV6_ADDR_ANY (ipv6), every socket bound to that port must be checked for a potential conflict. The bhash table is the only source of port->socket associations. Signed-off-by: Joanne Koong <[email protected]> Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 0bf7325 commit 28044fc

12 files changed

+700
-94
lines changed

include/net/inet_connection_sock.h

+3
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#undef INET_CSK_CLEAR_TIMERS
2626

2727
struct inet_bind_bucket;
28+
struct inet_bind2_bucket;
2829
struct tcp_congestion_ops;
2930

3031
/*
@@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops {
5758
*
5859
* @icsk_accept_queue: FIFO of established children
5960
* @icsk_bind_hash: Bind node
61+
* @icsk_bind2_hash: Bind node in the bhash2 table
6062
* @icsk_timeout: Timeout
6163
* @icsk_retransmit_timer: Resend (no ack)
6264
* @icsk_rto: Retransmit timeout
@@ -83,6 +85,7 @@ struct inet_connection_sock {
8385
struct inet_sock icsk_inet;
8486
struct request_sock_queue icsk_accept_queue;
8587
struct inet_bind_bucket *icsk_bind_hash;
88+
struct inet_bind2_bucket *icsk_bind2_hash;
8689
unsigned long icsk_timeout;
8790
struct timer_list icsk_retransmit_timer;
8891
struct timer_list icsk_delack_timer;

include/net/inet_hashtables.h

+78-2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#include <net/inet_connection_sock.h>
2525
#include <net/inet_sock.h>
26+
#include <net/ip.h>
2627
#include <net/sock.h>
2728
#include <net/route.h>
2829
#include <net/tcp_states.h>
@@ -90,7 +91,28 @@ struct inet_bind_bucket {
9091
struct hlist_head owners;
9192
};
9293

93-
static inline struct net *ib_net(struct inet_bind_bucket *ib)
94+
struct inet_bind2_bucket {
95+
possible_net_t ib_net;
96+
int l3mdev;
97+
unsigned short port;
98+
union {
99+
#if IS_ENABLED(CONFIG_IPV6)
100+
struct in6_addr v6_rcv_saddr;
101+
#endif
102+
__be32 rcv_saddr;
103+
};
104+
/* Node in the bhash2 inet_bind_hashbucket chain */
105+
struct hlist_node node;
106+
/* List of sockets hashed to this bucket */
107+
struct hlist_head owners;
108+
};
109+
110+
static inline struct net *ib_net(const struct inet_bind_bucket *ib)
111+
{
112+
return read_pnet(&ib->ib_net);
113+
}
114+
115+
static inline struct net *ib2_net(const struct inet_bind2_bucket *ib)
94116
{
95117
return read_pnet(&ib->ib_net);
96118
}
@@ -133,7 +155,14 @@ struct inet_hashinfo {
133155
* TCP hash as well as the others for fast bind/connect.
134156
*/
135157
struct kmem_cache *bind_bucket_cachep;
158+
/* This bind table is hashed by local port */
136159
struct inet_bind_hashbucket *bhash;
160+
struct kmem_cache *bind2_bucket_cachep;
161+
/* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4)
162+
* or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used
163+
* primarily for expediting bind conflict resolution.
164+
*/
165+
struct inet_bind_hashbucket *bhash2;
137166
unsigned int bhash_size;
138167

139168
/* The 2nd listener table hashed by local port and address */
@@ -182,14 +211,61 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
182211
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
183212
struct inet_bind_bucket *tb);
184213

214+
bool inet_bind_bucket_match(const struct inet_bind_bucket *tb,
215+
const struct net *net, unsigned short port,
216+
int l3mdev);
217+
218+
struct inet_bind2_bucket *
219+
inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
220+
struct inet_bind_hashbucket *head,
221+
unsigned short port, int l3mdev,
222+
const struct sock *sk);
223+
224+
void inet_bind2_bucket_destroy(struct kmem_cache *cachep,
225+
struct inet_bind2_bucket *tb);
226+
227+
struct inet_bind2_bucket *
228+
inet_bind2_bucket_find(const struct inet_bind_hashbucket *head,
229+
const struct net *net,
230+
unsigned short port, int l3mdev,
231+
const struct sock *sk);
232+
233+
bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb,
234+
const struct net *net, unsigned short port,
235+
int l3mdev, const struct sock *sk);
236+
185237
static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
186238
const u32 bhash_size)
187239
{
188240
return (lport + net_hash_mix(net)) & (bhash_size - 1);
189241
}
190242

243+
static inline struct inet_bind_hashbucket *
244+
inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk,
245+
const struct net *net, unsigned short port)
246+
{
247+
u32 hash;
248+
249+
#if IS_ENABLED(CONFIG_IPV6)
250+
if (sk->sk_family == AF_INET6)
251+
hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port);
252+
else
253+
#endif
254+
hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port);
255+
return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
256+
}
257+
258+
struct inet_bind_hashbucket *
259+
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port);
260+
261+
/* This should be called whenever a socket's sk_rcv_saddr (ipv4) or
262+
* sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's
263+
* rcv_saddr field should already have been updated when this is called.
264+
*/
265+
int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk);
266+
191267
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
192-
const unsigned short snum);
268+
struct inet_bind2_bucket *tb2, unsigned short port);
193269

194270
/* Caller must disable local BH processing. */
195271
int __inet_inherit_port(const struct sock *sk, struct sock *child);

include/net/sock.h

+14
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ struct sk_filter;
348348
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
349349
* @sk_txtime_unused: unused txtime flags
350350
* @ns_tracker: tracker for netns reference
351+
* @sk_bind2_node: bind node in the bhash2 table
351352
*/
352353
struct sock {
353354
/*
@@ -537,6 +538,7 @@ struct sock {
537538
#endif
538539
struct rcu_head sk_rcu;
539540
netns_tracker ns_tracker;
541+
struct hlist_node sk_bind2_node;
540542
};
541543

542544
enum sk_pacing {
@@ -870,6 +872,16 @@ static inline void sk_add_bind_node(struct sock *sk,
870872
hlist_add_head(&sk->sk_bind_node, list);
871873
}
872874

875+
static inline void __sk_del_bind2_node(struct sock *sk)
876+
{
877+
__hlist_del(&sk->sk_bind2_node);
878+
}
879+
880+
static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
881+
{
882+
hlist_add_head(&sk->sk_bind2_node, list);
883+
}
884+
873885
#define sk_for_each(__sk, list) \
874886
hlist_for_each_entry(__sk, list, sk_node)
875887
#define sk_for_each_rcu(__sk, list) \
@@ -887,6 +899,8 @@ static inline void sk_add_bind_node(struct sock *sk,
887899
hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
888900
#define sk_for_each_bound(__sk, list) \
889901
hlist_for_each_entry(__sk, list, sk_bind_node)
902+
#define sk_for_each_bound_bhash2(__sk, list) \
903+
hlist_for_each_entry(__sk, list, sk_bind2_node)
890904

891905
/**
892906
* sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset

net/dccp/ipv4.c

+23-2
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,11 @@ static unsigned int dccp_v4_pernet_id __read_mostly;
4545
int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
4646
{
4747
const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
48+
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
49+
__be32 daddr, nexthop, prev_sk_rcv_saddr;
4850
struct inet_sock *inet = inet_sk(sk);
4951
struct dccp_sock *dp = dccp_sk(sk);
5052
__be16 orig_sport, orig_dport;
51-
__be32 daddr, nexthop;
5253
struct flowi4 *fl4;
5354
struct rtable *rt;
5455
int err;
@@ -89,9 +90,29 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
8990
if (inet_opt == NULL || !inet_opt->opt.srr)
9091
daddr = fl4->daddr;
9192

92-
if (inet->inet_saddr == 0)
93+
if (inet->inet_saddr == 0) {
94+
if (inet_csk(sk)->icsk_bind2_hash) {
95+
prev_addr_hashbucket =
96+
inet_bhashfn_portaddr(&dccp_hashinfo, sk,
97+
sock_net(sk),
98+
inet->inet_num);
99+
prev_sk_rcv_saddr = sk->sk_rcv_saddr;
100+
}
93101
inet->inet_saddr = fl4->saddr;
102+
}
103+
94104
sk_rcv_saddr_set(sk, inet->inet_saddr);
105+
106+
if (prev_addr_hashbucket) {
107+
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
108+
if (err) {
109+
inet->inet_saddr = 0;
110+
sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
111+
ip_rt_put(rt);
112+
return err;
113+
}
114+
}
115+
95116
inet->inet_dport = usin->sin_port;
96117
sk_daddr_set(sk, daddr);
97118

net/dccp/ipv6.c

+18
Original file line numberDiff line numberDiff line change
@@ -934,8 +934,26 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
934934
}
935935

936936
if (saddr == NULL) {
937+
struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
938+
struct in6_addr prev_v6_rcv_saddr;
939+
940+
if (icsk->icsk_bind2_hash) {
941+
prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo,
942+
sk, sock_net(sk),
943+
inet->inet_num);
944+
prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
945+
}
946+
937947
saddr = &fl6.saddr;
938948
sk->sk_v6_rcv_saddr = *saddr;
949+
950+
if (prev_addr_hashbucket) {
951+
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
952+
if (err) {
953+
sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
954+
goto failure;
955+
}
956+
}
939957
}
940958

941959
/* set the source address */

net/dccp/proto.c

+29-5
Original file line numberDiff line numberDiff line change
@@ -1120,6 +1120,12 @@ static int __init dccp_init(void)
11201120
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
11211121
if (!dccp_hashinfo.bind_bucket_cachep)
11221122
goto out_free_hashinfo2;
1123+
dccp_hashinfo.bind2_bucket_cachep =
1124+
kmem_cache_create("dccp_bind2_bucket",
1125+
sizeof(struct inet_bind2_bucket), 0,
1126+
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
1127+
if (!dccp_hashinfo.bind2_bucket_cachep)
1128+
goto out_free_bind_bucket_cachep;
11231129

11241130
/*
11251131
* Size and allocate the main established and bind bucket
@@ -1150,7 +1156,7 @@ static int __init dccp_init(void)
11501156

11511157
if (!dccp_hashinfo.ehash) {
11521158
DCCP_CRIT("Failed to allocate DCCP established hash table");
1153-
goto out_free_bind_bucket_cachep;
1159+
goto out_free_bind2_bucket_cachep;
11541160
}
11551161

11561162
for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
@@ -1176,14 +1182,24 @@ static int __init dccp_init(void)
11761182
goto out_free_dccp_locks;
11771183
}
11781184

1185+
dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *)
1186+
__get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);
1187+
1188+
if (!dccp_hashinfo.bhash2) {
1189+
DCCP_CRIT("Failed to allocate DCCP bind2 hash table");
1190+
goto out_free_dccp_bhash;
1191+
}
1192+
11791193
for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
11801194
spin_lock_init(&dccp_hashinfo.bhash[i].lock);
11811195
INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
1196+
spin_lock_init(&dccp_hashinfo.bhash2[i].lock);
1197+
INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
11821198
}
11831199

11841200
rc = dccp_mib_init();
11851201
if (rc)
1186-
goto out_free_dccp_bhash;
1202+
goto out_free_dccp_bhash2;
11871203

11881204
rc = dccp_ackvec_init();
11891205
if (rc)
@@ -1207,30 +1223,38 @@ static int __init dccp_init(void)
12071223
dccp_ackvec_exit();
12081224
out_free_dccp_mib:
12091225
dccp_mib_exit();
1226+
out_free_dccp_bhash2:
1227+
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
12101228
out_free_dccp_bhash:
12111229
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
12121230
out_free_dccp_locks:
12131231
inet_ehash_locks_free(&dccp_hashinfo);
12141232
out_free_dccp_ehash:
12151233
free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
1234+
out_free_bind2_bucket_cachep:
1235+
kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
12161236
out_free_bind_bucket_cachep:
12171237
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
12181238
out_free_hashinfo2:
12191239
inet_hashinfo2_free_mod(&dccp_hashinfo);
12201240
out_fail:
12211241
dccp_hashinfo.bhash = NULL;
1242+
dccp_hashinfo.bhash2 = NULL;
12221243
dccp_hashinfo.ehash = NULL;
12231244
dccp_hashinfo.bind_bucket_cachep = NULL;
1245+
dccp_hashinfo.bind2_bucket_cachep = NULL;
12241246
return rc;
12251247
}
12261248

12271249
static void __exit dccp_fini(void)
12281250
{
1251+
int bhash_order = get_order(dccp_hashinfo.bhash_size *
1252+
sizeof(struct inet_bind_hashbucket));
1253+
12291254
ccid_cleanup_builtins();
12301255
dccp_mib_exit();
1231-
free_pages((unsigned long)dccp_hashinfo.bhash,
1232-
get_order(dccp_hashinfo.bhash_size *
1233-
sizeof(struct inet_bind_hashbucket)));
1256+
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
1257+
free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
12341258
free_pages((unsigned long)dccp_hashinfo.ehash,
12351259
get_order((dccp_hashinfo.ehash_mask + 1) *
12361260
sizeof(struct inet_ehash_bucket)));

net/ipv4/af_inet.c

+21-5
Original file line numberDiff line numberDiff line change
@@ -1219,13 +1219,15 @@ EXPORT_SYMBOL(inet_unregister_protosw);
12191219

12201220
static int inet_sk_reselect_saddr(struct sock *sk)
12211221
{
1222+
struct inet_bind_hashbucket *prev_addr_hashbucket;
12221223
struct inet_sock *inet = inet_sk(sk);
12231224
__be32 old_saddr = inet->inet_saddr;
12241225
__be32 daddr = inet->inet_daddr;
12251226
struct flowi4 *fl4;
12261227
struct rtable *rt;
12271228
__be32 new_saddr;
12281229
struct ip_options_rcu *inet_opt;
1230+
int err;
12291231

12301232
inet_opt = rcu_dereference_protected(inet->inet_opt,
12311233
lockdep_sock_is_held(sk));
@@ -1240,20 +1242,34 @@ static int inet_sk_reselect_saddr(struct sock *sk)
12401242
if (IS_ERR(rt))
12411243
return PTR_ERR(rt);
12421244

1243-
sk_setup_caps(sk, &rt->dst);
1244-
12451245
new_saddr = fl4->saddr;
12461246

1247-
if (new_saddr == old_saddr)
1247+
if (new_saddr == old_saddr) {
1248+
sk_setup_caps(sk, &rt->dst);
12481249
return 0;
1250+
}
1251+
1252+
prev_addr_hashbucket =
1253+
inet_bhashfn_portaddr(sk->sk_prot->h.hashinfo, sk,
1254+
sock_net(sk), inet->inet_num);
1255+
1256+
inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
1257+
1258+
err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
1259+
if (err) {
1260+
inet->inet_saddr = old_saddr;
1261+
inet->inet_rcv_saddr = old_saddr;
1262+
ip_rt_put(rt);
1263+
return err;
1264+
}
1265+
1266+
sk_setup_caps(sk, &rt->dst);
12491267

12501268
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
12511269
pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
12521270
__func__, &old_saddr, &new_saddr);
12531271
}
12541272

1255-
inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
1256-
12571273
/*
12581274
* XXX The only one ugly spot where we need to
12591275
* XXX really change the sockets identity after

0 commit comments

Comments
 (0)