From aa1039e73cc2cf834e99c09d2033d5d2675357b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Jun 2010 08:23:14 +0000 Subject: inetpeer: RCU conversion inetpeer currently uses an AVL tree protected by an rwlock. It's possible to make most lookups use RCU 1) Add a struct rcu_head to struct inet_peer 2) add a lookup_rcu_bh() helper to perform lockless and opportunistic lookup. This is a normal function, not a macro like lookup(). 3) Add a limit to number of links followed by lookup_rcu_bh(). This is needed in case we fall in a loop. 4) add an smp_wmb() in link_to_pool() right before node insert. 5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take last reference to an inet_peer, since lockless readers could increase refcount, even while we hold peers.lock. 6) Delay struct inet_peer freeing after rcu grace period so that lookup_rcu_bh() cannot crash. 7) inet_getpeer() first attempts lockless lookup. Note this lookup can fail even if target is in AVL tree, but a concurrent writer can let tree in a non correct form. If this attemps fails, lock is taken a regular lookup is performed again. 8) convert peers.lock from rwlock to a spinlock 9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 -> 128 bytes) In a future patch, this is probably possible to revert this part, if rcu field is put in an union to share space with rid, ip_id_count, tcp_ts & tcp_ts_stamp. These fields being manipulated only with refcnt > 0. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inetpeer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net/inetpeer.h') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 87b1df0d4d8c..617404730422 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -26,6 +26,7 @@ struct inet_peer { atomic_t ip_id_count; /* IP ID for the next packet */ __u32 tcp_ts; __u32 tcp_ts_stamp; + struct rcu_head rcu; }; void inet_initpeers(void) __init; -- cgit v1.2.3 From 317fe0e6c5dc9448bcef41a2e31fecfd3dba7f55 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Jun 2010 04:52:13 +0000 Subject: inetpeer: restore small inet_peer structures Addition of rcu_head to struct inet_peer added 16bytes on 64bit arches. Thats a bit unfortunate, since old size was exactly 64 bytes. This can be solved, using an union between this rcu_head an four fields, that are normally used only when a refcount is taken on inet_peer. rcu_head is used only when refcnt=-1, right before structure freeing. Add a inet_peer_refcheck() function to check this assertion for a while. We can bring back SLAB_HWCACHE_ALIGN qualifier in kmem cache creation. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inetpeer.h | 31 ++++++++++++++++++++++++++----- net/ipv4/inetpeer.c | 4 ++-- net/ipv4/route.c | 1 + net/ipv4/tcp_ipv4.c | 11 +++++++---- 4 files changed, 36 insertions(+), 11 deletions(-) (limited to 'include/net/inetpeer.h') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 617404730422..417d0c894f29 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -22,11 +22,21 @@ struct inet_peer { __u32 dtime; /* the time of last use of not * referenced entries */ atomic_t refcnt; - atomic_t rid; /* Frag reception counter */ - atomic_t ip_id_count; /* IP ID for the next packet */ - __u32 tcp_ts; - __u32 tcp_ts_stamp; - struct rcu_head rcu; + /* + * Once inet_peer is queued for deletion (refcnt == -1), following fields + * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp + * We can share memory with rcu_head to keep inet_peer small + * (less then 64 bytes) + */ + union { + struct { + atomic_t rid; /* Frag reception counter */ + atomic_t ip_id_count; /* IP ID for the next packet */ + __u32 tcp_ts; + __u32 tcp_ts_stamp; + }; + struct rcu_head rcu; + }; }; void inet_initpeers(void) __init; @@ -37,10 +47,21 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create); /* can be called from BH context or outside */ extern void inet_putpeer(struct inet_peer *p); +/* + * temporary check to make sure we dont access rid, ip_id_count, tcp_ts, + * tcp_ts_stamp if no refcount is taken on inet_peer + */ +static inline void inet_peer_refcheck(const struct inet_peer *p) +{ + WARN_ON_ONCE(atomic_read(&p->refcnt) <= 0); +} + + /* can be called with or without local BH being disabled */ static inline __u16 inet_getid(struct inet_peer *p, int more) { more++; + inet_peer_refcheck(p); return atomic_add_return(more, &p->ip_id_count) - more; } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 349249fad2db..9ffa24b9a804 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -64,7 +64,7 @@ * usually under some other lock to prevent node disappearing * dtime: unused node list lock * v4daddr: unchangeable - * ip_id_count: idlock + * ip_id_count: atomic value (no lock needed) */ static struct kmem_cache *peer_cachep __read_mostly; @@ -129,7 +129,7 @@ void __init inet_initpeers(void) peer_cachep = kmem_cache_create("inet_peer_cache", sizeof(struct inet_peer), - 0, SLAB_PANIC, + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); /* All the timers, started at system startup tend diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a291edbbc97f..03430de46166 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2881,6 +2881,7 @@ static int rt_fill_info(struct net *net, error = rt->dst.error; expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; if (rt->peer) { + inet_peer_refcheck(rt->peer); id = atomic_read(&rt->peer->ip_id_count) & 0xffff; if (rt->peer->tcp_ts_stamp) { ts = rt->peer->tcp_ts; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7f9515c0379f..2e41e6f92968 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -204,10 +204,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * TIME-WAIT * and initialize rx_opt.ts_recent from it, * when trying new connection. */ - if (peer != NULL && - (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; - tp->rx_opt.ts_recent = peer->tcp_ts; + if (peer) { + inet_peer_refcheck(peer); + if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { + tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; + tp->rx_opt.ts_recent = peer->tcp_ts; + } } } @@ -1351,6 +1353,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { + inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { -- cgit v1.2.3