From 1ddf9916ac09313128e40d6581cef889c0b4ce84 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:42 +0200 Subject: xfrm: Add support for per cpu xfrm state handling. Currently all flows for a certain SA must be processed by the same cpu to avoid packet reordering and lock contention of the xfrm state lock. To get rid of this limitation, the IETF standardized per cpu SAs in RFC 9611. This patch implements the xfrm part of it. We add the cpu as a lookup key for xfrm states and a config option to generate acquire messages for each cpu. With that, we can have on each cpu a SA with identical traffic selector so that flows can be processed in parallel on all cpus. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/xfrm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a0bdd58f401c..f5275618e744 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -188,6 +188,7 @@ struct xfrm_state { refcount_t refcnt; spinlock_t lock; + u32 pcpu_num; struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; @@ -1684,7 +1685,7 @@ struct xfrmk_spdinfo { u32 spdhmcnt; }; -struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); +struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); @@ -1796,7 +1797,7 @@ int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi, struct netlink_ext_ack *extack); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, - u8 mode, u32 reqid, u32 if_id, u8 proto, + u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); -- cgit v1.2.3 From 0045e3d80613cc7174dc15f189ee6fc4e73b9365 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:43 +0200 Subject: xfrm: Cache used outbound xfrm states at the policy. Now that we can have percpu xfrm states, the number of active states might increase. To get a better lookup performance, we cache the used xfrm states at the policy for outbound IPsec traffic. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/xfrm.h | 4 ++++ net/xfrm/xfrm_policy.c | 12 +++++++++++ net/xfrm/xfrm_state.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f5275618e744..0b394c5fb5f3 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -184,6 +184,7 @@ struct xfrm_state { }; struct hlist_node byspi; struct hlist_node byseq; + struct hlist_node state_cache; refcount_t refcnt; spinlock_t lock; @@ -537,6 +538,7 @@ struct xfrm_policy_queue { * @xp_net: network namespace the policy lives in * @bydst: hlist node for SPD hash table or rbtree list * @byidx: hlist node for index hash table + * @state_cache_list: hlist head for policy cached xfrm states * @lock: serialize changes to policy structure members * @refcnt: reference count, freed once it reaches 0 * @pos: kernel internal tie-breaker to determine age of policy @@ -567,6 +569,8 @@ struct xfrm_policy { struct hlist_node bydst; struct hlist_node byidx; + struct hlist_head state_cache_list; + /* This lock only affects elements except for entry. */ rwlock_t lock; refcount_t refcnt; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index a2ea9dbac90b..8a1b83191a6c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -434,6 +434,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); + INIT_HLIST_HEAD(&policy->state_cache_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); @@ -475,6 +476,9 @@ EXPORT_SYMBOL(xfrm_policy_destroy); static void xfrm_policy_kill(struct xfrm_policy *policy) { + struct net *net = xp_net(policy); + struct xfrm_state *x; + xfrm_dev_policy_delete(policy); write_lock_bh(&policy->lock); @@ -490,6 +494,13 @@ static void xfrm_policy_kill(struct xfrm_policy *policy) if (del_timer(&policy->timer)) xfrm_pol_put(policy); + /* XXX: Flush state cache */ + spin_lock_bh(&net->xfrm.xfrm_state_lock); + hlist_for_each_entry_rcu(x, &policy->state_cache_list, state_cache) { + hlist_del_init_rcu(&x->state_cache); + } + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + xfrm_pol_put(policy); } @@ -3275,6 +3286,7 @@ no_transform: dst_release(dst); dst = dst_orig; } + ok: xfrm_pols_put(pols, drop_pols); if (dst && dst->xfrm && diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ebef07b80afa..a2047825f6c8 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -665,6 +665,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) refcount_set(&x->refcnt, 1); atomic_set(&x->tunnel_users, 0); INIT_LIST_HEAD(&x->km.all); + INIT_HLIST_NODE(&x->state_cache); INIT_HLIST_NODE(&x->bydst); INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); @@ -744,12 +745,15 @@ int __xfrm_state_delete(struct xfrm_state *x) if (x->km.state != XFRM_STATE_DEAD) { x->km.state = XFRM_STATE_DEAD; + spin_lock(&net->xfrm.xfrm_state_lock); list_del(&x->km.all); hlist_del_rcu(&x->bydst); hlist_del_rcu(&x->bysrc); if (x->km.seq) hlist_del_rcu(&x->byseq); + if (!hlist_unhashed(&x->state_cache)) + hlist_del_rcu(&x->state_cache); if (x->id.spi) hlist_del_rcu(&x->byspi); net->xfrm.state_num--; @@ -1222,6 +1226,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned int sequence; struct km_event c; unsigned int pcpu_id; + bool cached = false; /* We need the cpu id just as a lookup key, * we don't require it to be stable. @@ -1234,6 +1239,46 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); rcu_read_lock(); + hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) { + if (x->props.family == encap_family && + x->props.reqid == tmpl->reqid && + (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && + !(x->props.flags & XFRM_STATE_WILDRECV) && + xfrm_state_addr_check(x, daddr, saddr, encap_family) && + tmpl->mode == x->props.mode && + tmpl->id.proto == x->id.proto && + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) + xfrm_state_look_at(pol, x, fl, encap_family, + &best, &acquire_in_progress, &error); + } + + if (best) + goto cached; + + hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) { + if (x->props.family == encap_family && + x->props.reqid == tmpl->reqid && + (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && + !(x->props.flags & XFRM_STATE_WILDRECV) && + xfrm_addr_equal(&x->id.daddr, daddr, encap_family) && + tmpl->mode == x->props.mode && + tmpl->id.proto == x->id.proto && + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) + xfrm_state_look_at(pol, x, fl, family, + &best, &acquire_in_progress, &error); + } + +cached: + cached = true; + if (best) + goto found; + else if (error) + best = NULL; + else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */ + WARN_ON(1); + h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD @@ -1383,6 +1428,7 @@ found: XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, x->xso.type); + INIT_HLIST_NODE(&x->state_cache); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); XFRM_STATE_INSERT(byspi, &x->byspi, @@ -1431,6 +1477,15 @@ out: } else { *err = acquire_in_progress ? -EAGAIN : error; } + + if (x && x->km.state == XFRM_STATE_VALID && !cached && + (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) || x->pcpu_num == pcpu_id)) { + spin_lock_bh(&net->xfrm.xfrm_state_lock); + if (hlist_unhashed(&x->state_cache)) + hlist_add_head_rcu(&x->state_cache, &pol->state_cache_list); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + } + rcu_read_unlock(); if (to_put) xfrm_state_put(to_put); -- cgit v1.2.3 From 81a331a0e72ddc2f75092603d9577bd1a0ca23ad Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:44 +0200 Subject: xfrm: Add an inbound percpu state cache. Now that we can have percpu xfrm states, the number of active states might increase. To get a better lookup performance, we add a percpu cache to cache the used inbound xfrm states. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/netns/xfrm.h | 1 + include/net/xfrm.h | 5 +++++ net/ipv4/esp4_offload.c | 6 ++--- net/ipv6/esp6_offload.c | 6 ++--- net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_state.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 70 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index ae60d6664095..23dd647fe024 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -43,6 +43,7 @@ struct netns_xfrm { struct hlist_head __rcu *state_bysrc; struct hlist_head __rcu *state_byspi; struct hlist_head __rcu *state_byseq; + struct hlist_head __percpu *state_cache_input; unsigned int state_hmask; unsigned int state_num; struct work_struct state_hash_work; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 0b394c5fb5f3..2b87999bd5aa 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -185,6 +185,7 @@ struct xfrm_state { struct hlist_node byspi; struct hlist_node byseq; struct hlist_node state_cache; + struct hlist_node state_cache_input; refcount_t refcnt; spinlock_t lock; @@ -1650,6 +1651,10 @@ int xfrm_state_update(struct xfrm_state *x); struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family); +struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family); struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 80c4ea0e12f4..e0d94270da28 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -53,9 +53,9 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, if (sp->len == XFRM_MAX_DEPTH) goto out_reset; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ip_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET); + x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ip_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 919ebfabbe4e..7b41fb4f00b5 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -80,9 +80,9 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, if (sp->len == XFRM_MAX_DEPTH) goto out_reset; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ipv6_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET6); + x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ipv6_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET6); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 749e7eea99e4..841a60a6fbfe 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -572,7 +572,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } - x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family); + x = xfrm_input_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index a2047825f6c8..e3266a5d4f90 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -754,6 +754,9 @@ int __xfrm_state_delete(struct xfrm_state *x) hlist_del_rcu(&x->byseq); if (!hlist_unhashed(&x->state_cache)) hlist_del_rcu(&x->state_cache); + if (!hlist_unhashed(&x->state_cache_input)) + hlist_del_rcu(&x->state_cache_input); + if (x->id.spi) hlist_del_rcu(&x->byspi); net->xfrm.state_num--; @@ -1106,6 +1109,52 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, return NULL; } +struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family) +{ + struct hlist_head *state_cache_input; + struct xfrm_state *x = NULL; + int cpu = get_cpu(); + + state_cache_input = per_cpu_ptr(net->xfrm.state_cache_input, cpu); + + rcu_read_lock(); + hlist_for_each_entry_rcu(x, state_cache_input, state_cache_input) { + if (x->props.family != family || + x->id.spi != spi || + x->id.proto != proto || + !xfrm_addr_equal(&x->id.daddr, daddr, family)) + continue; + + if ((mark & x->mark.m) != x->mark.v) + continue; + if (!xfrm_state_hold_rcu(x)) + continue; + goto out; + } + + x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + + if (x && x->km.state == XFRM_STATE_VALID) { + spin_lock_bh(&net->xfrm.xfrm_state_lock); + if (hlist_unhashed(&x->state_cache_input)) { + hlist_add_head_rcu(&x->state_cache_input, state_cache_input); + } else { + hlist_del_rcu(&x->state_cache_input); + hlist_add_head_rcu(&x->state_cache_input, state_cache_input); + } + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + } + +out: + rcu_read_unlock(); + put_cpu(); + return x; +} +EXPORT_SYMBOL(xfrm_input_state_lookup); + static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, @@ -3079,6 +3128,11 @@ int __net_init xfrm_state_init(struct net *net) net->xfrm.state_byseq = xfrm_hash_alloc(sz); if (!net->xfrm.state_byseq) goto out_byseq; + + net->xfrm.state_cache_input = alloc_percpu(struct hlist_head); + if (!net->xfrm.state_cache_input) + goto out_state_cache_input; + net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1); net->xfrm.state_num = 0; @@ -3088,6 +3142,8 @@ int __net_init xfrm_state_init(struct net *net) &net->xfrm.xfrm_state_lock); return 0; +out_state_cache_input: + xfrm_hash_free(net->xfrm.state_byseq, sz); out_byseq: xfrm_hash_free(net->xfrm.state_byspi, sz); out_byspi: @@ -3117,6 +3173,7 @@ void xfrm_state_fini(struct net *net) xfrm_hash_free(net->xfrm.state_bysrc, sz); WARN_ON(!hlist_empty(net->xfrm.state_bydst)); xfrm_hash_free(net->xfrm.state_bydst, sz); + free_percpu(net->xfrm.state_cache_input); } #ifdef CONFIG_AUDITSYSCALL -- cgit v1.2.3 From e57dfaa4b0a72f6a231a8eedb95d260045bbd8db Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 31 Oct 2024 16:52:57 +0100 Subject: xfrm: Convert struct xfrm_dst_lookup_params -> tos to dscp_t. Add type annotation to the "tos" field of struct xfrm_dst_lookup_params, to ensure that the ECN bits aren't mistakenly taken into account when doing route lookups. Rename that field (tos -> dscp) to make that change explicit. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- net/ipv4/xfrm4_policy.c | 3 ++- net/xfrm/xfrm_policy.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2b87999bd5aa..32c09e85a64c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -354,7 +355,7 @@ void xfrm_if_unregister_cb(void); struct xfrm_dst_lookup_params { struct net *net; - int tos; + dscp_t dscp; int oif; xfrm_address_t *saddr; xfrm_address_t *daddr; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7e1c2faed1ff..7fb6205619e7 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -24,7 +25,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = params->daddr->a4; - fl4->flowi4_tos = params->tos; + fl4->flowi4_tos = inet_dscp_to_dsfield(params->dscp); fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl4->flowi4_mark = params->mark; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7e3e10fb9ca0..4408c11c0835 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -312,7 +312,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, params.net = net; params.saddr = saddr; params.daddr = daddr; - params.tos = inet_dscp_to_dsfield(dscp); + params.dscp = dscp; params.oif = oif; params.mark = mark; params.ipproto = x->id.proto; -- cgit v1.2.3