summaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c158
1 files changed, 125 insertions, 33 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 93a25d87b86b..a64cef2c537e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3768,8 +3768,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
features &= ~dev->gso_partial_features;
- /* Make sure to clear the IPv4 ID mangling feature if the
- * IPv4 header has the potential to be fragmented.
+ /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header
+ * has the potential to be fragmented so that TSO does not generate
+ * segments with the same ID. For encapsulated packets, the ID mangling
+ * feature is guaranteed not to use the same ID for the outer IPv4
+ * headers of the generated segments if the headers have the potential
+ * to be fragmented, so there is no need to clear the IPv4 ID mangling
+ * feature (see the section about NETIF_F_TSO_MANGLEID in
+ * segmentation-offloads.rst).
*/
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
struct iphdr *iph = skb->encapsulation ?
@@ -3907,6 +3913,38 @@ sw_checksum:
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
+/* Checks if this SKB belongs to an HW offloaded socket
+ * and whether any SW fallbacks are required based on dev.
+ * Check decrypted mark in case skb_orphan() cleared socket.
+ */
+static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+ struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev,
+ struct sk_buff *skb);
+ struct sock *sk = skb->sk;
+
+ sk_validate = NULL;
+ if (sk) {
+ if (sk_fullsock(sk))
+ sk_validate = sk->sk_validate_xmit_skb;
+ else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT)
+ sk_validate = inet_twsk(sk)->tw_validate_xmit_skb;
+ }
+
+ if (sk_validate) {
+ skb = sk_validate(sk, dev, skb);
+ } else if (unlikely(skb_is_decrypted(skb))) {
+ pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
+ kfree_skb(skb);
+ skb = NULL;
+ }
+#endif
+
+ return skb;
+}
+
static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
struct net_device *dev)
{
@@ -4849,9 +4887,40 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
return hash_32(hash, flow_table->log);
}
+#ifdef CONFIG_RFS_ACCEL
+/**
+ * rps_flow_is_active - check whether the flow is recently active.
+ * @rflow: Specific flow to check activity.
+ * @flow_table: per-queue flowtable that @rflow belongs to.
+ * @cpu: CPU saved in @rflow.
+ *
+ * If the CPU has processed many packets since the flow's last activity
+ * (beyond 10 times the table size), the flow is considered stale.
+ *
+ * Return: true if flow was recently active.
+ */
+static bool rps_flow_is_active(struct rps_dev_flow *rflow,
+ struct rps_dev_flow_table *flow_table,
+ unsigned int cpu)
+{
+ unsigned int flow_last_active;
+ unsigned int sd_input_head;
+
+ if (cpu >= nr_cpu_ids)
+ return false;
+
+ sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head);
+ flow_last_active = READ_ONCE(rflow->last_qtail);
+
+ return (int)(sd_input_head - flow_last_active) <
+ (int)(10 << flow_table->log);
+}
+#endif
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
- struct rps_dev_flow *rflow, u16 next_cpu)
+ struct rps_dev_flow *rflow, u16 next_cpu, u32 hash,
+ u32 flow_id)
{
if (next_cpu < nr_cpu_ids) {
u32 head;
@@ -4859,8 +4928,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *old_rflow;
+ struct rps_dev_flow *tmp_rflow;
+ unsigned int tmp_cpu;
u16 rxq_index;
- u32 flow_id;
int rc;
/* Should we steer this flow to a different hardware queue? */
@@ -4875,14 +4945,29 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = rfs_slot(skb_get_hash(skb), flow_table);
+
+ tmp_rflow = &flow_table->flows[flow_id];
+ tmp_cpu = READ_ONCE(tmp_rflow->cpu);
+
+ if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
+ if (rps_flow_is_active(tmp_rflow, flow_table,
+ tmp_cpu)) {
+ if (hash != READ_ONCE(tmp_rflow->hash) ||
+ next_cpu == tmp_cpu)
+ goto out;
+ }
+ }
+
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
goto out;
+
old_rflow = rflow;
- rflow = &flow_table->flows[flow_id];
+ rflow = tmp_rflow;
WRITE_ONCE(rflow->filter, rc);
+ WRITE_ONCE(rflow->hash, hash);
+
if (old_rflow->filter == rc)
WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
out:
@@ -4908,6 +4993,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow_table *flow_table;
struct rps_map *map;
int cpu = -1;
+ u32 flow_id;
u32 tcpu;
u32 hash;
@@ -4954,7 +5040,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
+ flow_id = rfs_slot(hash, flow_table);
+ rflow = &flow_table->flows[flow_id];
tcpu = rflow->cpu;
/*
@@ -4973,7 +5060,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
- rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash,
+ flow_id);
}
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
@@ -5017,17 +5105,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
- unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id < (1UL << flow_table->log)) {
+ unsigned int cpu;
+
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
- if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
- ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
- READ_ONCE(rflow->last_qtail)) <
- (int)(10 << flow_table->log)))
+ if (READ_ONCE(rflow->filter) == filter_id &&
+ rps_flow_is_active(rflow, flow_table, cpu))
expire = false;
}
rcu_read_unlock();
@@ -5093,8 +5180,9 @@ static void napi_schedule_rps(struct softnet_data *sd)
__napi_schedule_irqoff(&mysd->backlog);
}
-void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
+void kick_defer_list_purge(unsigned int cpu)
{
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
unsigned long flags;
if (use_backlog_threads()) {
@@ -5199,7 +5287,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
backlog_unlock_irq_restore(sd, &flags);
cpu_backlog_drop:
- atomic_inc(&sd->dropped);
+ numa_drop_add(&sd->drop_counters, 1);
bad_dev:
dev_core_stats_rx_dropped_inc(skb->dev);
kfree_skb_reason(skb, reason);
@@ -6628,24 +6716,24 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
}
EXPORT_SYMBOL(napi_complete_done);
-static void skb_defer_free_flush(struct softnet_data *sd)
+static void skb_defer_free_flush(void)
{
+ struct llist_node *free_list;
struct sk_buff *skb, *next;
+ struct skb_defer_node *sdn;
+ int node;
- /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
- if (!READ_ONCE(sd->defer_list))
- return;
+ for_each_node(node) {
+ sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node;
- spin_lock(&sd->defer_lock);
- skb = sd->defer_list;
- sd->defer_list = NULL;
- sd->defer_count = 0;
- spin_unlock(&sd->defer_lock);
+ if (llist_empty(&sdn->defer_list))
+ continue;
+ atomic_long_set(&sdn->defer_count, 0);
+ free_list = llist_del_all(&sdn->defer_list);
- while (skb != NULL) {
- next = skb->next;
- napi_consume_skb(skb, 1);
- skb = next;
+ llist_for_each_entry_safe(skb, next, free_list, ll_node) {
+ napi_consume_skb(skb, 1);
+ }
}
}
@@ -6773,7 +6861,7 @@ count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
- skb_defer_free_flush(this_cpu_ptr(&softnet_data));
+ skb_defer_free_flush();
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
@@ -6965,7 +7053,7 @@ static void napi_stop_kthread(struct napi_struct *napi)
* the kthread.
*/
while (true) {
- if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state))
+ if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state))
break;
msleep(20);
@@ -7632,7 +7720,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
- skb_defer_free_flush(sd);
+ skb_defer_free_flush();
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
@@ -7674,7 +7762,7 @@ start:
for (;;) {
struct napi_struct *n;
- skb_defer_free_flush(sd);
+ skb_defer_free_flush();
if (list_empty(&list)) {
if (list_empty(&repoll)) {
@@ -12908,7 +12996,6 @@ static int __init net_dev_init(void)
sd->cpu = i;
#endif
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
- spin_lock_init(&sd->defer_lock);
gro_init(&sd->backlog.gro);
sd->backlog.poll = process_backlog;
@@ -12918,6 +13005,11 @@ static int __init net_dev_init(void)
if (net_page_pool_create(i))
goto out;
}
+ net_hotdata.skb_defer_nodes =
+ __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids,
+ __alignof__(struct skb_defer_node));
+ if (!net_hotdata.skb_defer_nodes)
+ goto out;
if (use_backlog_threads())
smpboot_register_percpu_thread(&backlog_threads);