diff options
| author | David S. Miller <davem@davemloft.net> | 2023-10-01 13:20:36 +0100 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2023-10-01 13:20:36 +0100 |
| commit | b49a948568dcbb5f38cbf5356ea0fb9c9c6f6953 (patch) | |
| tree | b6a000dec2cee2f62ef3a94484e1c80e77171356 | |
| parent | 66ac08a7385fa8a5d3312ca96d1399670cfca0eb (diff) | |
| parent | 8f6c4ff9e0522da9313fbff5295ae208af679fed (diff) | |
Merge branch 'sch_fq-improvements'
Eric Dumazet says:
====================
net_sched: sch_fq: round of improvements
For FQ tenth anniversary, it was time for making it faster.
The FQ part (as in Fair Queue) is rather expensive, because
we have to classify packets and store them in a per-flow structure,
and add this per-flow structure in a hash table. Then the RR lists
also add cache line misses.
Most fq qdisc are almost idle. Trying to share NIC bandwidth has
no benefits, thus the qdisc could behave like a FIFO.
This series brings a 5 % throughput increase in intensive
tcp_rr workload, and 13 % increase for (unpaced) UDP packets.
v2: removed an extra label (build bot).
Fix an accidental increase of stat_internal_packets counter
in fast path.
Added "constify qdisc_priv()" patch to allow fq_fastpath_check()
first parameter to be const.
typo on 'eligible' (Willem)
====================
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | include/net/pkt_sched.h | 8 | ||||
| -rw-r--r-- | include/uapi/linux/pkt_sched.h | 1 | ||||
| -rw-r--r-- | net/sched/sch_fq.c | 148 |
3 files changed, 110 insertions, 47 deletions
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 15960564e0c3..9fa1d0794dfa 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -20,10 +20,10 @@ struct qdisc_walker { int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); }; -static inline void *qdisc_priv(struct Qdisc *q) -{ - return &q->privdata; -} +#define qdisc_priv(q) \ + _Generic(q, \ + const struct Qdisc * : (const void *)&q->privdata, \ + struct Qdisc * : (void *)&q->privdata) static inline struct Qdisc *qdisc_from_priv(void *priv) { diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 3f85ae578056..579f641846b8 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -962,6 +962,7 @@ struct tc_fq_qd_stats { __u64 ce_mark; /* packets above ce_threshold */ __u64 horizon_drops; __u64 horizon_caps; + __u64 fastpath_packets; }; /* Heavy-Hitter Filter */ diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index f59a2cb2c803..681bbf34b707 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -2,7 +2,7 @@ /* * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing) * - * Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com> + * Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com> * * Meant to be mostly used for locally generated traffic : * Fast classification depends on skb->sk being set before reaching us. @@ -73,7 +73,13 @@ struct fq_flow { struct sk_buff *tail; /* last skb in the list */ unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ }; - struct rb_node fq_node; /* anchor in fq_root[] trees */ + union { + struct rb_node fq_node; /* anchor in fq_root[] trees */ + /* Following field is only used for q->internal, + * because q->internal is not hashed in fq_root[] + */ + u64 stat_fastpath_packets; + }; struct sock *sk; u32 socket_hash; /* sk_hash */ int qlen; /* number of packets in flow queue */ @@ -104,6 +110,9 @@ struct fq_sched_data { unsigned long unthrottle_latency_ns; struct fq_flow internal; /* for non classified or high prio packets */ + +/* Read mostly cache line */ + u32 quantum; u32 initial_quantum; u32 flow_refill_delay; @@ -117,22 +126,27 @@ struct fq_sched_data { u8 rate_enable; u8 fq_trees_log; u8 horizon_drop; + u32 timer_slack; /* hrtimer slack in ns */ + +/* Read/Write fields. */ + u32 flows; - u32 inactive_flows; + u32 inactive_flows; /* Flows with no packet to send. */ u32 throttled_flows; - u64 stat_gc_flows; - u64 stat_internal_packets; u64 stat_throttled; + struct qdisc_watchdog watchdog; + u64 stat_gc_flows; + +/* Seldom used fields. */ + + u64 stat_internal_packets; /* aka highprio */ u64 stat_ce_mark; u64 stat_horizon_drops; u64 stat_horizon_caps; u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; - - u32 timer_slack; /* hrtimer slack in ns */ - struct qdisc_watchdog watchdog; }; /* @@ -258,17 +272,64 @@ static void fq_gc(struct fq_sched_data *q, kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); } -static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) +/* Fast path can be used if : + * 1) Packet tstamp is in the past. + * 2) FQ qlen == 0 OR + * (no flow is currently eligible for transmit, + * AND fast path queue has less than 8 packets) + * 3) No SO_MAX_PACING_RATE on the socket (if any). + * 4) No @maxrate attribute on this qdisc, + * + * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure. + */ +static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb) +{ + const struct fq_sched_data *q = qdisc_priv(sch); + const struct sock *sk; + + if (fq_skb_cb(skb)->time_to_send > q->ktime_cache) + return false; + + if (sch->q.qlen != 0) { + /* Even if some packets are stored in this qdisc, + * we can still enable fast path if all of them are + * scheduled in the future (ie no flows are eligible) + * or in the fast path queue. + */ + if (q->flows != q->inactive_flows + q->throttled_flows) + return false; + + /* Do not allow fast path queue to explode, we want Fair Queue mode + * under pressure. + */ + if (q->internal.qlen >= 8) + return false; + } + + sk = skb->sk; + if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) && + sk->sk_max_pacing_rate != ~0UL) + return false; + + if (q->flow_max_rate != ~0UL) + return false; + + return true; +} + +static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb) { + struct fq_sched_data *q = qdisc_priv(sch); struct rb_node **p, *parent; struct sock *sk = skb->sk; struct rb_root *root; struct fq_flow *f; /* warning: no starvation prevention... */ - if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) + if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) { + q->stat_internal_packets++; /* highprio packet */ return &q->internal; - + } /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket * or a listener (SYNCOOKIE mode) * 1) request sockets are not full blown, @@ -299,11 +360,14 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) sk = (struct sock *)((hash << 1) | 1UL); } + if (fq_fastpath_check(sch, skb)) { + q->internal.stat_fastpath_packets++; + return &q->internal; + } + root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; - if (q->flows >= (2U << q->fq_trees_log) && - q->inactive_flows > q->flows/2) - fq_gc(q, root, sk); + fq_gc(q, root, sk); p = &root->rb_node; parent = NULL; @@ -396,7 +460,6 @@ static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, { fq_erase_head(sch, flow, skb); skb_mark_not_on_list(skb); - flow->qlen--; qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } @@ -448,49 +511,45 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(sch->q.qlen >= sch->limit)) return qdisc_drop(skb, sch, to_free); + q->ktime_cache = ktime_get_ns(); if (!skb->tstamp) { - fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); + fq_skb_cb(skb)->time_to_send = q->ktime_cache; } else { - /* Check if packet timestamp is too far in the future. - * Try first if our cached value, to avoid ktime_get_ns() - * cost in most cases. - */ + /* Check if packet timestamp is too far in the future. */ if (fq_packet_beyond_horizon(skb, q)) { - /* Refresh our cache and check another time */ - q->ktime_cache = ktime_get_ns(); - if (fq_packet_beyond_horizon(skb, q)) { - if (q->horizon_drop) { + if (q->horizon_drop) { q->stat_horizon_drops++; return qdisc_drop(skb, sch, to_free); - } - q->stat_horizon_caps++; - skb->tstamp = q->ktime_cache + q->horizon; } + q->stat_horizon_caps++; + skb->tstamp = q->ktime_cache + q->horizon; } fq_skb_cb(skb)->time_to_send = skb->tstamp; } - f = fq_classify(skb, q); - if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { - q->stat_flows_plimit++; - return qdisc_drop(skb, sch, to_free); - } + f = fq_classify(sch, skb); - f->qlen++; - qdisc_qstats_backlog_inc(sch, skb); - if (fq_flow_is_detached(f)) { - fq_flow_add_tail(&q->new_flows, f); - if (time_after(jiffies, f->age + q->flow_refill_delay)) - f->credit = max_t(u32, f->credit, q->quantum); - q->inactive_flows--; + if (f != &q->internal) { + if (unlikely(f->qlen >= q->flow_plimit)) { + q->stat_flows_plimit++; + return qdisc_drop(skb, sch, to_free); + } + + if (fq_flow_is_detached(f)) { + fq_flow_add_tail(&q->new_flows, f); + if (time_after(jiffies, f->age + q->flow_refill_delay)) + f->credit = max_t(u32, f->credit, q->quantum); + } + + if (f->qlen == 0) + q->inactive_flows--; } + f->qlen++; /* Note: this overwrites f->age */ flow_queue_add(f, skb); - if (unlikely(f == &q->internal)) { - q->stat_internal_packets++; - } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -538,6 +597,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) skb = fq_peek(&q->internal); if (unlikely(skb)) { + q->internal.qlen--; fq_dequeue_skb(sch, &q->internal, skb); goto out; } @@ -581,6 +641,8 @@ begin: INET_ECN_set_ce(skb); q->stat_ce_mark++; } + if (--f->qlen == 0) + q->inactive_flows++; fq_dequeue_skb(sch, f, skb); } else { head->first = f->next; @@ -589,7 +651,6 @@ begin: fq_flow_add_tail(&q->old_flows, f); } else { fq_flow_set_detached(f); - q->inactive_flows++; } goto begin; } @@ -1014,6 +1075,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.gc_flows = q->stat_gc_flows; st.highprio_packets = q->stat_internal_packets; + st.fastpath_packets = q->internal.stat_fastpath_packets; st.tcp_retrans = 0; st.throttled = q->stat_throttled; st.flows_plimit = q->stat_flows_plimit; |
