From 0302d2fd6efb0c386e521df0134eb2679a9a138f Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Wed, 27 Nov 2024 09:54:30 +0100 Subject: locking/ww_mutex: Fix ww_mutex dummy lockdep map selftest warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The below commit introduces a dummy lockdep map, but didn't get the initialization quite right (it should mimic the initialization of the real ww_mutex lockdep maps). It also introduced a separate locking api selftest failure. Fix these. Closes: https://lore.kernel.org/lkml/Zw19sMtnKdyOVQoh@boqun-archlinux/ Fixes: 823a566221a5 ("locking/ww_mutex: Adjust to lockdep nest_lock requirements") Reported-by: Boqun Feng Suggested-by: Boqun Feng Signed-off-by: Thomas Hellström Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241127085430.3045-1-thomas.hellstrom@linux.intel.com --- include/linux/ww_mutex.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index a401a2f31a77..45ff6f7a872b 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -156,8 +156,8 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx, debug_check_no_locks_freed((void *)ctx, sizeof(*ctx)); lockdep_init_map(&ctx->dep_map, ww_class->acquire_name, &ww_class->acquire_key, 0); - lockdep_init_map(&ctx->first_lock_dep_map, ww_class->mutex_name, - &ww_class->mutex_key, 0); + lockdep_init_map_wait(&ctx->first_lock_dep_map, ww_class->mutex_name, + &ww_class->mutex_key, 0, LD_WAIT_SLEEP); mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_); mutex_acquire_nest(&ctx->first_lock_dep_map, 0, 0, &ctx->dep_map, _RET_IP_); #endif -- cgit v1.2.3 From b4d83c8323b0c4a899a996fed919cfe10720d289 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Dec 2024 10:19:31 +0100 Subject: headers/cleanup.h: Remove the if_not_guard() facility Linus noticed that the new if_not_guard() definition is fragile: "This macro generates actively wrong code if it happens to be inside an if-statement or a loop without a block. IOW, code like this: for (iterate-over-something) if_not_guard(a) return -BUSY; looks like will build fine, but will generate completely incorrect code." The reason is that the __if_not_guard() macro is multi-statement, so while most kernel developers expect macros to be simple or at least compound statements - but for __if_not_guard() it is not so: #define __if_not_guard(_name, _id, args...) \ BUILD_BUG_ON(!__is_cond_ptr(_name)); \ CLASS(_name, _id)(args); \ if (!__guard_ptr(_name)(&_id)) To add insult to injury, the placement of the BUILD_BUG_ON() line makes the macro appear to compile fine, but it will generate incorrect code as Linus reported, for example if used within iteration or conditional statements that will use the first statement of a macro as a loop body or conditional statement body. [ I'd also like to note that the original submission by David Lechner did not contain the BUILD_BUG_ON() line, so it was safer than what we ended up committing. Mea culpa. ] It doesn't appear to be possible to turn this macro into a robust single or compound statement that could be used in single statements, due to the necessity to define an auto scope variable with an open scope and the necessity of it having to expand to a partial 'if' statement with no body. Instead of trying to work around this fragility, just remove the construct before it gets used. Reported-by: Linus Torvalds Signed-off-by: Ingo Molnar Cc: David Lechner Cc: Peter Zijlstra Link: https://lore.kernel.org/r/Z1LBnX9TpZLR5Dkf@gmail.com --- include/linux/cleanup.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 966fcc5ff8ef..ec00e3f7af2b 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -273,12 +273,6 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ * an anonymous instance of the (guard) class, not recommended for * conditional locks. * - * if_not_guard(name, args...) { }: - * convenience macro for conditional guards that calls the statement that - * follows only if the lock was not acquired (typically an error return). - * - * Only for conditional locks. - * * scoped_guard (name, args...) { }: * similar to CLASS(name, scope)(args), except the variable (with the * explicit name 'scope') is declard in a for-loop such that its scope is @@ -350,14 +344,6 @@ _label: \ #define scoped_cond_guard(_name, _fail, args...) \ __scoped_cond_guard(_name, _fail, __UNIQUE_ID(label), args) -#define __if_not_guard(_name, _id, args...) \ - BUILD_BUG_ON(!__is_cond_ptr(_name)); \ - CLASS(_name, _id)(args); \ - if (!__guard_ptr(_name)(&_id)) - -#define if_not_guard(_name, args...) \ - __if_not_guard(_name, __UNIQUE_ID(guard), args) - /* * Additional helper macros for generating lock guards with types, either for * locks that don't have a native type (eg. RCU, preempt) or those that need a -- cgit v1.2.3 From b454abfab52543c44b581afc807b9f97fc1e7a3a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 5 Dec 2024 16:55:18 +0200 Subject: net: mscc: ocelot: be resilient to loss of PTP packets during transmission The Felix DSA driver presents unique challenges that make the simplistic ocelot PTP TX timestamping procedure unreliable: any transmitted packet may be lost in hardware before it ever leaves our local system. This may happen because there is congestion on the DSA conduit, the switch CPU port or even user port (Qdiscs like taprio may delay packets indefinitely by design). The technical problem is that the kernel, i.e. ocelot_port_add_txtstamp_skb(), runs out of timestamp IDs eventually, because it never detects that packets are lost, and keeps the IDs of the lost packets on hold indefinitely. The manifestation of the issue once the entire timestamp ID range becomes busy looks like this in dmesg: mscc_felix 0000:00:00.5: port 0 delivering skb without TX timestamp mscc_felix 0000:00:00.5: port 1 delivering skb without TX timestamp At the surface level, we need a timeout timer so that the kernel knows a timestamp ID is available again. But there is a deeper problem with the implementation, which is the monotonically increasing ocelot_port->ts_id. In the presence of packet loss, it will be impossible to detect that and reuse one of the holes created in the range of free timestamp IDs. What we actually need is a bitmap of 63 timestamp IDs tracking which one is available. That is able to use up holes caused by packet loss, but also gives us a unique opportunity to not implement an actual timer_list for the timeout timer (very complicated in terms of locking). We could only declare a timestamp ID stale on demand (lazily), aka when there's no other timestamp ID available. There are pros and cons to this approach: the implementation is much more simple than per-packet timers would be, but most of the stale packets would be quasi-leaked - not really leaked, but blocked in driver memory, since this algorithm sees no reason to free them. An improved technique would be to check for stale timestamp IDs every time we allocate a new one. Assuming a constant flux of PTP packets, this avoids stale packets being blocked in memory, but of course, packets lost at the end of the flux are still blocked until the flux resumes (nobody left to kick them out). Since implementing per-packet timers is way too complicated, this should be good enough. Testing procedure: Persistently block traffic class 5 and try to run PTP on it: $ tc qdisc replace dev swp3 parent root taprio num_tc 8 \ map 0 1 2 3 4 5 6 7 queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \ base-time 0 sched-entry S 0xdf 100000 flags 0x2 [ 126.948141] mscc_felix 0000:00:00.5: port 3 tc 5 min gate length 0 ns not enough for max frame size 1526 at 1000 Mbps, dropping frames over 1 octets including FCS $ ptp4l -i swp3 -2 -P -m --socket_priority 5 --fault_reset_interval ASAP --logSyncInterval -3 ptp4l[70.351]: port 1 (swp3): INITIALIZING to LISTENING on INIT_COMPLETE ptp4l[70.354]: port 0 (/var/run/ptp4l): INITIALIZING to LISTENING on INIT_COMPLETE ptp4l[70.358]: port 0 (/var/run/ptp4lro): INITIALIZING to LISTENING on INIT_COMPLETE [ 70.394583] mscc_felix 0000:00:00.5: port 3 timestamp id 0 ptp4l[70.406]: timed out while polling for tx timestamp ptp4l[70.406]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[70.406]: port 1 (swp3): send peer delay response failed ptp4l[70.407]: port 1 (swp3): clearing fault immediately ptp4l[70.952]: port 1 (swp3): new foreign master d858d7.fffe.00ca6d-1 [ 71.394858] mscc_felix 0000:00:00.5: port 3 timestamp id 1 ptp4l[71.400]: timed out while polling for tx timestamp ptp4l[71.400]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[71.401]: port 1 (swp3): send peer delay response failed ptp4l[71.401]: port 1 (swp3): clearing fault immediately [ 72.393616] mscc_felix 0000:00:00.5: port 3 timestamp id 2 ptp4l[72.401]: timed out while polling for tx timestamp ptp4l[72.402]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[72.402]: port 1 (swp3): send peer delay response failed ptp4l[72.402]: port 1 (swp3): clearing fault immediately ptp4l[72.952]: port 1 (swp3): new foreign master d858d7.fffe.00ca6d-1 [ 73.395291] mscc_felix 0000:00:00.5: port 3 timestamp id 3 ptp4l[73.400]: timed out while polling for tx timestamp ptp4l[73.400]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[73.400]: port 1 (swp3): send peer delay response failed ptp4l[73.400]: port 1 (swp3): clearing fault immediately [ 74.394282] mscc_felix 0000:00:00.5: port 3 timestamp id 4 ptp4l[74.400]: timed out while polling for tx timestamp ptp4l[74.401]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[74.401]: port 1 (swp3): send peer delay response failed ptp4l[74.401]: port 1 (swp3): clearing fault immediately ptp4l[74.953]: port 1 (swp3): new foreign master d858d7.fffe.00ca6d-1 [ 75.396830] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 0 which seems lost [ 75.405760] mscc_felix 0000:00:00.5: port 3 timestamp id 0 ptp4l[75.410]: timed out while polling for tx timestamp ptp4l[75.411]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[75.411]: port 1 (swp3): send peer delay response failed ptp4l[75.411]: port 1 (swp3): clearing fault immediately (...) Remove the blocking condition and see that the port recovers: $ same tc command as above, but use "sched-entry S 0xff" instead $ same ptp4l command as above ptp4l[99.489]: port 1 (swp3): INITIALIZING to LISTENING on INIT_COMPLETE ptp4l[99.490]: port 0 (/var/run/ptp4l): INITIALIZING to LISTENING on INIT_COMPLETE ptp4l[99.492]: port 0 (/var/run/ptp4lro): INITIALIZING to LISTENING on INIT_COMPLETE [ 100.403768] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 0 which seems lost [ 100.412545] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 1 which seems lost [ 100.421283] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 2 which seems lost [ 100.430015] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 3 which seems lost [ 100.438744] mscc_felix 0000:00:00.5: port 3 invalidating stale timestamp ID 4 which seems lost [ 100.447470] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 100.505919] mscc_felix 0000:00:00.5: port 3 timestamp id 0 ptp4l[100.963]: port 1 (swp3): new foreign master d858d7.fffe.00ca6d-1 [ 101.405077] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 101.507953] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 102.405405] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 102.509391] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 103.406003] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 103.510011] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 104.405601] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 104.510624] mscc_felix 0000:00:00.5: port 3 timestamp id 0 ptp4l[104.965]: selected best master clock d858d7.fffe.00ca6d ptp4l[104.966]: port 1 (swp3): assuming the grand master role ptp4l[104.967]: port 1 (swp3): LISTENING to GRAND_MASTER on RS_GRAND_MASTER [ 105.106201] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.232420] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.359001] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.405500] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.485356] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.511220] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.610938] mscc_felix 0000:00:00.5: port 3 timestamp id 0 [ 105.737237] mscc_felix 0000:00:00.5: port 3 timestamp id 0 (...) Notice that in this new usage pattern, a non-congested port should basically use timestamp ID 0 all the time, progressing to higher numbers only if there are unacknowledged timestamps in flight. Compare this to the old usage, where the timestamp ID used to monotonically increase modulo OCELOT_MAX_PTP_ID. In terms of implementation, this simplifies the bookkeeping of the ocelot_port :: ts_id and ptp_skbs_in_flight. Since we need to traverse the list of two-step timestampable skbs for each new packet anyway, the information can already be computed and does not need to be stored. Also, ocelot_port->tx_skbs is always accessed under the switch-wide ocelot->ts_id_lock IRQ-unsafe spinlock, so we don't need the skb queue's lock and can use the unlocked primitives safely. This problem was actually detected using the tc-taprio offload, and is causing trouble in TSN scenarios, which Felix (NXP LS1028A / VSC9959) supports but Ocelot (VSC7514) does not. Thus, I've selected the commit to blame as the one adding initial timestamping support for the Felix switch. Fixes: c0bcf537667c ("net: dsa: ocelot: add hardware timestamping support for Felix") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241205145519.1236778-5-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot_ptp.c | 134 +++++++++++++++++++-------------- include/linux/dsa/ocelot.h | 1 + include/soc/mscc/ocelot.h | 2 - 3 files changed, 80 insertions(+), 57 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.c b/drivers/net/ethernet/mscc/ocelot_ptp.c index d732f99e6391..7eb01d1e1ecd 100644 --- a/drivers/net/ethernet/mscc/ocelot_ptp.c +++ b/drivers/net/ethernet/mscc/ocelot_ptp.c @@ -14,6 +14,8 @@ #include #include "ocelot.h" +#define OCELOT_PTP_TX_TSTAMP_TIMEOUT (5 * HZ) + int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts) { struct ocelot *ocelot = container_of(ptp, struct ocelot, ptp_info); @@ -603,34 +605,88 @@ int ocelot_get_ts_info(struct ocelot *ocelot, int port, } EXPORT_SYMBOL(ocelot_get_ts_info); -static int ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, +static struct sk_buff *ocelot_port_dequeue_ptp_tx_skb(struct ocelot *ocelot, + int port, u8 ts_id, + u32 seqid) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct sk_buff *skb, *skb_tmp, *skb_match = NULL; + struct ptp_header *hdr; + + spin_lock(&ocelot->ts_id_lock); + + skb_queue_walk_safe(&ocelot_port->tx_skbs, skb, skb_tmp) { + if (OCELOT_SKB_CB(skb)->ts_id != ts_id) + continue; + + /* Check that the timestamp ID is for the expected PTP + * sequenceId. We don't have to test ptp_parse_header() against + * NULL, because we've pre-validated the packet's ptp_class. + */ + hdr = ptp_parse_header(skb, OCELOT_SKB_CB(skb)->ptp_class); + if (seqid != ntohs(hdr->sequence_id)) + continue; + + __skb_unlink(skb, &ocelot_port->tx_skbs); + ocelot->ptp_skbs_in_flight--; + skb_match = skb; + break; + } + + spin_unlock(&ocelot->ts_id_lock); + + return skb_match; +} + +static int ocelot_port_queue_ptp_tx_skb(struct ocelot *ocelot, int port, struct sk_buff *clone) { struct ocelot_port *ocelot_port = ocelot->ports[port]; + DECLARE_BITMAP(ts_id_in_flight, OCELOT_MAX_PTP_ID); + struct sk_buff *skb, *skb_tmp; + unsigned long n; spin_lock(&ocelot->ts_id_lock); - if (ocelot_port->ptp_skbs_in_flight == OCELOT_MAX_PTP_ID || - ocelot->ptp_skbs_in_flight == OCELOT_PTP_FIFO_SIZE) { + /* To get a better chance of acquiring a timestamp ID, first flush the + * stale packets still waiting in the TX timestamping queue. They are + * probably lost. + */ + skb_queue_walk_safe(&ocelot_port->tx_skbs, skb, skb_tmp) { + if (time_before(OCELOT_SKB_CB(skb)->ptp_tx_time + + OCELOT_PTP_TX_TSTAMP_TIMEOUT, jiffies)) { + dev_warn_ratelimited(ocelot->dev, + "port %d invalidating stale timestamp ID %u which seems lost\n", + port, OCELOT_SKB_CB(skb)->ts_id); + __skb_unlink(skb, &ocelot_port->tx_skbs); + kfree_skb(skb); + ocelot->ptp_skbs_in_flight--; + } else { + __set_bit(OCELOT_SKB_CB(skb)->ts_id, ts_id_in_flight); + } + } + + if (ocelot->ptp_skbs_in_flight == OCELOT_PTP_FIFO_SIZE) { spin_unlock(&ocelot->ts_id_lock); return -EBUSY; } - skb_shinfo(clone)->tx_flags |= SKBTX_IN_PROGRESS; - /* Store timestamp ID in OCELOT_SKB_CB(clone)->ts_id */ - OCELOT_SKB_CB(clone)->ts_id = ocelot_port->ts_id; - - ocelot_port->ts_id++; - if (ocelot_port->ts_id == OCELOT_MAX_PTP_ID) - ocelot_port->ts_id = 0; + n = find_first_zero_bit(ts_id_in_flight, OCELOT_MAX_PTP_ID); + if (n == OCELOT_MAX_PTP_ID) { + spin_unlock(&ocelot->ts_id_lock); + return -EBUSY; + } - ocelot_port->ptp_skbs_in_flight++; + /* Found an available timestamp ID, use it */ + OCELOT_SKB_CB(clone)->ts_id = n; + OCELOT_SKB_CB(clone)->ptp_tx_time = jiffies; ocelot->ptp_skbs_in_flight++; - - skb_queue_tail(&ocelot_port->tx_skbs, clone); + __skb_queue_tail(&ocelot_port->tx_skbs, clone); spin_unlock(&ocelot->ts_id_lock); + dev_dbg_ratelimited(ocelot->dev, "port %d timestamp id %lu\n", port, n); + return 0; } @@ -686,12 +742,14 @@ int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port, if (!(*clone)) return -ENOMEM; - err = ocelot_port_add_txtstamp_skb(ocelot, port, *clone); + /* Store timestamp ID in OCELOT_SKB_CB(clone)->ts_id */ + err = ocelot_port_queue_ptp_tx_skb(ocelot, port, *clone); if (err) { kfree_skb(*clone); return err; } + skb_shinfo(*clone)->tx_flags |= SKBTX_IN_PROGRESS; OCELOT_SKB_CB(skb)->ptp_cmd = ptp_cmd; OCELOT_SKB_CB(*clone)->ptp_class = ptp_class; } @@ -727,26 +785,14 @@ static void ocelot_get_hwtimestamp(struct ocelot *ocelot, spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags); } -static bool ocelot_validate_ptp_skb(struct sk_buff *clone, u16 seqid) -{ - struct ptp_header *hdr; - - hdr = ptp_parse_header(clone, OCELOT_SKB_CB(clone)->ptp_class); - if (WARN_ON(!hdr)) - return false; - - return seqid == ntohs(hdr->sequence_id); -} - void ocelot_get_txtstamp(struct ocelot *ocelot) { int budget = OCELOT_PTP_QUEUE_SZ; while (budget--) { - struct sk_buff *skb, *skb_tmp, *skb_match = NULL; struct skb_shared_hwtstamps shhwtstamps; u32 val, id, seqid, txport; - struct ocelot_port *port; + struct sk_buff *skb_match; struct timespec64 ts; val = ocelot_read(ocelot, SYS_PTP_STATUS); @@ -762,36 +808,14 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) txport = SYS_PTP_STATUS_PTP_MESS_TXPORT_X(val); seqid = SYS_PTP_STATUS_PTP_MESS_SEQ_ID(val); - port = ocelot->ports[txport]; - - spin_lock(&ocelot->ts_id_lock); - port->ptp_skbs_in_flight--; - ocelot->ptp_skbs_in_flight--; - spin_unlock(&ocelot->ts_id_lock); - /* Retrieve its associated skb */ -try_again: - spin_lock(&port->tx_skbs.lock); - - skb_queue_walk_safe(&port->tx_skbs, skb, skb_tmp) { - if (OCELOT_SKB_CB(skb)->ts_id != id) - continue; - __skb_unlink(skb, &port->tx_skbs); - skb_match = skb; - break; - } - - spin_unlock(&port->tx_skbs.lock); - - if (WARN_ON(!skb_match)) + skb_match = ocelot_port_dequeue_ptp_tx_skb(ocelot, txport, id, + seqid); + if (!skb_match) { + dev_warn_ratelimited(ocelot->dev, + "port %d received TX timestamp (seqid %d, ts id %u) for packet previously declared stale\n", + txport, seqid, id); goto next_ts; - - if (!ocelot_validate_ptp_skb(skb_match, seqid)) { - dev_err_ratelimited(ocelot->dev, - "port %d received stale TX timestamp for seqid %d, discarding\n", - txport, seqid); - kfree_skb(skb); - goto try_again; } /* Get the h/w timestamp */ diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index 6fbfbde68a37..620a3260fc08 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -15,6 +15,7 @@ struct ocelot_skb_cb { struct sk_buff *clone; unsigned int ptp_class; /* valid only for clones */ + unsigned long ptp_tx_time; /* valid only for clones */ u32 tstamp_lo; u8 ptp_cmd; u8 ts_id; diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 462c653e1017..2db9ae0575b6 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -778,7 +778,6 @@ struct ocelot_port { phy_interface_t phy_mode; - unsigned int ptp_skbs_in_flight; struct sk_buff_head tx_skbs; unsigned int trap_proto; @@ -786,7 +785,6 @@ struct ocelot_port { u16 mrp_ring_id; u8 ptp_cmd; - u8 ts_id; u8 index; -- cgit v1.2.3 From 8d6712c892019b9b9dc5c7039edd3c9d770b510b Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 6 Dec 2024 10:10:44 +0900 Subject: virtio_ring: add a func argument 'recycle_done' to virtqueue_resize() When virtqueue_resize() has actually recycled all unused buffers, additional work may be required in some cases. Relying solely on its return status is fragile, so introduce a new function argument 'recycle_done', which is invoked when the recycle really occurs. Cc: # v6.11+ Signed-off-by: Koichiro Den Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 4 ++-- drivers/virtio/virtio_ring.c | 6 +++++- include/linux/virtio.h | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index fc89c5e1a207..e10bc9e6b072 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3331,7 +3331,7 @@ static int virtnet_rx_resize(struct virtnet_info *vi, virtnet_rx_pause(vi, rq); - err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf); + err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf, NULL); if (err) netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); @@ -3394,7 +3394,7 @@ static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue *sq, virtnet_tx_pause(vi, sq); - err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf); + err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf, NULL); if (err) netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 82a7d2cbc704..6af8cf6a619e 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2772,6 +2772,7 @@ EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma); * @_vq: the struct virtqueue we're talking about. * @num: new ring num * @recycle: callback to recycle unused buffers + * @recycle_done: callback to be invoked when recycle for all unused buffers done * * When it is really necessary to create a new vring, it will set the current vq * into the reset state. Then call the passed callback to recycle the buffer @@ -2792,7 +2793,8 @@ EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma); * */ int virtqueue_resize(struct virtqueue *_vq, u32 num, - void (*recycle)(struct virtqueue *vq, void *buf)) + void (*recycle)(struct virtqueue *vq, void *buf), + void (*recycle_done)(struct virtqueue *vq)) { struct vring_virtqueue *vq = to_vvq(_vq); int err; @@ -2809,6 +2811,8 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num, err = virtqueue_disable_and_recycle(_vq, recycle); if (err) return err; + if (recycle_done) + recycle_done(_vq); if (vq->packed_ring) err = virtqueue_resize_packed(_vq, num); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 57cc4b07fd17..0aa7df4ed5ca 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -109,7 +109,8 @@ dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *vq); dma_addr_t virtqueue_get_used_addr(const struct virtqueue *vq); int virtqueue_resize(struct virtqueue *vq, u32 num, - void (*recycle)(struct virtqueue *vq, void *buf)); + void (*recycle)(struct virtqueue *vq, void *buf), + void (*recycle_done)(struct virtqueue *vq)); int virtqueue_reset(struct virtqueue *vq, void (*recycle)(struct virtqueue *vq, void *buf)); -- cgit v1.2.3 From 8d2da07c813ad333c20eb803e15f8c4541f25350 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 6 Dec 2024 10:10:46 +0900 Subject: virtio_ring: add a func argument 'recycle_done' to virtqueue_reset() When virtqueue_reset() has actually recycled all unused buffers, additional work may be required in some cases. Relying solely on its return status is fragile, so introduce a new function argument 'recycle_done', which is invoked when it really occurs. Signed-off-by: Koichiro Den Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 4 ++-- drivers/virtio/virtio_ring.c | 6 +++++- include/linux/virtio.h | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3a0341cc6085..5cf4b2b20431 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -5711,7 +5711,7 @@ static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queu virtnet_rx_pause(vi, rq); - err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf); + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf, NULL); if (err) { netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); @@ -5740,7 +5740,7 @@ static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi, virtnet_tx_pause(vi, sq); - err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf); + err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf, NULL); if (err) { netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err); pool = NULL; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 6af8cf6a619e..fdd2d2b07b5a 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2827,6 +2827,7 @@ EXPORT_SYMBOL_GPL(virtqueue_resize); * virtqueue_reset - detach and recycle all unused buffers * @_vq: the struct virtqueue we're talking about. * @recycle: callback to recycle unused buffers + * @recycle_done: callback to be invoked when recycle for all unused buffers done * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). @@ -2838,7 +2839,8 @@ EXPORT_SYMBOL_GPL(virtqueue_resize); * -EPERM: Operation not permitted */ int virtqueue_reset(struct virtqueue *_vq, - void (*recycle)(struct virtqueue *vq, void *buf)) + void (*recycle)(struct virtqueue *vq, void *buf), + void (*recycle_done)(struct virtqueue *vq)) { struct vring_virtqueue *vq = to_vvq(_vq); int err; @@ -2846,6 +2848,8 @@ int virtqueue_reset(struct virtqueue *_vq, err = virtqueue_disable_and_recycle(_vq, recycle); if (err) return err; + if (recycle_done) + recycle_done(_vq); if (vq->packed_ring) virtqueue_reinit_packed(vq); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 0aa7df4ed5ca..dd88682e27e3 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -112,7 +112,8 @@ int virtqueue_resize(struct virtqueue *vq, u32 num, void (*recycle)(struct virtqueue *vq, void *buf), void (*recycle_done)(struct virtqueue *vq)); int virtqueue_reset(struct virtqueue *vq, - void (*recycle)(struct virtqueue *vq, void *buf)); + void (*recycle)(struct virtqueue *vq, void *buf), + void (*recycle_done)(struct virtqueue *vq)); struct virtio_admin_cmd { __le16 opcode; -- cgit v1.2.3 From b76b840fd93374240b59825f1ab8e2f5c9907acb Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 9 Dec 2024 21:23:56 +0900 Subject: dm: Fix dm-zoned-reclaim zone write pointer alignment The zone reclaim processing of the dm-zoned device mapper uses blkdev_issue_zeroout() to align the write pointer of a zone being used for reclaiming another zone, to write the valid data blocks from the zone being reclaimed at the same position relative to the zone start in the reclaim target zone. The first call to blkdev_issue_zeroout() will try to use hardware offload using a REQ_OP_WRITE_ZEROES operation if the device reports a non-zero max_write_zeroes_sectors queue limit. If this operation fails because of the lack of hardware support, blkdev_issue_zeroout() falls back to using a regular write operation with the zero-page as buffer. Currently, such REQ_OP_WRITE_ZEROES failure is automatically handled by the block layer zone write plugging code which will execute a report zones operation to ensure that the write pointer of the target zone of the failed operation has not changed and to "rewind" the zone write pointer offset of the target zone as it was advanced when the write zero operation was submitted. So the REQ_OP_WRITE_ZEROES failure does not cause any issue and blkdev_issue_zeroout() works as expected. However, since the automatic recovery of zone write pointers by the zone write plugging code can potentially cause deadlocks with queue freeze operations, a different recovery must be implemented in preparation for the removal of zone write plugging report zones based recovery. Do this by introducing the new function blk_zone_issue_zeroout(). This function first calls blkdev_issue_zeroout() with the flag BLKDEV_ZERO_NOFALLBACK to intercept failures on the first execution which attempt to use the device hardware offload with the REQ_OP_WRITE_ZEROES operation. If this attempt fails, a report zone operation is issued to restore the zone write pointer offset of the target zone to the correct position and blkdev_issue_zeroout() is called again without the BLKDEV_ZERO_NOFALLBACK flag. The report zones operation performing this recovery is implemented using the helper function disk_zone_sync_wp_offset() which calls the gendisk report_zones file operation with the callback disk_report_zones_cb(). This callback updates the target write pointer offset of the target zone using the new function disk_zone_wplug_sync_wp_offset(). dmz_reclaim_align_wp() is modified to change its call to blkdev_issue_zeroout() to a call to blk_zone_issue_zeroout() without any other change needed as the two functions are functionnally equivalent. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Acked-by: Mike Snitzer Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241209122357.47838-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 141 +++++++++++++++++++++++++++++++++++------- drivers/md/dm-zoned-reclaim.c | 6 +- include/linux/blkdev.h | 3 + 3 files changed, 124 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ee9c67121c6c..781caca0381e 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -115,6 +115,30 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); +struct disk_report_zones_cb_args { + struct gendisk *disk; + report_zones_cb user_cb; + void *user_data; +}; + +static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, + struct blk_zone *zone); + +static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct disk_report_zones_cb_args *args = data; + struct gendisk *disk = args->disk; + + if (disk->zone_wplugs_hash) + disk_zone_wplug_sync_wp_offset(disk, zone); + + if (!args->user_cb) + return 0; + + return args->user_cb(zone, idx, args->user_data); +} + /** * blkdev_report_zones - Get zones information * @bdev: Target block device @@ -694,6 +718,58 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, spin_unlock_irqrestore(&zwplug->lock, flags); } +static unsigned int blk_zone_wp_offset(struct blk_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return zone->wp - zone->start; + case BLK_ZONE_COND_FULL: + return zone->len; + case BLK_ZONE_COND_EMPTY: + return 0; + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + default: + /* + * Conventional, offline and read-only zones do not have a valid + * write pointer. + */ + return UINT_MAX; + } +} + +static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, + struct blk_zone *zone) +{ + struct blk_zone_wplug *zwplug; + unsigned long flags; + + zwplug = disk_get_zone_wplug(disk, zone->start); + if (!zwplug) + return; + + spin_lock_irqsave(&zwplug->lock, flags); + if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) + disk_zone_wplug_set_wp_offset(disk, zwplug, + blk_zone_wp_offset(zone)); + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); +} + +static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector) +{ + struct disk_report_zones_cb_args args = { + .disk = disk, + }; + + return disk->fops->report_zones(disk, sector, 1, + disk_report_zones_cb, &args); +} + static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, unsigned int wp_offset) { @@ -1280,29 +1356,6 @@ put_zwplug: disk_put_zone_wplug(zwplug); } -static unsigned int blk_zone_wp_offset(struct blk_zone *zone) -{ - switch (zone->cond) { - case BLK_ZONE_COND_IMP_OPEN: - case BLK_ZONE_COND_EXP_OPEN: - case BLK_ZONE_COND_CLOSED: - return zone->wp - zone->start; - case BLK_ZONE_COND_FULL: - return zone->len; - case BLK_ZONE_COND_EMPTY: - return 0; - case BLK_ZONE_COND_NOT_WP: - case BLK_ZONE_COND_OFFLINE: - case BLK_ZONE_COND_READONLY: - default: - /* - * Conventional, offline and read-only zones do not have a valid - * write pointer. - */ - return UINT_MAX; - } -} - static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone, unsigned int idx, void *data) { @@ -1866,6 +1919,48 @@ int blk_revalidate_disk_zones(struct gendisk *disk) } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); +/** + * blk_zone_issue_zeroout - zero-fill a block range in a zone + * @bdev: blockdev to write + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * + * Description: + * Zero-fill a block range in a zone (@sector must be equal to the zone write + * pointer), handling potential errors due to the (initially unknown) lack of + * hardware offload (See blkdev_issue_zeroout()). + */ +int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask) +{ + int ret; + + if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) + return -EIO; + + ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, + BLKDEV_ZERO_NOFALLBACK); + if (ret != -EOPNOTSUPP) + return ret; + + /* + * The failed call to blkdev_issue_zeroout() advanced the zone write + * pointer. Undo this using a report zone to update the zone write + * pointer to the correct current value. + */ + ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector); + if (ret != 1) + return ret < 0 ? ret : -EIO; + + /* + * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a + * regular write with zero-pages. + */ + return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0); +} +EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); + #ifdef CONFIG_BLK_DEBUG_FS int queue_zone_wplugs_show(void *data, struct seq_file *m) diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index d58db9a27e6c..76e2c6868548 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -76,9 +76,9 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, * pointer and the requested position. */ nr_blocks = block - wp_block; - ret = blkdev_issue_zeroout(dev->bdev, - dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), - dmz_blk2sect(nr_blocks), GFP_NOIO, 0); + ret = blk_zone_issue_zeroout(dev->bdev, + dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), + dmz_blk2sect(nr_blocks), GFP_NOIO); if (ret) { dmz_dev_err(dev, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 08a727b40816..4dd698dad2d6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1421,6 +1421,9 @@ static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) return is_seq; } +int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask); + static inline unsigned int queue_dma_alignment(const struct request_queue *q) { return q->limits.dma_alignment; -- cgit v1.2.3 From fe0418eb9bd69a19a948b297c8de815e05f3cde1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 9 Dec 2024 21:23:57 +0900 Subject: block: Prevent potential deadlocks in zone write plug error recovery Zone write plugging for handling writes to zones of a zoned block device always execute a zone report whenever a write BIO to a zone fails. The intent of this is to ensure that the tracking of a zone write pointer is always correct to ensure that the alignment to a zone write pointer of write BIOs can be checked on submission and that we can always correctly emulate zone append operations using regular write BIOs. However, this error recovery scheme introduces a potential deadlock if a device queue freeze is initiated while BIOs are still plugged in a zone write plug and one of these write operation fails. In such case, the disk zone write plug error recovery work is scheduled and executes a report zone. This in turn can result in a request allocation in the underlying driver to issue the report zones command to the device. But with the device queue freeze already started, this allocation will block, preventing the report zone execution and the continuation of the processing of the plugged BIOs. As plugged BIOs hold a queue usage reference, the queue freeze itself will never complete, resulting in a deadlock. Avoid this problem by completely removing from the zone write plugging code the use of report zones operations after a failed write operation, instead relying on the device user to either execute a report zones, reset the zone, finish the zone, or give up writing to the device (which is a fairly common pattern for file systems which degrade to read-only after write failures). This is not an unreasonnable requirement as all well-behaved applications, FSes and device mapper already use report zones to recover from write errors whenever possible by comparing the current position of a zone write pointer with what their assumption about the position is. The changes to remove the automatic error recovery are as follows: - Completely remove the error recovery work and its associated resources (zone write plug list head, disk error list, and disk zone_wplugs_work work struct). This also removes the functions disk_zone_wplug_set_error() and disk_zone_wplug_clear_error(). - Change the BLK_ZONE_WPLUG_ERROR zone write plug flag into BLK_ZONE_WPLUG_NEED_WP_UPDATE. This new flag is set for a zone write plug whenever a write opration targetting the zone of the zone write plug fails. This flag indicates that the zone write pointer offset is not reliable and that it must be updated when the next report zone, reset zone, finish zone or disk revalidation is executed. - Modify blk_zone_write_plug_bio_endio() to set the BLK_ZONE_WPLUG_NEED_WP_UPDATE flag for the target zone of a failed write BIO. - Modify the function disk_zone_wplug_set_wp_offset() to clear this new flag, thus implementing recovery of a correct write pointer offset with the reset (all) zone and finish zone operations. - Modify blkdev_report_zones() to always use the disk_report_zones_cb() callback so that disk_zone_wplug_sync_wp_offset() can be called for any zone marked with the BLK_ZONE_WPLUG_NEED_WP_UPDATE flag. This implements recovery of a correct write pointer offset for zone write plugs marked with BLK_ZONE_WPLUG_NEED_WP_UPDATE and within the range of the report zones operation executed by the user. - Modify blk_revalidate_seq_zone() to call disk_zone_wplug_sync_wp_offset() for all sequential write required zones when a zoned block device is revalidated, thus always resolving any inconsistency between the write pointer offset of zone write plugs and the actual write pointer position of sequential zones. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241209122357.47838-5-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 308 ++++++++++--------------------------------------- include/linux/blkdev.h | 2 - 2 files changed, 61 insertions(+), 249 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 781caca0381e..77f12fc2086f 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -41,7 +41,6 @@ static const char *const zone_cond_name[] = { /* * Per-zone write plug. * @node: hlist_node structure for managing the plug using a hash table. - * @link: To list the plug in the zone write plug error list of the disk. * @ref: Zone write plug reference counter. A zone write plug reference is * always at least 1 when the plug is hashed in the disk plug hash table. * The reference is incremented whenever a new BIO needing plugging is @@ -63,7 +62,6 @@ static const char *const zone_cond_name[] = { */ struct blk_zone_wplug { struct hlist_node node; - struct list_head link; refcount_t ref; spinlock_t lock; unsigned int flags; @@ -80,8 +78,8 @@ struct blk_zone_wplug { * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, * that is, that write BIOs are being throttled due to a write BIO already * being executed or the zone write plug bio list is not empty. - * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be - * recovered with a report zone to update the zone write pointer offset. + * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone + * write pointer offset and need to update it. * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed * from the disk hash table and that the initial reference to the zone * write plug set when the plug was first added to the hash table has been @@ -91,11 +89,9 @@ struct blk_zone_wplug { * freed once all remaining references from BIOs or functions are dropped. */ #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) -#define BLK_ZONE_WPLUG_ERROR (1U << 1) +#define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) -#define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR) - /** * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. * @zone_cond: BLK_ZONE_COND_XXX. @@ -163,6 +159,11 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, { struct gendisk *disk = bdev->bd_disk; sector_t capacity = get_capacity(disk); + struct disk_report_zones_cb_args args = { + .disk = disk, + .user_cb = cb, + .user_data = data, + }; if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) return -EOPNOTSUPP; @@ -170,7 +171,8 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, if (!nr_zones || sector >= capacity) return 0; - return disk->fops->report_zones(disk, sector, nr_zones, cb, data); + return disk->fops->report_zones(disk, sector, nr_zones, + disk_report_zones_cb, &args); } EXPORT_SYMBOL_GPL(blkdev_report_zones); @@ -451,7 +453,7 @@ static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) { if (refcount_dec_and_test(&zwplug->ref)) { WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); - WARN_ON_ONCE(!list_empty(&zwplug->link)); + WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); @@ -465,8 +467,8 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) return false; - /* If the zone write plug is still busy, it cannot be removed. */ - if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) + /* If the zone write plug is still plugged, it cannot be removed. */ + if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) return false; /* @@ -549,7 +551,6 @@ again: return NULL; INIT_HLIST_NODE(&zwplug->node); - INIT_LIST_HEAD(&zwplug->link); refcount_set(&zwplug->ref, 2); spin_lock_init(&zwplug->lock); zwplug->flags = 0; @@ -598,115 +599,22 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) } /* - * Abort (fail) all plugged BIOs of a zone write plug that are not aligned - * with the assumed write pointer location of the zone when the BIO will - * be unplugged. - */ -static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - unsigned int wp_offset = zwplug->wp_offset; - struct bio_list bl = BIO_EMPTY_LIST; - struct bio *bio; - - while ((bio = bio_list_pop(&zwplug->bio_list))) { - if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) || - (bio_op(bio) != REQ_OP_ZONE_APPEND && - bio_offset_from_zone_start(bio) != wp_offset)) { - blk_zone_wplug_bio_io_error(zwplug, bio); - continue; - } - - wp_offset += bio_sectors(bio); - bio_list_add(&bl, bio); - } - - bio_list_merge(&zwplug->bio_list, &bl); -} - -static inline void disk_zone_wplug_set_error(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - unsigned long flags; - - if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) - return; - - /* - * At this point, we already have a reference on the zone write plug. - * However, since we are going to add the plug to the disk zone write - * plugs work list, increase its reference count. This reference will - * be dropped in disk_zone_wplugs_work() once the error state is - * handled, or in disk_zone_wplug_clear_error() if the zone is reset or - * finished. - */ - zwplug->flags |= BLK_ZONE_WPLUG_ERROR; - refcount_inc(&zwplug->ref); - - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); -} - -static inline void disk_zone_wplug_clear_error(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - unsigned long flags; - - if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) - return; - - /* - * We are racing with the error handling work which drops the reference - * on the zone write plug after handling the error state. So remove the - * plug from the error list and drop its reference count only if the - * error handling has not yet started, that is, if the zone write plug - * is still listed. - */ - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - if (!list_empty(&zwplug->link)) { - list_del_init(&zwplug->link); - zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; - disk_put_zone_wplug(zwplug); - } - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); -} - -/* - * Set a zone write plug write pointer offset to either 0 (zone reset case) - * or to the zone size (zone finish case). This aborts all plugged BIOs, which - * is fine to do as doing a zone reset or zone finish while writes are in-flight - * is a mistake from the user which will most likely cause all plugged BIOs to - * fail anyway. + * Set a zone write plug write pointer offset to the specified value. + * This aborts all plugged BIOs, which is fine as this function is called for + * a zone reset operation, a zone finish operation or if the zone needs a wp + * update from a report zone after a write error. */ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, struct blk_zone_wplug *zwplug, unsigned int wp_offset) { - unsigned long flags; - - spin_lock_irqsave(&zwplug->lock, flags); - - /* - * Make sure that a BIO completion or another zone reset or finish - * operation has not already removed the plug from the hash table. - */ - if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { - spin_unlock_irqrestore(&zwplug->lock, flags); - return; - } + lockdep_assert_held(&zwplug->lock); /* Update the zone write pointer and abort all plugged BIOs. */ + zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; zwplug->wp_offset = wp_offset; disk_zone_wplug_abort(zwplug); - /* - * Updating the write pointer offset puts back the zone - * in a good state. So clear the error flag and decrement the - * error count if we were in error state. - */ - disk_zone_wplug_clear_error(disk, zwplug); - /* * The zone write plug now has no BIO plugged: remove it from the * hash table so that it cannot be seen. The plug will be freed @@ -714,8 +622,6 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, */ if (disk_should_remove_zone_wplug(disk, zwplug)) disk_remove_zone_wplug(disk, zwplug); - - spin_unlock_irqrestore(&zwplug->lock, flags); } static unsigned int blk_zone_wp_offset(struct blk_zone *zone) @@ -752,7 +658,7 @@ static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, return; spin_lock_irqsave(&zwplug->lock, flags); - if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) + if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) disk_zone_wplug_set_wp_offset(disk, zwplug, blk_zone_wp_offset(zone)); spin_unlock_irqrestore(&zwplug->lock, flags); @@ -776,6 +682,7 @@ static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, struct gendisk *disk = bio->bi_bdev->bd_disk; sector_t sector = bio->bi_iter.bi_sector; struct blk_zone_wplug *zwplug; + unsigned long flags; /* Conventional zones cannot be reset nor finished. */ if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { @@ -801,7 +708,9 @@ static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, */ zwplug = disk_get_zone_wplug(disk, sector); if (zwplug) { + spin_lock_irqsave(&zwplug->lock, flags); disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); + spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); } @@ -812,6 +721,7 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; struct blk_zone_wplug *zwplug; + unsigned long flags; sector_t sector; /* @@ -823,7 +733,9 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio) sector += disk->queue->limits.chunk_sectors) { zwplug = disk_get_zone_wplug(disk, sector); if (zwplug) { + spin_lock_irqsave(&zwplug->lock, flags); disk_zone_wplug_set_wp_offset(disk, zwplug, 0); + spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); } } @@ -1005,13 +917,23 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, { struct gendisk *disk = bio->bi_bdev->bd_disk; + /* + * If we lost track of the zone write pointer due to a write error, + * the user must either execute a report zones, reset the zone or finish + * the to recover a reliable write pointer position. Fail BIOs if the + * user did not do that as we cannot handle emulated zone append + * otherwise. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) + return false; + /* * Check that the user is not attempting to write to a full zone. * We know such BIO will fail, and that would potentially overflow our * write pointer offset beyond the end of the zone. */ if (disk_zone_wplug_is_full(disk, zwplug)) - goto err; + return false; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { /* @@ -1030,24 +952,18 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); } else { /* - * Check for non-sequential writes early because we avoid a - * whole lot of error handling trouble if we don't send it off - * to the driver. + * Check for non-sequential writes early as we know that BIOs + * with a start sector not unaligned to the zone write pointer + * will fail. */ if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) - goto err; + return false; } /* Advance the zone write pointer offset. */ zwplug->wp_offset += bio_sectors(bio); return true; - -err: - /* We detected an invalid write BIO: schedule error recovery. */ - disk_zone_wplug_set_error(disk, zwplug); - kblockd_schedule_work(&disk->zone_wplugs_work); - return false; } static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) @@ -1097,20 +1013,20 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); /* - * If the zone is already plugged or has a pending error, add the BIO - * to the plug BIO list. Do the same for REQ_NOWAIT BIOs to ensure that - * we will not see a BLK_STS_AGAIN failure if we let the BIO execute. + * If the zone is already plugged, add the BIO to the plug BIO list. + * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a + * BLK_STS_AGAIN failure if we let the BIO execute. * Otherwise, plug and let the BIO execute. */ - if (zwplug->flags & BLK_ZONE_WPLUG_BUSY || (bio->bi_opf & REQ_NOWAIT)) + if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) || + (bio->bi_opf & REQ_NOWAIT)) goto plug; - /* - * If an error is detected when preparing the BIO, add it to the BIO - * list so that error recovery can deal with it. - */ - if (!blk_zone_wplug_prepare_bio(zwplug, bio)) - goto plug; + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { + spin_unlock_irqrestore(&zwplug->lock, flags); + bio_io_error(bio); + return true; + } zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; @@ -1210,16 +1126,6 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, spin_lock_irqsave(&zwplug->lock, flags); - /* - * If we had an error, schedule error recovery. The recovery work - * will restart submission of plugged BIOs. - */ - if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { - spin_unlock_irqrestore(&zwplug->lock, flags); - kblockd_schedule_work(&disk->zone_wplugs_work); - return; - } - /* Schedule submission of the next plugged BIO if we have one. */ if (!bio_list_empty(&zwplug->bio_list)) { disk_zone_wplug_schedule_bio_work(disk, zwplug); @@ -1262,12 +1168,13 @@ void blk_zone_write_plug_bio_endio(struct bio *bio) } /* - * If the BIO failed, mark the plug as having an error to trigger - * recovery. + * If the BIO failed, abort all plugged BIOs and mark the plug as + * needing a write pointer update. */ if (bio->bi_status != BLK_STS_OK) { spin_lock_irqsave(&zwplug->lock, flags); - disk_zone_wplug_set_error(disk, zwplug); + disk_zone_wplug_abort(zwplug); + zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; spin_unlock_irqrestore(&zwplug->lock, flags); } @@ -1323,6 +1230,7 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) */ spin_lock_irqsave(&zwplug->lock, flags); +again: bio = bio_list_pop(&zwplug->bio_list); if (!bio) { zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; @@ -1331,10 +1239,8 @@ static void blk_zone_wplug_bio_work(struct work_struct *work) } if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { - /* Error recovery will decide what to do with the BIO. */ - bio_list_add_head(&zwplug->bio_list, bio); - spin_unlock_irqrestore(&zwplug->lock, flags); - goto put_zwplug; + blk_zone_wplug_bio_io_error(zwplug, bio); + goto again; } spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1356,97 +1262,6 @@ put_zwplug: disk_put_zone_wplug(zwplug); } -static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone, - unsigned int idx, void *data) -{ - struct blk_zone *zonep = data; - - *zonep = *zone; - return 0; -} - -static void disk_zone_wplug_handle_error(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - sector_t zone_start_sector = - bdev_zone_sectors(disk->part0) * zwplug->zone_no; - unsigned int noio_flag; - struct blk_zone zone; - unsigned long flags; - int ret; - - /* Get the current zone information from the device. */ - noio_flag = memalloc_noio_save(); - ret = disk->fops->report_zones(disk, zone_start_sector, 1, - blk_zone_wplug_report_zone_cb, &zone); - memalloc_noio_restore(noio_flag); - - spin_lock_irqsave(&zwplug->lock, flags); - - /* - * A zone reset or finish may have cleared the error already. In such - * case, do nothing as the report zones may have seen the "old" write - * pointer value before the reset/finish operation completed. - */ - if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) - goto unlock; - - zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; - - if (ret != 1) { - /* - * We failed to get the zone information, meaning that something - * is likely really wrong with the device. Abort all remaining - * plugged BIOs as otherwise we could endup waiting forever on - * plugged BIOs to complete if there is a queue freeze on-going. - */ - disk_zone_wplug_abort(zwplug); - goto unplug; - } - - /* Update the zone write pointer offset. */ - zwplug->wp_offset = blk_zone_wp_offset(&zone); - disk_zone_wplug_abort_unaligned(disk, zwplug); - - /* Restart BIO submission if we still have any BIO left. */ - if (!bio_list_empty(&zwplug->bio_list)) { - disk_zone_wplug_schedule_bio_work(disk, zwplug); - goto unlock; - } - -unplug: - zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; - if (disk_should_remove_zone_wplug(disk, zwplug)) - disk_remove_zone_wplug(disk, zwplug); - -unlock: - spin_unlock_irqrestore(&zwplug->lock, flags); -} - -static void disk_zone_wplugs_work(struct work_struct *work) -{ - struct gendisk *disk = - container_of(work, struct gendisk, zone_wplugs_work); - struct blk_zone_wplug *zwplug; - unsigned long flags; - - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - - while (!list_empty(&disk->zone_wplugs_err_list)) { - zwplug = list_first_entry(&disk->zone_wplugs_err_list, - struct blk_zone_wplug, link); - list_del_init(&zwplug->link); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); - - disk_zone_wplug_handle_error(disk, zwplug); - disk_put_zone_wplug(zwplug); - - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - } - - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); -} - static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) { return 1U << disk->zone_wplugs_hash_bits; @@ -1455,8 +1270,6 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) void disk_init_zone_resources(struct gendisk *disk) { spin_lock_init(&disk->zone_wplugs_lock); - INIT_LIST_HEAD(&disk->zone_wplugs_err_list); - INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work); } /* @@ -1555,8 +1368,6 @@ void disk_free_zone_resources(struct gendisk *disk) if (!disk->zone_wplugs_pool) return; - cancel_work_sync(&disk->zone_wplugs_work); - if (disk->zone_wplugs_wq) { destroy_workqueue(disk->zone_wplugs_wq); disk->zone_wplugs_wq = NULL; @@ -1753,6 +1564,8 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, if (!disk->zone_wplugs_hash) return 0; + disk_zone_wplug_sync_wp_offset(disk, zone); + wp_offset = blk_zone_wp_offset(zone); if (!wp_offset || wp_offset >= zone->capacity) return 0; @@ -1883,6 +1696,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) memalloc_noio_restore(noio_flag); return ret; } + ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); if (!ret) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4dd698dad2d6..378d3a1a22fc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -200,8 +200,6 @@ struct gendisk { spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; - struct list_head zone_wplugs_err_list; - struct work_struct zone_wplugs_work; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ -- cgit v1.2.3 From b238e187b4a2d3b54d80aec05a9cab6466b79dde Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:54 -0800 Subject: bpf: refactor bpf_helper_changes_pkt_data to use helper number Use BPF helper number instead of function pointer in bpf_helper_changes_pkt_data(). This would simplify usage of this function in verifier.c:check_cfg() (in a follow-up patch), where only helper number is easily available and there is no real need to lookup helper proto. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 2 +- net/core/filter.c | 63 ++++++++++++++++++++++---------------------------- 4 files changed, 31 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 3a21947f2fd4..0477254bc2d3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1122,7 +1122,7 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); -bool bpf_helper_changes_pkt_data(void *func); +bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id); static inline bool bpf_dump_raw_ok(const struct cred *cred) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a2327c4fdc8b..6fa8041d4831 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2936,7 +2936,7 @@ void __weak bpf_jit_compile(struct bpf_prog *prog) { } -bool __weak bpf_helper_changes_pkt_data(void *func) +bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id) { return false; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 277c1892bb9a..ad3f6d28e8e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10728,7 +10728,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } /* With LD_ABS/IND some JITs save/restore skb from r1. */ - changes_data = bpf_helper_changes_pkt_data(fn->func); + changes_data = bpf_helper_changes_pkt_data(func_id); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", func_id_name(func_id), func_id); diff --git a/net/core/filter.c b/net/core/filter.c index 6625b3f563a4..efb75eed2e35 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7899,42 +7899,35 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { #endif /* CONFIG_INET */ -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == sk_skb_change_head || - func == bpf_skb_change_tail || - func == sk_skb_change_tail || - func == bpf_skb_adjust_room || - func == sk_skb_adjust_room || - func == bpf_skb_pull_data || - func == sk_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data || - func == bpf_msg_push_data || - func == bpf_msg_pop_data || - func == bpf_xdp_adjust_tail || -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) - func == bpf_lwt_seg6_store_bytes || - func == bpf_lwt_seg6_adjust_srh || - func == bpf_lwt_seg6_action || -#endif -#ifdef CONFIG_INET - func == bpf_sock_ops_store_hdr_opt || -#endif - func == bpf_lwt_in_push_encap || - func == bpf_lwt_xmit_push_encap) +bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_clone_redirect: + case BPF_FUNC_l3_csum_replace: + case BPF_FUNC_l4_csum_replace: + case BPF_FUNC_lwt_push_encap: + case BPF_FUNC_lwt_seg6_action: + case BPF_FUNC_lwt_seg6_adjust_srh: + case BPF_FUNC_lwt_seg6_store_bytes: + case BPF_FUNC_msg_pop_data: + case BPF_FUNC_msg_pull_data: + case BPF_FUNC_msg_push_data: + case BPF_FUNC_skb_adjust_room: + case BPF_FUNC_skb_change_head: + case BPF_FUNC_skb_change_proto: + case BPF_FUNC_skb_change_tail: + case BPF_FUNC_skb_pull_data: + case BPF_FUNC_skb_store_bytes: + case BPF_FUNC_skb_vlan_pop: + case BPF_FUNC_skb_vlan_push: + case BPF_FUNC_store_hdr_opt: + case BPF_FUNC_xdp_adjust_head: + case BPF_FUNC_xdp_adjust_meta: + case BPF_FUNC_xdp_adjust_tail: return true; - - return false; + default: + return false; + } } const struct bpf_func_proto bpf_event_output_data_proto __weak; -- cgit v1.2.3 From 51081a3f25c742da5a659d7fc6fd77ebfdd555be Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:55 -0800 Subject: bpf: track changes_pkt_data property for global functions When processing calls to certain helpers, verifier invalidates all packet pointers in a current state. For example, consider the following program: __attribute__((__noinline__)) long skb_pull_data(struct __sk_buff *sk, __u32 len) { return bpf_skb_pull_data(sk, len); } SEC("tc") int test_invalidate_checks(struct __sk_buff *sk) { int *p = (void *)(long)sk->data; if ((void *)(p + 1) > (void *)(long)sk->data_end) return TCX_DROP; skb_pull_data(sk, 0); *p = 42; return TCX_PASS; } After a call to bpf_skb_pull_data() the pointer 'p' can't be used safely. See function filter.c:bpf_helper_changes_pkt_data() for a list of such helpers. At the moment verifier invalidates packet pointers when processing helper function calls, and does not traverse global sub-programs when processing calls to global sub-programs. This means that calls to helpers done from global sub-programs do not invalidate pointers in the caller state. E.g. the program above is unsafe, but is not rejected by verifier. This commit fixes the omission by computing field bpf_subprog_info->changes_pkt_data for each sub-program before main verification pass. changes_pkt_data should be set if: - subprogram calls helper for which bpf_helper_changes_pkt_data returns true; - subprogram calls a global function, for which bpf_subprog_info->changes_pkt_data should be set. The verifier.c:check_cfg() pass is modified to compute this information. The commit relies on depth first instruction traversal done by check_cfg() and absence of recursive function calls: - check_cfg() would eventually visit every call to subprogram S in a state when S is fully explored; - when S is fully explored: - every direct helper call within S is explored (and thus changes_pkt_data is set if needed); - every call to subprogram S1 called by S was visited with S1 fully explored (and thus S inherits changes_pkt_data from S1). The downside of such approach is that dead code elimination is not taken into account: if a helper call inside global function is dead because of current configuration, verifier would conservatively assume that the call occurs for the purpose of the changes_pkt_data computation. Reported-by: Nick Zavaritsky Closes: https://lore.kernel.org/bpf/0498CA22-5779-4767-9C0C-A9515CEA711F@gmail.com/ Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 32 +++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f4290c179bee..48b7b2eeb7e2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -659,6 +659,7 @@ struct bpf_subprog_info { bool args_cached: 1; /* true if bpf_fastcall stack region is used by functions that can't be inlined */ bool keep_fastcall_stack: 1; + bool changes_pkt_data: 1; enum priv_stack_mode priv_stack_mode; u8 arg_cnt; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ad3f6d28e8e4..6a29b68cebd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10042,6 +10042,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, verbose(env, "Func#%d ('%s') is global and assumed valid.\n", subprog, sub_name); + if (env->subprog_info[subprog].changes_pkt_data) + clear_all_pkt_pointers(env); /* mark global subprog for verifying after main prog */ subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); @@ -16246,6 +16248,29 @@ enforce_retval: return 0; } +static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = find_containing_subprog(env, off); + subprog->changes_pkt_data = true; +} + +/* 't' is an index of a call-site. + * 'w' is a callee entry point. + * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. + * Rely on DFS traversal order and absence of recursive calls to guarantee that + * callee's change_pkt_data marks would be correct at that moment. + */ +static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) +{ + struct bpf_subprog_info *caller, *callee; + + caller = find_containing_subprog(env, t); + callee = find_containing_subprog(env, w); + caller->changes_pkt_data |= callee->changes_pkt_data; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -16379,6 +16404,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, bool visit_callee) { int ret, insn_sz; + int w; insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); @@ -16390,8 +16416,10 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, mark_jmp_point(env, t + insn_sz); if (visit_callee) { + w = t + insns[t].imm + 1; mark_prune_point(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + merge_callee_effects(env, t, w); + ret = push_insn(t, w, BRANCH, env); } return ret; } @@ -16708,6 +16736,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_prune_point(env, t); mark_jmp_point(env, t); } + if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; -- cgit v1.2.3 From 81f6d0530ba031b5f038a091619bf2ff29568852 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 9 Dec 2024 20:10:57 -0800 Subject: bpf: check changes_pkt_data property for extension programs When processing calls to global sub-programs, verifier decides whether to invalidate all packet pointers in current state depending on the changes_pkt_data property of the global sub-program. Because of this, an extension program replacing a global sub-program must be compatible with changes_pkt_data property of the sub-program being replaced. This commit: - adds changes_pkt_data flag to struct bpf_prog_aux: - this flag is set in check_cfg() for main sub-program; - in jit_subprogs() for other sub-programs; - modifies bpf_check_attach_btf_id() to check changes_pkt_data flag; - moves call to check_attach_btf_id() after the call to check_cfg(), because it needs changes_pkt_data flag to be set: bpf_check: ... ... - check_attach_btf_id resolve_pseudo_ldimm64 resolve_pseudo_ldimm64 --> bpf_prog_is_offloaded bpf_prog_is_offloaded check_cfg check_cfg + check_attach_btf_id ... ... The following fields are set by check_attach_btf_id(): - env->ops - prog->aux->attach_btf_trace - prog->aux->attach_func_name - prog->aux->attach_func_proto - prog->aux->dst_trampoline - prog->aux->mod - prog->aux->saved_dst_attach_type - prog->aux->saved_dst_prog_type - prog->expected_attach_type Neither of these fields are used by resolve_pseudo_ldimm64() or bpf_prog_offload_verifier_prep() (for netronome and netdevsim drivers), so the reordering is safe. Suggested-by: Alexei Starovoitov Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241210041100.1898468-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaee2a819f4c..fe392d074973 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1527,6 +1527,7 @@ struct bpf_prog_aux { bool is_extended; /* true if extended by freplace program */ bool jits_use_priv_stack; bool priv_stack_requested; + bool changes_pkt_data; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a29b68cebd6..c2e5d0e6e3d0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16872,6 +16872,7 @@ walk_cfg: } } ret = 0; /* cfg looks good */ + env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; err_free: kvfree(insn_state); @@ -20361,6 +20362,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; + func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; if (!i) func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); @@ -22225,6 +22227,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension programs should be JITed\n"); return -EINVAL; } + if (prog->aux->changes_pkt_data && + !aux->func[subprog]->aux->changes_pkt_data) { + bpf_log(log, + "Extension program changes packet data, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -22690,10 +22698,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = check_attach_btf_id(env); - if (ret) - goto skip_full_check; - ret = resolve_pseudo_ldimm64(env); if (ret < 0) goto skip_full_check; @@ -22708,6 +22712,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; -- cgit v1.2.3 From 7d0d673627e20cfa3b21a829a896ce03b58a4f1c Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 10 Dec 2024 20:08:14 +0100 Subject: bpf: Fix theoretical prog_array UAF in __uprobe_perf_func() Currently, the pointer stored in call->prog_array is loaded in __uprobe_perf_func(), with no RCU annotation and no immediately visible RCU protection, so it looks as if the loaded pointer can immediately be dangling. Later, bpf_prog_run_array_uprobe() starts a RCU-trace read-side critical section, but this is too late. It then uses rcu_dereference_check(), but this use of rcu_dereference_check() does not actually dereference anything. Fix it by aligning the semantics to bpf_prog_run_array(): Let the caller provide rcu_read_lock_trace() protection and then load call->prog_array with rcu_dereference_check(). This issue seems to be theoretical: I don't know of any way to reach this code without having handle_swbp() further up the stack, which is already holding a rcu_read_lock_trace() lock, so where we take rcu_read_lock_trace() in __uprobe_perf_func()/bpf_prog_run_array_uprobe() doesn't actually have any effect. Fixes: 8c7dcb84e3b7 ("bpf: implement sleepable uprobes by chaining gps") Suggested-by: Andrii Nakryiko Signed-off-by: Jann Horn Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241210-bpf-fix-uprobe-uaf-v4-1-5fc8959b2b74@google.com --- include/linux/bpf.h | 13 +++++-------- kernel/trace/trace_uprobe.c | 6 +++++- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fe392d074973..805040813f5d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2194,26 +2194,25 @@ bpf_prog_run_array(const struct bpf_prog_array *array, * rcu-protected dynamically sized maps. */ static __always_inline u32 -bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu, +bpf_prog_run_array_uprobe(const struct bpf_prog_array *array, const void *ctx, bpf_prog_run_fn run_prog) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; - const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; might_fault(); + RCU_LOCKDEP_WARN(!rcu_read_lock_trace_held(), "no rcu lock held"); + + if (unlikely(!array)) + return ret; - rcu_read_lock_trace(); migrate_disable(); run_ctx.is_uprobe = true; - array = rcu_dereference_check(array_rcu, rcu_read_lock_trace_held()); - if (unlikely(!array)) - goto out; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); item = &array->items[0]; while ((prog = READ_ONCE(item->prog))) { @@ -2228,9 +2227,7 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array __rcu *array_rcu, rcu_read_unlock(); } bpf_reset_run_ctx(old_run_ctx); -out: migrate_enable(); - rcu_read_unlock_trace(); return ret; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index fed382b7881b..4875e7f5de3d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1402,9 +1402,13 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, #ifdef CONFIG_BPF_EVENTS if (bpf_prog_array_valid(call)) { + const struct bpf_prog_array *array; u32 ret; - ret = bpf_prog_run_array_uprobe(call->prog_array, regs, bpf_prog_run); + rcu_read_lock_trace(); + array = rcu_dereference_check(call->prog_array, rcu_read_lock_trace_held()); + ret = bpf_prog_run_array_uprobe(array, regs, bpf_prog_run); + rcu_read_unlock_trace(); if (!ret) return; } -- cgit v1.2.3 From d2516c3a53705f783bb6868df0f4a2b977898a71 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 10 Dec 2024 15:12:41 +0100 Subject: net, team, bonding: Add netdev_base_features helper Both bonding and team driver have logic to derive the base feature flags before iterating over their slave devices to refine the set via netdev_increment_features(). Add a small helper netdev_base_features() so this can be reused instead of having it open-coded multiple times. Signed-off-by: Daniel Borkmann Cc: Nikolay Aleksandrov Cc: Ido Schimmel Cc: Jiri Pirko Reviewed-by: Hangbin Liu Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241210141245.327886-1-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 4 +--- drivers/net/team/team_core.c | 3 +-- include/linux/netdev_features.h | 7 +++++++ 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 49dd4fe195e5..42c835c60cd8 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1520,9 +1520,7 @@ static netdev_features_t bond_fix_features(struct net_device *dev, struct slave *slave; mask = features; - - features &= ~NETIF_F_ONE_FOR_ALL; - features |= NETIF_F_ALL_FOR_ALL; + features = netdev_base_features(features); bond_for_each_slave(bond, slave, iter) { features = netdev_increment_features(features, diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index a1b27b69f010..1df062c67640 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -2011,8 +2011,7 @@ static netdev_features_t team_fix_features(struct net_device *dev, netdev_features_t mask; mask = features; - features &= ~NETIF_F_ONE_FOR_ALL; - features |= NETIF_F_ALL_FOR_ALL; + features = netdev_base_features(features); rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 66e7d26b70a4..11be70a7929f 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -253,4 +253,11 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) NETIF_F_GSO_UDP_TUNNEL | \ NETIF_F_GSO_UDP_TUNNEL_CSUM) +static inline netdev_features_t netdev_base_features(netdev_features_t features) +{ + features &= ~NETIF_F_ONE_FOR_ALL; + features |= NETIF_F_ALL_FOR_ALL; + return features; +} + #endif /* _LINUX_NETDEV_FEATURES_H */ -- cgit v1.2.3 From 2f4873f9b5f8a49113045ad91c021347486de323 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 2 Dec 2024 11:57:27 +0000 Subject: block: Make bio_iov_bvec_set() accept pointer to const iov_iter Make bio_iov_bvec_set() accept a pointer to const iov_iter, which means that we can drop the undesirable casting to struct iov_iter pointer in blk_rq_map_user_bvec(). Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241202115727.2320401-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-map.c | 2 +- include/linux/bio.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 699a78c85c75..d5bdc31d88d3 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1171,7 +1171,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(__bio_release_pages); -void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); diff --git a/block/blk-map.c b/block/blk-map.c index b5fd1d857461..894009b2d881 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -574,7 +574,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL); if (!bio) return -ENOMEM; - bio_iov_bvec_set(bio, (struct iov_iter *)iter); + bio_iov_bvec_set(bio, iter); /* check that the data layout matches the hardware restrictions */ ret = bio_split_rw_at(bio, lim, &nsegs, max_bytes); diff --git a/include/linux/bio.h b/include/linux/bio.h index 60830a6a5939..7a1b3b1a8fed 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -423,7 +423,7 @@ void __bio_add_page(struct bio *bio, struct page *page, void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); -void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter); +void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -- cgit v1.2.3 From b53127db1dbf7f1047cf35c10922d801dcd40324 Mon Sep 17 00:00:00 2001 From: "Vineeth Pillai (Google)" Date: Thu, 12 Dec 2024 22:22:36 -0500 Subject: sched/dlserver: Fix dlserver double enqueue dlserver can get dequeued during a dlserver pick_task due to the delayed deueue feature and this can lead to issues with dlserver logic as it still thinks that dlserver is on the runqueue. The dlserver throttling and replenish logic gets confused and can lead to double enqueue of dlserver. Double enqueue of dlserver could happend due to couple of reasons: Case 1 ------ Delayed dequeue feature[1] can cause dlserver being stopped during a pick initiated by dlserver: __pick_next_task pick_task_dl -> server_pick_task pick_task_fair pick_next_entity (if (sched_delayed)) dequeue_entities dl_server_stop server_pick_task goes ahead with update_curr_dl_se without knowing that dlserver is dequeued and this confuses the logic and may lead to unintended enqueue while the server is stopped. Case 2 ------ A race condition between a task dequeue on one cpu and same task's enqueue on this cpu by a remote cpu while the lock is released causing dlserver double enqueue. One cpu would be in the schedule() and releasing RQ-lock: current->state = TASK_INTERRUPTIBLE(); schedule(); deactivate_task() dl_stop_server(); pick_next_task() pick_next_task_fair() sched_balance_newidle() rq_unlock(this_rq) at which point another CPU can take our RQ-lock and do: try_to_wake_up() ttwu_queue() rq_lock() ... activate_task() dl_server_start() --> first enqueue wakeup_preempt() := check_preempt_wakeup_fair() update_curr() update_curr_task() if (current->dl_server) dl_server_update() enqueue_dl_entity() --> second enqueue This bug was not apparent as the enqueue in dl_server_start doesn't usually happen because of the defer logic. But as a side effect of the first case(dequeue during dlserver pick), dl_throttled and dl_yield will be set and this causes the time accounting of dlserver to messup and then leading to a enqueue in dl_server_start. Have an explicit flag representing the status of dlserver to avoid the confusion. This is set in dl_server_start and reset in dlserver_stop. Fixes: 63ba8422f876 ("sched/deadline: Introduce deadline servers") Suggested-by: Peter Zijlstra Signed-off-by: "Vineeth Pillai (Google)" Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marcel Ziswiler # ROCK 5B Link: https://lkml.kernel.org/r/20241213032244.877029-1-vineeth@bitbyteword.org --- include/linux/sched.h | 7 +++++++ kernel/sched/deadline.c | 8 ++++++-- kernel/sched/sched.h | 5 +++++ 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d380bffee2ef..66b311fbd5d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -656,6 +656,12 @@ struct sched_dl_entity { * @dl_defer_armed tells if the deferrable server is waiting * for the replenishment timer to activate it. * + * @dl_server_active tells if the dlserver is active(started). + * dlserver is started on first cfs enqueue on an idle runqueue + * and is stopped when a dequeue results in 0 cfs tasks on the + * runqueue. In other words, dlserver is active only when cpu's + * runqueue has atleast one cfs task. + * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. */ @@ -664,6 +670,7 @@ struct sched_dl_entity { unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; + unsigned int dl_server_active : 1; unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index db47f33cb7d2..d94f2ed6d1f4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1647,6 +1647,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) if (!dl_se->dl_runtime) return; + dl_se->dl_server_active = 1; enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) resched_curr(dl_se->rq); @@ -1661,6 +1662,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se) hrtimer_try_to_cancel(&dl_se->dl_timer); dl_se->dl_defer_armed = 0; dl_se->dl_throttled = 0; + dl_se->dl_server_active = 0; } void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, @@ -2421,8 +2423,10 @@ again: if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); + if (dl_server_active(dl_se)) { + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + } goto again; } rq->dl_server = dl_se; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1e494af2cd23..c5d67a43fe52 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -398,6 +398,11 @@ extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq extern int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init); +static inline bool dl_server_active(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server_active; +} + #ifdef CONFIG_CGROUP_SCHED extern struct list_head task_groups; -- cgit v1.2.3 From c00d738e1673ab801e1577e4e3c780ccf88b1a5b Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 13 Dec 2024 14:19:27 -0800 Subject: bpf: Revert "bpf: Mark raw_tp arguments with PTR_MAYBE_NULL" This patch reverts commit cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL"). The patch was well-intended and meant to be as a stop-gap fixing branch prediction when the pointer may actually be NULL at runtime. Eventually, it was supposed to be replaced by an automated script or compiler pass detecting possibly NULL arguments and marking them accordingly. However, it caused two main issues observed for production programs and failed to preserve backwards compatibility. First, programs relied on the verifier not exploring == NULL branch when pointer is not NULL, thus they started failing with a 'dereference of scalar' error. Next, allowing raw_tp arguments to be modified surfaced the warning in the verifier that warns against reg->off when PTR_MAYBE_NULL is set. More information, context, and discusson on both problems is available in [0]. Overall, this approach had several shortcomings, and the fixes would further complicate the verifier's logic, and the entire masking scheme would have to be removed eventually anyway. Hence, revert the patch in preparation of a better fix avoiding these issues to replace this commit. [0]: https://lore.kernel.org/bpf/20241206161053.809580-1-memxor@gmail.com Reported-by: Manu Bretelle Fixes: cb4158ce8ec8 ("bpf: Mark raw_tp arguments with PTR_MAYBE_NULL") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241213221929.3495062-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 -- kernel/bpf/btf.c | 5 +- kernel/bpf/verifier.c | 79 ++-------------------- .../selftests/bpf/progs/test_tp_btf_nullable.c | 6 +- 4 files changed, 9 insertions(+), 87 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 805040813f5d..6e63dd3443b9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3514,10 +3514,4 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog) return prog->aux->func_idx != 0; } -static inline bool bpf_prog_is_raw_tp(const struct bpf_prog *prog) -{ - return prog->type == BPF_PROG_TYPE_TRACING && - prog->expected_attach_type == BPF_TRACE_RAW_TP; -} - #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a63a03582f02..c4aa304028ce 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6594,10 +6594,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog_args_trusted(prog)) info->reg_type |= PTR_TRUSTED; - /* Raw tracepoint arguments always get marked as maybe NULL */ - if (bpf_prog_is_raw_tp(prog)) - info->reg_type |= PTR_MAYBE_NULL; - else if (btf_param_match_suffix(btf, &args[arg], "__nullable")) + if (btf_param_match_suffix(btf, &args[arg], "__nullable")) info->reg_type |= PTR_MAYBE_NULL; if (tgt_prog) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5e541339b2f6..f7f892a52a37 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -420,25 +420,6 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) return rec; } -static bool mask_raw_tp_reg_cond(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - return reg->type == (PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL) && - bpf_prog_is_raw_tp(env->prog) && !reg->ref_obj_id; -} - -static bool mask_raw_tp_reg(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - if (!mask_raw_tp_reg_cond(env, reg)) - return false; - reg->type &= ~PTR_MAYBE_NULL; - return true; -} - -static void unmask_raw_tp_reg(struct bpf_reg_state *reg, bool result) -{ - if (result) - reg->type |= PTR_MAYBE_NULL; -} - static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) { struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux; @@ -6801,7 +6782,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, const char *field_name = NULL; enum bpf_type_flag flag = 0; u32 btf_id = 0; - bool mask; int ret; if (!env->allow_ptr_leaks) { @@ -6873,21 +6853,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - /* For raw_tp progs, we allow dereference of PTR_MAYBE_NULL - * trusted PTR_TO_BTF_ID, these are the ones that are possibly - * arguments to the raw_tp. Since internal checks in for trusted - * reg in check_ptr_to_btf_access would consider PTR_MAYBE_NULL - * modifier as problematic, mask it out temporarily for the - * check. Don't apply this to pointers with ref_obj_id > 0, as - * those won't be raw_tp args. - * - * We may end up applying this relaxation to other trusted - * PTR_TO_BTF_ID with maybe null flag, since we cannot - * distinguish PTR_MAYBE_NULL tagged for arguments vs normal - * tagging, but that should expand allowed behavior, and not - * cause regression for existing behavior. - */ - mask = mask_raw_tp_reg(env, reg); + if (ret != PTR_TO_BTF_ID) { /* just mark; */ @@ -6948,13 +6914,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, clear_trusted_flags(&flag); } - if (atype == BPF_READ && value_regno >= 0) { + if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); - /* We've assigned a new type to regno, so don't undo masking. */ - if (regno == value_regno) - mask = false; - } - unmask_raw_tp_reg(reg, mask); return 0; } @@ -7329,7 +7290,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && - (mask_raw_tp_reg_cond(env, reg) || !type_may_be_null(reg->type))) { + !type_may_be_null(reg->type)) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { @@ -9032,7 +8993,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; int err = 0; - bool mask; if (arg_type == ARG_DONTCARE) return 0; @@ -9073,11 +9033,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; - mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); + if (err) + return err; - err = err ?: check_func_arg_reg_off(env, reg, regno, arg_type); - unmask_raw_tp_reg(reg, mask); + err = check_func_arg_reg_off(env, reg, regno, arg_type); if (err) return err; @@ -9872,17 +9832,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { struct bpf_call_arg_meta meta; - bool mask; int err; if (register_is_null(reg) && type_may_be_null(arg->arg_type)) continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ - mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); - unmask_raw_tp_reg(reg, mask); if (err) return err; } else { @@ -12205,7 +12162,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1, ref_id, type_size; bool is_ret_buf_sz = false; - bool mask = false; int kf_arg_type; t = btf_type_skip_modifiers(btf, args[i].type, NULL); @@ -12264,15 +12220,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - mask = mask_raw_tp_reg(env, reg); if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && (register_is_null(reg) || type_may_be_null(reg->type)) && !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); - unmask_raw_tp_reg(reg, mask); return -EACCES; } - unmask_raw_tp_reg(reg, mask); if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { @@ -12330,24 +12283,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) break; - /* Allow passing maybe NULL raw_tp arguments to - * kfuncs for compatibility. Don't apply this to - * arguments with ref_obj_id > 0. - */ - mask = mask_raw_tp_reg(env, reg); if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "R%d must be referenced or trusted\n", regno); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } if (!is_rcu_reg(reg)) { verbose(env, "R%d must be a rcu pointer\n", regno); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } } - unmask_raw_tp_reg(reg, mask); fallthrough; case KF_ARG_PTR_TO_CTX: case KF_ARG_PTR_TO_DYNPTR: @@ -12370,9 +12315,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_release(meta) && reg->ref_obj_id) arg_type |= OBJ_RELEASE; - mask = mask_raw_tp_reg(env, reg); ret = check_func_arg_reg_off(env, reg, regno, arg_type); - unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; @@ -12549,7 +12492,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ref_tname = btf_name_by_offset(btf, ref_t->name_off); fallthrough; case KF_ARG_PTR_TO_BTF_ID: - mask = mask_raw_tp_reg(env, reg); /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID || (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && @@ -12558,11 +12500,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "expected %s or socket\n", reg_type_str(env, base_type(reg->type) | (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); - unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; break; @@ -13535,7 +13475,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, */ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, - struct bpf_reg_state *ptr_reg, + const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { struct bpf_verifier_state *vstate = env->cur_state; @@ -13549,7 +13489,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; - bool mask; int ret; dst_reg = ®s[dst]; @@ -13576,14 +13515,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - mask = mask_raw_tp_reg(env, ptr_reg); if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", dst, reg_type_str(env, ptr_reg->type)); - unmask_raw_tp_reg(ptr_reg, mask); return -EACCES; } - unmask_raw_tp_reg(ptr_reg, mask); switch (base_type(ptr_reg->type)) { case PTR_TO_CTX: @@ -20126,7 +20062,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * for this case. */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: - case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL: if (type == BPF_READ) { if (BPF_MODE(insn->code) == BPF_MEM) insn->code = BPF_LDX | BPF_PROBE_MEM | diff --git a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c index 5aaf2b065f86..bba3e37f749b 100644 --- a/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_tp_btf_nullable.c @@ -7,11 +7,7 @@ #include "bpf_misc.h" SEC("tp_btf/bpf_testmod_test_nullable_bare") -/* This used to be a failure test, but raw_tp nullable arguments can now - * directly be dereferenced, whether they have nullable annotation or not, - * and don't need to be explicitly checked. - */ -__success +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") int BPF_PROG(handle_tp_btf_nullable_bare1, struct bpf_testmod_test_read_ctx *nullable_ctx) { return nullable_ctx->len; -- cgit v1.2.3