From a6d1ce5951185ee91bbe6909fe2758f3625561b0 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 11 Oct 2022 00:33:58 +0000 Subject: cgroup: add cgroup_v1v2_get_from_[fd/file]() Add cgroup_v1v2_get_from_fd() and cgroup_v1v2_get_from_file() that support both cgroup1 and cgroup2. Signed-off-by: Yosry Ahmed Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 398f0bce7c21..a88de5bdeaa9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -106,6 +106,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); +struct cgroup *cgroup_v1v2_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); -- cgit v1.2.3 From a2550d3ce53c68f54042bc5e468c4d07491ffe0e Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Wed, 12 Oct 2022 19:18:36 +0200 Subject: net: dsa: qca8k: fix inband mgmt for big-endian systems The header and the data of the skb for the inband mgmt requires to be in little-endian. This is problematic for big-endian system as the mgmt header is written in the cpu byte order. Fix this by converting each value for the mgmt header and data to little-endian, and convert to cpu byte order the mgmt header and data sent by the switch. Fixes: 5950c7c0a68c ("net: dsa: qca8k: add support for mgmt read/write in Ethernet packet") Tested-by: Pawel Dembicki Tested-by: Lech Perczak Signed-off-by: Christian Marangi Reviewed-by: Lech Perczak Signed-off-by: David S. Miller --- drivers/net/dsa/qca/qca8k-8xxx.c | 63 ++++++++++++++++++++++++++++++---------- include/linux/dsa/tag_qca.h | 6 ++-- 2 files changed, 50 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index 5669c92c93f7..644338ca0510 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -137,27 +137,42 @@ static void qca8k_rw_reg_ack_handler(struct dsa_switch *ds, struct sk_buff *skb) struct qca8k_mgmt_eth_data *mgmt_eth_data; struct qca8k_priv *priv = ds->priv; struct qca_mgmt_ethhdr *mgmt_ethhdr; + u32 command; u8 len, cmd; + int i; mgmt_ethhdr = (struct qca_mgmt_ethhdr *)skb_mac_header(skb); mgmt_eth_data = &priv->mgmt_eth_data; - cmd = FIELD_GET(QCA_HDR_MGMT_CMD, mgmt_ethhdr->command); - len = FIELD_GET(QCA_HDR_MGMT_LENGTH, mgmt_ethhdr->command); + command = get_unaligned_le32(&mgmt_ethhdr->command); + cmd = FIELD_GET(QCA_HDR_MGMT_CMD, command); + len = FIELD_GET(QCA_HDR_MGMT_LENGTH, command); /* Make sure the seq match the requested packet */ - if (mgmt_ethhdr->seq == mgmt_eth_data->seq) + if (get_unaligned_le32(&mgmt_ethhdr->seq) == mgmt_eth_data->seq) mgmt_eth_data->ack = true; if (cmd == MDIO_READ) { - mgmt_eth_data->data[0] = mgmt_ethhdr->mdio_data; + u32 *val = mgmt_eth_data->data; + + *val = get_unaligned_le32(&mgmt_ethhdr->mdio_data); /* Get the rest of the 12 byte of data. * The read/write function will extract the requested data. */ - if (len > QCA_HDR_MGMT_DATA1_LEN) - memcpy(mgmt_eth_data->data + 1, skb->data, - QCA_HDR_MGMT_DATA2_LEN); + if (len > QCA_HDR_MGMT_DATA1_LEN) { + __le32 *data2 = (__le32 *)skb->data; + int data_len = min_t(int, QCA_HDR_MGMT_DATA2_LEN, + len - QCA_HDR_MGMT_DATA1_LEN); + + val++; + + for (i = sizeof(u32); i <= data_len; i += sizeof(u32)) { + *val = get_unaligned_le32(data2); + val++; + data2++; + } + } } complete(&mgmt_eth_data->rw_done); @@ -169,8 +184,10 @@ static struct sk_buff *qca8k_alloc_mdio_header(enum mdio_cmd cmd, u32 reg, u32 * struct qca_mgmt_ethhdr *mgmt_ethhdr; unsigned int real_len; struct sk_buff *skb; - u32 *data2; + __le32 *data2; + u32 command; u16 hdr; + int i; skb = dev_alloc_skb(QCA_HDR_MGMT_PKT_LEN); if (!skb) @@ -199,20 +216,32 @@ static struct sk_buff *qca8k_alloc_mdio_header(enum mdio_cmd cmd, u32 reg, u32 * hdr |= FIELD_PREP(QCA_HDR_XMIT_DP_BIT, BIT(0)); hdr |= FIELD_PREP(QCA_HDR_XMIT_CONTROL, QCA_HDR_XMIT_TYPE_RW_REG); - mgmt_ethhdr->command = FIELD_PREP(QCA_HDR_MGMT_ADDR, reg); - mgmt_ethhdr->command |= FIELD_PREP(QCA_HDR_MGMT_LENGTH, real_len); - mgmt_ethhdr->command |= FIELD_PREP(QCA_HDR_MGMT_CMD, cmd); - mgmt_ethhdr->command |= FIELD_PREP(QCA_HDR_MGMT_CHECK_CODE, + command = FIELD_PREP(QCA_HDR_MGMT_ADDR, reg); + command |= FIELD_PREP(QCA_HDR_MGMT_LENGTH, real_len); + command |= FIELD_PREP(QCA_HDR_MGMT_CMD, cmd); + command |= FIELD_PREP(QCA_HDR_MGMT_CHECK_CODE, QCA_HDR_MGMT_CHECK_CODE_VAL); + put_unaligned_le32(command, &mgmt_ethhdr->command); + if (cmd == MDIO_WRITE) - mgmt_ethhdr->mdio_data = *val; + put_unaligned_le32(*val, &mgmt_ethhdr->mdio_data); mgmt_ethhdr->hdr = htons(hdr); data2 = skb_put_zero(skb, QCA_HDR_MGMT_DATA2_LEN + QCA_HDR_MGMT_PADDING_LEN); - if (cmd == MDIO_WRITE && len > QCA_HDR_MGMT_DATA1_LEN) - memcpy(data2, val + 1, len - QCA_HDR_MGMT_DATA1_LEN); + if (cmd == MDIO_WRITE && len > QCA_HDR_MGMT_DATA1_LEN) { + int data_len = min_t(int, QCA_HDR_MGMT_DATA2_LEN, + len - QCA_HDR_MGMT_DATA1_LEN); + + val++; + + for (i = sizeof(u32); i <= data_len; i += sizeof(u32)) { + put_unaligned_le32(*val, data2); + data2++; + val++; + } + } return skb; } @@ -220,9 +249,11 @@ static struct sk_buff *qca8k_alloc_mdio_header(enum mdio_cmd cmd, u32 reg, u32 * static void qca8k_mdio_header_fill_seq_num(struct sk_buff *skb, u32 seq_num) { struct qca_mgmt_ethhdr *mgmt_ethhdr; + u32 seq; + seq = FIELD_PREP(QCA_HDR_MGMT_SEQ_NUM, seq_num); mgmt_ethhdr = (struct qca_mgmt_ethhdr *)skb->data; - mgmt_ethhdr->seq = FIELD_PREP(QCA_HDR_MGMT_SEQ_NUM, seq_num); + put_unaligned_le32(seq, &mgmt_ethhdr->seq); } static int qca8k_read_eth(struct qca8k_priv *priv, u32 reg, u32 *val, int len) diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h index 50be7cbd93a5..0e176da1e43f 100644 --- a/include/linux/dsa/tag_qca.h +++ b/include/linux/dsa/tag_qca.h @@ -61,9 +61,9 @@ struct sk_buff; /* Special struct emulating a Ethernet header */ struct qca_mgmt_ethhdr { - u32 command; /* command bit 31:0 */ - u32 seq; /* seq 63:32 */ - u32 mdio_data; /* first 4byte mdio */ + __le32 command; /* command bit 31:0 */ + __le32 seq; /* seq 63:32 */ + __le32 mdio_data; /* first 4byte mdio */ __be16 hdr; /* qca hdr */ } __packed; -- cgit v1.2.3 From 0d4636f7d72df3179b20a2d32b647881917a5e2a Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Wed, 12 Oct 2022 19:18:37 +0200 Subject: net: dsa: qca8k: fix ethtool autocast mib for big-endian systems The switch sends autocast mib in little-endian. This is problematic for big-endian system as the values needs to be converted. Fix this by converting each mib value to cpu byte order. Fixes: 5c957c7ca78c ("net: dsa: qca8k: add support for mib autocast in Ethernet packet") Tested-by: Pawel Dembicki Tested-by: Lech Perczak Signed-off-by: Christian Marangi Signed-off-by: David S. Miller --- drivers/net/dsa/qca/qca8k-8xxx.c | 20 ++++++++------------ include/linux/dsa/tag_qca.h | 2 +- 2 files changed, 9 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c index 644338ca0510..c5c3b4e92f28 100644 --- a/drivers/net/dsa/qca/qca8k-8xxx.c +++ b/drivers/net/dsa/qca/qca8k-8xxx.c @@ -1518,9 +1518,9 @@ static void qca8k_mib_autocast_handler(struct dsa_switch *ds, struct sk_buff *sk struct qca8k_priv *priv = ds->priv; const struct qca8k_mib_desc *mib; struct mib_ethhdr *mib_ethhdr; - int i, mib_len, offset = 0; - u64 *data; + __le32 *data2; u8 port; + int i; mib_ethhdr = (struct mib_ethhdr *)skb_mac_header(skb); mib_eth_data = &priv->mib_eth_data; @@ -1532,28 +1532,24 @@ static void qca8k_mib_autocast_handler(struct dsa_switch *ds, struct sk_buff *sk if (port != mib_eth_data->req_port) goto exit; - data = mib_eth_data->data; + data2 = (__le32 *)skb->data; for (i = 0; i < priv->info->mib_count; i++) { mib = &ar8327_mib[i]; /* First 3 mib are present in the skb head */ if (i < 3) { - data[i] = mib_ethhdr->data[i]; + mib_eth_data->data[i] = get_unaligned_le32(mib_ethhdr->data + i); continue; } - mib_len = sizeof(uint32_t); - /* Some mib are 64 bit wide */ if (mib->size == 2) - mib_len = sizeof(uint64_t); - - /* Copy the mib value from packet to the */ - memcpy(data + i, skb->data + offset, mib_len); + mib_eth_data->data[i] = get_unaligned_le64((__le64 *)data2); + else + mib_eth_data->data[i] = get_unaligned_le32(data2); - /* Set the offset for the next mib */ - offset += mib_len; + data2 += mib->size; } exit: diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h index 0e176da1e43f..b1b5720d89a5 100644 --- a/include/linux/dsa/tag_qca.h +++ b/include/linux/dsa/tag_qca.h @@ -73,7 +73,7 @@ enum mdio_cmd { }; struct mib_ethhdr { - u32 data[3]; /* first 3 mib counter */ + __le32 data[3]; /* first 3 mib counter */ __be16 hdr; /* qca hdr */ } __packed; -- cgit v1.2.3 From fc8695eb11f07d936a4a9dbd15d7797986bc8b89 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 14 Oct 2022 09:07:46 -0700 Subject: Revert "net: fix cpu_max_bits_warn() usage in netif_attrmask_next{,_and}" This reverts commit 854701ba4c39afae2362ba19a580c461cb183e4f. We have more violations around, which leads to: WARNING: CPU: 2 PID: 1 at include/linux/cpumask.h:110 __netif_set_xps_queue+0x14e/0x770 Let's back this out and retry with a larger clean up in -next. Fixes: 854701ba4c39 ("net: fix cpu_max_bits_warn() usage in netif_attrmask_next{,_and}") Link: https://lore.kernel.org/all/20221014030459.3272206-2-guoren@kernel.org/ Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a36edb0ec199..eddf8ee270e7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3663,8 +3663,9 @@ static inline bool netif_attr_test_online(unsigned long j, static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, unsigned int nr_bits) { - /* n is a prior cpu */ - cpu_max_bits_warn(n + 1, nr_bits); + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); if (srcp) return find_next_bit(srcp, nr_bits, n + 1); @@ -3685,8 +3686,9 @@ static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, const unsigned long *src2p, unsigned int nr_bits) { - /* n is a prior cpu */ - cpu_max_bits_warn(n + 1, nr_bits); + /* -1 is a legal arg here. */ + if (n != -1) + cpu_max_bits_warn(n, nr_bits); if (src1p && src2p) return find_next_and_bit(src1p, src2p, nr_bits, n + 1); -- cgit v1.2.3 From 96de900ae78e7dbedc937fd91bafe2934579c65a Mon Sep 17 00:00:00 2001 From: Shenwei Wang Date: Fri, 14 Oct 2022 09:47:28 -0500 Subject: net: phylink: add mac_managed_pm in phylink_config structure The recent commit 'commit 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state")' requires the MAC driver explicitly tell the phy driver who is managing the PM, otherwise you will see warning during resume stage. Add a boolean property in the phylink_config structure so that the MAC driver can use it to tell the PHY driver if it wants to manage the PM. Fixes: 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") Signed-off-by: Shenwei Wang Acked-by: Florian Fainelli Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 3 +++ include/linux/phylink.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 75464df191ef..6547b6cc6cbe 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1661,6 +1661,9 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, if (phy_interrupt_is_valid(phy)) phy_request_interrupt(phy); + if (pl->config->mac_managed_pm) + phy->mac_managed_pm = true; + return 0; } diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 664dd409feb9..3f01ac8017e0 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -122,6 +122,7 @@ enum phylink_op_type { * (See commit 7cceb599d15d ("net: phylink: avoid mac_config calls") * @poll_fixed_state: if true, starts link_poll, * if MAC link is at %MLO_AN_FIXED mode. + * @mac_managed_pm: if true, indicate the MAC driver is responsible for PHY PM. * @ovr_an_inband: if true, override PCS to MLO_AN_INBAND * @get_fixed_state: callback to execute to determine the fixed link state, * if MAC link is at %MLO_AN_FIXED mode. @@ -134,6 +135,7 @@ struct phylink_config { enum phylink_op_type type; bool legacy_pre_march2020; bool poll_fixed_state; + bool mac_managed_pm; bool ovr_an_inband; void (*get_fixed_state)(struct phylink_config *config, struct phylink_link_state *state); -- cgit v1.2.3 From 25b72d530e7aa185955196b63f53c38f751f1632 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Oct 2022 12:18:54 -0700 Subject: fbdev: MIPS supports iomem addresses Add MIPS to fb_* helpers list for iomem addresses. This silences Sparse warnings about lacking __iomem address space casts: drivers/video/fbdev/pvr2fb.c:800:9: sparse: sparse: incorrect type in argument 1 (different address spaces) drivers/video/fbdev/pvr2fb.c:800:9: sparse: expected void const * drivers/video/fbdev/pvr2fb.c:800:9: sparse: got char [noderef] __iomem *screen_base Reported-by: kernel test robot Link: https://lore.kernel.org/lkml/202210100209.tR2Iqbqk-lkp@intel.com/ Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Signed-off-by: Kees Cook Signed-off-by: Helge Deller --- include/linux/fb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index 0aff76bcbb00..bcb8658f5b64 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -555,7 +555,7 @@ static inline struct apertures_struct *alloc_apertures(unsigned int max_num) { #elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || \ defined(__hppa__) || defined(__sh__) || defined(__powerpc__) || \ - defined(__arm__) || defined(__aarch64__) + defined(__arm__) || defined(__aarch64__) || defined(__mips__) #define fb_readb __raw_readb #define fb_readw __raw_readw -- cgit v1.2.3 From 472a1482325b3a285e0bcf82c0b0edc689b7e8cd Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Sun, 2 Oct 2022 08:04:19 -0400 Subject: counter: Reduce DEFINE_COUNTER_ARRAY_POLARITY() to defining counter_array A spare warning was reported for drivers/counter/ti-ecap-capture.c:: sparse warnings: (new ones prefixed by >>) >> drivers/counter/ti-ecap-capture.c:380:8: sparse: sparse: symbol 'ecap_cnt_pol_array' was not declared. Should it be static? vim +/ecap_cnt_pol_array +380 drivers/counter/ti-ecap-capture.c 379 > 380 static DEFINE_COUNTER_ARRAY_POLARITY(ecap_cnt_pol_array, ecap_cnt_pol_avail, ECAP_NB_CEVT); 381 The first argument to the DEFINE_COUNTER_ARRAY_POLARITY() macro is a token serving as the symbol name in the definition of a new struct counter_array structure. However, this macro actually expands to two statements:: #define DEFINE_COUNTER_ARRAY_POLARITY(_name, _enums, _length) \ DEFINE_COUNTER_AVAILABLE(_name##_available, _enums); \ struct counter_array _name = { \ .type = COUNTER_COMP_SIGNAL_POLARITY, \ .avail = &(_name##_available), \ .length = (_length), \ } Because of this, the "static" on line 380 only applies to the first statement. This patch splits out the DEFINE_COUNTER_AVAILABLE() line and leaves DEFINE_COUNTER_ARRAY_POLARITY() as a simple structure definition to avoid issues like this. Reported-by: kernel test robot Link: https://lore.kernel.org/all/202210020619.NQbyomII-lkp@intel.com/ Cc: Julien Panis Signed-off-by: William Breathitt Gray --- drivers/counter/ti-ecap-capture.c | 3 ++- include/linux/counter.h | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/counter/ti-ecap-capture.c b/drivers/counter/ti-ecap-capture.c index af10de30aba5..b8dbf0212a8e 100644 --- a/drivers/counter/ti-ecap-capture.c +++ b/drivers/counter/ti-ecap-capture.c @@ -377,7 +377,8 @@ static const enum counter_signal_polarity ecap_cnt_pol_avail[] = { COUNTER_SIGNAL_POLARITY_NEGATIVE, }; -static DEFINE_COUNTER_ARRAY_POLARITY(ecap_cnt_pol_array, ecap_cnt_pol_avail, ECAP_NB_CEVT); +static DEFINE_COUNTER_AVAILABLE(ecap_cnt_pol_available, ecap_cnt_pol_avail); +static DEFINE_COUNTER_ARRAY_POLARITY(ecap_cnt_pol_array, ecap_cnt_pol_available, ECAP_NB_CEVT); static struct counter_comp ecap_cnt_signal_ext[] = { COUNTER_COMP_ARRAY_POLARITY(ecap_cnt_pol_read, ecap_cnt_pol_write, ecap_cnt_pol_array), diff --git a/include/linux/counter.h b/include/linux/counter.h index c41fa602ed28..b63746637de2 100644 --- a/include/linux/counter.h +++ b/include/linux/counter.h @@ -542,11 +542,10 @@ struct counter_array { #define DEFINE_COUNTER_ARRAY_CAPTURE(_name, _length) \ DEFINE_COUNTER_ARRAY_U64(_name, _length) -#define DEFINE_COUNTER_ARRAY_POLARITY(_name, _enums, _length) \ - DEFINE_COUNTER_AVAILABLE(_name##_available, _enums); \ +#define DEFINE_COUNTER_ARRAY_POLARITY(_name, _available, _length) \ struct counter_array _name = { \ .type = COUNTER_COMP_SIGNAL_POLARITY, \ - .avail = &(_name##_available), \ + .avail = &(_available), \ .length = (_length), \ } -- cgit v1.2.3 From ca6c21327c6af02b7eec31ce4b9a740a18c6c13f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Oct 2022 15:00:39 +0200 Subject: perf: Fix missing SIGTRAPs Marco reported: Due to the implementation of how SIGTRAP are delivered if perf_event_attr::sigtrap is set, we've noticed 3 issues: 1. Missing SIGTRAP due to a race with event_sched_out() (more details below). 2. Hardware PMU events being disabled due to returning 1 from perf_event_overflow(). The only way to re-enable the event is for user space to first "properly" disable the event and then re-enable it. 3. The inability to automatically disable an event after a specified number of overflows via PERF_EVENT_IOC_REFRESH. The worst of the 3 issues is problem (1), which occurs when a pending_disable is "consumed" by a racing event_sched_out(), observed as follows: CPU0 | CPU1 --------------------------------+--------------------------- __perf_event_overflow() | perf_event_disable_inatomic() | pending_disable = CPU0 | ... | _perf_event_enable() | event_function_call() | task_function_call() | /* sends IPI to CPU0 */ | ... __perf_event_enable() +--------------------------- ctx_resched() task_ctx_sched_out() ctx_sched_out() group_sched_out() event_sched_out() pending_disable = -1 perf_pending_event() perf_pending_event_disable() /* Fails to send SIGTRAP because no pending_disable! */ In the above case, not only is that particular SIGTRAP missed, but also all future SIGTRAPs because 'event_limit' is not reset back to 1. To fix, rework pending delivery of SIGTRAP via IRQ-work by introduction of a separate 'pending_sigtrap', no longer using 'event_limit' and 'pending_disable' for its delivery. Additionally; and different to Marco's proposed patch: - recognise that pending_disable effectively duplicates oncpu for the case where it is set. As such, change the irq_work handler to use ->oncpu to target the event and use pending_* as boolean toggles. - observe that SIGTRAP targets the ctx->task, so the context switch optimization that carries contexts between tasks is invalid. If the irq_work were delayed enough to hit after a context switch the SIGTRAP would be delivered to the wrong task. - observe that if the event gets scheduled out (rotation/migration/context-switch/...) the irq-work would be insufficient to deliver the SIGTRAP when the event gets scheduled back in (the irq-work might still be pending on the old CPU). Therefore have event_sched_out() convert the pending sigtrap into a task_work which will deliver the signal at return_to_user. Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov Debugged-by: Dmitry Vyukov Reported-by: Marco Elver Debugged-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Tested-by: Marco Elver --- include/linux/perf_event.h | 19 ++++-- kernel/events/core.c | 151 +++++++++++++++++++++++++++++++++----------- kernel/events/ring_buffer.c | 2 +- 3 files changed, 129 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 853f64b6c8c2..0031f7b4d9ab 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -756,11 +756,14 @@ struct perf_event { struct fasync_struct *fasync; /* delayed work for NMIs and such */ - int pending_wakeup; - int pending_kill; - int pending_disable; + unsigned int pending_wakeup; + unsigned int pending_kill; + unsigned int pending_disable; + unsigned int pending_sigtrap; unsigned long pending_addr; /* SIGTRAP */ - struct irq_work pending; + struct irq_work pending_irq; + struct callback_head pending_task; + unsigned int pending_work; atomic_t event_limit; @@ -877,6 +880,14 @@ struct perf_event_context { #endif void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; + + /* + * Sum (event->pending_sigtrap + event->pending_work) + * + * The SIGTRAP is targeted at ctx->task, as such it won't do changing + * that until the signal is delivered. + */ + local_t nr_pending; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index aefc1e08e015..01933db7629c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -54,6 +54,7 @@ #include #include #include +#include #include "internal.h" @@ -2276,11 +2277,26 @@ event_sched_out(struct perf_event *event, event->pmu->del(event, 0); event->oncpu = -1; - if (READ_ONCE(event->pending_disable) >= 0) { - WRITE_ONCE(event->pending_disable, -1); + if (event->pending_disable) { + event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } + + if (event->pending_sigtrap) { + bool dec = true; + + event->pending_sigtrap = 0; + if (state != PERF_EVENT_STATE_OFF && + !event->pending_work) { + event->pending_work = 1; + dec = false; + task_work_add(current, &event->pending_task, TWA_RESUME); + } + if (dec) + local_dec(&event->ctx->nr_pending); + } + perf_event_set_state(event, state); if (!is_software_event(event)) @@ -2432,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event, * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_event it's OK because event->ctx + * When called from perf_pending_irq it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @@ -2471,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { - WRITE_ONCE(event->pending_disable, smp_processor_id()); - /* can fail, see perf_pending_event_disable() */ - irq_work_queue(&event->pending); + event->pending_disable = 1; + irq_work_queue(&event->pending_irq); } #define MAX_INTERRUPTS (~0ULL) @@ -3428,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { + perf_pmu_disable(pmu); + + /* PMIs are disabled; ctx->nr_pending is stable. */ + if (local_read(&ctx->nr_pending) || + local_read(&next_ctx->nr_pending)) { + /* + * Must not swap out ctx when there's pending + * events that rely on the ctx->task relation. + */ + raw_spin_unlock(&next_ctx->lock); + rcu_read_unlock(); + goto inside_switch; + } + WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - perf_pmu_disable(pmu); - if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(ctx, false); @@ -3473,6 +3500,7 @@ unlock: raw_spin_lock(&ctx->lock); perf_pmu_disable(pmu); +inside_switch: if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(ctx, false); task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); @@ -4939,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event, static void _free_event(struct perf_event *event) { - irq_work_sync(&event->pending); + irq_work_sync(&event->pending_irq); unaccount_event(event); @@ -6439,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event) return; /* - * perf_pending_event() can race with the task exiting. + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. */ if (current->flags & PF_EXITING) return; @@ -6448,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event) event->attr.type, event->attr.sig_data); } -static void perf_pending_event_disable(struct perf_event *event) +/* + * Deliver the pending work in-event-context or follow the context. + */ +static void __perf_pending_irq(struct perf_event *event) { - int cpu = READ_ONCE(event->pending_disable); + int cpu = READ_ONCE(event->oncpu); + /* + * If the event isn't running; we done. event_sched_out() will have + * taken care of things. + */ if (cpu < 0) return; + /* + * Yay, we hit home and are in the context of the event. + */ if (cpu == smp_processor_id()) { - WRITE_ONCE(event->pending_disable, -1); - - if (event->attr.sigtrap) { + if (event->pending_sigtrap) { + event->pending_sigtrap = 0; perf_sigtrap(event); - atomic_set_release(&event->event_limit, 1); /* rearm event */ - return; + local_dec(&event->ctx->nr_pending); + } + if (event->pending_disable) { + event->pending_disable = 0; + perf_event_disable_local(event); } - - perf_event_disable_local(event); return; } @@ -6484,35 +6523,62 @@ static void perf_pending_event_disable(struct perf_event *event) * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_event() + * perf_pending_irq() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending, cpu); + irq_work_queue_on(&event->pending_irq, cpu); } -static void perf_pending_event(struct irq_work *entry) +static void perf_pending_irq(struct irq_work *entry) { - struct perf_event *event = container_of(entry, struct perf_event, pending); + struct perf_event *event = container_of(entry, struct perf_event, pending_irq); int rctx; - rctx = perf_swevent_get_recursion_context(); /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ + rctx = perf_swevent_get_recursion_context(); - perf_pending_event_disable(event); - + /* + * The wakeup isn't bound to the context of the event -- it can happen + * irrespective of where the event is. + */ if (event->pending_wakeup) { event->pending_wakeup = 0; perf_event_wakeup(event); } + __perf_pending_irq(event); + if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } +static void perf_pending_task(struct callback_head *head) +{ + struct perf_event *event = container_of(head, struct perf_event, pending_task); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + + if (event->pending_work) { + event->pending_work = 0; + perf_sigtrap(event); + local_dec(&event->ctx->nr_pending); + } + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); + preempt_enable_notrace(); +} + #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; @@ -9212,8 +9278,8 @@ int perf_event_account_interrupt(struct perf_event *event) */ static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) { int events = atomic_read(&event->event_limit); int ret = 0; @@ -9236,24 +9302,36 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_addr = data->addr; - perf_event_disable_inatomic(event); } + if (event->attr.sigtrap) { + /* + * Should not be able to return to user space without processing + * pending_sigtrap (kernel events can overflow multiple times). + */ + WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel); + if (!event->pending_sigtrap) { + event->pending_sigtrap = 1; + local_inc(&event->ctx->nr_pending); + } + event->pending_addr = data->addr; + irq_work_queue(&event->pending_irq); + } + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; - irq_work_queue(&event->pending); + irq_work_queue(&event->pending_irq); } return ret; } int perf_event_overflow(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { return __perf_event_overflow(event, 1, data, regs); } @@ -11570,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); - event->pending_disable = -1; - init_irq_work(&event->pending, perf_pending_event); + init_irq_work(&event->pending_irq, perf_pending_irq); + init_task_work(&event->pending_task, perf_pending_task); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -11593,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (parent_event) event->event_caps = parent_event->event_caps; - if (event->attr.sigtrap) - atomic_set(&event->event_limit, 1); - if (task) { event->attach_state = PERF_ATTACH_TASK; /* diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 726132039c38..273a0fe7910a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) atomic_set(&handle->rb->poll, EPOLLIN); handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); + irq_work_queue(&handle->event->pending_irq); } /* -- cgit v1.2.3 From ccd30a476f8e864732de220bd50e6f372f5ebcab Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 11 Oct 2022 14:38:38 -0700 Subject: fscrypt: fix keyring memory leak on mount failure Commit d7e7b9af104c ("fscrypt: stop using keyrings subsystem for fscrypt_master_key") moved the keyring destruction from __put_super() to generic_shutdown_super() so that the filesystem's block device(s) are still available. Unfortunately, this causes a memory leak in the case where a mount is attempted with the test_dummy_encryption mount option, but the mount fails after the option has already been processed. To fix this, attempt the keyring destruction in both places. Reported-by: syzbot+104c2a89561289cec13e@syzkaller.appspotmail.com Fixes: d7e7b9af104c ("fscrypt: stop using keyrings subsystem for fscrypt_master_key") Signed-off-by: Eric Biggers Reviewed-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20221011213838.209879-1-ebiggers@kernel.org --- fs/crypto/keyring.c | 17 +++++++++++------ fs/super.c | 3 ++- include/linux/fscrypt.h | 4 ++-- 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 1cca09aa43f8..2a24b1f0ae68 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -205,14 +205,19 @@ static int allocate_filesystem_keyring(struct super_block *sb) } /* - * This is called at unmount time to release all encryption keys that have been - * added to the filesystem, along with the keyring that contains them. + * Release all encryption keys that have been added to the filesystem, along + * with the keyring that contains them. * - * Note that besides clearing and freeing memory, this might need to evict keys - * from the keyslots of an inline crypto engine. Therefore, this must be called - * while the filesystem's underlying block device(s) are still available. + * This is called at unmount time. The filesystem's underlying block device(s) + * are still available at this time; this is important because after user file + * accesses have been allowed, this function may need to evict keys from the + * keyslots of an inline crypto engine, which requires the block device(s). + * + * This is also called when the super_block is being freed. This is needed to + * avoid a memory leak if mounting fails after the "test_dummy_encryption" + * option was processed, as in that case the unmount-time call isn't made. */ -void fscrypt_sb_delete(struct super_block *sb) +void fscrypt_destroy_keyring(struct super_block *sb) { struct fscrypt_keyring *keyring = sb->s_master_keys; size_t i; diff --git a/fs/super.c b/fs/super.c index 6a82660e1adb..8d39e4f11cfa 100644 --- a/fs/super.c +++ b/fs/super.c @@ -291,6 +291,7 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); + fscrypt_destroy_keyring(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); @@ -479,7 +480,7 @@ void generic_shutdown_super(struct super_block *sb) evict_inodes(sb); /* only nonzero refcount inodes can have marks */ fsnotify_sb_delete(sb); - fscrypt_sb_delete(sb); + fscrypt_destroy_keyring(sb); security_sb_delete(sb); if (sb->s_dio_done_wq) { diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index cad78b569c7e..4f5f8a651213 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -307,7 +307,7 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) } /* keyring.c */ -void fscrypt_sb_delete(struct super_block *sb); +void fscrypt_destroy_keyring(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); int fscrypt_add_test_dummy_key(struct super_block *sb, const struct fscrypt_dummy_policy *dummy_policy); @@ -521,7 +521,7 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) } /* keyring.c */ -static inline void fscrypt_sb_delete(struct super_block *sb) +static inline void fscrypt_destroy_keyring(struct super_block *sb) { } -- cgit v1.2.3 From dbe69b29988465b011f198f2797b1c2b6980b50e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 18 Oct 2022 09:59:34 +0200 Subject: bpf: Fix dispatcher patchable function entry to 5 bytes nop The patchable_function_entry(5) might output 5 single nop instructions (depends on toolchain), which will clash with bpf_arch_text_poke check for 5 bytes nop instruction. Adding early init call for dispatcher that checks and change the patchable entry into expected 5 nop instruction if needed. There's no need to take text_mutex, because we are using it in early init call which is called at pre-smp time. Fixes: ceea991a019c ("bpf: Move bpf_dispatcher function out of ftrace locations") Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20221018075934.574415-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 13 +++++++++++++ include/linux/bpf.h | 14 +++++++++++++- kernel/bpf/dispatcher.c | 6 ++++++ 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 99620428ad78..00127abd89ee 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -388,6 +389,18 @@ out: return ret; } +int __init bpf_arch_init_dispatcher_early(void *ip) +{ + const u8 *nop_insn = x86_nops[5]; + + if (is_endbr(*(u32 *)ip)) + ip += ENDBR_INSN_SIZE; + + if (memcmp(ip, nop_insn, X86_PATCH_SIZE)) + text_poke_early(ip, nop_insn, X86_PATCH_SIZE); + return 0; +} + int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *old_addr, void *new_addr) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9e7d46d16032..0566705c1d4e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -27,6 +27,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -970,6 +971,8 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); +int __init bpf_arch_init_dispatcher_early(void *ip); + #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ @@ -983,6 +986,13 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func }, \ } +#define BPF_DISPATCHER_INIT_CALL(_name) \ + static int __init _name##_init(void) \ + { \ + return bpf_arch_init_dispatcher_early(_name##_func); \ + } \ + early_initcall(_name##_init) + #ifdef CONFIG_X86_64 #define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) #else @@ -1000,7 +1010,9 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func } \ EXPORT_SYMBOL(bpf_dispatcher_##name##_func); \ struct bpf_dispatcher bpf_dispatcher_##name = \ - BPF_DISPATCHER_INIT(bpf_dispatcher_##name); + BPF_DISPATCHER_INIT(bpf_dispatcher_##name); \ + BPF_DISPATCHER_INIT_CALL(bpf_dispatcher_##name); + #define DECLARE_BPF_DISPATCHER(name) \ unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index fa64b80b8bca..04f0a045dcaa 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -4,6 +4,7 @@ #include #include #include +#include /* The BPF dispatcher is a multiway branch code generator. The * dispatcher is a mechanism to avoid the performance penalty of an @@ -90,6 +91,11 @@ int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int n return -ENOTSUPP; } +int __weak __init bpf_arch_init_dispatcher_early(void *ip) +{ + return -ENOTSUPP; +} + static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf) { s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; -- cgit v1.2.3 From 0251d0107cfb0bb5ab2d3f97710487b9522db020 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 19 Oct 2022 08:44:44 +0800 Subject: iommu: Add gfp parameter to iommu_alloc_resv_region Add gfp parameter to iommu_alloc_resv_region() for the callers to specify the memory allocation behavior. Thus iommu_alloc_resv_region() could also be available in critical contexts. Signed-off-by: Lu Baolu Tested-by: Alex Williamson Link: https://lore.kernel.org/r/20220927053109.4053662-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/acpi/arm64/iort.c | 3 ++- drivers/iommu/amd/iommu.c | 7 ++++--- drivers/iommu/apple-dart.c | 2 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 2 +- drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 +- drivers/iommu/intel/iommu.c | 8 +++++--- drivers/iommu/iommu.c | 7 ++++--- drivers/iommu/mtk_iommu.c | 3 ++- drivers/iommu/virtio-iommu.c | 9 ++++++--- include/linux/iommu.h | 2 +- 10 files changed, 27 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index ca2aed86b540..8059baf4ef27 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1142,7 +1142,8 @@ static void iort_iommu_msi_get_resv_regions(struct device *dev, struct iommu_resv_region *region; region = iommu_alloc_resv_region(base + SZ_64K, SZ_64K, - prot, IOMMU_RESV_MSI); + prot, IOMMU_RESV_MSI, + GFP_KERNEL); if (region) list_add_tail(®ion->list, head); } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 65856e401949..d3b39d0416fa 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2330,7 +2330,8 @@ static void amd_iommu_get_resv_regions(struct device *dev, type = IOMMU_RESV_RESERVED; region = iommu_alloc_resv_region(entry->address_start, - length, prot, type); + length, prot, type, + GFP_KERNEL); if (!region) { dev_err(dev, "Out of memory allocating dm-regions\n"); return; @@ -2340,14 +2341,14 @@ static void amd_iommu_get_resv_regions(struct device *dev, region = iommu_alloc_resv_region(MSI_RANGE_START, MSI_RANGE_END - MSI_RANGE_START + 1, - 0, IOMMU_RESV_MSI); + 0, IOMMU_RESV_MSI, GFP_KERNEL); if (!region) return; list_add_tail(®ion->list, head); region = iommu_alloc_resv_region(HT_RANGE_START, HT_RANGE_END - HT_RANGE_START + 1, - 0, IOMMU_RESV_RESERVED); + 0, IOMMU_RESV_RESERVED, GFP_KERNEL); if (!region) return; list_add_tail(®ion->list, head); diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c index 4526575b999e..4f4a323be0d0 100644 --- a/drivers/iommu/apple-dart.c +++ b/drivers/iommu/apple-dart.c @@ -758,7 +758,7 @@ static void apple_dart_get_resv_regions(struct device *dev, region = iommu_alloc_resv_region(DOORBELL_ADDR, PAGE_SIZE, prot, - IOMMU_RESV_MSI); + IOMMU_RESV_MSI, GFP_KERNEL); if (!region) return; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ba47c73f5b8c..6d5df91c5c46 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2757,7 +2757,7 @@ static void arm_smmu_get_resv_regions(struct device *dev, int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, - prot, IOMMU_RESV_SW_MSI); + prot, IOMMU_RESV_SW_MSI, GFP_KERNEL); if (!region) return; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 6c1114a4d6cc..30dab1418e3f 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1534,7 +1534,7 @@ static void arm_smmu_get_resv_regions(struct device *dev, int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, - prot, IOMMU_RESV_SW_MSI); + prot, IOMMU_RESV_SW_MSI, GFP_KERNEL); if (!region) return; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a8b36c3fddf1..d5965b4f8b60 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4552,7 +4552,8 @@ static void intel_iommu_get_resv_regions(struct device *device, IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT; resv = iommu_alloc_resv_region(rmrr->base_address, - length, prot, type); + length, prot, type, + GFP_KERNEL); if (!resv) break; @@ -4567,7 +4568,8 @@ static void intel_iommu_get_resv_regions(struct device *device, if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) { reg = iommu_alloc_resv_region(0, 1UL << 24, prot, - IOMMU_RESV_DIRECT_RELAXABLE); + IOMMU_RESV_DIRECT_RELAXABLE, + GFP_KERNEL); if (reg) list_add_tail(®->list, head); } @@ -4576,7 +4578,7 @@ static void intel_iommu_get_resv_regions(struct device *device, reg = iommu_alloc_resv_region(IOAPIC_RANGE_START, IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1, - 0, IOMMU_RESV_MSI); + 0, IOMMU_RESV_MSI, GFP_KERNEL); if (!reg) return; list_add_tail(®->list, head); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 4893c2429ca5..65a3b3d886dc 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -504,7 +504,7 @@ static int iommu_insert_resv_region(struct iommu_resv_region *new, LIST_HEAD(stack); nr = iommu_alloc_resv_region(new->start, new->length, - new->prot, new->type); + new->prot, new->type, GFP_KERNEL); if (!nr) return -ENOMEM; @@ -2579,11 +2579,12 @@ EXPORT_SYMBOL(iommu_put_resv_regions); struct iommu_resv_region *iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot, - enum iommu_resv_type type) + enum iommu_resv_type type, + gfp_t gfp) { struct iommu_resv_region *region; - region = kzalloc(sizeof(*region), GFP_KERNEL); + region = kzalloc(sizeof(*region), gfp); if (!region) return NULL; diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 5a4e00e4bbbc..2ab2ecfe01f8 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -917,7 +917,8 @@ static void mtk_iommu_get_resv_regions(struct device *dev, continue; region = iommu_alloc_resv_region(resv->iova_base, resv->size, - prot, IOMMU_RESV_RESERVED); + prot, IOMMU_RESV_RESERVED, + GFP_KERNEL); if (!region) return; diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index b7c22802f57c..8b1b5c270e50 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -490,11 +490,13 @@ static int viommu_add_resv_mem(struct viommu_endpoint *vdev, fallthrough; case VIRTIO_IOMMU_RESV_MEM_T_RESERVED: region = iommu_alloc_resv_region(start, size, 0, - IOMMU_RESV_RESERVED); + IOMMU_RESV_RESERVED, + GFP_KERNEL); break; case VIRTIO_IOMMU_RESV_MEM_T_MSI: region = iommu_alloc_resv_region(start, size, prot, - IOMMU_RESV_MSI); + IOMMU_RESV_MSI, + GFP_KERNEL); break; } if (!region) @@ -909,7 +911,8 @@ static void viommu_get_resv_regions(struct device *dev, struct list_head *head) */ if (!msi) { msi = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, - prot, IOMMU_RESV_SW_MSI); + prot, IOMMU_RESV_SW_MSI, + GFP_KERNEL); if (!msi) return; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a325532aeab5..3c9da1f8979e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -455,7 +455,7 @@ extern void iommu_set_default_translated(bool cmd_line); extern bool iommu_default_passthrough(void); extern struct iommu_resv_region * iommu_alloc_resv_region(phys_addr_t start, size_t length, int prot, - enum iommu_resv_type type); + enum iommu_resv_type type, gfp_t gfp); extern int iommu_get_group_resv_regions(struct iommu_group *group, struct list_head *head); -- cgit v1.2.3 From 8a254d90a77580244ec57e82bca7eb65656cc167 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 19 Oct 2022 23:29:58 +0200 Subject: efi: efivars: Fix variable writes without query_variable_store() Commit bbc6d2c6ef22 ("efi: vars: Switch to new wrapper layer") refactored the efivars layer so that the 'business logic' related to which UEFI variables affect the boot flow in which way could be moved out of it, and into the efivarfs driver. This inadvertently broke setting variables on firmware implementations that lack the QueryVariableInfo() boot service, because we no longer tolerate a EFI_UNSUPPORTED result from check_var_size() when calling efivar_entry_set_get_size(), which now ends up calling check_var_size() a second time inadvertently. If QueryVariableInfo() is missing, we support writes of up to 64k - let's move that logic into check_var_size(), and drop the redundant call. Cc: # v6.0 Fixes: bbc6d2c6ef22 ("efi: vars: Switch to new wrapper layer") Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/vars.c | 10 +++++----- fs/efivarfs/vars.c | 16 ---------------- include/linux/efi.h | 3 --- 3 files changed, 5 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index dd74d2ad3184..433b61587139 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -20,19 +21,19 @@ static struct efivars *__efivars; static DEFINE_SEMAPHORE(efivars_lock); -efi_status_t check_var_size(u32 attributes, unsigned long size) +static efi_status_t check_var_size(u32 attributes, unsigned long size) { const struct efivar_operations *fops; fops = __efivars->ops; if (!fops->query_variable_store) - return EFI_UNSUPPORTED; + return (size <= SZ_64K) ? EFI_SUCCESS : EFI_OUT_OF_RESOURCES; return fops->query_variable_store(attributes, size, false); } -EXPORT_SYMBOL_NS_GPL(check_var_size, EFIVAR); +static efi_status_t check_var_size_nonblocking(u32 attributes, unsigned long size) { const struct efivar_operations *fops; @@ -40,11 +41,10 @@ efi_status_t check_var_size_nonblocking(u32 attributes, unsigned long size) fops = __efivars->ops; if (!fops->query_variable_store) - return EFI_UNSUPPORTED; + return (size <= SZ_64K) ? EFI_SUCCESS : EFI_OUT_OF_RESOURCES; return fops->query_variable_store(attributes, size, true); } -EXPORT_SYMBOL_NS_GPL(check_var_size_nonblocking, EFIVAR); /** * efivars_kobject - get the kobject for the registered efivars diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index a0ef63cfcecb..9e4f47808bd5 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -651,22 +651,6 @@ int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes, if (err) return err; - /* - * Ensure that the available space hasn't shrunk below the safe level - */ - status = check_var_size(attributes, *size + ucs2_strsize(name, 1024)); - if (status != EFI_SUCCESS) { - if (status != EFI_UNSUPPORTED) { - err = efi_status_to_err(status); - goto out; - } - - if (*size > 65536) { - err = -ENOSPC; - goto out; - } - } - status = efivar_set_variable_locked(name, vendor, attributes, *size, data, false); if (status != EFI_SUCCESS) { diff --git a/include/linux/efi.h b/include/linux/efi.h index da3974bf05d3..80f3c1c7827d 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1085,9 +1085,6 @@ efi_status_t efivar_set_variable_locked(efi_char16_t *name, efi_guid_t *vendor, efi_status_t efivar_set_variable(efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data); -efi_status_t check_var_size(u32 attributes, unsigned long size); -efi_status_t check_var_size_nonblocking(u32 attributes, unsigned long size); - #if IS_ENABLED(CONFIG_EFI_CAPSULE_LOADER) extern bool efi_capsule_pending(int *reset_type); -- cgit v1.2.3 From ed51862f2f57cbce6fed2d4278cfe70a490899fd Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 17 Oct 2022 20:45:39 +0200 Subject: kvm: Add support for arch compat vm ioctls We will introduce the first architecture specific compat vm ioctl in the next patch. Add all necessary boilerplate to allow architectures to override compat vm ioctls when necessary. Signed-off-by: Alexander Graf Message-Id: <20221017184541.2658-2-graf@amazon.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 32f259fa5801..00c3448ba7f8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1390,6 +1390,8 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg); int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e30f1b4ecfa5..1376a47fedee 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4839,6 +4839,12 @@ struct compat_kvm_clear_dirty_log { }; }; +long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOTTY; +} + static long kvm_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4847,6 +4853,11 @@ static long kvm_vm_compat_ioctl(struct file *filp, if (kvm->mm != current->mm || kvm->vm_dead) return -EIO; + + r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg); + if (r != -ENOTTY) + return r; + switch (ioctl) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT case KVM_CLEAR_DIRTY_LOG: { -- cgit v1.2.3 From e993ffe3da4bcddea0536b03be1031bf35cd8d85 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 21 Oct 2022 11:16:39 +0100 Subject: net: flag sockets supporting msghdr originated zerocopy We need an efficient way in io_uring to check whether a socket supports zerocopy with msghdr provided ubuf_info. Add a new flag into the struct socket flags fields. Cc: # 6.0 Signed-off-by: Pavel Begunkov Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/3dafafab822b1c66308bb58a0ac738b1e3f53f74.1666346426.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/net.h | 1 + net/ipv4/tcp.c | 1 + net/ipv4/udp.c | 1 + 3 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 711c3593c3b8..18d942bbdf6e 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -41,6 +41,7 @@ struct net; #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 +#define SOCK_SUPPORT_ZC 5 #ifndef ARCH_HAS_SOCKET_TYPES /** diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f8232811a5be..ef14efa1fb70 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 662d717d5123..1c646797cc79 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1624,6 +1624,7 @@ int udp_init_sock(struct sock *sk) { skb_queue_head_init(&udp_sk(sk)->reader_queue); sk->sk_destruct = udp_destruct_sock; + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); return 0; } -- cgit v1.2.3 From 52826d3b2d1d8e1180a84bef7d72596d6a024a38 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 23 Oct 2022 12:01:01 -0700 Subject: kernel/utsname_sysctl.c: Fix hostname polling Commit bfca3dd3d068 ("kernel/utsname_sysctl.c: print kernel arch") added a new entry to the uts_kern_table[] array, but didn't update the UTS_PROC_xyz enumerators of older entries, breaking anything that used them. Which is admittedly not many cases: it's really just the two uses of uts_proc_notify() in kernel/sys.c. But apparently journald-systemd actually uses this to detect hostname changes. Reported-by: Torsten Hilbrich Fixes: bfca3dd3d068 ("kernel/utsname_sysctl.c: print kernel arch") Link: https://lore.kernel.org/lkml/0c2b92a6-0f25-9538-178f-eee3b06da23f@secunet.com/ Link: https://linux-regtracking.leemhuis.info/regzbot/regression/0c2b92a6-0f25-9538-178f-eee3b06da23f@secunet.com/ Cc: Petr Vorel Signed-off-by: Linus Torvalds --- include/linux/utsname.h | 1 + kernel/utsname_sysctl.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 2b1737c9b244..bf7613ba412b 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -10,6 +10,7 @@ #include enum uts_proc { + UTS_PROC_ARCH, UTS_PROC_OSTYPE, UTS_PROC_OSRELEASE, UTS_PROC_VERSION, diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 064072c16e3d..f50398cb790d 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -74,6 +74,7 @@ static int proc_do_uts_string(struct ctl_table *table, int write, static DEFINE_CTL_TABLE_POLL(hostname_poll); static DEFINE_CTL_TABLE_POLL(domainname_poll); +// Note: update 'enum uts_proc' to match any changes to this table static struct ctl_table uts_kern_table[] = { { .procname = "arch", -- cgit v1.2.3 From 161a438d730dade2ba2b1bf8785f0759aba4ca5f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 20 Oct 2022 10:39:08 +0200 Subject: efi: random: reduce seed size to 32 bytes We no longer need at least 64 bytes of random seed to permit the early crng init to complete. The RNG is now based on Blake2s, so reduce the EFI seed size to the Blake2s hash size, which is sufficient for our purposes. While at it, drop the READ_ONCE(), which was supposed to prevent size from being evaluated after seed was unmapped. However, this cannot actually happen, so READ_ONCE() is unnecessary here. Cc: # v4.14+ Signed-off-by: Ard Biesheuvel Reviewed-by: Jason A. Donenfeld Acked-by: Ilias Apalodimas --- drivers/firmware/efi/efi.c | 2 +- include/linux/efi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 3ecdc43a3f2b..a46df5d1d094 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -611,7 +611,7 @@ int __init efi_config_parse_tables(const efi_config_table_t *config_tables, seed = early_memremap(efi_rng_seed, sizeof(*seed)); if (seed != NULL) { - size = READ_ONCE(seed->size); + size = min(seed->size, EFI_RANDOM_SEED_SIZE); early_memunmap(seed, sizeof(*seed)); } else { pr_err("Could not map UEFI random seed!\n"); diff --git a/include/linux/efi.h b/include/linux/efi.h index 80f3c1c7827d..929d559ad41d 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1222,7 +1222,7 @@ efi_status_t efi_random_get_seed(void); arch_efi_call_virt_teardown(); \ }) -#define EFI_RANDOM_SEED_SIZE 64U +#define EFI_RANDOM_SEED_SIZE 32U // BLAKE2S_HASH_SIZE struct linux_efi_random_seed { u32 size; -- cgit v1.2.3 From 31970608a6d3796c3adbfbfd379fa3092de65c5d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 26 Sep 2022 12:45:32 -0700 Subject: overflow: Fix kern-doc markup for functions Fix the kern-doc markings for several of the overflow helpers and move their location into the core kernel API documentation, where it belongs (it's not driver-specific). Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: linux-hardening@vger.kernel.org Reviewed-by: Akira Yokosawa Signed-off-by: Kees Cook --- Documentation/core-api/kernel-api.rst | 6 ++++++ Documentation/driver-api/basics.rst | 3 --- include/linux/overflow.h | 38 +++++++++++++++-------------------- 3 files changed, 22 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 0793c400d4b0..06f4ab122697 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -118,6 +118,12 @@ Text Searching CRC and Math Functions in Linux =============================== +Arithmetic Overflow Checking +---------------------------- + +.. kernel-doc:: include/linux/overflow.h + :internal: + CRC Functions ------------- diff --git a/Documentation/driver-api/basics.rst b/Documentation/driver-api/basics.rst index 3e2dae954898..4b4d8e28d3be 100644 --- a/Documentation/driver-api/basics.rst +++ b/Documentation/driver-api/basics.rst @@ -107,9 +107,6 @@ Kernel utility functions .. kernel-doc:: kernel/panic.c :export: -.. kernel-doc:: include/linux/overflow.h - :internal: - Device Resource Management -------------------------- diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 19dfdd74835e..1d3be1a2204c 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -51,8 +51,8 @@ static inline bool __must_check __must_check_overflow(bool overflow) return unlikely(overflow); } -/** check_add_overflow() - Calculate addition with overflow checking - * +/** + * check_add_overflow() - Calculate addition with overflow checking * @a: first addend * @b: second addend * @d: pointer to store sum @@ -66,8 +66,8 @@ static inline bool __must_check __must_check_overflow(bool overflow) #define check_add_overflow(a, b, d) \ __must_check_overflow(__builtin_add_overflow(a, b, d)) -/** check_sub_overflow() - Calculate subtraction with overflow checking - * +/** + * check_sub_overflow() - Calculate subtraction with overflow checking * @a: minuend; value to subtract from * @b: subtrahend; value to subtract from @a * @d: pointer to store difference @@ -81,8 +81,8 @@ static inline bool __must_check __must_check_overflow(bool overflow) #define check_sub_overflow(a, b, d) \ __must_check_overflow(__builtin_sub_overflow(a, b, d)) -/** check_mul_overflow() - Calculate multiplication with overflow checking - * +/** + * check_mul_overflow() - Calculate multiplication with overflow checking * @a: first factor * @b: second factor * @d: pointer to store product @@ -96,23 +96,24 @@ static inline bool __must_check __must_check_overflow(bool overflow) #define check_mul_overflow(a, b, d) \ __must_check_overflow(__builtin_mul_overflow(a, b, d)) -/** check_shl_overflow() - Calculate a left-shifted value and check overflow - * +/** + * check_shl_overflow() - Calculate a left-shifted value and check overflow * @a: Value to be shifted * @s: How many bits left to shift * @d: Pointer to where to store the result * * Computes *@d = (@a << @s) * - * Returns true if '*d' cannot hold the result or when 'a << s' doesn't + * Returns true if '*@d' cannot hold the result or when '@a << @s' doesn't * make sense. Example conditions: - * - 'a << s' causes bits to be lost when stored in *d. - * - 's' is garbage (e.g. negative) or so large that the result of - * 'a << s' is guaranteed to be 0. - * - 'a' is negative. - * - 'a << s' sets the sign bit, if any, in '*d'. * - * '*d' will hold the results of the attempted shift, but is not + * - '@a << @s' causes bits to be lost when stored in *@d. + * - '@s' is garbage (e.g. negative) or so large that the result of + * '@a << @s' is guaranteed to be 0. + * - '@a' is negative. + * - '@a << @s' sets the sign bit, if any, in '*@d'. + * + * '*@d' will hold the results of the attempted shift, but is not * considered "safe for use" if true is returned. */ #define check_shl_overflow(a, s, d) __must_check_overflow(({ \ @@ -129,7 +130,6 @@ static inline bool __must_check __must_check_overflow(bool overflow) /** * size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX - * * @factor1: first factor * @factor2: second factor * @@ -149,7 +149,6 @@ static inline size_t __must_check size_mul(size_t factor1, size_t factor2) /** * size_add() - Calculate size_t addition with saturation at SIZE_MAX - * * @addend1: first addend * @addend2: second addend * @@ -169,7 +168,6 @@ static inline size_t __must_check size_add(size_t addend1, size_t addend2) /** * size_sub() - Calculate size_t subtraction with saturation at SIZE_MAX - * * @minuend: value to subtract from * @subtrahend: value to subtract from @minuend * @@ -192,7 +190,6 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) /** * array_size() - Calculate size of 2-dimensional array. - * * @a: dimension one * @b: dimension two * @@ -205,7 +202,6 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) /** * array3_size() - Calculate size of 3-dimensional array. - * * @a: dimension one * @b: dimension two * @c: dimension three @@ -220,7 +216,6 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) /** * flex_array_size() - Calculate size of a flexible array member * within an enclosing structure. - * * @p: Pointer to the structure. * @member: Name of the flexible array member. * @count: Number of elements in the array. @@ -237,7 +232,6 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) /** * struct_size() - Calculate size of structure with trailing flexible array. - * * @p: Pointer to the structure. * @member: Name of the array member. * @count: Number of elements in the array. -- cgit v1.2.3 From 52491a38b2c2411f3f0229dc6ad610349c704a41 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Oct 2022 21:12:19 +0000 Subject: KVM: Initialize gfn_to_pfn_cache locks in dedicated helper Move the gfn_to_pfn_cache lock initialization to another helper and call the new helper during VM/vCPU creation. There are race conditions possible due to kvm_gfn_to_pfn_cache_init()'s ability to re-initialize the cache's locks. For example: a race between ioctl(KVM_XEN_HVM_EVTCHN_SEND) and kvm_gfn_to_pfn_cache_init() leads to a corrupted shinfo gpc lock. (thread 1) | (thread 2) | kvm_xen_set_evtchn_fast | read_lock_irqsave(&gpc->lock, ...) | | kvm_gfn_to_pfn_cache_init | rwlock_init(&gpc->lock) read_unlock_irqrestore(&gpc->lock, ...) | Rename "cache_init" and "cache_destroy" to activate+deactivate to avoid implying that the cache really is destroyed/freed. Note, there more races in the newly named kvm_gpc_activate() that will be addressed separately. Fixes: 982ed0de4753 ("KVM: Reinstate gfn_to_pfn_cache with invalidation support") Cc: stable@vger.kernel.org Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj [sean: call out that this is a bug fix] Signed-off-by: Sean Christopherson Message-Id: <20221013211234.1318131-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 12 +++++----- arch/x86/kvm/xen.c | 57 +++++++++++++++++++++++++----------------------- include/linux/kvm_host.h | 24 ++++++++++++++------ virt/kvm/pfncache.c | 21 ++++++++++-------- 4 files changed, 66 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 104b72df33d6..521b433f978c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2315,11 +2315,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, /* we verify if the enable bit is set... */ if (system_time & 1) { - kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu, - KVM_HOST_USES_PFN, system_time & ~1ULL, - sizeof(struct pvclock_vcpu_time_info)); + kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu, + KVM_HOST_USES_PFN, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info)); } else { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); } return; @@ -3388,7 +3388,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); vcpu->arch.time = 0; } @@ -11829,6 +11829,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + kvm_gpc_init(&vcpu->arch.pv_time); + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 93c628d3e3a9..b2be60c6efa4 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -42,13 +42,13 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) int idx = srcu_read_lock(&kvm->srcu); if (gfn == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(kvm, gpc); + kvm_gpc_deactivate(kvm, gpc); goto out; } do { - ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN, - gpa, PAGE_SIZE); + ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa, + PAGE_SIZE); if (ret) goto out; @@ -554,15 +554,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) offsetof(struct compat_vcpu_info, time)); if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_info)); + r = kvm_gpc_activate(vcpu->kvm, + &vcpu->arch.xen.vcpu_info_cache, NULL, + KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct vcpu_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -570,16 +570,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct pvclock_vcpu_time_info)); + r = kvm_gpc_activate(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache, + NULL, KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct pvclock_vcpu_time_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); break; @@ -590,16 +590,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(vcpu->kvm, + &vcpu->arch.xen.runstate_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.runstate_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_runstate_info)); + r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache, + NULL, KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct vcpu_runstate_info)); break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: @@ -1816,7 +1815,12 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) { vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx; vcpu->arch.xen.poll_evtchn = 0; + timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); + + kvm_gpc_init(&vcpu->arch.xen.runstate_cache); + kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache); } void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) @@ -1824,18 +1828,17 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) if (kvm_xen_timer_enabled(vcpu)) kvm_xen_stop_timer(vcpu); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache); + del_timer_sync(&vcpu->arch.xen.poll_timer); } void kvm_xen_init_vm(struct kvm *kvm) { idr_init(&kvm->arch.xen.evtchn_ports); + kvm_gpc_init(&kvm->arch.xen.shinfo_cache); } void kvm_xen_destroy_vm(struct kvm *kvm) @@ -1843,7 +1846,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm) struct evtchnfd *evtchnfd; int i; - kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); + kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache); idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { if (!evtchnfd->deliver.port.port) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 00c3448ba7f8..18592bdf4c1b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1240,8 +1240,18 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); /** - * kvm_gfn_to_pfn_cache_init - prepare a cached kernel mapping and HPA for a - * given guest physical address. + * kvm_gpc_init - initialize gfn_to_pfn_cache. + * + * @gpc: struct gfn_to_pfn_cache object. + * + * This sets up a gfn_to_pfn_cache by initializing locks. Note, the cache must + * be zero-allocated (or zeroed by the caller before init). + */ +void kvm_gpc_init(struct gfn_to_pfn_cache *gpc); + +/** + * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest + * physical address. * * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. @@ -1265,9 +1275,9 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); * kvm_gfn_to_pfn_cache_check() to ensure that the cache is valid before * accessing the target page. */ -int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, - gpa_t gpa, unsigned long len); +int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, + gpa_t gpa, unsigned long len); /** * kvm_gfn_to_pfn_cache_check - check validity of a gfn_to_pfn_cache. @@ -1324,7 +1334,7 @@ int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); /** - * kvm_gfn_to_pfn_cache_destroy - destroy and unlink a gfn_to_pfn_cache. + * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache. * * @kvm: pointer to kvm instance. * @gpc: struct gfn_to_pfn_cache object. @@ -1332,7 +1342,7 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); * This removes a cache from the @kvm's list to be processed on MMU notifier * invocation. */ -void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); +void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc); void kvm_sigset_activate(struct kvm_vcpu *vcpu); void kvm_sigset_deactivate(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 68ff41d39545..08f97cf97264 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -346,17 +346,20 @@ void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) } EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap); +void kvm_gpc_init(struct gfn_to_pfn_cache *gpc) +{ + rwlock_init(&gpc->lock); + mutex_init(&gpc->refresh_lock); +} +EXPORT_SYMBOL_GPL(kvm_gpc_init); -int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, - struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, - gpa_t gpa, unsigned long len) +int kvm_gpc_activate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + struct kvm_vcpu *vcpu, enum pfn_cache_usage usage, + gpa_t gpa, unsigned long len) { WARN_ON_ONCE(!usage || (usage & KVM_GUEST_AND_HOST_USE_PFN) != usage); if (!gpc->active) { - rwlock_init(&gpc->lock); - mutex_init(&gpc->refresh_lock); - gpc->khva = NULL; gpc->pfn = KVM_PFN_ERR_FAULT; gpc->uhva = KVM_HVA_ERR_BAD; @@ -371,9 +374,9 @@ int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, } return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len); } -EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_init); +EXPORT_SYMBOL_GPL(kvm_gpc_activate); -void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) +void kvm_gpc_deactivate(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) { if (gpc->active) { spin_lock(&kvm->gpc_lock); @@ -384,4 +387,4 @@ void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) gpc->active = false; } } -EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_destroy); +EXPORT_SYMBOL_GPL(kvm_gpc_deactivate); -- cgit v1.2.3 From 2d87d455ead2cbdee7e60463cddc5bff3f98c912 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 27 Oct 2022 16:57:09 +0800 Subject: blk-mq: don't add non-pt request with ->end_io to batch dm-rq implements ->end_io callback for request issued to underlying queue, and it isn't passthrough request. Commit ab3e1d3bbab9 ("block: allow end_io based requests in the completion batch handling") doesn't clear rq->bio and rq->__data_len for request with ->end_io in blk_mq_end_request_batch(), and this way is actually dangerous, but so far it is only for nvme passthrough request. dm-rq needs to clean up remained bios in case of partial completion, and req->bio is required, then use-after-free is triggered, so the underlying clone request can't be completed in blk_mq_end_request_batch. Fix panic by not adding such request into batch list, and the issue can be triggered simply by exposing nvme pci to dm-mpath simply. Fixes: ab3e1d3bbab9 ("block: allow end_io based requests in the completion batch handling") Cc: dm-devel@redhat.com Cc: Mike Snitzer Reported-by: Changhui Zhong Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20221027085709.513175-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ba18e9bdb799..d6119c5d1069 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -853,7 +853,8 @@ static inline bool blk_mq_add_to_batch(struct request *req, struct io_comp_batch *iob, int ioerror, void (*complete)(struct io_comp_batch *)) { - if (!iob || (req->rq_flags & RQF_ELV) || ioerror) + if (!iob || (req->rq_flags & RQF_ELV) || ioerror || + (req->end_io && !blk_rq_is_passthrough(req))) return false; if (!iob->complete) -- cgit v1.2.3 From bacd22df95147ed673bec4692ab2d4d585935241 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 26 Oct 2022 14:51:45 +0100 Subject: net/mlx5: Fix possible use-after-free in async command interface mlx5_cmd_cleanup_async_ctx should return only after all its callback handlers were completed. Before this patch, the below race between mlx5_cmd_cleanup_async_ctx and mlx5_cmd_exec_cb_handler was possible and lead to a use-after-free: 1. mlx5_cmd_cleanup_async_ctx is called while num_inflight is 2 (i.e. elevated by 1, a single inflight callback). 2. mlx5_cmd_cleanup_async_ctx decreases num_inflight to 1. 3. mlx5_cmd_exec_cb_handler is called, decreases num_inflight to 0 and is about to call wake_up(). 4. mlx5_cmd_cleanup_async_ctx calls wait_event, which returns immediately as the condition (num_inflight == 0) holds. 5. mlx5_cmd_cleanup_async_ctx returns. 6. The caller of mlx5_cmd_cleanup_async_ctx frees the mlx5_async_ctx object. 7. mlx5_cmd_exec_cb_handler goes on and calls wake_up() on the freed object. Fix it by syncing using a completion object. Mark it completed when num_inflight reaches 0. Trace: BUG: KASAN: use-after-free in do_raw_spin_lock+0x23d/0x270 Read of size 4 at addr ffff888139cd12f4 by task swapper/5/0 CPU: 5 PID: 0 Comm: swapper/5 Not tainted 6.0.0-rc3_for_upstream_debug_2022_08_30_13_10 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x57/0x7d print_report.cold+0x2d5/0x684 ? do_raw_spin_lock+0x23d/0x270 kasan_report+0xb1/0x1a0 ? do_raw_spin_lock+0x23d/0x270 do_raw_spin_lock+0x23d/0x270 ? rwlock_bug.part.0+0x90/0x90 ? __delete_object+0xb8/0x100 ? lock_downgrade+0x6e0/0x6e0 _raw_spin_lock_irqsave+0x43/0x60 ? __wake_up_common_lock+0xb9/0x140 __wake_up_common_lock+0xb9/0x140 ? __wake_up_common+0x650/0x650 ? destroy_tis_callback+0x53/0x70 [mlx5_core] ? kasan_set_track+0x21/0x30 ? destroy_tis_callback+0x53/0x70 [mlx5_core] ? kfree+0x1ba/0x520 ? do_raw_spin_unlock+0x54/0x220 mlx5_cmd_exec_cb_handler+0x136/0x1a0 [mlx5_core] ? mlx5_cmd_cleanup_async_ctx+0x220/0x220 [mlx5_core] ? mlx5_cmd_cleanup_async_ctx+0x220/0x220 [mlx5_core] mlx5_cmd_comp_handler+0x65a/0x12b0 [mlx5_core] ? dump_command+0xcc0/0xcc0 [mlx5_core] ? lockdep_hardirqs_on_prepare+0x400/0x400 ? cmd_comp_notifier+0x7e/0xb0 [mlx5_core] cmd_comp_notifier+0x7e/0xb0 [mlx5_core] atomic_notifier_call_chain+0xd7/0x1d0 mlx5_eq_async_int+0x3ce/0xa20 [mlx5_core] atomic_notifier_call_chain+0xd7/0x1d0 ? irq_release+0x140/0x140 [mlx5_core] irq_int_handler+0x19/0x30 [mlx5_core] __handle_irq_event_percpu+0x1f2/0x620 handle_irq_event+0xb2/0x1d0 handle_edge_irq+0x21e/0xb00 __common_interrupt+0x79/0x1a0 common_interrupt+0x78/0xa0 asm_common_interrupt+0x22/0x40 RIP: 0010:default_idle+0x42/0x60 Code: c1 83 e0 07 48 c1 e9 03 83 c0 03 0f b6 14 11 38 d0 7c 04 84 d2 75 14 8b 05 eb 47 22 02 85 c0 7e 07 0f 00 2d e0 9f 48 00 fb f4 48 c7 c7 80 08 7f 85 e8 d1 d3 3e fe eb de 66 66 2e 0f 1f 84 00 RSP: 0018:ffff888100dbfdf0 EFLAGS: 00000242 RAX: 0000000000000001 RBX: ffffffff84ecbd48 RCX: 1ffffffff0afe110 RDX: 0000000000000004 RSI: 0000000000000000 RDI: ffffffff835cc9bc RBP: 0000000000000005 R08: 0000000000000001 R09: ffff88881dec4ac3 R10: ffffed1103bd8958 R11: 0000017d0ca571c9 R12: 0000000000000005 R13: ffffffff84f024e0 R14: 0000000000000000 R15: dffffc0000000000 ? default_idle_call+0xcc/0x450 default_idle_call+0xec/0x450 do_idle+0x394/0x450 ? arch_cpu_idle_exit+0x40/0x40 ? do_idle+0x17/0x450 cpu_startup_entry+0x19/0x20 start_secondary+0x221/0x2b0 ? set_cpu_sibling_map+0x2070/0x2070 secondary_startup_64_no_verify+0xcd/0xdb Allocated by task 49502: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 kvmalloc_node+0x48/0xe0 mlx5e_bulk_async_init+0x35/0x110 [mlx5_core] mlx5e_tls_priv_tx_list_cleanup+0x84/0x3e0 [mlx5_core] mlx5e_ktls_cleanup_tx+0x38f/0x760 [mlx5_core] mlx5e_cleanup_nic_tx+0xa7/0x100 [mlx5_core] mlx5e_detach_netdev+0x1ca/0x2b0 [mlx5_core] mlx5e_suspend+0xdb/0x140 [mlx5_core] mlx5e_remove+0x89/0x190 [mlx5_core] auxiliary_bus_remove+0x52/0x70 device_release_driver_internal+0x40f/0x650 driver_detach+0xc1/0x180 bus_remove_driver+0x125/0x2f0 auxiliary_driver_unregister+0x16/0x50 mlx5e_cleanup+0x26/0x30 [mlx5_core] cleanup+0xc/0x4e [mlx5_core] __x64_sys_delete_module+0x2b5/0x450 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Freed by task 49502: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_set_free_info+0x20/0x30 ____kasan_slab_free+0x11d/0x1b0 kfree+0x1ba/0x520 mlx5e_tls_priv_tx_list_cleanup+0x2e7/0x3e0 [mlx5_core] mlx5e_ktls_cleanup_tx+0x38f/0x760 [mlx5_core] mlx5e_cleanup_nic_tx+0xa7/0x100 [mlx5_core] mlx5e_detach_netdev+0x1ca/0x2b0 [mlx5_core] mlx5e_suspend+0xdb/0x140 [mlx5_core] mlx5e_remove+0x89/0x190 [mlx5_core] auxiliary_bus_remove+0x52/0x70 device_release_driver_internal+0x40f/0x650 driver_detach+0xc1/0x180 bus_remove_driver+0x125/0x2f0 auxiliary_driver_unregister+0x16/0x50 mlx5e_cleanup+0x26/0x30 [mlx5_core] cleanup+0xc/0x4e [mlx5_core] __x64_sys_delete_module+0x2b5/0x450 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Fixes: e355477ed9e4 ("net/mlx5: Make mlx5_cmd_exec_cb() a safe API") Signed-off-by: Tariq Toukan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20221026135153.154807-8-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 10 +++++----- include/linux/mlx5/driver.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 0377392848d9..46ba4c2faad2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -2004,7 +2004,7 @@ void mlx5_cmd_init_async_ctx(struct mlx5_core_dev *dev, ctx->dev = dev; /* Starts at 1 to avoid doing wake_up if we are not cleaning up */ atomic_set(&ctx->num_inflight, 1); - init_waitqueue_head(&ctx->wait); + init_completion(&ctx->inflight_done); } EXPORT_SYMBOL(mlx5_cmd_init_async_ctx); @@ -2018,8 +2018,8 @@ EXPORT_SYMBOL(mlx5_cmd_init_async_ctx); */ void mlx5_cmd_cleanup_async_ctx(struct mlx5_async_ctx *ctx) { - atomic_dec(&ctx->num_inflight); - wait_event(ctx->wait, atomic_read(&ctx->num_inflight) == 0); + if (!atomic_dec_and_test(&ctx->num_inflight)) + wait_for_completion(&ctx->inflight_done); } EXPORT_SYMBOL(mlx5_cmd_cleanup_async_ctx); @@ -2032,7 +2032,7 @@ static void mlx5_cmd_exec_cb_handler(int status, void *_work) status = cmd_status_err(ctx->dev, status, work->opcode, work->out); work->user_callback(status, work); if (atomic_dec_and_test(&ctx->num_inflight)) - wake_up(&ctx->wait); + complete(&ctx->inflight_done); } int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, @@ -2050,7 +2050,7 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, ret = cmd_exec(ctx->dev, in, in_size, out, out_size, mlx5_cmd_exec_cb_handler, work, false); if (ret && atomic_dec_and_test(&ctx->num_inflight)) - wake_up(&ctx->wait); + complete(&ctx->inflight_done); return ret; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a12929bc31b2..af2ceb4160bc 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -970,7 +970,7 @@ void mlx5_cmd_allowed_opcode(struct mlx5_core_dev *dev, u16 opcode); struct mlx5_async_ctx { struct mlx5_core_dev *dev; atomic_t num_inflight; - struct wait_queue_head wait; + struct completion inflight_done; }; struct mlx5_async_work; -- cgit v1.2.3 From 67eae54bc227b30dedcce9db68b063ba1adb7838 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 24 Oct 2022 15:33:35 -0400 Subject: mm/uffd: fix vma check on userfault for wp We used to have a report that pte-marker code can be reached even when uffd-wp is not compiled in for file memories, here: https://lore.kernel.org/all/YzeR+R6b4bwBlBHh@x1n/T/#u I just got time to revisit this and found that the root cause is we simply messed up with the vma check, so that for !PTE_MARKER_UFFD_WP system, we will allow UFFDIO_REGISTER of MINOR & WP upon shmem as the check was wrong: if (vm_flags & VM_UFFD_MINOR) return is_vm_hugetlb_page(vma) || vma_is_shmem(vma); Where we'll allow anything to pass on shmem as long as minor mode is requested. Axel did it right when introducing minor mode but I messed it up in b1f9e876862d when moving code around. Fix it. Link: https://lkml.kernel.org/r/20221024193336.1233616-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221024193336.1233616-2-peterx@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: Andrea Arcangeli Cc: Nadav Amit Cc: Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f07e6998bb68..9df0b9a762cc 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -146,9 +146,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) static inline bool vma_can_userfault(struct vm_area_struct *vma, unsigned long vm_flags) { - if (vm_flags & VM_UFFD_MINOR) - return is_vm_hugetlb_page(vma) || vma_is_shmem(vma); - + if ((vm_flags & VM_UFFD_MINOR) && + (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) + return false; #ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for -- cgit v1.2.3 From 78a498c3a227f2ac773a8234b2ce092a4403f2c3 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:44 +0200 Subject: x86: fortify: kmsan: fix KMSAN fortify builds Ensure that KMSAN builds replace memset/memcpy/memmove calls with the respective __msan_XXX functions, and that none of the macros are redefined twice. This should allow building kernel with both CONFIG_KMSAN and CONFIG_FORTIFY_SOURCE. Link: https://lkml.kernel.org/r/20221024212144.2852069-5-glider@google.com Link: https://github.com/google/kmsan/issues/89 Signed-off-by: Alexander Potapenko Reported-by: Tamas K Lengyel Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Kees Cook Signed-off-by: Andrew Morton --- arch/x86/include/asm/string_64.h | 11 +++++++---- include/linux/fortify-string.h | 17 +++++++++++++++-- include/linux/kmsan_string.h | 21 +++++++++++++++++++++ mm/kmsan/instrumentation.c | 1 + 4 files changed, 44 insertions(+), 6 deletions(-) create mode 100644 include/linux/kmsan_string.h (limited to 'include/linux') diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 3b87d889b6e1..888731ccf1f6 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -10,10 +10,13 @@ /* Even with __builtin_ the compiler may decide to use the out of line function. */ +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) +#include +#endif + #define __HAVE_ARCH_MEMCPY 1 -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memcpy -void *__msan_memcpy(void *dst, const void *src, size_t size); #define memcpy __msan_memcpy #else extern void *memcpy(void *to, const void *from, size_t len); @@ -21,7 +24,7 @@ extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) extern void *__msan_memset(void *s, int c, size_t n); #undef memset #define memset __msan_memset @@ -67,7 +70,7 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) } #define __HAVE_ARCH_MEMMOVE -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memmove void *__msan_memmove(void *dest, const void *src, size_t len); #define memmove __msan_memmove diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 4029fe368a4f..18a31b125f9d 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -43,11 +43,24 @@ extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); #else -#define __underlying_memchr __builtin_memchr -#define __underlying_memcmp __builtin_memcmp + +#if defined(__SANITIZE_MEMORY__) +/* + * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the + * corresponding __msan_XXX functions. + */ +#include +#define __underlying_memcpy __msan_memcpy +#define __underlying_memmove __msan_memmove +#define __underlying_memset __msan_memset +#else #define __underlying_memcpy __builtin_memcpy #define __underlying_memmove __builtin_memmove #define __underlying_memset __builtin_memset +#endif + +#define __underlying_memchr __builtin_memchr +#define __underlying_memcmp __builtin_memcmp #define __underlying_strcat __builtin_strcat #define __underlying_strcpy __builtin_strcpy #define __underlying_strlen __builtin_strlen diff --git a/include/linux/kmsan_string.h b/include/linux/kmsan_string.h new file mode 100644 index 000000000000..7287da6f52ef --- /dev/null +++ b/include/linux/kmsan_string.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN string functions API used in other headers. + * + * Copyright (C) 2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_STRING_H +#define _LINUX_KMSAN_STRING_H + +/* + * KMSAN overrides the default memcpy/memset/memmove implementations in the + * kernel, which requires having __msan_XXX function prototypes in several other + * headers. Keep them in one place instead of open-coding. + */ +void *__msan_memcpy(void *dst, const void *src, size_t size); +void *__msan_memset(void *s, int c, size_t n); +void *__msan_memmove(void *dest, const void *src, size_t len); + +#endif /* _LINUX_KMSAN_STRING_H */ diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 280d15413268..271f135f97a1 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -14,6 +14,7 @@ #include "kmsan.h" #include +#include #include #include -- cgit v1.2.3 From 6f7630b1b5bc672b54c1285ee6aba752b446672c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 28 Oct 2022 15:32:07 -0700 Subject: fortify: Capture __bos() results in const temp vars In two recent run-time memcpy() bound checking bug reports (NFS[1] and JFS[2]), the _detection_ was working correctly (in the sense that the requested copy size was larger than the destination field size), but the _warning text_ was showing the destination field size as SIZE_MAX ("unknown size"). This should be impossible, since the detection function will explicitly give up if the destination field size is unknown. For example, the JFS warning was: memcpy: detected field-spanning write (size 132) of single field "ip->i_link" at fs/jfs/namei.c:950 (size 18446744073709551615) Other cases of this warning (e.g.[3]) have reported correctly, and the reproducer only happens under GCC (at least 10.2 and 12.1), so this currently appears to be a GCC bug. Explicitly capturing the __builtin_object_size() results in const temporary variables fixes the report. For example, the JFS reproducer now correctly reports the field size (128): memcpy: detected field-spanning write (size 132) of single field "ip->i_link" at fs/jfs/namei.c:950 (size 128) Examination of the .text delta (which is otherwise identical), shows the literal value used in the report changing: - mov $0xffffffffffffffff,%rcx + mov $0x80,%ecx [1] https://lore.kernel.org/lkml/Y0zEzZwhOxTDcBTB@codemonkey.org.uk/ [2] https://syzkaller.appspot.com/bug?id=23d613df5259b977dac1696bec77f61a85890e3d [3] https://lore.kernel.org/all/202210110948.26b43120-yujie.liu@intel.com/ Cc: "Dr. David Alan Gilbert" Cc: llvm@lists.linux.dev Cc: linux-hardening@vger.kernel.org Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 4029fe368a4f..0f00a551939a 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -441,13 +441,18 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ - size_t __fortify_size = (size_t)(size); \ - WARN_ONCE(fortify_memcpy_chk(__fortify_size, p_size, q_size, \ - p_size_field, q_size_field, #op), \ + const size_t __fortify_size = (size_t)(size); \ + const size_t __p_size = (p_size); \ + const size_t __q_size = (q_size); \ + const size_t __p_size_field = (p_size_field); \ + const size_t __q_size_field = (q_size_field); \ + WARN_ONCE(fortify_memcpy_chk(__fortify_size, __p_size, \ + __q_size, __p_size_field, \ + __q_size_field, #op), \ #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \ __fortify_size, \ "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \ - p_size_field); \ + __p_size_field); \ __underlying_##op(p, q, __fortify_size); \ }) -- cgit v1.2.3