From b71f9804f66c2592d4c3a2397b7374a4039005a5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Apr 2025 22:46:52 -0700 Subject: timekeeping: Prevent coarse clocks going backwards Lei Chen raised an issue with CLOCK_MONOTONIC_COARSE seeing time inconsistencies. Lei tracked down that this was being caused by the adjustment: tk->tkr_mono.xtime_nsec -= offset; which is made to compensate for the unaccumulated cycles in offset when the multiplicator is adjusted forward, so that the non-_COARSE clockids don't see inconsistencies. However, the _COARSE clockid getter functions use the adjusted xtime_nsec value directly and do not compensate the negative offset via the clocksource delta multiplied with the new multiplicator. In that case the caller can observe time going backwards in consecutive calls. By design, this negative adjustment should be fine, because the logic run from timekeeping_adjust() is done after it accumulated approximately multiplicator * interval_cycles into xtime_nsec. The accumulated value is always larger then the mult_adj * offset value, which is subtracted from xtime_nsec. Both operations are done together under the tk_core.lock, so the net change to xtime_nsec is always always be positive. However, do_adjtimex() calls into timekeeping_advance() as well, to apply the NTP frequency adjustment immediately. In this case, timekeeping_advance() does not return early when the offset is smaller then interval_cycles. In that case there is no time accumulated into xtime_nsec. But the subsequent call into timekeeping_adjust(), which modifies the multiplicator, subtracts from xtime_nsec to correct for the new multiplicator. Here because there was no accumulation, xtime_nsec becomes smaller than before, which opens a window up to the next accumulation, where the _COARSE clockid getters, which don't compensate for the offset, can observe the inconsistency. This has been tried to be fixed by forwarding the timekeeper in the case that adjtimex() adjusts the multiplier, which resets the offset to zero: 757b000f7b93 ("timekeeping: Fix possible inconsistencies in _COARSE clockids") That works correctly, but unfortunately causes a regression on the adjtimex() side. There are two issues: 1) The forwarding of the base time moves the update out of the original period and establishes a new one. 2) The clearing of the accumulated NTP error is changing the behaviour as well. User-space expects that multiplier/frequency updates are in effect, when the syscall returns, so delaying the update to the next tick is not solving the problem either. Commit 757b000f7b93 was reverted so that the established expectations of user space implementations (ntpd, chronyd) are restored, but that obviously brought the inconsistencies back. One of the initial approaches to fix this was to establish a separate storage for the coarse time getter nanoseconds part by calculating it from the offset. That was dropped on the floor because not having yet another state to maintain was simpler. But given the result of the above exercise, this solution turns out to be the right one. Bring it back in a slightly modified form. Thus introduce timekeeper::coarse_nsec and store that nanoseconds part in it, switch the time getter functions and the VDSO update to use that value. coarse_nsec is set on operations which forward or initialize the timekeeper and after time was accumulated during a tick. If there is no accumulation the timestamp is unchanged. This leaves the adjtimex() behaviour unmodified and prevents coarse time from going backwards. [ jstultz: Simplified the coarse_nsec calculation and kept behavior so coarse clockids aren't adjusted on each inter-tick adjtimex call, slightly reworked the comments and commit message ] Fixes: da15cfdae033 ("time: Introduce CLOCK_REALTIME_COARSE") Reported-by: Lei Chen Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/all/20250419054706.2319105-1-jstultz@google.com Closes: https://lore.kernel.org/lkml/20250310030004.3705801-1-lei.chen@smartx.com/ --- include/linux/timekeeper_internal.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index e39d4d563b19..785048a3b3e6 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -51,7 +51,7 @@ struct tk_read_base { * @offs_real: Offset clock monotonic -> clock realtime * @offs_boot: Offset clock monotonic -> clock boottime * @offs_tai: Offset clock monotonic -> clock tai - * @tai_offset: The current UTC to TAI offset in seconds + * @coarse_nsec: The nanoseconds part for coarse time getters * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds * @clock_was_set_seq: The sequence number of clock was set events @@ -76,6 +76,7 @@ struct tk_read_base { * ntp shifted nano seconds. * @ntp_err_mult: Multiplication factor for scaled math conversion * @skip_second_overflow: Flag used to avoid updating NTP twice with same second + * @tai_offset: The current UTC to TAI offset in seconds * * Note: For timespec(64) based interfaces wall_to_monotonic is what * we need to add to xtime (or xtime corrected for sub jiffy times) @@ -100,7 +101,7 @@ struct tk_read_base { * which results in the following cacheline layout: * * 0: seqcount, tkr_mono - * 1: xtime_sec ... tai_offset + * 1: xtime_sec ... coarse_nsec * 2: tkr_raw, raw_sec * 3,4: Internal variables * @@ -121,7 +122,7 @@ struct timekeeper { ktime_t offs_real; ktime_t offs_boot; ktime_t offs_tai; - s32 tai_offset; + u32 coarse_nsec; /* Cacheline 2: */ struct tk_read_base tkr_raw; @@ -144,6 +145,7 @@ struct timekeeper { u32 ntp_error_shift; u32 ntp_err_mult; u32 skip_second_overflow; + s32 tai_offset; }; #ifdef CONFIG_GENERIC_TIME_VSYSCALL -- cgit v1.2.3 From f31fe8165d365379d858c53bef43254c7d6d1cfd Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Fri, 2 May 2025 13:18:10 +0530 Subject: uio_hv_generic: Fix sysfs creation path for ring buffer On regular bootup, devices get registered to VMBus first, so when uio_hv_generic driver for a particular device type is probed, the device is already initialized and added, so sysfs creation in hv_uio_probe() works fine. However, when the device is removed and brought back, the channel gets rescinded and the device again gets registered to VMBus. However this time, the uio_hv_generic driver is already registered to probe for that device and in this case sysfs creation is tried before the device's kobject gets initialized completely. Fix this by moving the core logic of sysfs creation of ring buffer, from uio_hv_generic to HyperV's VMBus driver, where the rest of the sysfs attributes for the channels are defined. While doing that, make use of attribute groups and macros, instead of creating sysfs directly, to ensure better error handling and code flow. Problematic path: vmbus_process_offer (A new offer comes for the VMBus device) vmbus_add_channel_work vmbus_device_register |-> device_register | |... | |-> hv_uio_probe | |... | |-> sysfs_create_bin_file (leads to a warning as | the primary channel's kobject, which is used to | create the sysfs file, is not yet initialized) |-> kset_create_and_add |-> vmbus_add_channel_kobj (initialization of the primary channel's kobject happens later) Above code flow is sequential and the warning is always reproducible in this path. Fixes: 9ab877a6ccf8 ("uio_hv_generic: make ring buffer attribute for primary channel") Cc: stable@kernel.org Suggested-by: Saurabh Sengar Suggested-by: Michael Kelley Reviewed-by: Michael Kelley Tested-by: Michael Kelley Reviewed-by: Dexuan Cui Signed-off-by: Naman Jain Link: https://lore.kernel.org/r/20250502074811.2022-2-namjain@linux.microsoft.com Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hyperv_vmbus.h | 6 +++ drivers/hv/vmbus_drv.c | 100 ++++++++++++++++++++++++++++++++++++++++++- drivers/uio/uio_hv_generic.c | 39 ++++++++--------- include/linux/hyperv.h | 6 +++ 4 files changed, 128 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 29780f3a7478..0b450e53161e 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -477,4 +477,10 @@ static inline int hv_debug_add_dev_dir(struct hv_device *dev) #endif /* CONFIG_HYPERV_TESTING */ +/* Create and remove sysfs entry for memory mapped ring buffers for a channel */ +int hv_create_ring_sysfs(struct vmbus_channel *channel, + int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, + struct vm_area_struct *vma)); +int hv_remove_ring_sysfs(struct vmbus_channel *channel); + #endif /* _HYPERV_VMBUS_H */ diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 8d3cff42bdbb..0f16a83cc2d6 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1802,6 +1802,27 @@ static ssize_t subchannel_id_show(struct vmbus_channel *channel, } static VMBUS_CHAN_ATTR_RO(subchannel_id); +static int hv_mmap_ring_buffer_wrapper(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + struct vmbus_channel *channel = container_of(kobj, struct vmbus_channel, kobj); + + /* + * hv_(create|remove)_ring_sysfs implementation ensures that mmap_ring_buffer + * is not NULL. + */ + return channel->mmap_ring_buffer(channel, vma); +} + +static struct bin_attribute chan_attr_ring_buffer = { + .attr = { + .name = "ring", + .mode = 0600, + }, + .size = 2 * SZ_2M, + .mmap = hv_mmap_ring_buffer_wrapper, +}; static struct attribute *vmbus_chan_attrs[] = { &chan_attr_out_mask.attr, &chan_attr_in_mask.attr, @@ -1821,6 +1842,11 @@ static struct attribute *vmbus_chan_attrs[] = { NULL }; +static struct bin_attribute *vmbus_chan_bin_attrs[] = { + &chan_attr_ring_buffer, + NULL +}; + /* * Channel-level attribute_group callback function. Returns the permission for * each attribute, and returns 0 if an attribute is not visible. @@ -1841,9 +1867,24 @@ static umode_t vmbus_chan_attr_is_visible(struct kobject *kobj, return attr->mode; } +static umode_t vmbus_chan_bin_attr_is_visible(struct kobject *kobj, + const struct bin_attribute *attr, int idx) +{ + const struct vmbus_channel *channel = + container_of(kobj, struct vmbus_channel, kobj); + + /* Hide ring attribute if channel's ring_sysfs_visible is set to false */ + if (attr == &chan_attr_ring_buffer && !channel->ring_sysfs_visible) + return 0; + + return attr->attr.mode; +} + static const struct attribute_group vmbus_chan_group = { .attrs = vmbus_chan_attrs, - .is_visible = vmbus_chan_attr_is_visible + .bin_attrs = vmbus_chan_bin_attrs, + .is_visible = vmbus_chan_attr_is_visible, + .is_bin_visible = vmbus_chan_bin_attr_is_visible, }; static const struct kobj_type vmbus_chan_ktype = { @@ -1851,6 +1892,63 @@ static const struct kobj_type vmbus_chan_ktype = { .release = vmbus_chan_release, }; +/** + * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel. + * @channel: Pointer to vmbus_channel structure + * @hv_mmap_ring_buffer: function pointer for initializing the function to be called on mmap of + * channel's "ring" sysfs node, which is for the ring buffer of that channel. + * Function pointer is of below type: + * int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, + * struct vm_area_struct *vma)) + * This has a pointer to the channel and a pointer to vm_area_struct, + * used for mmap, as arguments. + * + * Sysfs node for ring buffer of a channel is created along with other fields, however its + * visibility is disabled by default. Sysfs creation needs to be controlled when the use-case + * is running. + * For example, HV_NIC device is used either by uio_hv_generic or hv_netvsc at any given point of + * time, and "ring" sysfs is needed only when uio_hv_generic is bound to that device. To avoid + * exposing the ring buffer by default, this function is reponsible to enable visibility of + * ring for userspace to use. + * Note: Race conditions can happen with userspace and it is not encouraged to create new + * use-cases for this. This was added to maintain backward compatibility, while solving + * one of the race conditions in uio_hv_generic while creating sysfs. + * + * Returns 0 on success or error code on failure. + */ +int hv_create_ring_sysfs(struct vmbus_channel *channel, + int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, + struct vm_area_struct *vma)) +{ + struct kobject *kobj = &channel->kobj; + + channel->mmap_ring_buffer = hv_mmap_ring_buffer; + channel->ring_sysfs_visible = true; + + return sysfs_update_group(kobj, &vmbus_chan_group); +} +EXPORT_SYMBOL_GPL(hv_create_ring_sysfs); + +/** + * hv_remove_ring_sysfs() - remove ring sysfs entry corresponding to ring buffers for a channel. + * @channel: Pointer to vmbus_channel structure + * + * Hide "ring" sysfs for a channel by changing its is_visible attribute and updating sysfs group. + * + * Returns 0 on success or error code on failure. + */ +int hv_remove_ring_sysfs(struct vmbus_channel *channel) +{ + struct kobject *kobj = &channel->kobj; + int ret; + + channel->ring_sysfs_visible = false; + ret = sysfs_update_group(kobj, &vmbus_chan_group); + channel->mmap_ring_buffer = NULL; + return ret; +} +EXPORT_SYMBOL_GPL(hv_remove_ring_sysfs); + /* * vmbus_add_channel_kobj - setup a sub-directory under device/channels */ diff --git a/drivers/uio/uio_hv_generic.c b/drivers/uio/uio_hv_generic.c index 1b19b5647495..69c1df0f4ca5 100644 --- a/drivers/uio/uio_hv_generic.c +++ b/drivers/uio/uio_hv_generic.c @@ -131,15 +131,12 @@ static void hv_uio_rescind(struct vmbus_channel *channel) vmbus_device_unregister(channel->device_obj); } -/* Sysfs API to allow mmap of the ring buffers +/* Function used for mmap of ring buffer sysfs interface. * The ring buffer is allocated as contiguous memory by vmbus_open */ -static int hv_uio_ring_mmap(struct file *filp, struct kobject *kobj, - const struct bin_attribute *attr, - struct vm_area_struct *vma) +static int +hv_uio_ring_mmap(struct vmbus_channel *channel, struct vm_area_struct *vma) { - struct vmbus_channel *channel - = container_of(kobj, struct vmbus_channel, kobj); void *ring_buffer = page_address(channel->ringbuffer_page); if (channel->state != CHANNEL_OPENED_STATE) @@ -149,15 +146,6 @@ static int hv_uio_ring_mmap(struct file *filp, struct kobject *kobj, channel->ringbuffer_pagecount << PAGE_SHIFT); } -static const struct bin_attribute ring_buffer_bin_attr = { - .attr = { - .name = "ring", - .mode = 0600, - }, - .size = 2 * SZ_2M, - .mmap = hv_uio_ring_mmap, -}; - /* Callback from VMBUS subsystem when new channel created. */ static void hv_uio_new_channel(struct vmbus_channel *new_sc) @@ -178,8 +166,7 @@ hv_uio_new_channel(struct vmbus_channel *new_sc) /* Disable interrupts on sub channel */ new_sc->inbound.ring_buffer->interrupt_mask = 1; set_channel_read_mode(new_sc, HV_CALL_ISR); - - ret = sysfs_create_bin_file(&new_sc->kobj, &ring_buffer_bin_attr); + ret = hv_create_ring_sysfs(new_sc, hv_uio_ring_mmap); if (ret) { dev_err(device, "sysfs create ring bin file failed; %d\n", ret); vmbus_close(new_sc); @@ -350,10 +337,18 @@ hv_uio_probe(struct hv_device *dev, goto fail_close; } - ret = sysfs_create_bin_file(&channel->kobj, &ring_buffer_bin_attr); - if (ret) - dev_notice(&dev->device, - "sysfs create ring bin file failed; %d\n", ret); + /* + * This internally calls sysfs_update_group, which returns a non-zero value if it executes + * before sysfs_create_group. This is expected as the 'ring' will be created later in + * vmbus_device_register() -> vmbus_add_channel_kobj(). Thus, no need to check the return + * value and print warning. + * + * Creating/exposing sysfs in driver probe is not encouraged as it can lead to race + * conditions with userspace. For backward compatibility, "ring" sysfs could not be removed + * or decoupled from uio_hv_generic probe. Userspace programs can make use of inotify + * APIs to make sure that ring is created. + */ + hv_create_ring_sysfs(channel, hv_uio_ring_mmap); hv_set_drvdata(dev, pdata); @@ -375,7 +370,7 @@ hv_uio_remove(struct hv_device *dev) if (!pdata) return; - sysfs_remove_bin_file(&dev->channel->kobj, &ring_buffer_bin_attr); + hv_remove_ring_sysfs(dev->channel); uio_unregister_device(&pdata->info); hv_uio_cleanup(dev, pdata); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 675959fb97ba..d6ffe01962c2 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1002,6 +1002,12 @@ struct vmbus_channel { /* The max size of a packet on this channel */ u32 max_pkt_size; + + /* function to mmap ring buffer memory to the channel's sysfs ring attribute */ + int (*mmap_ring_buffer)(struct vmbus_channel *channel, struct vm_area_struct *vma); + + /* boolean to control visibility of sysfs for ring buffer */ + bool ring_sysfs_visible; }; #define lock_requestor(channel, flags) \ -- cgit v1.2.3 From e12a42f64fc3d74872b349eedd47f90c6676b78a Mon Sep 17 00:00:00 2001 From: Michael-CY Lee Date: Mon, 5 May 2025 16:19:46 +0800 Subject: wifi: mac80211: fix the type of status_code for negotiated TID to Link Mapping The status code should be type of __le16. Fixes: 83e897a961b8 ("wifi: ieee80211: add definitions for negotiated TID to Link map") Fixes: 8f500fbc6c65 ("wifi: mac80211: process and save negotiated TID to Link mapping request") Signed-off-by: Michael-CY Lee Link: https://patch.msgid.link/20250505081946.3927214-1-michael-cy.lee@mediatek.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- net/mac80211/mlme.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 508d466de1cc..457b4fba88bd 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1526,7 +1526,7 @@ struct ieee80211_mgmt { struct { u8 action_code; u8 dialog_token; - u8 status_code; + __le16 status_code; u8 variable[]; } __packed ttlm_res; struct { diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5d1f2d6d09ad..35eaf0812c5b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7675,6 +7675,7 @@ ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_res); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; + u16 status_code; skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len); if (!skb) @@ -7697,19 +7698,18 @@ ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, WARN_ON(1); fallthrough; case NEG_TTLM_RES_REJECT: - mgmt->u.action.u.ttlm_res.status_code = - WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING; + status_code = WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING; break; case NEG_TTLM_RES_ACCEPT: - mgmt->u.action.u.ttlm_res.status_code = WLAN_STATUS_SUCCESS; + status_code = WLAN_STATUS_SUCCESS; break; case NEG_TTLM_RES_SUGGEST_PREFERRED: - mgmt->u.action.u.ttlm_res.status_code = - WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED; + status_code = WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED; ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); break; } + mgmt->u.action.u.ttlm_res.status_code = cpu_to_le16(status_code); ieee80211_tx_skb(sdata, skb); } @@ -7875,7 +7875,7 @@ void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, * This can be better implemented in the future, to handle request * rejections. */ - if (mgmt->u.action.u.ttlm_res.status_code != WLAN_STATUS_SUCCESS) + if (le16_to_cpu(mgmt->u.action.u.ttlm_res.status_code) != WLAN_STATUS_SUCCESS) __ieee80211_disconnect(sdata); } -- cgit v1.2.3 From 78cd408356fe3edbac66598772fd347bf3e32c1f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 5 May 2025 18:19:19 -0700 Subject: net: add missing instance lock to dev_set_promiscuity Accidentally spotted while trying to understand what else needs to be renamed to netif_ prefix. Most of the calls to dev_set_promiscuity are adjacent to dev_set_allmulti or dev_disable_lro so it should be safe to add the lock. Note that new netif_set_promiscuity is currently unused, the locked paths call __dev_set_promiscuity directly. Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations") Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250506011919.2882313-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + net/core/dev.c | 14 +------------- net/core/dev_api.c | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2d11d013cabe..7ea022750e4e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4972,6 +4972,7 @@ static inline void __dev_mc_unsync(struct net_device *dev, /* Functions used for secondary unicast and multicast support */ void dev_set_rx_mode(struct net_device *dev); +int netif_set_promiscuity(struct net_device *dev, int inc); int dev_set_promiscuity(struct net_device *dev, int inc); int netif_set_allmulti(struct net_device *dev, int inc, bool notify); int dev_set_allmulti(struct net_device *dev, int inc); diff --git a/net/core/dev.c b/net/core/dev.c index 92e004c354ea..11da1e272ec2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9193,18 +9193,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) return 0; } -/** - * dev_set_promiscuity - update promiscuity count on a device - * @dev: device - * @inc: modifier - * - * Add or remove promiscuity from a device. While the count in the device - * remains above zero the interface remains promiscuous. Once it hits zero - * the device reverts back to normal filtering operation. A negative inc - * value is used to drop promiscuity on the device. - * Return 0 if successful or a negative errno code on error. - */ -int dev_set_promiscuity(struct net_device *dev, int inc) +int netif_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; @@ -9216,7 +9205,6 @@ int dev_set_promiscuity(struct net_device *dev, int inc) dev_set_rx_mode(dev); return err; } -EXPORT_SYMBOL(dev_set_promiscuity); int netif_set_allmulti(struct net_device *dev, int inc, bool notify) { diff --git a/net/core/dev_api.c b/net/core/dev_api.c index 90898cd540ce..f9a160ab596f 100644 --- a/net/core/dev_api.c +++ b/net/core/dev_api.c @@ -267,6 +267,29 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); +/** + * dev_set_promiscuity() - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promiscuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + * Return 0 if successful or a negative errno code on error. + */ +int dev_set_promiscuity(struct net_device *dev, int inc) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_promiscuity(dev, inc); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_promiscuity); + /** * dev_set_allmulti() - update allmulti count on a device * @dev: device -- cgit v1.2.3 From a0309faf1cb0622cac7c820150b7abf2024acff5 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 25 Apr 2025 17:11:07 -0700 Subject: mm: vmalloc: support more granular vrealloc() sizing Introduce struct vm_struct::requested_size so that the requested (re)allocation size is retained separately from the allocated area size. This means that KASAN will correctly poison the correct spans of requested bytes. This also means we can support growing the usable portion of an allocation that can already be supported by the existing area's existing allocation. Link: https://lkml.kernel.org/r/20250426001105.it.679-kees@kernel.org Fixes: 3ddc2fefe6f3 ("mm: vmalloc: implement vrealloc()") Signed-off-by: Kees Cook Reported-by: Erhard Furtner Closes: https://lore.kernel.org/all/20250408192503.6149a816@outsider.home/ Reviewed-by: Danilo Krummrich Cc: Michal Hocko Cc: "Uladzislau Rezki (Sony)" Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 31 ++++++++++++++++++++++++------- 2 files changed, 25 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 31e9ffd936e3..5ca8d4dd149d 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -61,6 +61,7 @@ struct vm_struct { unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; + unsigned long requested_size; }; struct vmap_area { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3ed720a787ec..2d7511654831 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1940,7 +1940,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm, { vm->flags = flags; vm->addr = (void *)va->va_start; - vm->size = va_size(va); + vm->size = vm->requested_size = va_size(va); vm->caller = caller; va->vm = vm; } @@ -3133,6 +3133,7 @@ struct vm_struct *__get_vm_area_node(unsigned long size, area->flags = flags; area->caller = caller; + area->requested_size = requested_size; va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); if (IS_ERR(va)) { @@ -4063,6 +4064,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof); */ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) { + struct vm_struct *vm = NULL; + size_t alloced_size = 0; size_t old_size = 0; void *n; @@ -4072,15 +4075,17 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) } if (p) { - struct vm_struct *vm; - vm = find_vm_area(p); if (unlikely(!vm)) { WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); return NULL; } - old_size = get_vm_area_size(vm); + alloced_size = get_vm_area_size(vm); + old_size = vm->requested_size; + if (WARN(alloced_size < old_size, + "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) + return NULL; } /* @@ -4088,14 +4093,26 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) * would be a good heuristic for when to shrink the vm_area? */ if (size <= old_size) { - /* Zero out spare memory. */ - if (want_init_on_alloc(flags)) + /* Zero out "freed" memory. */ + if (want_init_on_free()) memset((void *)p + size, 0, old_size - size); + vm->requested_size = size; kasan_poison_vmalloc(p + size, old_size - size); - kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL); return (void *)p; } + /* + * We already have the bytes available in the allocation; use them. + */ + if (size <= alloced_size) { + kasan_unpoison_vmalloc(p + old_size, size - old_size, + KASAN_VMALLOC_PROT_NORMAL); + /* Zero out "alloced" memory. */ + if (want_init_on_alloc(flags)) + memset((void *)p + old_size, 0, size - old_size); + vm->requested_size = size; + } + /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ n = __vmalloc_noprof(size, flags); if (!n) -- cgit v1.2.3