From 077772468ec141b22e1e7c0c58bc09e2f9dc8762 Mon Sep 17 00:00:00 2001 From: Wang Dongsheng Date: Sun, 1 Jul 2018 23:15:46 -0700 Subject: net: phy: marvell: change default m88e1510 LED configuration The m88e1121 LED default configuration does not apply m88e151x. So add a function to relpace m88e1121 LED configuration. Signed-off-by: Wang Dongsheng Signed-off-by: David S. Miller --- include/linux/marvell_phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index 4f5f8c21e283..1eb6f244588d 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -27,6 +27,8 @@ */ #define MARVELL_PHY_ID_88E6390 0x01410f90 +#define MARVELL_PHY_FAMILY_ID(id) ((id) >> 4) + /* struct phy_device dev_flags definitions */ #define MARVELL_PHY_M1145_FLAGS_RESISTANCE 0x00000001 #define MARVELL_PHY_M1118_DNS323_LEDS 0x00000002 -- cgit v1.2.3 From 000244d3dc1f8114e38fe9ee2d9a0986404d9cbe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 Jul 2018 14:44:02 +0200 Subject: net: bridge: fix br_vlan_get_{pvid,info} return values These two functions return the regular -EINVAL failure in the normal code path, but return a nonstandard '-1' error otherwise, which gets interpreted as -EPERM. Let's change it to -EINVAL for the dummy functions as well. Fixes: 4d4fd36126d6 ("net: bridge: Publish bridge accessor functions") Signed-off-by: Arnd Bergmann Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 7843b98e1c6e..c20c7e197d07 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -105,13 +105,13 @@ static inline bool br_vlan_enabled(const struct net_device *dev) static inline int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) { - return -1; + return -EINVAL; } static inline int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { - return -1; + return -EINVAL; } #endif -- cgit v1.2.3 From d8d7218ad842e18fc6976b87c08ed749e8d56313 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 6 Jul 2018 11:49:00 +0900 Subject: xdp: XDP_REDIRECT should check IFF_UP and MTU Otherwise we end up with attempting to send packets from down devices or to send oversized packets, which may cause unexpected driver/device behaviour. Generic XDP has already done this check, so reuse the logic in native XDP. Fixes: 814abfabef3c ("xdp: add bpf_redirect helper function") Signed-off-by: Toshiaki Makita Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 6 +++--- kernel/bpf/devmap.c | 7 ++++++- net/core/filter.c | 9 +++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 300baad62c88..c73dd7396886 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -765,8 +765,8 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); -static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, - struct net_device *fwd) +static inline int xdp_ok_fwd_dev(const struct net_device *fwd, + unsigned int pktlen) { unsigned int len; @@ -774,7 +774,7 @@ static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, return -ENETDOWN; len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; - if (skb->len > len) + if (pktlen > len) return -EMSGSIZE; return 0; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 642c97f6d1b8..d361fc1e3bf3 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; + int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, { int err; - err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; skb->dev = dst->dev; diff --git a/net/core/filter.c b/net/core/filter.c index 470268024a40..5fa66a33927f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3128,12 +3128,16 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int sent; + int err, sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -3367,7 +3371,8 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, goto err; } - if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) goto err; skb->dev = fwd; -- cgit v1.2.3 From f292b87d3ac020418644d8a4bbf29814890505cb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 6 Jul 2018 14:34:29 -0700 Subject: bpf: include errno.h from bpf-cgroup.h Commit fdb5c4531c1e ("bpf: fix attach type BPF_LIRC_MODE2 dependency wrt CONFIG_CGROUP_BPF") caused some build issues, detected by 0-DAY kernel test infrastructure. The problem is that cgroup_bpf_prog_attach/detach/query() functions can return -EINVAL error code, which is not defined. Fix this adding errno.h to includes. Fixes: fdb5c4531c1e ("bpf: fix attach type BPF_LIRC_MODE2 dependency wrt CONFIG_CGROUP_BPF") Signed-off-by: Roman Gushchin Cc: Sean Young Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 79795c5fa7c3..d50c2f0a655a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -2,6 +2,7 @@ #ifndef _BPF_CGROUP_H #define _BPF_CGROUP_H +#include #include #include -- cgit v1.2.3 From b4e7a7a88b5d060650094b8d3454bc521d669f6a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 Jun 2018 11:17:54 -0400 Subject: drm_mode_create_lease_ioctl(): fix open-coded filp_clone_open() Failure of ->open() should *not* be followed by fput(). Fixed by using filp_clone_open(), which gets the cleanups right. Cc: stable@vger.kernel.org Acked-by: Linus Torvalds Signed-off-by: Al Viro --- drivers/gpu/drm/drm_lease.c | 16 +--------------- fs/internal.h | 1 - include/linux/fs.h | 1 + 3 files changed, 2 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/drm_lease.c b/drivers/gpu/drm/drm_lease.c index 50c73c0a20b9..d638c0fb3418 100644 --- a/drivers/gpu/drm/drm_lease.c +++ b/drivers/gpu/drm/drm_lease.c @@ -553,24 +553,13 @@ int drm_mode_create_lease_ioctl(struct drm_device *dev, /* Clone the lessor file to create a new file for us */ DRM_DEBUG_LEASE("Allocating lease file\n"); - path_get(&lessor_file->f_path); - lessee_file = alloc_file(&lessor_file->f_path, - lessor_file->f_mode, - fops_get(lessor_file->f_inode->i_fop)); - + lessee_file = filp_clone_open(lessor_file); if (IS_ERR(lessee_file)) { ret = PTR_ERR(lessee_file); goto out_lessee; } - /* Initialize the new file for DRM */ - DRM_DEBUG_LEASE("Initializing the file with %p\n", lessee_file->f_op->open); - ret = lessee_file->f_op->open(lessee_file->f_inode, lessee_file); - if (ret) - goto out_lessee_file; - lessee_priv = lessee_file->private_data; - /* Change the file to a master one */ drm_master_put(&lessee_priv->master); lessee_priv->master = lessee; @@ -588,9 +577,6 @@ int drm_mode_create_lease_ioctl(struct drm_device *dev, DRM_DEBUG_LEASE("drm_mode_create_lease_ioctl succeeded\n"); return 0; -out_lessee_file: - fput(lessee_file); - out_lessee: drm_master_put(&lessee); diff --git a/fs/internal.h b/fs/internal.h index 980d005b21b4..5645b4ebf494 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -127,7 +127,6 @@ int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); -extern struct file *filp_clone_open(struct file *); /* * inode.c diff --git a/include/linux/fs.h b/include/linux/fs.h index 5c91108846db..aa9b4c169ed2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2422,6 +2422,7 @@ extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int, umode_t); extern struct file * dentry_open(const struct path *, int, const struct cred *); +extern struct file *filp_clone_open(struct file *); extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int, int *); -- cgit v1.2.3 From 8b7008620b8452728cadead460a36f64ed78c460 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 11 Jul 2018 14:39:42 +0200 Subject: net: Don't copy pfmemalloc flag in __copy_skb_header() The pfmemalloc flag indicates that the skb was allocated from the PFMEMALLOC reserves, and the flag is currently copied on skb copy and clone. However, an skb copied from an skb flagged with pfmemalloc wasn't necessarily allocated from PFMEMALLOC reserves, and on the other hand an skb allocated that way might be copied from an skb that wasn't. So we should not copy the flag on skb copy, and rather decide whether to allow an skb to be associated with sockets unrelated to page reclaim depending only on how it was allocated. Move the pfmemalloc flag before headers_start[0] using an existing 1-bit hole, so that __copy_skb_header() doesn't copy it. When cloning, we'll now take care of this flag explicitly, contravening to the warning comment of __skb_clone(). While at it, restore the newline usage introduced by commit b19372273164 ("net: reorganize sk_buff for faster __copy_skb_header()") to visually separate bytes used in bitfields after headers_start[0], that was gone after commit a9e419dc7be6 ("netfilter: merge ctinfo into nfct pointer storage area"), and describe the pfmemalloc flag in the kernel-doc structure comment. This doesn't change the size of sk_buff or cacheline boundaries, but consolidates the 15 bits hole before tc_index into a 2 bytes hole before csum, that could now be filled more easily. Reported-by: Patrick Talbert Fixes: c93bdd0e03e8 ("netvm: allow skb allocation to use PFMEMALLOC reserves") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- include/linux/skbuff.h | 10 +++++----- net/core/skbuff.c | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 164cdedf6012..610a201126ee 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -630,6 +630,7 @@ typedef unsigned char *sk_buff_data_t; * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue + * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -735,7 +736,7 @@ struct sk_buff { peeked:1, head_frag:1, xmit_more:1, - __unused:1; /* one bit hole */ + pfmemalloc:1; /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() @@ -754,31 +755,30 @@ struct sk_buff { __u8 __pkt_type_offset[0]; __u8 pkt_type:3; - __u8 pfmemalloc:1; __u8 ignore_df:1; - __u8 nf_trace:1; __u8 ip_summed:2; __u8 ooo_okay:1; + __u8 l4_hash:1; __u8 sw_hash:1; __u8 wifi_acked_valid:1; __u8 wifi_acked:1; - __u8 no_fcs:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; + __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 csum_not_inet:1; - __u8 dst_pending_confirm:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif __u8 ipvs_property:1; + __u8 inner_protocol_type:1; __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV diff --git a/net/core/skbuff.c b/net/core/skbuff.c index eba8dae22c25..4df3164bb5fc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -858,6 +858,8 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->cloned = 1; n->nohdr = 0; n->peeked = 0; + if (skb->pfmemalloc) + n->pfmemalloc = 1; n->destructor = NULL; C(tail); C(end); -- cgit v1.2.3 From d1b47a7c9efcf3c3384b70f6e3c8f1423b44d8c7 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Mon, 16 Jul 2018 11:16:30 -0400 Subject: mm: don't do zero_resv_unavail if memmap is not allocated Moving zero_resv_unavail before memmap_init_zone(), caused a regression on x86-32. The cause is that we access struct pages before they are allocated when CONFIG_FLAT_NODE_MEM_MAP is used. free_area_init_nodes() zero_resv_unavail() mm_zero_struct_page(pfn_to_page(pfn)); <- struct page is not alloced free_area_init_node() if CONFIG_FLAT_NODE_MEM_MAP alloc_node_mem_map() memblock_virt_alloc_node_nopanic() <- struct page alloced here On the other hand memblock_virt_alloc_node_nopanic() zeroes all the memory that it returns, so we do not need to do zero_resv_unavail() here. Fixes: e181ae0c5db9 ("mm: zero unavailable pages before memmap init") Signed-off-by: Pavel Tatashin Tested-by: Matt Hart Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/page_alloc.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a0fbb9ffe380..3982c83fdcbf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2132,7 +2132,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, struct mminit_pfnnid_cache *state); #endif -#ifdef CONFIG_HAVE_MEMBLOCK +#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) void zero_resv_unavail(void); #else static inline void zero_resv_unavail(void) {} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5d800d61ddb7..a790ef4be74e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6383,7 +6383,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } -#ifdef CONFIG_HAVE_MEMBLOCK +#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) /* * Only struct pages that are backed by physical memory are zeroed and * initialized by going through __init_single_page(). But, there are some @@ -6421,7 +6421,7 @@ void __paginginit zero_resv_unavail(void) if (pgcnt) pr_info("Reserved but unavailable: %lld pages", pgcnt); } -#endif /* CONFIG_HAVE_MEMBLOCK */ +#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -- cgit v1.2.3 From 6e2059b53f9885f202b086d7b4ca10a98926e974 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 10 Jul 2018 22:41:26 +0800 Subject: ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 ++ net/ipv4/igmp.c | 58 ++++++++++++++++++++++++++++++++++++-------------- net/ipv4/ip_sockglue.c | 4 ++-- 3 files changed, 46 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index f8231854b5d6..119f53941c12 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -109,6 +109,8 @@ struct ip_mc_list { extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u8 proto); extern int igmp_rcv(struct sk_buff *); extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); +extern int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode); extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern void ip_mc_drop_socket(struct sock *sk); extern int ip_mc_source(int add, int omode, struct sock *sk, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 85b617b655bc..b3c899a630a0 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1200,13 +1200,14 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) spin_lock_bh(&im->lock); if (pmc) { im->interface = pmc->interface; - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; im->sfmode = pmc->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { im->tomb = pmc->tomb; im->sources = pmc->sources; for (psf = im->sources; psf; psf = psf->sf_next) - psf->sf_crcount = im->crcount; + psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + } else { + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; } in_dev_put(pmc->interface); kfree(pmc); @@ -1288,7 +1289,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) #endif } -static void igmp_group_added(struct ip_mc_list *im) +static void igmp_group_added(struct ip_mc_list *im, unsigned int mode) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST @@ -1316,7 +1317,13 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should + * not send filter-mode change record as the mode should be from + * IN() to IN(A). + */ + if (mode == MCAST_EXCLUDE) + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + igmp_ifc_event(in_dev); #endif } @@ -1381,8 +1388,7 @@ static void ip_mc_hash_remove(struct in_device *in_dev, /* * A socket has joined a multicast group on device dev. */ - -void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode) { struct ip_mc_list *im; #ifdef CONFIG_IP_MULTICAST @@ -1394,7 +1400,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == addr) { im->users++; - ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); + ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); goto out; } } @@ -1408,8 +1414,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) in_dev_hold(in_dev); im->multiaddr = addr; /* initial mode is (EX, empty) */ - im->sfmode = MCAST_EXCLUDE; - im->sfcount[MCAST_EXCLUDE] = 1; + im->sfmode = mode; + im->sfcount[mode] = 1; refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST @@ -1426,12 +1432,17 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im); #endif - igmp_group_added(im); + igmp_group_added(im, mode); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: return; } + +void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +{ + __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_inc_group); static int ip_mc_check_iphdr(struct sk_buff *skb) @@ -1688,7 +1699,7 @@ void ip_mc_remap(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc); + igmp_group_added(pmc, pmc->sfmode); } } @@ -1751,7 +1762,7 @@ void ip_mc_up(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc); + igmp_group_added(pmc, pmc->sfmode); } } @@ -2130,8 +2141,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) /* Join a multicast group */ - -int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) { __be32 addr = imr->imr_multiaddr.s_addr; struct ip_mc_socklist *iml, *i; @@ -2172,15 +2183,30 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) memcpy(&iml->multi, imr, sizeof(*imr)); iml->next_rcu = inet->mc_list; iml->sflist = NULL; - iml->sfmode = MCAST_EXCLUDE; + iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); - ip_mc_inc_group(in_dev, addr); + __ip_mc_inc_group(in_dev, addr, mode); err = 0; done: return err; } + +/* Join ASM (Any-Source Multicast) group + */ +int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +{ + return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_join_group); +/* Join SSM (Source-Specific Multicast) group + */ +int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) +{ + return __ip_mc_join_group(sk, imr, mode); +} + static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index fc32fdbeefa6..64c76dcf7386 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -984,7 +984,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; mreq.imr_address.s_addr = mreqs.imr_interface; mreq.imr_ifindex = 0; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; omode = MCAST_INCLUDE; @@ -1061,7 +1061,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; mreq.imr_ifindex = greqs.gsr_interface; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; greqs.gsr_interface = mreq.imr_ifindex; -- cgit v1.2.3 From c133459765fae249ba482f62e12f987aec4376f0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 13 Jul 2018 21:25:19 -0700 Subject: net/ethernet/freescale/fman: fix cross-build error CC [M] drivers/net/ethernet/freescale/fman/fman.o In file included from ../drivers/net/ethernet/freescale/fman/fman.c:35: ../include/linux/fsl/guts.h: In function 'guts_set_dmacr': ../include/linux/fsl/guts.h:165:2: error: implicit declaration of function 'clrsetbits_be32' [-Werror=implicit-function-declaration] clrsetbits_be32(&guts->dmacr, 3 << shift, device << shift); ^~~~~~~~~~~~~~~ Signed-off-by: Randy Dunlap Cc: Madalin Bucur Cc: netdev@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: David S. Miller --- include/linux/fsl/guts.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fsl/guts.h b/include/linux/fsl/guts.h index 3efa3b861d44..941b11811f85 100644 --- a/include/linux/fsl/guts.h +++ b/include/linux/fsl/guts.h @@ -16,6 +16,7 @@ #define __FSL_GUTS_H__ #include +#include /** * Global Utility Registers. -- cgit v1.2.3 From 9ba546c01976a426292af99e682a557075d6c010 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Jul 2018 15:48:46 +0200 Subject: aio: don't expose __aio_sigset in uapi glibc uses a different defintion of sigset_t than the kernel does, and the current version would pull in both. To fix this just do not expose the type at all - this somewhat mirrors pselect() where we do not even have a type for the magic sigmask argument, but just use pointer arithmetics. Fixes: 7a074e96 ("aio: implement io_pgetevents") Signed-off-by: Christoph Hellwig Reported-by: Adrian Reber Signed-off-by: Al Viro --- fs/aio.c | 5 +++++ include/linux/syscalls.h | 1 + include/uapi/linux/aio_abi.h | 6 ------ 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/aio.c b/fs/aio.c index e1d20124ec0e..b1a42e45698b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2042,6 +2042,11 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, return ret; } +struct __aio_sigset { + const sigset_t __user *sigmask; + size_t sigsetsize; +}; + SYSCALL_DEFINE6(io_pgetevents, aio_context_t, ctx_id, long, min_nr, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 73810808cdf2..b06b5eeda8e8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -11,6 +11,7 @@ #ifndef _LINUX_SYSCALLS_H #define _LINUX_SYSCALLS_H +struct __aio_sigset; struct epoll_event; struct iattr; struct inode; diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index d00221345c19..ce43d340f010 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -29,7 +29,6 @@ #include #include -#include #include typedef __kernel_ulong_t aio_context_t; @@ -108,10 +107,5 @@ struct iocb { #undef IFBIG #undef IFLITTLE -struct __aio_sigset { - const sigset_t __user *sigmask; - size_t sigsetsize; -}; - #endif /* __LINUX__AIO_ABI_H */ -- cgit v1.2.3 From a5fb9fb023a1435f2b42bccd7f547560f3a21dc3 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Wed, 18 Jul 2018 15:40:26 -0500 Subject: PCI: OF: Fix I/O space page leak When testing the R-Car PCIe driver on the Condor board, if the PCIe PHY driver was left disabled, the kernel crashed with this BUG: kernel BUG at lib/ioremap.c:72! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 39 Comm: kworker/0:1 Not tainted 4.17.0-dirty #1092 Hardware name: Renesas Condor board based on r8a77980 (DT) Workqueue: events deferred_probe_work_func pstate: 80000005 (Nzcv daif -PAN -UAO) pc : ioremap_page_range+0x370/0x3c8 lr : ioremap_page_range+0x40/0x3c8 sp : ffff000008da39e0 x29: ffff000008da39e0 x28: 00e8000000000f07 x27: ffff7dfffee00000 x26: 0140000000000000 x25: ffff7dfffef00000 x24: 00000000000fe100 x23: ffff80007b906000 x22: ffff000008ab8000 x21: ffff000008bb1d58 x20: ffff7dfffef00000 x19: ffff800009c30fb8 x18: 0000000000000001 x17: 00000000000152d0 x16: 00000000014012d0 x15: 0000000000000000 x14: 0720072007200720 x13: 0720072007200720 x12: 0720072007200720 x11: 0720072007300730 x10: 00000000000000ae x9 : 0000000000000000 x8 : ffff7dffff000000 x7 : 0000000000000000 x6 : 0000000000000100 x5 : 0000000000000000 x4 : 000000007b906000 x3 : ffff80007c61a880 x2 : ffff7dfffeefffff x1 : 0000000040000000 x0 : 00e80000fe100f07 Process kworker/0:1 (pid: 39, stack limit = 0x (ptrval)) Call trace: ioremap_page_range+0x370/0x3c8 pci_remap_iospace+0x7c/0xac pci_parse_request_of_pci_ranges+0x13c/0x190 rcar_pcie_probe+0x4c/0xb04 platform_drv_probe+0x50/0xbc driver_probe_device+0x21c/0x308 __device_attach_driver+0x98/0xc8 bus_for_each_drv+0x54/0x94 __device_attach+0xc4/0x12c device_initial_probe+0x10/0x18 bus_probe_device+0x90/0x98 deferred_probe_work_func+0xb0/0x150 process_one_work+0x12c/0x29c worker_thread+0x200/0x3fc kthread+0x108/0x134 ret_from_fork+0x10/0x18 Code: f9004ba2 54000080 aa0003fb 17ffff48 (d4210000) It turned out that pci_remap_iospace() wasn't undone when the driver's probe failed, and since devm_phy_optional_get() returned -EPROBE_DEFER, the probe was retried, finally causing the BUG due to trying to remap already remapped pages. Introduce the devm_pci_remap_iospace() managed API and replace the pci_remap_iospace() call with it to fix the bug. Fixes: dbf9826d5797 ("PCI: generic: Convert to DT resource parsing API") Signed-off-by: Sergei Shtylyov [lorenzo.pieralisi@arm.com: split commit/updated the commit log] Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Linus Walleij --- drivers/pci/of.c | 2 +- drivers/pci/pci.c | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 2 ++ 3 files changed, 41 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/pci/of.c b/drivers/pci/of.c index d088c9147f10..69a60d6ebd73 100644 --- a/drivers/pci/of.c +++ b/drivers/pci/of.c @@ -612,7 +612,7 @@ int pci_parse_request_of_pci_ranges(struct device *dev, switch (resource_type(res)) { case IORESOURCE_IO: - err = pci_remap_iospace(res, iobase); + err = devm_pci_remap_iospace(dev, res, iobase); if (err) { dev_warn(dev, "error %d: failed to map resource %pR\n", err, res); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 97acba712e4e..316496e99da9 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3579,6 +3579,44 @@ void pci_unmap_iospace(struct resource *res) } EXPORT_SYMBOL(pci_unmap_iospace); +static void devm_pci_unmap_iospace(struct device *dev, void *ptr) +{ + struct resource **res = ptr; + + pci_unmap_iospace(*res); +} + +/** + * devm_pci_remap_iospace - Managed pci_remap_iospace() + * @dev: Generic device to remap IO address for + * @res: Resource describing the I/O space + * @phys_addr: physical address of range to be mapped + * + * Managed pci_remap_iospace(). Map is automatically unmapped on driver + * detach. + */ +int devm_pci_remap_iospace(struct device *dev, const struct resource *res, + phys_addr_t phys_addr) +{ + const struct resource **ptr; + int error; + + ptr = devres_alloc(devm_pci_unmap_iospace, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + error = pci_remap_iospace(res, phys_addr); + if (error) { + devres_free(ptr); + } else { + *ptr = res; + devres_add(dev, ptr); + } + + return error; +} +EXPORT_SYMBOL(devm_pci_remap_iospace); + /** * devm_pci_remap_cfgspace - Managed pci_remap_cfgspace() * @dev: Generic device to remap IO address for diff --git a/include/linux/pci.h b/include/linux/pci.h index 340029b2fb38..abd5d5e17aee 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1240,6 +1240,8 @@ int pci_register_io_range(struct fwnode_handle *fwnode, phys_addr_t addr, unsigned long pci_address_to_pio(phys_addr_t addr); phys_addr_t pci_pio_to_address(unsigned long pio); int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr); +int devm_pci_remap_iospace(struct device *dev, const struct resource *res, + phys_addr_t phys_addr); void pci_unmap_iospace(struct resource *res); void __iomem *devm_pci_remap_cfgspace(struct device *dev, resource_size_t offset, -- cgit v1.2.3 From d7037ad73daa9598b8caa7d5fdf41e8ceee6ef73 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 8 Jul 2018 12:14:59 +0300 Subject: net/mlx5: Fix QP fragmented buffer allocation Fix bad alignment of SQ buffer in fragmented QP allocation. It should start directly after RQ buffer ends. Take special care of the end case where the RQ buffer does not occupy a whole page. RQ size is a power of two, so would be the case only for small RQ sizes (RQ size < PAGE_SIZE). Fix wrong assignments for sqb->size (mistakenly assigned RQ size), and for npages value of RQ and SQ. Fixes: 3a2f70331226 ("net/mlx5: Use order-0 allocations for all WQ types") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/alloc.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/wq.c | 34 ++++++++++++++++--------- include/linux/mlx5/driver.h | 18 ++++++++++--- 3 files changed, 38 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c index 323ffe8bf7e4..456f30007ad6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c @@ -123,7 +123,7 @@ int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size, int i; buf->size = size; - buf->npages = 1 << get_order(size); + buf->npages = DIV_ROUND_UP(size, PAGE_SIZE); buf->page_shift = PAGE_SHIFT; buf->frags = kcalloc(buf->npages, sizeof(struct mlx5_buf_list), GFP_KERNEL); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c index b97bb72b4db4..86478a6b99c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -113,35 +113,45 @@ err_db_free: return err; } -static void mlx5e_qp_set_frag_buf(struct mlx5_frag_buf *buf, - struct mlx5_wq_qp *qp) +static void mlx5_qp_set_frag_buf(struct mlx5_frag_buf *buf, + struct mlx5_wq_qp *qp) { + struct mlx5_frag_buf_ctrl *sq_fbc; struct mlx5_frag_buf *rqb, *sqb; - rqb = &qp->rq.fbc.frag_buf; + rqb = &qp->rq.fbc.frag_buf; *rqb = *buf; rqb->size = mlx5_wq_cyc_get_byte_size(&qp->rq); - rqb->npages = 1 << get_order(rqb->size); + rqb->npages = DIV_ROUND_UP(rqb->size, PAGE_SIZE); - sqb = &qp->sq.fbc.frag_buf; - *sqb = *buf; - sqb->size = mlx5_wq_cyc_get_byte_size(&qp->rq); - sqb->npages = 1 << get_order(sqb->size); + sq_fbc = &qp->sq.fbc; + sqb = &sq_fbc->frag_buf; + *sqb = *buf; + sqb->size = mlx5_wq_cyc_get_byte_size(&qp->sq); + sqb->npages = DIV_ROUND_UP(sqb->size, PAGE_SIZE); sqb->frags += rqb->npages; /* first part is for the rq */ + if (sq_fbc->strides_offset) + sqb->frags--; } int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *qpc, struct mlx5_wq_qp *wq, struct mlx5_wq_ctrl *wq_ctrl) { + u32 sq_strides_offset; int err; mlx5_fill_fbc(MLX5_GET(qpc, qpc, log_rq_stride) + 4, MLX5_GET(qpc, qpc, log_rq_size), &wq->rq.fbc); - mlx5_fill_fbc(ilog2(MLX5_SEND_WQE_BB), - MLX5_GET(qpc, qpc, log_sq_size), - &wq->sq.fbc); + + sq_strides_offset = + ((wq->rq.fbc.frag_sz_m1 + 1) % PAGE_SIZE) / MLX5_SEND_WQE_BB; + + mlx5_fill_fbc_offset(ilog2(MLX5_SEND_WQE_BB), + MLX5_GET(qpc, qpc, log_sq_size), + sq_strides_offset, + &wq->sq.fbc); err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node); if (err) { @@ -156,7 +166,7 @@ int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, goto err_db_free; } - mlx5e_qp_set_frag_buf(&wq_ctrl->buf, wq); + mlx5_qp_set_frag_buf(&wq_ctrl->buf, wq); wq->rq.db = &wq_ctrl->db.db[MLX5_RCV_DBR]; wq->sq.db = &wq_ctrl->db.db[MLX5_SND_DBR]; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 80cbb7fdce4a..83957920653a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -358,6 +358,7 @@ struct mlx5_frag_buf_ctrl { struct mlx5_frag_buf frag_buf; u32 sz_m1; u32 frag_sz_m1; + u32 strides_offset; u8 log_sz; u8 log_stride; u8 log_frag_strides; @@ -983,14 +984,22 @@ static inline u32 mlx5_base_mkey(const u32 key) return key & 0xffffff00u; } -static inline void mlx5_fill_fbc(u8 log_stride, u8 log_sz, - struct mlx5_frag_buf_ctrl *fbc) +static inline void mlx5_fill_fbc_offset(u8 log_stride, u8 log_sz, + u32 strides_offset, + struct mlx5_frag_buf_ctrl *fbc) { fbc->log_stride = log_stride; fbc->log_sz = log_sz; fbc->sz_m1 = (1 << fbc->log_sz) - 1; fbc->log_frag_strides = PAGE_SHIFT - fbc->log_stride; fbc->frag_sz_m1 = (1 << fbc->log_frag_strides) - 1; + fbc->strides_offset = strides_offset; +} + +static inline void mlx5_fill_fbc(u8 log_stride, u8 log_sz, + struct mlx5_frag_buf_ctrl *fbc) +{ + mlx5_fill_fbc_offset(log_stride, log_sz, 0, fbc); } static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc, @@ -1004,7 +1013,10 @@ static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc, static inline void *mlx5_frag_buf_get_wqe(struct mlx5_frag_buf_ctrl *fbc, u32 ix) { - unsigned int frag = (ix >> fbc->log_frag_strides); + unsigned int frag; + + ix += fbc->strides_offset; + frag = ix >> fbc->log_frag_strides; return fbc->frag_buf.frags[frag].buf + ((fbc->frag_sz_m1 & ix) << fbc->log_stride); -- cgit v1.2.3 From 2db1581e1f432ac6b4efe152c57fdfb4de85c154 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sun, 8 Jul 2018 14:23:21 +0800 Subject: Revert "iommu/vt-d: Clean up pasid quirk for pre-production devices" This reverts commit ab96746aaa344fb720a198245a837e266fad3b62. The commit ab96746aaa34 ("iommu/vt-d: Clean up pasid quirk for pre-production devices") triggers ECS mode on some platforms which have broken ECS support. As the result, graphic device will be inoperable on boot. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107017 Cc: Ashok Raj Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 32 ++++++++++++++++++++++++++++++-- include/linux/intel-iommu.h | 1 + 2 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index b344a883f116..115ff26e9ced 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -484,14 +484,37 @@ static int dmar_forcedac; static int intel_iommu_strict; static int intel_iommu_superpage = 1; static int intel_iommu_ecs = 1; +static int intel_iommu_pasid28; static int iommu_identity_mapping; #define IDENTMAP_ALL 1 #define IDENTMAP_GFX 2 #define IDENTMAP_AZALIA 4 -#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap)) -#define pasid_enabled(iommu) (ecs_enabled(iommu) && ecap_pasid(iommu->ecap)) +/* Broadwell and Skylake have broken ECS support — normal so-called "second + * level" translation of DMA requests-without-PASID doesn't actually happen + * unless you also set the NESTE bit in an extended context-entry. Which of + * course means that SVM doesn't work because it's trying to do nested + * translation of the physical addresses it finds in the process page tables, + * through the IOVA->phys mapping found in the "second level" page tables. + * + * The VT-d specification was retroactively changed to change the definition + * of the capability bits and pretend that Broadwell/Skylake never happened... + * but unfortunately the wrong bit was changed. It's ECS which is broken, but + * for some reason it was the PASID capability bit which was redefined (from + * bit 28 on BDW/SKL to bit 40 in future). + * + * So our test for ECS needs to eschew those implementations which set the old + * PASID capabiity bit 28, since those are the ones on which ECS is broken. + * Unless we are working around the 'pasid28' limitations, that is, by putting + * the device into passthrough mode for normal DMA and thus masking the bug. + */ +#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \ + (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap))) +/* PASID support is thus enabled if ECS is enabled and *either* of the old + * or new capability bits are set. */ +#define pasid_enabled(iommu) (ecs_enabled(iommu) && \ + (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap))) int intel_iommu_gfx_mapped; EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); @@ -554,6 +577,11 @@ static int __init intel_iommu_setup(char *str) printk(KERN_INFO "Intel-IOMMU: disable extended context table support\n"); intel_iommu_ecs = 0; + } else if (!strncmp(str, "pasid28", 7)) { + printk(KERN_INFO + "Intel-IOMMU: enable pre-production PASID support\n"); + intel_iommu_pasid28 = 1; + iommu_identity_mapping |= IDENTMAP_GFX; } else if (!strncmp(str, "tboot_noforce", 13)) { printk(KERN_INFO "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 1df940196ab2..ef169d67df92 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -121,6 +121,7 @@ #define ecap_srs(e) ((e >> 31) & 0x1) #define ecap_ers(e) ((e >> 30) & 0x1) #define ecap_prs(e) ((e >> 29) & 0x1) +#define ecap_broken_pasid(e) ((e >> 28) & 0x1) #define ecap_dis(e) ((e >> 27) & 0x1) #define ecap_nest(e) ((e >> 26) & 0x1) #define ecap_mts(e) ((e >> 25) & 0x1) -- cgit v1.2.3 From 3928d4f5ee37cdc523894f6e549e6aae521d8980 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 21 Jul 2018 13:48:51 -0700 Subject: mm: use helper functions for allocating and freeing vm_area structs The vm_area_struct is one of the most fundamental memory management objects, but the management of it is entirely open-coded evertwhere, ranging from allocation and freeing (using kmem_cache_[z]alloc and kmem_cache_free) to initializing all the fields. We want to unify this in order to end up having some unified initialization of the vmas, and the first step to this is to at least have basic allocation functions. Right now those functions are literally just wrappers around the kmem_cache_*() calls. This is a purely mechanical conversion: # new vma: kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL) -> vm_area_alloc() # copy old vma kmem_cache_alloc(vm_area_cachep, GFP_KERNEL) -> vm_area_dup(old) # free vma kmem_cache_free(vm_area_cachep, vma) -> vm_area_free(vma) to the point where the old vma passed in to the vm_area_dup() function isn't even used yet (because I've left all the old manual initialization alone). Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 4 ++-- arch/ia64/mm/init.c | 8 ++++---- fs/exec.c | 4 ++-- include/linux/mm.h | 4 +++- kernel/fork.c | 21 ++++++++++++++++++--- mm/mmap.c | 22 +++++++++++----------- mm/nommu.c | 8 ++++---- 7 files changed, 44 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 3b38c717008a..e859246badca 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2278,7 +2278,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t DPRINT(("smpl_buf @%p\n", smpl_buf)); /* allocate vma */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { DPRINT(("Cannot allocate vma\n")); goto error_kmem; @@ -2346,7 +2346,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t return 0; error: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); error_kmem: pfm_rvfree(smpl_buf, size); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 18278b448530..3f2321bffb72 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -114,7 +114,7 @@ ia64_init_addr_space (void) * the problem. When the process attempts to write to the register backing store * for the first time, it will get a SEGFAULT in this case. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (vma) { INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; @@ -125,7 +125,7 @@ ia64_init_addr_space (void) down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return; } up_write(¤t->mm->mmap_sem); @@ -133,7 +133,7 @@ ia64_init_addr_space (void) /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (vma) { INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; @@ -144,7 +144,7 @@ ia64_init_addr_space (void) down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return; } up_write(¤t->mm->mmap_sem); diff --git a/fs/exec.c b/fs/exec.c index 2d4e0075bd24..9bd83989ea25 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + bprm->vma = vma = vm_area_alloc(); if (!vma) return -ENOMEM; @@ -326,7 +326,7 @@ err: up_write(&mm->mmap_sem); err_free: bprm->vma = NULL; - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return err; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 3982c83fdcbf..de2fd86c6154 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -155,7 +155,9 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, * mmap() functions). */ -extern struct kmem_cache *vm_area_cachep; +struct vm_area_struct *vm_area_alloc(void); +struct vm_area_struct *vm_area_dup(struct vm_area_struct *); +void vm_area_free(struct vm_area_struct *); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..0e23deb5acfc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -303,11 +303,26 @@ struct kmem_cache *files_cachep; struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ -struct kmem_cache *vm_area_cachep; +static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +struct vm_area_struct *vm_area_alloc(void) +{ + return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); +} + +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + return kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); +} + +void vm_area_free(struct vm_area_struct *vma) +{ + kmem_cache_free(vm_area_cachep, vma); +} + static void account_kernel_stack(struct task_struct *tsk, int account) { void *stack = task_stack_page(tsk); @@ -455,7 +470,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -539,7 +554,7 @@ fail_uprobe_end: fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); + vm_area_free(tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); diff --git a/mm/mmap.c b/mm/mmap.c index 5801b5f0a634..4286ad2dd1f5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -182,7 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return next; } @@ -911,7 +911,7 @@ again: anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); + vm_area_free(next); /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -1729,7 +1729,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { error = -ENOMEM; goto unacct_error; @@ -1832,7 +1832,7 @@ allow_write_and_free_vma: if (vm_flags & VM_DENYWRITE) allow_write_access(file); free_vma: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); unacct_error: if (charged) vm_unacct_memory(charged); @@ -2620,7 +2620,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return err; } - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) return -ENOMEM; @@ -2669,7 +2669,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, out_free_mpol: mpol_put(vma_policy(new)); out_free_vma: - kmem_cache_free(vm_area_cachep, new); + vm_area_free(new); return err; } @@ -2984,7 +2984,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* * create a vma struct for an anonymous mapping */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; @@ -3202,7 +3202,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { - new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new_vma = vm_area_dup(vma); if (!new_vma) goto out; *new_vma = *vma; @@ -3226,7 +3226,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: - kmem_cache_free(vm_area_cachep, new_vma); + vm_area_free(new_vma); out: return NULL; } @@ -3350,7 +3350,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3376,7 +3376,7 @@ static struct vm_area_struct *__install_special_mapping( return vma; out: - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ERR_PTR(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index 4452d8bd9ae4..006e3fe65017 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -769,7 +769,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); put_nommu_region(vma->vm_region); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); } /* @@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file, if (!region) goto error_getting_region; - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + vma = vm_area_alloc(); if (!vma) goto error_getting_vma; @@ -1368,7 +1368,7 @@ error: kmem_cache_free(vm_region_jar, region); if (vma->vm_file) fput(vma->vm_file); - kmem_cache_free(vm_area_cachep, vma); + vm_area_free(vma); return ret; sharing_violation: @@ -1469,7 +1469,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!region) return -ENOMEM; - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + new = vm_area_dup(vma); if (!new) { kmem_cache_free(vm_region_jar, region); return -ENOMEM; -- cgit v1.2.3 From 490fc053865c9cc40f1085ef8a5504f5341f79d2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 21 Jul 2018 15:24:03 -0700 Subject: mm: make vm_area_alloc() initialize core fields Like vm_area_dup(), it initializes the anon_vma_chain head, and the basic mm pointer. The rest of the fields end up being different for different users, although the plan is to also initialize the 'vm_ops' field to a dummy entry. Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 4 +--- arch/ia64/mm/init.c | 8 ++------ fs/exec.c | 4 +--- include/linux/mm.h | 2 +- kernel/fork.c | 10 ++++++++-- mm/mmap.c | 12 +++--------- mm/nommu.c | 3 +-- 7 files changed, 17 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index e859246badca..46bff1661836 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2278,17 +2278,15 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t DPRINT(("smpl_buf @%p\n", smpl_buf)); /* allocate vma */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { DPRINT(("Cannot allocate vma\n")); goto error_kmem; } - INIT_LIST_HEAD(&vma->anon_vma_chain); /* * partially initialize the vma for the sampling buffer */ - vma->vm_mm = mm; vma->vm_file = get_file(filp); vma->vm_flags = VM_READ|VM_MAYREAD|VM_DONTEXPAND|VM_DONTDUMP; vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 3f2321bffb72..bdb14a369137 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -114,10 +114,8 @@ ia64_init_addr_space (void) * the problem. When the process attempts to write to the register backing store * for the first time, it will get a SEGFAULT in this case. */ - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (vma) { - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = current->mm; vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; @@ -133,10 +131,8 @@ ia64_init_addr_space (void) /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ if (!(current->personality & MMAP_PAGE_ZERO)) { - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (vma) { - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = current->mm; vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | diff --git a/fs/exec.c b/fs/exec.c index 9bd83989ea25..72e961a62adb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -290,7 +290,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; - bprm->vma = vma = vm_area_alloc(); + bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; @@ -298,7 +298,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) err = -EINTR; goto err_free; } - vma->vm_mm = mm; /* * Place the stack at the largest stack address the architecture @@ -311,7 +310,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - INIT_LIST_HEAD(&vma->anon_vma_chain); err = insert_vm_struct(mm, vma); if (err) diff --git a/include/linux/mm.h b/include/linux/mm.h index de2fd86c6154..d3a3842316b8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -155,7 +155,7 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, * mmap() functions). */ -struct vm_area_struct *vm_area_alloc(void); +struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); diff --git a/kernel/fork.c b/kernel/fork.c index 67253e41bfb0..a191c05e757d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -308,9 +308,15 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -struct vm_area_struct *vm_area_alloc(void) +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { - return kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + + if (vma) { + vma->vm_mm = mm; + INIT_LIST_HEAD(&vma->anon_vma_chain); + } + return vma; } struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) diff --git a/mm/mmap.c b/mm/mmap.c index b0ed8ce1b67e..ff1944d8d458 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1729,19 +1729,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { error = -ENOMEM; goto unacct_error; } - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); if (file) { if (vm_flags & VM_DENYWRITE) { @@ -2979,14 +2977,12 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* * create a vma struct for an anonymous mapping */ - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (!vma) { vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = pgoff; @@ -3343,12 +3339,10 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - vma = vm_area_alloc(); + vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&vma->anon_vma_chain); - vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; diff --git a/mm/nommu.c b/mm/nommu.c index c2560e9cc803..1d22fdbf7d7c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1204,7 +1204,7 @@ unsigned long do_mmap(struct file *file, if (!region) goto error_getting_region; - vma = vm_area_alloc(); + vma = vm_area_alloc(current->mm); if (!vma) goto error_getting_vma; @@ -1212,7 +1212,6 @@ unsigned long do_mmap(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_flags = vm_flags; vma->vm_pgoff = pgoff; -- cgit v1.2.3 From f95de8aa9f824d96421cb7ca81552b4ad8768a31 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 19 Jul 2018 15:56:59 +0800 Subject: bpfilter: Fix mismatch in function argument types Fix following warning: net/ipv4/bpfilter/sockopt.c:28:5: error: symbol 'bpfilter_ip_set_sockopt' redeclared with different type net/ipv4/bpfilter/sockopt.c:34:5: error: symbol 'bpfilter_ip_get_sockopt' redeclared with different type Signed-off-by: YueHaibing Acked-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpfilter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h index 687b1760bb9f..f02cee0225d4 100644 --- a/include/linux/bpfilter.h +++ b/include/linux/bpfilter.h @@ -5,10 +5,10 @@ #include struct sock; -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval, +int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen); -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval, - int *optlen); +int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, + int __user *optlen); extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname, char __user *optval, unsigned int optlen, bool is_set); -- cgit v1.2.3 From f88a333b44318643282b8acc92af90deda441f5e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 Jul 2018 15:07:11 +0100 Subject: alpha: fix osf_wait4() breakage kernel_wait4() expects a userland address for status - it's only rusage that goes as a kernel one (and needs a copyout afterwards) [ Also, fix the prototype of kernel_wait4() to have that __user annotation - Linus ] Fixes: 92ebce5ac55d ("osf_wait4: switch to kernel_wait4()") Cc: stable@kernel.org # v4.13+ Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/alpha/kernel/osf_sys.c | 5 +---- include/linux/sched/task.h | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 6e921754c8fc..c210a25dd6da 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1180,13 +1180,10 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) SYSCALL_DEFINE4(osf_wait4, pid_t, pid, int __user *, ustatus, int, options, struct rusage32 __user *, ur) { - unsigned int status = 0; struct rusage r; - long err = kernel_wait4(pid, &status, options, &r); + long err = kernel_wait4(pid, ustatus, options, &r); if (err <= 0) return err; - if (put_user(status, ustatus)) - return -EFAULT; if (!ur) return err; if (put_tv_to_tv32(&ur->ru_utime, &r.ru_utime)) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 5be31eb7b266..108ede99e533 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -75,7 +75,7 @@ extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -extern long kernel_wait4(pid_t, int *, int, struct rusage *); +extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); extern void free_task(struct task_struct *tsk); -- cgit v1.2.3 From 0fc09f920983f61be625658c62cc40ac25a7b3a5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 23 Jul 2018 08:37:50 -0600 Subject: blk-mq: export setting request completion state This is preparing for drivers that want to directly alter the state of their requests. No functional change here. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +--- include/linux/blk-mq.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index d394cdd8d8c6..5291a95ba362 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -558,10 +558,8 @@ static void __blk_mq_complete_request(struct request *rq) bool shared = false; int cpu; - if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) != - MQ_RQ_IN_FLIGHT) + if (!blk_mq_mark_complete(rq)) return; - if (rq->internal_tag != -1) blk_mq_sched_completed_request(rq); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e3147eb74222..ca3f2c2edd85 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -287,6 +287,20 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); +/** + * blk_mq_mark_complete() - Set request state to complete + * @rq: request to set to complete state + * + * Returns true if request state was successfully set to complete. If + * successful, the caller is responsibile for seeing this request is ended, as + * blk_mq_complete_request will not work again. + */ +static inline bool blk_mq_mark_complete(struct request *rq) +{ + return cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) == + MQ_RQ_IN_FLIGHT; +} + /* * Driver command data is immediately after the request. So subtract request * size to get back to the original request, add request size to get the PDU. -- cgit v1.2.3 From 73c8d8945505acdcbae137c2e00a1232e0be709f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 14 Jul 2018 01:28:15 +0900 Subject: ring_buffer: tracing: Inherit the tracing setting to next ring buffer Maintain the tracing on/off setting of the ring_buffer when switching to the trace buffer snapshot. Taking a snapshot is done by swapping the backup ring buffer (max_tr_buffer). But since the tracing on/off setting is defined by the ring buffer, when swapping it, the tracing on/off setting can also be changed. This causes a strange result like below: /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 0 > tracing_on /sys/kernel/debug/tracing # cat tracing_on 0 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 0 We don't touch tracing_on, but snapshot changes tracing_on setting each time. This is an anomaly, because user doesn't know that each "ring_buffer" stores its own tracing-enable state and the snapshot is done by swapping ring buffers. Link: http://lkml.kernel.org/r/153149929558.11274.11730609978254724394.stgit@devbox Cc: Ingo Molnar Cc: Shuah Khan Cc: Tom Zanussi Cc: Hiraku Toyooka Cc: stable@vger.kernel.org Fixes: debdd57f5145 ("tracing: Make a snapshot feature available from userspace") Signed-off-by: Masami Hiramatsu [ Updated commit log and comment in the code ] Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 1 + kernel/trace/ring_buffer.c | 16 ++++++++++++++++ kernel/trace/trace.c | 6 ++++++ 3 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b72ebdff0b77..003d09ab308d 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -165,6 +165,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer); void ring_buffer_record_off(struct ring_buffer *buffer); void ring_buffer_record_on(struct ring_buffer *buffer); int ring_buffer_record_is_on(struct ring_buffer *buffer); +int ring_buffer_record_is_set_on(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6a46af21765c..0b0b688ea166 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3226,6 +3226,22 @@ int ring_buffer_record_is_on(struct ring_buffer *buffer) return !atomic_read(&buffer->record_disabled); } +/** + * ring_buffer_record_is_set_on - return true if the ring buffer is set writable + * @buffer: The ring buffer to see if write is set enabled + * + * Returns true if the ring buffer is set writable by ring_buffer_record_on(). + * Note that this does NOT mean it is in a writable state. + * + * It may return true when the ring buffer has been disabled by + * ring_buffer_record_disable(), as that is a temporary disabling of + * the ring buffer. + */ +int ring_buffer_record_is_set_on(struct ring_buffer *buffer) +{ + return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); +} + /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87cf25171fb8..823687997b01 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1373,6 +1373,12 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&tr->max_lock); + /* Inherit the recordable setting from trace_buffer */ + if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer)) + ring_buffer_record_on(tr->max_buffer.buffer); + else + ring_buffer_record_off(tr->max_buffer.buffer); + swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); -- cgit v1.2.3 From b512719f771a82180211c9a315b8a7f628832b3d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Jul 2018 16:37:08 -0700 Subject: delayacct: fix crash in delayacct_blkio_end() after delayacct init failure While forking, if delayacct init fails due to memory shortage, it continues expecting all delayacct users to check task->delays pointer against NULL before dereferencing it, which all of them used to do. Commit c96f5471ce7d ("delayacct: Account blkio completion on the correct task"), while updating delayacct_blkio_end() to take the target task instead of always using %current, made the function test NULL on %current->delays and then continue to operated on @p->delays. If %current succeeded init while @p didn't, it leads to the following crash. BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 IP: __delayacct_blkio_end+0xc/0x40 PGD 8000001fd07e1067 P4D 8000001fd07e1067 PUD 1fcffbb067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 4 PID: 25774 Comm: QIOThread0 Not tainted 4.16.0-9_fbk1_rc2_1180_g6b593215b4d7 #9 RIP: 0010:__delayacct_blkio_end+0xc/0x40 Call Trace: try_to_wake_up+0x2c0/0x600 autoremove_wake_function+0xe/0x30 __wake_up_common+0x74/0x120 wake_up_page_bit+0x9c/0xe0 mpage_end_io+0x27/0x70 blk_update_request+0x78/0x2c0 scsi_end_request+0x2c/0x1e0 scsi_io_completion+0x20b/0x5f0 blk_mq_complete_request+0xa2/0x100 ata_scsi_qc_complete+0x79/0x400 ata_qc_complete_multiple+0x86/0xd0 ahci_handle_port_interrupt+0xc9/0x5c0 ahci_handle_port_intr+0x54/0xb0 ahci_single_level_irq_intr+0x3b/0x60 __handle_irq_event_percpu+0x43/0x190 handle_irq_event_percpu+0x20/0x50 handle_irq_event+0x2a/0x50 handle_edge_irq+0x80/0x1c0 handle_irq+0xaf/0x120 do_IRQ+0x41/0xc0 common_interrupt+0xf/0xf Fix it by updating delayacct_blkio_end() check @p->delays instead. Link: http://lkml.kernel.org/r/20180724175542.GP1934745@devbig577.frc2.facebook.com Fixes: c96f5471ce7d ("delayacct: Account blkio completion on the correct task") Signed-off-by: Tejun Heo Reported-by: Dave Jones Debugged-by: Dave Jones Reviewed-by: Andrew Morton Cc: Josh Snyder Cc: [4.15+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index e6c0448ebcc7..31c865d1842e 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -124,7 +124,7 @@ static inline void delayacct_blkio_start(void) static inline void delayacct_blkio_end(struct task_struct *p) { - if (current->delays) + if (p->delays) __delayacct_blkio_end(p); delayacct_clear_flag(DELAYACCT_PF_BLKIO); } -- cgit v1.2.3 From 027232da7c7c1c7f04383f93bd798e475dde5285 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 26 Jul 2018 16:37:25 -0700 Subject: mm: introduce vma_init() Not all VMAs allocated with vm_area_alloc(). Some of them allocated on stack or in data segment. The new helper can be use to initialize VMA properly regardless where it was allocated. Link: http://lkml.kernel.org/r/20180724121139.62570-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Reviewed-by: Andrew Morton Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++++ kernel/fork.c | 6 ++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index d3a3842316b8..31540f166987 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -452,6 +452,12 @@ struct vm_operations_struct { unsigned long addr); }; +static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) +{ + vma->vm_mm = mm; + INIT_LIST_HEAD(&vma->anon_vma_chain); +} + struct mmu_gather; struct inode; diff --git a/kernel/fork.c b/kernel/fork.c index a191c05e757d..1b27babc4c78 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -312,10 +312,8 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (vma) { - vma->vm_mm = mm; - INIT_LIST_HEAD(&vma->anon_vma_chain); - } + if (vma) + vma_init(vma, mm); return vma; } -- cgit v1.2.3 From bfd40eaff5abb9f62c8ef94ca13ed0d94a560f10 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 26 Jul 2018 16:37:35 -0700 Subject: mm: fix vma_is_anonymous() false-positives vma_is_anonymous() relies on ->vm_ops being NULL to detect anonymous VMA. This is unreliable as ->mmap may not set ->vm_ops. False-positive vma_is_anonymous() may lead to crashes: next ffff8801ce5e7040 prev ffff8801d20eca50 mm ffff88019c1e13c0 prot 27 anon_vma ffff88019680cdd8 vm_ops 0000000000000000 pgoff 0 file ffff8801b2ec2d00 private_data 0000000000000000 flags: 0xff(read|write|exec|shared|mayread|maywrite|mayexec|mayshare) ------------[ cut here ]------------ kernel BUG at mm/memory.c:1422! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 18486 Comm: syz-executor3 Not tainted 4.18.0-rc3+ #136 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:zap_pmd_range mm/memory.c:1421 [inline] RIP: 0010:zap_pud_range mm/memory.c:1466 [inline] RIP: 0010:zap_p4d_range mm/memory.c:1487 [inline] RIP: 0010:unmap_page_range+0x1c18/0x2220 mm/memory.c:1508 Call Trace: unmap_single_vma+0x1a0/0x310 mm/memory.c:1553 zap_page_range_single+0x3cc/0x580 mm/memory.c:1644 unmap_mapping_range_vma mm/memory.c:2792 [inline] unmap_mapping_range_tree mm/memory.c:2813 [inline] unmap_mapping_pages+0x3a7/0x5b0 mm/memory.c:2845 unmap_mapping_range+0x48/0x60 mm/memory.c:2880 truncate_pagecache+0x54/0x90 mm/truncate.c:800 truncate_setsize+0x70/0xb0 mm/truncate.c:826 simple_setattr+0xe9/0x110 fs/libfs.c:409 notify_change+0xf13/0x10f0 fs/attr.c:335 do_truncate+0x1ac/0x2b0 fs/open.c:63 do_sys_ftruncate+0x492/0x560 fs/open.c:205 __do_sys_ftruncate fs/open.c:215 [inline] __se_sys_ftruncate fs/open.c:213 [inline] __x64_sys_ftruncate+0x59/0x80 fs/open.c:213 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Reproducer: #include #include #include #include #include #include #include #include #include #include #include #define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) #define KCOV_ENABLE _IO('c', 100) #define KCOV_DISABLE _IO('c', 101) #define COVER_SIZE (1024<<10) #define KCOV_TRACE_PC 0 #define KCOV_TRACE_CMP 1 int main(int argc, char **argv) { int fd; unsigned long *cover; system("mount -t debugfs none /sys/kernel/debug"); fd = open("/sys/kernel/debug/kcov", O_RDWR); ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE); cover = mmap(NULL, COVER_SIZE * sizeof(unsigned long), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); munmap(cover, COVER_SIZE * sizeof(unsigned long)); cover = mmap(NULL, COVER_SIZE * sizeof(unsigned long), PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); memset(cover, 0, COVER_SIZE * sizeof(unsigned long)); ftruncate(fd, 3UL << 20); return 0; } This can be fixed by assigning anonymous VMAs own vm_ops and not relying on it being NULL. If ->mmap() failed to set ->vm_ops, mmap_region() will set it to dummy_vm_ops. This way we will have non-NULL ->vm_ops for all VMAs. Link: http://lkml.kernel.org/r/20180724121139.62570-4-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reported-by: syzbot+3f84280d52be9b7083cc@syzkaller.appspotmail.com Acked-by: Linus Torvalds Reviewed-by: Andrew Morton Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/mem.c | 1 + fs/exec.c | 1 + include/linux/mm.h | 8 ++++++++ mm/mmap.c | 3 +++ mm/nommu.c | 2 ++ 5 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index ffeb60d3434c..df66a9dd0aae 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -708,6 +708,7 @@ static int mmap_zero(struct file *file, struct vm_area_struct *vma) #endif if (vma->vm_flags & VM_SHARED) return shmem_zero_setup(vma); + vma_set_anonymous(vma); return 0; } diff --git a/fs/exec.c b/fs/exec.c index 72e961a62adb..bdd0eacefdf5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -293,6 +293,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; + vma_set_anonymous(vma); if (down_write_killable(&mm->mmap_sem)) { err = -EINTR; diff --git a/include/linux/mm.h b/include/linux/mm.h index 31540f166987..7ba6d356d18f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -454,10 +454,18 @@ struct vm_operations_struct { static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { + static const struct vm_operations_struct dummy_vm_ops = {}; + vma->vm_mm = mm; + vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); } +static inline void vma_set_anonymous(struct vm_area_struct *vma) +{ + vma->vm_ops = NULL; +} + struct mmu_gather; struct inode; diff --git a/mm/mmap.c b/mm/mmap.c index ff1944d8d458..17bbf4d3e24f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1778,6 +1778,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, error = shmem_zero_setup(vma); if (error) goto free_vma; + } else { + vma_set_anonymous(vma); } vma_link(mm, vma, prev, rb_link, rb_parent); @@ -2983,6 +2985,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla return -ENOMEM; } + vma_set_anonymous(vma); vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = pgoff; diff --git a/mm/nommu.c b/mm/nommu.c index 1d22fdbf7d7c..9fc9e43335b6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1145,6 +1145,8 @@ static int do_mmap_private(struct vm_area_struct *vma, if (ret < len) memset(base + ret, 0, len - ret); + } else { + vma_set_anonymous(vma); } return 0; -- cgit v1.2.3 From fa3fc2ad99b4f025446d1cff589a8d2dd7db92f2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 26 Jul 2018 16:37:38 -0700 Subject: include/linux/eventfd.h: include linux/errno.h The new gasket staging driver ran into a randconfig build failure when CONFIG_EVENTFD is disabled: In file included from drivers/staging/gasket/gasket_interrupt.h:11, from drivers/staging/gasket/gasket_interrupt.c:4: include/linux/eventfd.h: In function 'eventfd_ctx_fdget': include/linux/eventfd.h:51:9: error: implicit declaration of function 'ERR_PTR' [-Werror=implicit-function-declaration] I can't see anything wrong with including eventfd.h before err.h, so the easiest fix is to make it possible to do this by including the file where it is needed. Link: http://lkml.kernel.org/r/20180724110737.3985088-1-arnd@arndb.de Fixes: 9a69f5087ccc ("drivers/staging: Gasket driver framework + Apex driver") Signed-off-by: Arnd Bergmann Cc: Eric Biggers Cc: Al Viro Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/eventfd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 7094718b653b..ffcc7724ca21 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -11,6 +11,7 @@ #include #include +#include /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining -- cgit v1.2.3