diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/Makefile | 2 | ||||
| -rw-r--r-- | net/core/datagram.c | 13 | ||||
| -rw-r--r-- | net/core/dev.c | 188 | ||||
| -rw-r--r-- | net/core/devlink.c | 111 | ||||
| -rw-r--r-- | net/core/dst.c | 1 | ||||
| -rw-r--r-- | net/core/ethtool.c | 75 | ||||
| -rw-r--r-- | net/core/failover.c | 315 | ||||
| -rw-r--r-- | net/core/fib_rules.c | 495 | ||||
| -rw-r--r-- | net/core/filter.c | 1423 | ||||
| -rw-r--r-- | net/core/flow_dissector.c | 19 | ||||
| -rw-r--r-- | net/core/neighbour.c | 49 | ||||
| -rw-r--r-- | net/core/net-procfs.c | 65 | ||||
| -rw-r--r-- | net/core/net-traces.c | 4 | ||||
| -rw-r--r-- | net/core/page_pool.c | 317 | ||||
| -rw-r--r-- | net/core/pktgen.c | 3 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 34 | ||||
| -rw-r--r-- | net/core/skbuff.c | 25 | ||||
| -rw-r--r-- | net/core/sock.c | 47 | ||||
| -rw-r--r-- | net/core/xdp.c | 299 | 
19 files changed, 2815 insertions, 670 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 6dbbba8c57ae..80175e6a2eb8 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -14,6 +14,7 @@ obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \  			fib_notifier.o xdp.o  obj-y += net-sysfs.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o  obj-$(CONFIG_PROC_FS) += net-procfs.o  obj-$(CONFIG_NET_PKTGEN) += pktgen.o  obj-$(CONFIG_NETPOLL) += netpoll.o @@ -30,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o  obj-$(CONFIG_HWBM) += hwbm.o  obj-$(CONFIG_NET_DEVLINK) += devlink.o  obj-$(CONFIG_GRO_CELLS) += gro_cells.o +obj-$(CONFIG_FAILOVER) += failover.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 9938952c5c78..f19bf3dc2bd6 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -819,9 +819,8 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);  /**   * 	datagram_poll - generic datagram poll - *	@file: file struct   *	@sock: socket - *	@wait: poll table + *	@events to wait for   *   *	Datagram poll: Again totally generic. This also handles   *	sequenced packet sockets providing the socket receive queue @@ -831,14 +830,10 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);   *	and you use a different write policy from sock_writeable()   *	then please supply your own write_space callback.   */ -__poll_t datagram_poll(struct file *file, struct socket *sock, -			   poll_table *wait) +__poll_t datagram_poll_mask(struct socket *sock, __poll_t events)  {  	struct sock *sk = sock->sk; -	__poll_t mask; - -	sock_poll_wait(file, sk_sleep(sk), wait); -	mask = 0; +	__poll_t mask = 0;  	/* exceptional events? */  	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) @@ -871,4 +866,4 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,  	return mask;  } -EXPORT_SYMBOL(datagram_poll); +EXPORT_SYMBOL(datagram_poll_mask); diff --git a/net/core/dev.c b/net/core/dev.c index 2af787e8b130..57b7bab5f70b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1285,6 +1285,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)  	return len;  } +EXPORT_SYMBOL(dev_set_alias);  /**   *	dev_get_alias - get ifalias of a device @@ -1586,7 +1587,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)  	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)  	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)  	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) -	}; +	}  #undef N  	return "UNKNOWN_NETDEV_EVENT";  } @@ -1754,38 +1755,38 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)  EXPORT_SYMBOL(call_netdevice_notifiers);  #ifdef CONFIG_NET_INGRESS -static struct static_key ingress_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);  void net_inc_ingress_queue(void)  { -	static_key_slow_inc(&ingress_needed); +	static_branch_inc(&ingress_needed_key);  }  EXPORT_SYMBOL_GPL(net_inc_ingress_queue);  void net_dec_ingress_queue(void)  { -	static_key_slow_dec(&ingress_needed); +	static_branch_dec(&ingress_needed_key);  }  EXPORT_SYMBOL_GPL(net_dec_ingress_queue);  #endif  #ifdef CONFIG_NET_EGRESS -static struct static_key egress_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(egress_needed_key);  void net_inc_egress_queue(void)  { -	static_key_slow_inc(&egress_needed); +	static_branch_inc(&egress_needed_key);  }  EXPORT_SYMBOL_GPL(net_inc_egress_queue);  void net_dec_egress_queue(void)  { -	static_key_slow_dec(&egress_needed); +	static_branch_dec(&egress_needed_key);  }  EXPORT_SYMBOL_GPL(net_dec_egress_queue);  #endif -static struct static_key netstamp_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);  #ifdef HAVE_JUMP_LABEL  static atomic_t netstamp_needed_deferred;  static atomic_t netstamp_wanted; @@ -1796,9 +1797,9 @@ static void netstamp_clear(struct work_struct *work)  	wanted = atomic_add_return(deferred, &netstamp_wanted);  	if (wanted > 0) -		static_key_enable(&netstamp_needed); +		static_branch_enable(&netstamp_needed_key);  	else -		static_key_disable(&netstamp_needed); +		static_branch_disable(&netstamp_needed_key);  }  static DECLARE_WORK(netstamp_work, netstamp_clear);  #endif @@ -1818,7 +1819,7 @@ void net_enable_timestamp(void)  	atomic_inc(&netstamp_needed_deferred);  	schedule_work(&netstamp_work);  #else -	static_key_slow_inc(&netstamp_needed); +	static_branch_inc(&netstamp_needed_key);  #endif  }  EXPORT_SYMBOL(net_enable_timestamp); @@ -1838,7 +1839,7 @@ void net_disable_timestamp(void)  	atomic_dec(&netstamp_needed_deferred);  	schedule_work(&netstamp_work);  #else -	static_key_slow_dec(&netstamp_needed); +	static_branch_dec(&netstamp_needed_key);  #endif  }  EXPORT_SYMBOL(net_disable_timestamp); @@ -1846,15 +1847,15 @@ EXPORT_SYMBOL(net_disable_timestamp);  static inline void net_timestamp_set(struct sk_buff *skb)  {  	skb->tstamp = 0; -	if (static_key_false(&netstamp_needed)) +	if (static_branch_unlikely(&netstamp_needed_key))  		__net_timestamp(skb);  } -#define net_timestamp_check(COND, SKB)			\ -	if (static_key_false(&netstamp_needed)) {		\ -		if ((COND) && !(SKB)->tstamp)	\ -			__net_timestamp(SKB);		\ -	}						\ +#define net_timestamp_check(COND, SKB)				\ +	if (static_branch_unlikely(&netstamp_needed_key)) {	\ +		if ((COND) && !(SKB)->tstamp)			\ +			__net_timestamp(SKB);			\ +	}							\  bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)  { @@ -2614,17 +2615,16 @@ EXPORT_SYMBOL(netif_device_attach);   * Returns a Tx hash based on the given packet descriptor a Tx queues' number   * to be used as a distribution range.   */ -u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, -		  unsigned int num_tx_queues) +static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)  {  	u32 hash;  	u16 qoffset = 0; -	u16 qcount = num_tx_queues; +	u16 qcount = dev->real_num_tx_queues;  	if (skb_rx_queue_recorded(skb)) {  		hash = skb_get_rx_queue(skb); -		while (unlikely(hash >= num_tx_queues)) -			hash -= num_tx_queues; +		while (unlikely(hash >= qcount)) +			hash -= qcount;  		return hash;  	} @@ -2637,7 +2637,6 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,  	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;  } -EXPORT_SYMBOL(__skb_tx_hash);  static void skb_warn_bad_offload(const struct sk_buff *skb)  { @@ -2884,11 +2883,7 @@ void netdev_rx_csum_fault(struct net_device *dev)  EXPORT_SYMBOL(netdev_rx_csum_fault);  #endif -/* Actually, we should eliminate this check as soon as we know, that: - * 1. IOMMU is present and allows to map all the memory. - * 2. No high memory really exists on this machine. - */ - +/* XXX: check that highmem exists at all on the given machine. */  static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)  {  #ifdef CONFIG_HIGHMEM @@ -2902,20 +2897,6 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)  				return 1;  		}  	} - -	if (PCI_DMA_BUS_IS_PHYS) { -		struct device *pdev = dev->dev.parent; - -		if (!pdev) -			return 0; -		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { -			skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; -			dma_addr_t addr = page_to_phys(skb_frag_page(frag)); - -			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) -				return 1; -		} -	}  #endif  	return 0;  } @@ -3113,6 +3094,10 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device  	if (unlikely(!skb))  		goto out_null; +	skb = sk_validate_xmit_skb(skb, dev); +	if (unlikely(!skb)) +		goto out_null; +  	if (netif_needs_gso(skb, features)) {  		struct sk_buff *segs; @@ -3241,7 +3226,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,  			rc = NET_XMIT_DROP;  		} else {  			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; -			__qdisc_run(q); +			qdisc_run(q);  		}  		if (unlikely(to_free)) @@ -3529,7 +3514,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)  #ifdef CONFIG_NET_CLS_ACT  	skb->tc_at_ingress = 0;  # ifdef CONFIG_NET_EGRESS -	if (static_key_false(&egress_needed)) { +	if (static_branch_unlikely(&egress_needed_key)) {  		skb = sch_handle_egress(skb, &rc, dev);  		if (!skb)  			goto out; @@ -3624,6 +3609,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)  }  EXPORT_SYMBOL(dev_queue_xmit_accel); +int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +{ +	struct net_device *dev = skb->dev; +	struct sk_buff *orig_skb = skb; +	struct netdev_queue *txq; +	int ret = NETDEV_TX_BUSY; +	bool again = false; + +	if (unlikely(!netif_running(dev) || +		     !netif_carrier_ok(dev))) +		goto drop; + +	skb = validate_xmit_skb_list(skb, dev, &again); +	if (skb != orig_skb) +		goto drop; + +	skb_set_queue_mapping(skb, queue_id); +	txq = skb_get_tx_queue(dev, skb); + +	local_bh_disable(); + +	HARD_TX_LOCK(dev, txq, smp_processor_id()); +	if (!netif_xmit_frozen_or_drv_stopped(txq)) +		ret = netdev_start_xmit(skb, dev, txq, false); +	HARD_TX_UNLOCK(dev, txq); + +	local_bh_enable(); + +	if (!dev_xmit_complete(ret)) +		kfree_skb(skb); + +	return ret; +drop: +	atomic_long_inc(&dev->tx_dropped); +	kfree_skb_list(skb); +	return NET_XMIT_DROP; +} +EXPORT_SYMBOL(dev_direct_xmit);  /*************************************************************************   *			Receiver routines @@ -3993,12 +4016,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)  }  static u32 netif_receive_generic_xdp(struct sk_buff *skb, +				     struct xdp_buff *xdp,  				     struct bpf_prog *xdp_prog)  {  	struct netdev_rx_queue *rxqueue; +	void *orig_data, *orig_data_end;  	u32 metalen, act = XDP_DROP; -	struct xdp_buff xdp; -	void *orig_data;  	int hlen, off;  	u32 mac_len; @@ -4033,31 +4056,42 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,  	 */  	mac_len = skb->data - skb_mac_header(skb);  	hlen = skb_headlen(skb) + mac_len; -	xdp.data = skb->data - mac_len; -	xdp.data_meta = xdp.data; -	xdp.data_end = xdp.data + hlen; -	xdp.data_hard_start = skb->data - skb_headroom(skb); -	orig_data = xdp.data; +	xdp->data = skb->data - mac_len; +	xdp->data_meta = xdp->data; +	xdp->data_end = xdp->data + hlen; +	xdp->data_hard_start = skb->data - skb_headroom(skb); +	orig_data_end = xdp->data_end; +	orig_data = xdp->data;  	rxqueue = netif_get_rxqueue(skb); -	xdp.rxq = &rxqueue->xdp_rxq; +	xdp->rxq = &rxqueue->xdp_rxq; -	act = bpf_prog_run_xdp(xdp_prog, &xdp); +	act = bpf_prog_run_xdp(xdp_prog, xdp); -	off = xdp.data - orig_data; +	off = xdp->data - orig_data;  	if (off > 0)  		__skb_pull(skb, off);  	else if (off < 0)  		__skb_push(skb, -off);  	skb->mac_header += off; +	/* check if bpf_xdp_adjust_tail was used. it can only "shrink" +	 * pckt. +	 */ +	off = orig_data_end - xdp->data_end; +	if (off != 0) { +		skb_set_tail_pointer(skb, xdp->data_end - xdp->data); +		skb->len -= off; + +	} +  	switch (act) {  	case XDP_REDIRECT:  	case XDP_TX:  		__skb_push(skb, mac_len);  		break;  	case XDP_PASS: -		metalen = xdp.data - xdp.data_meta; +		metalen = xdp->data - xdp->data_meta;  		if (metalen)  			skb_metadata_set(skb, metalen);  		break; @@ -4102,22 +4136,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)  }  EXPORT_SYMBOL_GPL(generic_xdp_tx); -static struct static_key generic_xdp_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);  int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)  {  	if (xdp_prog) { -		u32 act = netif_receive_generic_xdp(skb, xdp_prog); +		struct xdp_buff xdp; +		u32 act;  		int err; +		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);  		if (act != XDP_PASS) {  			switch (act) {  			case XDP_REDIRECT:  				err = xdp_do_generic_redirect(skb->dev, skb, -							      xdp_prog); +							      &xdp, xdp_prog);  				if (err)  					goto out_redir; -			/* fallthru to submit skb */ +				break;  			case XDP_TX:  				generic_xdp_tx(skb, xdp_prog);  				break; @@ -4140,7 +4176,7 @@ static int netif_rx_internal(struct sk_buff *skb)  	trace_netif_rx(skb); -	if (static_key_false(&generic_xdp_needed)) { +	if (static_branch_unlikely(&generic_xdp_needed_key)) {  		int ret;  		preempt_disable(); @@ -4512,7 +4548,7 @@ another_round:  skip_taps:  #ifdef CONFIG_NET_INGRESS -	if (static_key_false(&ingress_needed)) { +	if (static_branch_unlikely(&ingress_needed_key)) {  		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);  		if (!skb)  			goto out; @@ -4672,9 +4708,9 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)  			bpf_prog_put(old);  		if (old && !new) { -			static_key_slow_dec(&generic_xdp_needed); +			static_branch_dec(&generic_xdp_needed_key);  		} else if (new && !old) { -			static_key_slow_inc(&generic_xdp_needed); +			static_branch_inc(&generic_xdp_needed_key);  			dev_disable_lro(dev);  			dev_disable_gro_hw(dev);  		} @@ -4702,7 +4738,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)  	if (skb_defer_rx_timestamp(skb))  		return NET_RX_SUCCESS; -	if (static_key_false(&generic_xdp_needed)) { +	if (static_branch_unlikely(&generic_xdp_needed_key)) {  		int ret;  		preempt_disable(); @@ -6749,15 +6785,15 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)  			dev->flags & IFF_PROMISC ? "entered" : "left");  		if (audit_enabled) {  			current_uid_gid(&uid, &gid); -			audit_log(current->audit_context, GFP_ATOMIC, -				AUDIT_ANOM_PROMISCUOUS, -				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", -				dev->name, (dev->flags & IFF_PROMISC), -				(old_flags & IFF_PROMISC), -				from_kuid(&init_user_ns, audit_get_loginuid(current)), -				from_kuid(&init_user_ns, uid), -				from_kgid(&init_user_ns, gid), -				audit_get_sessionid(current)); +			audit_log(audit_context(), GFP_ATOMIC, +				  AUDIT_ANOM_PROMISCUOUS, +				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", +				  dev->name, (dev->flags & IFF_PROMISC), +				  (old_flags & IFF_PROMISC), +				  from_kuid(&init_user_ns, audit_get_loginuid(current)), +				  from_kuid(&init_user_ns, uid), +				  from_kgid(&init_user_ns, gid), +				  audit_get_sessionid(current));  		}  		dev_change_rx_flags(dev, IFF_PROMISC); @@ -7870,6 +7906,8 @@ int register_netdevice(struct net_device *dev)  	int ret;  	struct net *net = dev_net(dev); +	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < +		     NETDEV_FEATURE_COUNT);  	BUG_ON(dev_boot_phase);  	ASSERT_RTNL(); @@ -8785,7 +8823,7 @@ static struct hlist_head * __net_init netdev_create_hash(void)  	int i;  	struct hlist_head *hash; -	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); +	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);  	if (hash != NULL)  		for (i = 0; i < NETDEV_HASHENTRIES; i++)  			INIT_HLIST_HEAD(&hash[i]); diff --git a/net/core/devlink.c b/net/core/devlink.c index ad1317376798..22099705cc41 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -453,6 +453,27 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)  				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);  } +static int devlink_nl_port_attrs_put(struct sk_buff *msg, +				     struct devlink_port *devlink_port) +{ +	struct devlink_port_attrs *attrs = &devlink_port->attrs; + +	if (!attrs->set) +		return 0; +	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) +		return -EMSGSIZE; +	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number)) +		return -EMSGSIZE; +	if (!attrs->split) +		return 0; +	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number)) +		return -EMSGSIZE; +	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, +			attrs->split_subport_number)) +		return -EMSGSIZE; +	return 0; +} +  static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,  				struct devlink_port *devlink_port,  				enum devlink_command cmd, u32 portid, @@ -492,9 +513,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,  				   ibdev->name))  			goto nla_put_failure;  	} -	if (devlink_port->split && -	    nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, -			devlink_port->split_group)) +	if (devlink_nl_port_attrs_put(msg, devlink_port))  		goto nla_put_failure;  	genlmsg_end(msg, hdr); @@ -683,12 +702,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,  	return 0;  } -static int devlink_port_split(struct devlink *devlink, -			      u32 port_index, u32 count) +static int devlink_port_split(struct devlink *devlink, u32 port_index, +			      u32 count, struct netlink_ext_ack *extack)  {  	if (devlink->ops && devlink->ops->port_split) -		return devlink->ops->port_split(devlink, port_index, count); +		return devlink->ops->port_split(devlink, port_index, count, +						extack);  	return -EOPNOTSUPP;  } @@ -705,14 +725,15 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,  	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);  	count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); -	return devlink_port_split(devlink, port_index, count); +	return devlink_port_split(devlink, port_index, count, info->extack);  } -static int devlink_port_unsplit(struct devlink *devlink, u32 port_index) +static int devlink_port_unsplit(struct devlink *devlink, u32 port_index, +				struct netlink_ext_ack *extack)  {  	if (devlink->ops && devlink->ops->port_unsplit) -		return devlink->ops->port_unsplit(devlink, port_index); +		return devlink->ops->port_unsplit(devlink, port_index, extack);  	return -EOPNOTSUPP;  } @@ -726,7 +747,7 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,  		return -EINVAL;  	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); -	return devlink_port_unsplit(devlink, port_index); +	return devlink_port_unsplit(devlink, port_index, info->extack);  }  static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, @@ -1807,7 +1828,6 @@ send_done:  nla_put_failure:  	err = -EMSGSIZE;  err_table_put: -	genlmsg_cancel(skb, hdr);  	nlmsg_free(skb);  	return err;  } @@ -2013,7 +2033,6 @@ int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)  	return 0;  nla_put_failure: -	genlmsg_cancel(dump_ctx->skb, dump_ctx->hdr);  	nlmsg_free(dump_ctx->skb);  	return -EMSGSIZE;  } @@ -2230,7 +2249,6 @@ send_done:  nla_put_failure:  	err = -EMSGSIZE;  err_table_put: -	genlmsg_cancel(skb, hdr);  	nlmsg_free(skb);  	return err;  } @@ -2532,7 +2550,6 @@ nla_put_failure:  	err = -EMSGSIZE;  err_resource_put:  err_skb_send_alloc: -	genlmsg_cancel(skb, hdr);  	nlmsg_free(skb);  	return err;  } @@ -2584,7 +2601,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)  		NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");  		return err;  	} -	return devlink->ops->reload(devlink); +	return devlink->ops->reload(devlink, info->extack);  }  static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { @@ -2737,7 +2754,8 @@ static const struct genl_ops devlink_nl_ops[] = {  		.doit = devlink_nl_cmd_eswitch_set_doit,  		.policy = devlink_nl_policy,  		.flags = GENL_ADMIN_PERM, -		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, +		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | +				  DEVLINK_NL_FLAG_NO_LOCK,  	},  	{  		.cmd = DEVLINK_CMD_DPIPE_TABLE_GET, @@ -2971,19 +2989,64 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)  EXPORT_SYMBOL_GPL(devlink_port_type_clear);  /** - *	devlink_port_split_set - Set port is split + *	devlink_port_attrs_set - Set port attributes   *   *	@devlink_port: devlink port - *	@split_group: split group - identifies group split port is part of + *	@flavour: flavour of the port + *	@port_number: number of the port that is facing user, for example + *	              the front panel port number + *	@split: indicates if this is split port + *	@split_subport_number: if the port is split, this is the number + *	                       of subport.   */ -void devlink_port_split_set(struct devlink_port *devlink_port, -			    u32 split_group) -{ -	devlink_port->split = true; -	devlink_port->split_group = split_group; +void devlink_port_attrs_set(struct devlink_port *devlink_port, +			    enum devlink_port_flavour flavour, +			    u32 port_number, bool split, +			    u32 split_subport_number) +{ +	struct devlink_port_attrs *attrs = &devlink_port->attrs; + +	attrs->set = true; +	attrs->flavour = flavour; +	attrs->port_number = port_number; +	attrs->split = split; +	attrs->split_subport_number = split_subport_number;  	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);  } -EXPORT_SYMBOL_GPL(devlink_port_split_set); +EXPORT_SYMBOL_GPL(devlink_port_attrs_set); + +int devlink_port_get_phys_port_name(struct devlink_port *devlink_port, +				    char *name, size_t len) +{ +	struct devlink_port_attrs *attrs = &devlink_port->attrs; +	int n = 0; + +	if (!attrs->set) +		return -EOPNOTSUPP; + +	switch (attrs->flavour) { +	case DEVLINK_PORT_FLAVOUR_PHYSICAL: +		if (!attrs->split) +			n = snprintf(name, len, "p%u", attrs->port_number); +		else +			n = snprintf(name, len, "p%us%u", attrs->port_number, +				     attrs->split_subport_number); +		break; +	case DEVLINK_PORT_FLAVOUR_CPU: +	case DEVLINK_PORT_FLAVOUR_DSA: +		/* As CPU and DSA ports do not have a netdevice associated +		 * case should not ever happen. +		 */ +		WARN_ON(1); +		return -EINVAL; +	} + +	if (n >= len) +		return -EINVAL; + +	return 0; +} +EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name);  int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,  			u32 size, u16 ingress_pools_count, diff --git a/net/core/dst.c b/net/core/dst.c index 007aa0b08291..2d9b37f8944a 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -58,6 +58,7 @@ const struct dst_metrics dst_default_metrics = {  	 */  	.refcnt = REFCOUNT_INIT(1),  }; +EXPORT_SYMBOL(dst_default_metrics);  void dst_init(struct dst_entry *dst, struct dst_ops *ops,  	      struct net_device *dev, int initial_ref, int initial_obsolete, diff --git a/net/core/ethtool.c b/net/core/ethtool.c index ba02f0dfe85c..e677a20180cf 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]  	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",  	[NETIF_F_GSO_SCTP_BIT] =	 "tx-sctp-segmentation",  	[NETIF_F_GSO_ESP_BIT] =		 "tx-esp-segmentation", +	[NETIF_F_GSO_UDP_L4_BIT] =	 "tx-udp-segmentation",  	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",  	[NETIF_F_SCTP_CRC_BIT] =        "tx-checksum-sctp", @@ -109,6 +110,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]  	[NETIF_F_HW_ESP_TX_CSUM_BIT] =	 "esp-tx-csum-hw-offload",  	[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] =	 "rx-udp_tunnel-port-offload",  	[NETIF_F_HW_TLS_RECORD_BIT] =	"tls-hw-record", +	[NETIF_F_HW_TLS_TX_BIT] =	 "tls-hw-tx-offload",  };  static const char @@ -210,23 +212,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)  	return ret;  } -static int phy_get_sset_count(struct phy_device *phydev) -{ -	int ret; - -	if (phydev->drv->get_sset_count && -	    phydev->drv->get_strings && -	    phydev->drv->get_stats) { -		mutex_lock(&phydev->lock); -		ret = phydev->drv->get_sset_count(phydev); -		mutex_unlock(&phydev->lock); - -		return ret; -	} - -	return -EOPNOTSUPP; -} -  static int __ethtool_get_sset_count(struct net_device *dev, int sset)  {  	const struct ethtool_ops *ops = dev->ethtool_ops; @@ -243,12 +228,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)  	if (sset == ETH_SS_PHY_TUNABLES)  		return ARRAY_SIZE(phy_tunable_strings); -	if (sset == ETH_SS_PHY_STATS) { -		if (dev->phydev) -			return phy_get_sset_count(dev->phydev); -		else -			return -EOPNOTSUPP; -	} +	if (sset == ETH_SS_PHY_STATS && dev->phydev && +	    !ops->get_ethtool_phy_stats) +		return phy_ethtool_get_sset_count(dev->phydev);  	if (ops->get_sset_count && ops->get_strings)  		return ops->get_sset_count(dev, sset); @@ -271,17 +253,10 @@ static void __ethtool_get_strings(struct net_device *dev,  		memcpy(data, tunable_strings, sizeof(tunable_strings));  	else if (stringset == ETH_SS_PHY_TUNABLES)  		memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); -	else if (stringset == ETH_SS_PHY_STATS) { -		struct phy_device *phydev = dev->phydev; - -		if (phydev) { -			mutex_lock(&phydev->lock); -			phydev->drv->get_strings(phydev, data); -			mutex_unlock(&phydev->lock); -		} else { -			return; -		} -	} else +	else if (stringset == ETH_SS_PHY_STATS && dev->phydev && +		 !ops->get_ethtool_phy_stats) +		phy_ethtool_get_strings(dev->phydev, data); +	else  		/* ops->get_strings is valid because checked earlier */  		ops->get_strings(dev, stringset, data);  } @@ -936,7 +911,7 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,  	memset(&info, 0, sizeof(info));  	info.cmd = ETHTOOL_GSSET_INFO; -	info_buf = kzalloc(n_bits * sizeof(u32), GFP_USER); +	info_buf = kcalloc(n_bits, sizeof(u32), GFP_USER);  	if (!info_buf)  		return -ENOMEM; @@ -1042,7 +1017,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,  	if (info.cmd == ETHTOOL_GRXCLSRLALL) {  		if (info.rule_cnt > 0) {  			if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) -				rule_buf = kzalloc(info.rule_cnt * sizeof(u32), +				rule_buf = kcalloc(info.rule_cnt, sizeof(u32),  						   GFP_USER);  			if (!rule_buf)  				return -ENOMEM; @@ -1841,7 +1816,7 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr)  		return -EFAULT;  	test.len = test_len; -	data = kmalloc(test_len * sizeof(u64), GFP_USER); +	data = kmalloc_array(test_len, sizeof(u64), GFP_USER);  	if (!data)  		return -ENOMEM; @@ -1877,7 +1852,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)  	WARN_ON_ONCE(!ret);  	gstrings.len = ret; -	data = vzalloc(gstrings.len * ETH_GSTRING_LEN); +	data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));  	if (gstrings.len && !data)  		return -ENOMEM; @@ -1977,7 +1952,7 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)  		return -EFAULT;  	stats.n_stats = n_stats; -	data = vzalloc(n_stats * sizeof(u64)); +	data = vzalloc(array_size(n_stats, sizeof(u64)));  	if (n_stats && !data)  		return -ENOMEM; @@ -1998,15 +1973,19 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)  static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)  { -	struct ethtool_stats stats; +	const struct ethtool_ops *ops = dev->ethtool_ops;  	struct phy_device *phydev = dev->phydev; +	struct ethtool_stats stats;  	u64 *data;  	int ret, n_stats; -	if (!phydev) +	if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count))  		return -EOPNOTSUPP; -	n_stats = phy_get_sset_count(phydev); +	if (dev->phydev && !ops->get_ethtool_phy_stats) +		n_stats = phy_ethtool_get_sset_count(dev->phydev); +	else +		n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);  	if (n_stats < 0)  		return n_stats;  	if (n_stats > S32_MAX / sizeof(u64)) @@ -2017,13 +1996,17 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)  		return -EFAULT;  	stats.n_stats = n_stats; -	data = vzalloc(n_stats * sizeof(u64)); +	data = vzalloc(array_size(n_stats, sizeof(u64)));  	if (n_stats && !data)  		return -ENOMEM; -	mutex_lock(&phydev->lock); -	phydev->drv->get_stats(phydev, &stats, data); -	mutex_unlock(&phydev->lock); +	if (dev->phydev && !ops->get_ethtool_phy_stats) { +		ret = phy_ethtool_get_stats(dev->phydev, &stats, data); +		if (ret < 0) +			return ret; +	} else { +		ops->get_ethtool_phy_stats(dev, &stats, data); +	}  	ret = -EFAULT;  	if (copy_to_user(useraddr, &stats, sizeof(stats))) diff --git a/net/core/failover.c b/net/core/failover.c new file mode 100644 index 000000000000..4a92a98ccce9 --- /dev/null +++ b/net/core/failover.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018, Intel Corporation. */ + +/* A common module to handle registrations and notifications for paravirtual + * drivers to enable accelerated datapath and support VF live migration. + * + * The notifier and event handling code is based on netvsc driver. + */ + +#include <linux/module.h> +#include <linux/etherdevice.h> +#include <uapi/linux/if_arp.h> +#include <linux/rtnetlink.h> +#include <linux/if_vlan.h> +#include <net/failover.h> + +static LIST_HEAD(failover_list); +static DEFINE_SPINLOCK(failover_lock); + +static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) +{ +	struct net_device *failover_dev; +	struct failover *failover; + +	spin_lock(&failover_lock); +	list_for_each_entry(failover, &failover_list, list) { +		failover_dev = rtnl_dereference(failover->failover_dev); +		if (ether_addr_equal(failover_dev->perm_addr, mac)) { +			*ops = rtnl_dereference(failover->ops); +			spin_unlock(&failover_lock); +			return failover_dev; +		} +	} +	spin_unlock(&failover_lock); +	return NULL; +} + +/** + * failover_slave_register - Register a slave netdev + * + * @slave_dev: slave netdev that is being registered + * + * Registers a slave device to a failover instance. Only ethernet devices + * are supported. + */ +static int failover_slave_register(struct net_device *slave_dev) +{ +	struct netdev_lag_upper_info lag_upper_info; +	struct net_device *failover_dev; +	struct failover_ops *fops; +	int err; + +	if (slave_dev->type != ARPHRD_ETHER) +		goto done; + +	ASSERT_RTNL(); + +	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); +	if (!failover_dev) +		goto done; + +	if (fops && fops->slave_pre_register && +	    fops->slave_pre_register(slave_dev, failover_dev)) +		goto done; + +	err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, +					 failover_dev); +	if (err) { +		netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", +			   err); +		goto done; +	} + +	lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; +	err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, +					   &lag_upper_info, NULL); +	if (err) { +		netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", +			   failover_dev->name, err); +		goto err_upper_link; +	} + +	slave_dev->priv_flags |= IFF_FAILOVER_SLAVE; + +	if (fops && fops->slave_register && +	    !fops->slave_register(slave_dev, failover_dev)) +		return NOTIFY_OK; + +	netdev_upper_dev_unlink(slave_dev, failover_dev); +	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; +err_upper_link: +	netdev_rx_handler_unregister(slave_dev); +done: +	return NOTIFY_DONE; +} + +/** + * failover_slave_unregister - Unregister a slave netdev + * + * @slave_dev: slave netdev that is being unregistered + * + * Unregisters a slave device from a failover instance. + */ +int failover_slave_unregister(struct net_device *slave_dev) +{ +	struct net_device *failover_dev; +	struct failover_ops *fops; + +	if (!netif_is_failover_slave(slave_dev)) +		goto done; + +	ASSERT_RTNL(); + +	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); +	if (!failover_dev) +		goto done; + +	if (fops && fops->slave_pre_unregister && +	    fops->slave_pre_unregister(slave_dev, failover_dev)) +		goto done; + +	netdev_rx_handler_unregister(slave_dev); +	netdev_upper_dev_unlink(slave_dev, failover_dev); +	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + +	if (fops && fops->slave_unregister && +	    !fops->slave_unregister(slave_dev, failover_dev)) +		return NOTIFY_OK; + +done: +	return NOTIFY_DONE; +} +EXPORT_SYMBOL_GPL(failover_slave_unregister); + +static int failover_slave_link_change(struct net_device *slave_dev) +{ +	struct net_device *failover_dev; +	struct failover_ops *fops; + +	if (!netif_is_failover_slave(slave_dev)) +		goto done; + +	ASSERT_RTNL(); + +	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); +	if (!failover_dev) +		goto done; + +	if (!netif_running(failover_dev)) +		goto done; + +	if (fops && fops->slave_link_change && +	    !fops->slave_link_change(slave_dev, failover_dev)) +		return NOTIFY_OK; + +done: +	return NOTIFY_DONE; +} + +static int failover_slave_name_change(struct net_device *slave_dev) +{ +	struct net_device *failover_dev; +	struct failover_ops *fops; + +	if (!netif_is_failover_slave(slave_dev)) +		goto done; + +	ASSERT_RTNL(); + +	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); +	if (!failover_dev) +		goto done; + +	if (!netif_running(failover_dev)) +		goto done; + +	if (fops && fops->slave_name_change && +	    !fops->slave_name_change(slave_dev, failover_dev)) +		return NOTIFY_OK; + +done: +	return NOTIFY_DONE; +} + +static int +failover_event(struct notifier_block *this, unsigned long event, void *ptr) +{ +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + +	/* Skip parent events */ +	if (netif_is_failover(event_dev)) +		return NOTIFY_DONE; + +	switch (event) { +	case NETDEV_REGISTER: +		return failover_slave_register(event_dev); +	case NETDEV_UNREGISTER: +		return failover_slave_unregister(event_dev); +	case NETDEV_UP: +	case NETDEV_DOWN: +	case NETDEV_CHANGE: +		return failover_slave_link_change(event_dev); +	case NETDEV_CHANGENAME: +		return failover_slave_name_change(event_dev); +	default: +		return NOTIFY_DONE; +	} +} + +static struct notifier_block failover_notifier = { +	.notifier_call = failover_event, +}; + +static void +failover_existing_slave_register(struct net_device *failover_dev) +{ +	struct net *net = dev_net(failover_dev); +	struct net_device *dev; + +	rtnl_lock(); +	for_each_netdev(net, dev) { +		if (netif_is_failover(dev)) +			continue; +		if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) +			failover_slave_register(dev); +	} +	rtnl_unlock(); +} + +/** + * failover_register - Register a failover instance + * + * @dev: failover netdev + * @ops: failover ops + * + * Allocate and register a failover instance for a failover netdev. ops + * provides handlers for slave device register/unregister/link change/ + * name change events. + * + * Return: pointer to failover instance + */ +struct failover *failover_register(struct net_device *dev, +				   struct failover_ops *ops) +{ +	struct failover *failover; + +	if (dev->type != ARPHRD_ETHER) +		return ERR_PTR(-EINVAL); + +	failover = kzalloc(sizeof(*failover), GFP_KERNEL); +	if (!failover) +		return ERR_PTR(-ENOMEM); + +	rcu_assign_pointer(failover->ops, ops); +	dev_hold(dev); +	dev->priv_flags |= IFF_FAILOVER; +	rcu_assign_pointer(failover->failover_dev, dev); + +	spin_lock(&failover_lock); +	list_add_tail(&failover->list, &failover_list); +	spin_unlock(&failover_lock); + +	netdev_info(dev, "failover master:%s registered\n", dev->name); + +	failover_existing_slave_register(dev); + +	return failover; +} +EXPORT_SYMBOL_GPL(failover_register); + +/** + * failover_unregister - Unregister a failover instance + * + * @failover: pointer to failover instance + * + * Unregisters and frees a failover instance. + */ +void failover_unregister(struct failover *failover) +{ +	struct net_device *failover_dev; + +	failover_dev = rcu_dereference(failover->failover_dev); + +	netdev_info(failover_dev, "failover master:%s unregistered\n", +		    failover_dev->name); + +	failover_dev->priv_flags &= ~IFF_FAILOVER; +	dev_put(failover_dev); + +	spin_lock(&failover_lock); +	list_del(&failover->list); +	spin_unlock(&failover_lock); + +	kfree(failover); +} +EXPORT_SYMBOL_GPL(failover_unregister); + +static __init int +failover_init(void) +{ +	register_netdevice_notifier(&failover_notifier); + +	return 0; +} +module_init(failover_init); + +static __exit +void failover_exit(void) +{ +	unregister_netdevice_notifier(&failover_notifier); +} +module_exit(failover_exit); + +MODULE_DESCRIPTION("Generic failover infrastructure/interface"); +MODULE_LICENSE("GPL v2"); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 33958f84c173..126ffc5bc630 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -387,247 +387,304 @@ unsigned int fib_rules_seq_read(struct net *net, int family)  }  EXPORT_SYMBOL_GPL(fib_rules_seq_read); -static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb, -			    struct fib_rules_ops *ops) -{ -	int err = -EINVAL; - -	if (frh->src_len) -		if (tb[FRA_SRC] == NULL || -		    frh->src_len > (ops->addr_size * 8) || -		    nla_len(tb[FRA_SRC]) != ops->addr_size) -			goto errout; - -	if (frh->dst_len) -		if (tb[FRA_DST] == NULL || -		    frh->dst_len > (ops->addr_size * 8) || -		    nla_len(tb[FRA_DST]) != ops->addr_size) -			goto errout; - -	err = 0; -errout: -	return err; -} - -static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, -		       struct nlattr **tb, struct fib_rule *rule) +static struct fib_rule *rule_find(struct fib_rules_ops *ops, +				  struct fib_rule_hdr *frh, +				  struct nlattr **tb, +				  struct fib_rule *rule, +				  bool user_priority)  {  	struct fib_rule *r;  	list_for_each_entry(r, &ops->rules_list, list) { -		if (r->action != rule->action) +		if (rule->action && r->action != rule->action)  			continue; -		if (r->table != rule->table) +		if (rule->table && r->table != rule->table)  			continue; -		if (r->pref != rule->pref) +		if (user_priority && r->pref != rule->pref)  			continue; -		if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) +		if (rule->iifname[0] && +		    memcmp(r->iifname, rule->iifname, IFNAMSIZ))  			continue; -		if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) +		if (rule->oifname[0] && +		    memcmp(r->oifname, rule->oifname, IFNAMSIZ))  			continue; -		if (r->mark != rule->mark) +		if (rule->mark && r->mark != rule->mark)  			continue; -		if (r->mark_mask != rule->mark_mask) +		if (rule->mark_mask && r->mark_mask != rule->mark_mask)  			continue; -		if (r->tun_id != rule->tun_id) +		if (rule->tun_id && r->tun_id != rule->tun_id)  			continue;  		if (r->fr_net != rule->fr_net)  			continue; -		if (r->l3mdev != rule->l3mdev) +		if (rule->l3mdev && r->l3mdev != rule->l3mdev)  			continue; -		if (!uid_eq(r->uid_range.start, rule->uid_range.start) || -		    !uid_eq(r->uid_range.end, rule->uid_range.end)) +		if (uid_range_set(&rule->uid_range) && +		    (!uid_eq(r->uid_range.start, rule->uid_range.start) || +		    !uid_eq(r->uid_range.end, rule->uid_range.end)))  			continue; -		if (r->ip_proto != rule->ip_proto) +		if (rule->ip_proto && r->ip_proto != rule->ip_proto)  			continue; -		if (!fib_rule_port_range_compare(&r->sport_range, +		if (fib_rule_port_range_set(&rule->sport_range) && +		    !fib_rule_port_range_compare(&r->sport_range,  						 &rule->sport_range))  			continue; -		if (!fib_rule_port_range_compare(&r->dport_range, +		if (fib_rule_port_range_set(&rule->dport_range) && +		    !fib_rule_port_range_compare(&r->dport_range,  						 &rule->dport_range))  			continue;  		if (!ops->compare(r, frh, tb))  			continue; -		return 1; +		return r; +	} + +	return NULL; +} + +#ifdef CONFIG_NET_L3_MASTER_DEV +static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, +			      struct netlink_ext_ack *extack) +{ +	nlrule->l3mdev = nla_get_u8(nla); +	if (nlrule->l3mdev != 1) { +		NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); +		return -1;  	} +  	return 0;  } +#else +static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, +			      struct netlink_ext_ack *extack) +{ +	NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); +	return -1; +} +#endif -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, -		   struct netlink_ext_ack *extack) +static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, +		       struct netlink_ext_ack *extack, +		       struct fib_rules_ops *ops, +		       struct nlattr *tb[], +		       struct fib_rule **rule, +		       bool *user_priority)  {  	struct net *net = sock_net(skb->sk);  	struct fib_rule_hdr *frh = nlmsg_data(nlh); -	struct fib_rules_ops *ops = NULL; -	struct fib_rule *rule, *r, *last = NULL; -	struct nlattr *tb[FRA_MAX+1]; -	int err = -EINVAL, unresolved = 0; - -	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) -		goto errout; +	struct fib_rule *nlrule = NULL; +	int err = -EINVAL; -	ops = lookup_rules_ops(net, frh->family); -	if (ops == NULL) { -		err = -EAFNOSUPPORT; -		goto errout; +	if (frh->src_len) +		if (!tb[FRA_SRC] || +		    frh->src_len > (ops->addr_size * 8) || +		    nla_len(tb[FRA_SRC]) != ops->addr_size) { +			NL_SET_ERR_MSG(extack, "Invalid source address"); +			goto errout;  	} -	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); -	if (err < 0) -		goto errout; - -	err = validate_rulemsg(frh, tb, ops); -	if (err < 0) -		goto errout; +	if (frh->dst_len) +		if (!tb[FRA_DST] || +		    frh->dst_len > (ops->addr_size * 8) || +		    nla_len(tb[FRA_DST]) != ops->addr_size) { +			NL_SET_ERR_MSG(extack, "Invalid dst address"); +			goto errout; +	} -	rule = kzalloc(ops->rule_size, GFP_KERNEL); -	if (rule == NULL) { +	nlrule = kzalloc(ops->rule_size, GFP_KERNEL); +	if (!nlrule) {  		err = -ENOMEM;  		goto errout;  	} -	refcount_set(&rule->refcnt, 1); -	rule->fr_net = net; +	refcount_set(&nlrule->refcnt, 1); +	nlrule->fr_net = net; -	rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) -	                              : fib_default_rule_pref(ops); +	if (tb[FRA_PRIORITY]) { +		nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); +		*user_priority = true; +	} else { +		nlrule->pref = fib_default_rule_pref(ops); +	} -	rule->proto = tb[FRA_PROTOCOL] ? +	nlrule->proto = tb[FRA_PROTOCOL] ?  		nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;  	if (tb[FRA_IIFNAME]) {  		struct net_device *dev; -		rule->iifindex = -1; -		nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); -		dev = __dev_get_by_name(net, rule->iifname); +		nlrule->iifindex = -1; +		nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); +		dev = __dev_get_by_name(net, nlrule->iifname);  		if (dev) -			rule->iifindex = dev->ifindex; +			nlrule->iifindex = dev->ifindex;  	}  	if (tb[FRA_OIFNAME]) {  		struct net_device *dev; -		rule->oifindex = -1; -		nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); -		dev = __dev_get_by_name(net, rule->oifname); +		nlrule->oifindex = -1; +		nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); +		dev = __dev_get_by_name(net, nlrule->oifname);  		if (dev) -			rule->oifindex = dev->ifindex; +			nlrule->oifindex = dev->ifindex;  	}  	if (tb[FRA_FWMARK]) { -		rule->mark = nla_get_u32(tb[FRA_FWMARK]); -		if (rule->mark) +		nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); +		if (nlrule->mark)  			/* compatibility: if the mark value is non-zero all bits  			 * are compared unless a mask is explicitly specified.  			 */ -			rule->mark_mask = 0xFFFFFFFF; +			nlrule->mark_mask = 0xFFFFFFFF;  	}  	if (tb[FRA_FWMASK]) -		rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); +		nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);  	if (tb[FRA_TUN_ID]) -		rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); +		nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);  	err = -EINVAL; -	if (tb[FRA_L3MDEV]) { -#ifdef CONFIG_NET_L3_MASTER_DEV -		rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]); -		if (rule->l3mdev != 1) -#endif -			goto errout_free; -	} +	if (tb[FRA_L3MDEV] && +	    fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) +		goto errout_free; -	rule->action = frh->action; -	rule->flags = frh->flags; -	rule->table = frh_get_table(frh, tb); +	nlrule->action = frh->action; +	nlrule->flags = frh->flags; +	nlrule->table = frh_get_table(frh, tb);  	if (tb[FRA_SUPPRESS_PREFIXLEN]) -		rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); +		nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);  	else -		rule->suppress_prefixlen = -1; +		nlrule->suppress_prefixlen = -1;  	if (tb[FRA_SUPPRESS_IFGROUP]) -		rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); +		nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);  	else -		rule->suppress_ifgroup = -1; +		nlrule->suppress_ifgroup = -1;  	if (tb[FRA_GOTO]) { -		if (rule->action != FR_ACT_GOTO) +		if (nlrule->action != FR_ACT_GOTO) { +			NL_SET_ERR_MSG(extack, "Unexpected goto");  			goto errout_free; +		} -		rule->target = nla_get_u32(tb[FRA_GOTO]); +		nlrule->target = nla_get_u32(tb[FRA_GOTO]);  		/* Backward jumps are prohibited to avoid endless loops */ -		if (rule->target <= rule->pref) +		if (nlrule->target <= nlrule->pref) { +			NL_SET_ERR_MSG(extack, "Backward goto not supported");  			goto errout_free; - -		list_for_each_entry(r, &ops->rules_list, list) { -			if (r->pref == rule->target) { -				RCU_INIT_POINTER(rule->ctarget, r); -				break; -			}  		} - -		if (rcu_dereference_protected(rule->ctarget, 1) == NULL) -			unresolved = 1; -	} else if (rule->action == FR_ACT_GOTO) +	} else if (nlrule->action == FR_ACT_GOTO) { +		NL_SET_ERR_MSG(extack, "Missing goto target for action goto");  		goto errout_free; +	} -	if (rule->l3mdev && rule->table) +	if (nlrule->l3mdev && nlrule->table) { +		NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");  		goto errout_free; +	}  	if (tb[FRA_UID_RANGE]) {  		if (current_user_ns() != net->user_ns) {  			err = -EPERM; +			NL_SET_ERR_MSG(extack, "No permission to set uid");  			goto errout_free;  		} -		rule->uid_range = nla_get_kuid_range(tb); +		nlrule->uid_range = nla_get_kuid_range(tb); -		if (!uid_range_set(&rule->uid_range) || -		    !uid_lte(rule->uid_range.start, rule->uid_range.end)) +		if (!uid_range_set(&nlrule->uid_range) || +		    !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { +			NL_SET_ERR_MSG(extack, "Invalid uid range");  			goto errout_free; +		}  	} else { -		rule->uid_range = fib_kuid_range_unset; +		nlrule->uid_range = fib_kuid_range_unset;  	}  	if (tb[FRA_IP_PROTO]) -		rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); +		nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);  	if (tb[FRA_SPORT_RANGE]) {  		err = nla_get_port_range(tb[FRA_SPORT_RANGE], -					 &rule->sport_range); -		if (err) +					 &nlrule->sport_range); +		if (err) { +			NL_SET_ERR_MSG(extack, "Invalid sport range");  			goto errout_free; +		}  	}  	if (tb[FRA_DPORT_RANGE]) {  		err = nla_get_port_range(tb[FRA_DPORT_RANGE], -					 &rule->dport_range); -		if (err) +					 &nlrule->dport_range); +		if (err) { +			NL_SET_ERR_MSG(extack, "Invalid dport range");  			goto errout_free; +		}  	} +	*rule = nlrule; + +	return 0; + +errout_free: +	kfree(nlrule); +errout: +	return err; +} + +int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, +		   struct netlink_ext_ack *extack) +{ +	struct net *net = sock_net(skb->sk); +	struct fib_rule_hdr *frh = nlmsg_data(nlh); +	struct fib_rules_ops *ops = NULL; +	struct fib_rule *rule = NULL, *r, *last = NULL; +	struct nlattr *tb[FRA_MAX + 1]; +	int err = -EINVAL, unresolved = 0; +	bool user_priority = false; + +	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { +		NL_SET_ERR_MSG(extack, "Invalid msg length"); +		goto errout; +	} + +	ops = lookup_rules_ops(net, frh->family); +	if (!ops) { +		err = -EAFNOSUPPORT; +		NL_SET_ERR_MSG(extack, "Rule family not supported"); +		goto errout; +	} + +	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); +	if (err < 0) { +		NL_SET_ERR_MSG(extack, "Error parsing msg"); +		goto errout; +	} + +	err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); +	if (err) +		goto errout; +  	if ((nlh->nlmsg_flags & NLM_F_EXCL) && -	    rule_exists(ops, frh, tb, rule)) { +	    rule_find(ops, frh, tb, rule, user_priority)) {  		err = -EEXIST;  		goto errout_free;  	} -	err = ops->configure(rule, skb, frh, tb); +	err = ops->configure(rule, skb, frh, tb, extack);  	if (err < 0)  		goto errout_free; @@ -637,6 +694,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,  		goto errout_free;  	list_for_each_entry(r, &ops->rules_list, list) { +		if (r->pref == rule->target) { +			RCU_INIT_POINTER(rule->ctarget, r); +			break; +		} +	} + +	if (rcu_dereference_protected(rule->ctarget, 1) == NULL) +		unresolved = 1; + +	list_for_each_entry(r, &ops->rules_list, list) {  		if (r->pref > rule->pref)  			break;  		last = r; @@ -690,171 +757,97 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,  {  	struct net *net = sock_net(skb->sk);  	struct fib_rule_hdr *frh = nlmsg_data(nlh); -	struct fib_rule_port_range sprange = {0, 0}; -	struct fib_rule_port_range dprange = {0, 0};  	struct fib_rules_ops *ops = NULL; -	struct fib_rule *rule, *r; +	struct fib_rule *rule = NULL, *r, *nlrule = NULL;  	struct nlattr *tb[FRA_MAX+1]; -	struct fib_kuid_range range;  	int err = -EINVAL; +	bool user_priority = false; -	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) +	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { +		NL_SET_ERR_MSG(extack, "Invalid msg length");  		goto errout; +	}  	ops = lookup_rules_ops(net, frh->family);  	if (ops == NULL) {  		err = -EAFNOSUPPORT; +		NL_SET_ERR_MSG(extack, "Rule family not supported");  		goto errout;  	}  	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); -	if (err < 0) +	if (err < 0) { +		NL_SET_ERR_MSG(extack, "Error parsing msg");  		goto errout; +	} -	err = validate_rulemsg(frh, tb, ops); -	if (err < 0) +	err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); +	if (err)  		goto errout; -	if (tb[FRA_UID_RANGE]) { -		range = nla_get_kuid_range(tb); -		if (!uid_range_set(&range)) { -			err = -EINVAL; -			goto errout; -		} -	} else { -		range = fib_kuid_range_unset; +	rule = rule_find(ops, frh, tb, nlrule, user_priority); +	if (!rule) { +		err = -ENOENT; +		goto errout;  	} -	if (tb[FRA_SPORT_RANGE]) { -		err = nla_get_port_range(tb[FRA_SPORT_RANGE], -					 &sprange); -		if (err) -			goto errout; +	if (rule->flags & FIB_RULE_PERMANENT) { +		err = -EPERM; +		goto errout;  	} -	if (tb[FRA_DPORT_RANGE]) { -		err = nla_get_port_range(tb[FRA_DPORT_RANGE], -					 &dprange); +	if (ops->delete) { +		err = ops->delete(rule);  		if (err)  			goto errout;  	} -	list_for_each_entry(rule, &ops->rules_list, list) { -		if (tb[FRA_PROTOCOL] && -		    (rule->proto != nla_get_u8(tb[FRA_PROTOCOL]))) -			continue; - -		if (frh->action && (frh->action != rule->action)) -			continue; - -		if (frh_get_table(frh, tb) && -		    (frh_get_table(frh, tb) != rule->table)) -			continue; - -		if (tb[FRA_PRIORITY] && -		    (rule->pref != nla_get_u32(tb[FRA_PRIORITY]))) -			continue; - -		if (tb[FRA_IIFNAME] && -		    nla_strcmp(tb[FRA_IIFNAME], rule->iifname)) -			continue; - -		if (tb[FRA_OIFNAME] && -		    nla_strcmp(tb[FRA_OIFNAME], rule->oifname)) -			continue; - -		if (tb[FRA_FWMARK] && -		    (rule->mark != nla_get_u32(tb[FRA_FWMARK]))) -			continue; - -		if (tb[FRA_FWMASK] && -		    (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK]))) -			continue; - -		if (tb[FRA_TUN_ID] && -		    (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) -			continue; - -		if (tb[FRA_L3MDEV] && -		    (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV]))) -			continue; - -		if (uid_range_set(&range) && -		    (!uid_eq(rule->uid_range.start, range.start) || -		     !uid_eq(rule->uid_range.end, range.end))) -			continue; - -		if (tb[FRA_IP_PROTO] && -		    (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO]))) -			continue; - -		if (fib_rule_port_range_set(&sprange) && -		    !fib_rule_port_range_compare(&rule->sport_range, &sprange)) -			continue; - -		if (fib_rule_port_range_set(&dprange) && -		    !fib_rule_port_range_compare(&rule->dport_range, &dprange)) -			continue; - -		if (!ops->compare(rule, frh, tb)) -			continue; - -		if (rule->flags & FIB_RULE_PERMANENT) { -			err = -EPERM; -			goto errout; -		} - -		if (ops->delete) { -			err = ops->delete(rule); -			if (err) -				goto errout; -		} +	if (rule->tun_id) +		ip_tunnel_unneed_metadata(); -		if (rule->tun_id) -			ip_tunnel_unneed_metadata(); +	list_del_rcu(&rule->list); -		list_del_rcu(&rule->list); - -		if (rule->action == FR_ACT_GOTO) { -			ops->nr_goto_rules--; -			if (rtnl_dereference(rule->ctarget) == NULL) -				ops->unresolved_rules--; -		} +	if (rule->action == FR_ACT_GOTO) { +		ops->nr_goto_rules--; +		if (rtnl_dereference(rule->ctarget) == NULL) +			ops->unresolved_rules--; +	} -		/* -		 * Check if this rule is a target to any of them. If so, -		 * adjust to the next one with the same preference or -		 * disable them. As this operation is eventually very -		 * expensive, it is only performed if goto rules, except -		 * current if it is goto rule, have actually been added. -		 */ -		if (ops->nr_goto_rules > 0) { -			struct fib_rule *n; - -			n = list_next_entry(rule, list); -			if (&n->list == &ops->rules_list || n->pref != rule->pref) -				n = NULL; -			list_for_each_entry(r, &ops->rules_list, list) { -				if (rtnl_dereference(r->ctarget) != rule) -					continue; -				rcu_assign_pointer(r->ctarget, n); -				if (!n) -					ops->unresolved_rules++; -			} +	/* +	 * Check if this rule is a target to any of them. If so, +	 * adjust to the next one with the same preference or +	 * disable them. As this operation is eventually very +	 * expensive, it is only performed if goto rules, except +	 * current if it is goto rule, have actually been added. +	 */ +	if (ops->nr_goto_rules > 0) { +		struct fib_rule *n; + +		n = list_next_entry(rule, list); +		if (&n->list == &ops->rules_list || n->pref != rule->pref) +			n = NULL; +		list_for_each_entry(r, &ops->rules_list, list) { +			if (rtnl_dereference(r->ctarget) != rule) +				continue; +			rcu_assign_pointer(r->ctarget, n); +			if (!n) +				ops->unresolved_rules++;  		} - -		call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, -					NULL); -		notify_rule_change(RTM_DELRULE, rule, ops, nlh, -				   NETLINK_CB(skb).portid); -		fib_rule_put(rule); -		flush_route_cache(ops); -		rules_ops_put(ops); -		return 0;  	} -	err = -ENOENT; +	call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, +				NULL); +	notify_rule_change(RTM_DELRULE, rule, ops, nlh, +			   NETLINK_CB(skb).portid); +	fib_rule_put(rule); +	flush_route_cache(ops); +	rules_ops_put(ops); +	kfree(nlrule); +	return 0; +  errout: +	if (nlrule) +		kfree(nlrule);  	rules_ops_put(ops);  	return err;  } diff --git a/net/core/filter.c b/net/core/filter.c index 201ff36b17a8..3d9ba7e5965a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -57,7 +57,17 @@  #include <net/sock_reuseport.h>  #include <net/busy_poll.h>  #include <net/tcp.h> +#include <net/xfrm.h>  #include <linux/bpf_trace.h> +#include <net/xdp_sock.h> +#include <linux/inetdevice.h> +#include <net/ip_fib.h> +#include <net/flow.h> +#include <net/arp.h> +#include <net/ipv6.h> +#include <linux/seg6_local.h> +#include <net/seg6.h> +#include <net/seg6_local.h>  /**   *	sk_filter_trim_cap - run a packet through a socket filter @@ -111,12 +121,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)  }  EXPORT_SYMBOL(sk_filter_trim_cap); -BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) +BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)  {  	return skb_get_poff(skb);  } -BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)  {  	struct nlattr *nla; @@ -136,7 +146,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)  	return 0;  } -BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)  {  	struct nlattr *nla; @@ -160,13 +170,94 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)  	return 0;  } -BPF_CALL_0(__get_raw_cpu_id) +BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, +	   data, int, headlen, int, offset) +{ +	u8 tmp, *ptr; +	const int len = sizeof(tmp); + +	if (offset >= 0) { +		if (headlen - offset >= len) +			return *(u8 *)(data + offset); +		if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) +			return tmp; +	} else { +		ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); +		if (likely(ptr)) +			return *(u8 *)ptr; +	} + +	return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, +	   int, offset) +{ +	return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, +					 offset); +} + +BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, +	   data, int, headlen, int, offset) +{ +	u16 tmp, *ptr; +	const int len = sizeof(tmp); + +	if (offset >= 0) { +		if (headlen - offset >= len) +			return get_unaligned_be16(data + offset); +		if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) +			return be16_to_cpu(tmp); +	} else { +		ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); +		if (likely(ptr)) +			return get_unaligned_be16(ptr); +	} + +	return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, +	   int, offset) +{ +	return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, +					  offset); +} + +BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, +	   data, int, headlen, int, offset) +{ +	u32 tmp, *ptr; +	const int len = sizeof(tmp); + +	if (likely(offset >= 0)) { +		if (headlen - offset >= len) +			return get_unaligned_be32(data + offset); +		if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) +			return be32_to_cpu(tmp); +	} else { +		ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); +		if (likely(ptr)) +			return get_unaligned_be32(ptr); +	} + +	return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, +	   int, offset) +{ +	return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, +					  offset); +} + +BPF_CALL_0(bpf_get_raw_cpu_id)  {  	return raw_smp_processor_id();  }  static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { -	.func		= __get_raw_cpu_id, +	.func		= bpf_get_raw_cpu_id,  	.gpl_only	= false,  	.ret_type	= RET_INTEGER,  }; @@ -316,16 +407,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp,  		/* Emit call(arg1=CTX, arg2=A, arg3=X) */  		switch (fp->k) {  		case SKF_AD_OFF + SKF_AD_PAY_OFFSET: -			*insn = BPF_EMIT_CALL(__skb_get_pay_offset); +			*insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);  			break;  		case SKF_AD_OFF + SKF_AD_NLATTR: -			*insn = BPF_EMIT_CALL(__skb_get_nlattr); +			*insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);  			break;  		case SKF_AD_OFF + SKF_AD_NLATTR_NEST: -			*insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); +			*insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);  			break;  		case SKF_AD_OFF + SKF_AD_CPU: -			*insn = BPF_EMIT_CALL(__get_raw_cpu_id); +			*insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);  			break;  		case SKF_AD_OFF + SKF_AD_RANDOM:  			*insn = BPF_EMIT_CALL(bpf_user_rnd_u32); @@ -352,26 +443,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp,  	return true;  } +static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) +{ +	const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); +	int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); +	bool endian = BPF_SIZE(fp->code) == BPF_H || +		      BPF_SIZE(fp->code) == BPF_W; +	bool indirect = BPF_MODE(fp->code) == BPF_IND; +	const int ip_align = NET_IP_ALIGN; +	struct bpf_insn *insn = *insnp; +	int offset = fp->k; + +	if (!indirect && +	    ((unaligned_ok && offset >= 0) || +	     (!unaligned_ok && offset >= 0 && +	      offset + ip_align >= 0 && +	      offset + ip_align % size == 0))) { +		*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); +		*insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); +		*insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); +		*insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, +				      offset); +		if (endian) +			*insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); +		*insn++ = BPF_JMP_A(8); +	} + +	*insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); +	*insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); +	*insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); +	if (!indirect) { +		*insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); +	} else { +		*insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); +		if (fp->k) +			*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); +	} + +	switch (BPF_SIZE(fp->code)) { +	case BPF_B: +		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); +		break; +	case BPF_H: +		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); +		break; +	case BPF_W: +		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); +		break; +	default: +		return false; +	} + +	*insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); +	*insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); +	*insn   = BPF_EXIT_INSN(); + +	*insnp = insn; +	return true; +} +  /**   *	bpf_convert_filter - convert filter program   *	@prog: the user passed filter program   *	@len: the length of the user passed filter program   *	@new_prog: allocated 'struct bpf_prog' or NULL   *	@new_len: pointer to store length of converted program + *	@seen_ld_abs: bool whether we've seen ld_abs/ind   *   * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'   * style extended BPF (eBPF).   * Conversion workflow:   *   * 1) First pass for calculating the new program length: - *   bpf_convert_filter(old_prog, old_len, NULL, &new_len) + *   bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)   *   * 2) 2nd pass to remap in two passes: 1st pass finds new   *    jump offsets, 2nd pass remapping: - *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len); + *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)   */  static int bpf_convert_filter(struct sock_filter *prog, int len, -			      struct bpf_prog *new_prog, int *new_len) +			      struct bpf_prog *new_prog, int *new_len, +			      bool *seen_ld_abs)  {  	int new_flen = 0, pass = 0, target, i, stack_off;  	struct bpf_insn *new_insn, *first_insn = NULL; @@ -410,12 +562,27 @@ do_pass:  		 * do this ourself. Initial CTX is present in BPF_REG_ARG1.  		 */  		*new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); +		if (*seen_ld_abs) { +			/* For packet access in classic BPF, cache skb->data +			 * in callee-saved BPF R8 and skb->len - skb->data_len +			 * (headlen) in BPF R9. Since classic BPF is read-only +			 * on CTX, we only need to cache it once. +			 */ +			*new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), +						  BPF_REG_D, BPF_REG_CTX, +						  offsetof(struct sk_buff, data)); +			*new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, +						  offsetof(struct sk_buff, len)); +			*new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, +						  offsetof(struct sk_buff, data_len)); +			*new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); +		}  	} else {  		new_insn += 3;  	}  	for (i = 0; i < len; fp++, i++) { -		struct bpf_insn tmp_insns[6] = { }; +		struct bpf_insn tmp_insns[32] = { };  		struct bpf_insn *insn = tmp_insns;  		if (addrs) @@ -458,6 +625,11 @@ do_pass:  			    BPF_MODE(fp->code) == BPF_ABS &&  			    convert_bpf_extensions(fp, &insn))  				break; +			if (BPF_CLASS(fp->code) == BPF_LD && +			    convert_bpf_ld_abs(fp, &insn)) { +				*seen_ld_abs = true; +				break; +			}  			if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||  			    fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { @@ -567,21 +739,31 @@ jmp_rest:  			break;  		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ -		case BPF_LDX | BPF_MSH | BPF_B: -			/* tmp = A */ -			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); +		case BPF_LDX | BPF_MSH | BPF_B: { +			struct sock_filter tmp = { +				.code	= BPF_LD | BPF_ABS | BPF_B, +				.k	= fp->k, +			}; + +			*seen_ld_abs = true; + +			/* X = A */ +			*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);  			/* A = BPF_R0 = *(u8 *) (skb->data + K) */ -			*insn++ = BPF_LD_ABS(BPF_B, fp->k); +			convert_bpf_ld_abs(&tmp, &insn); +			insn++;  			/* A &= 0xf */  			*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);  			/* A <<= 2 */  			*insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); +			/* tmp = X */ +			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);  			/* X = A */  			*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);  			/* A = tmp */  			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);  			break; - +		}  		/* RET_K is remaped into 2 insns. RET_A case doesn't need an  		 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.  		 */ @@ -663,6 +845,8 @@ jmp_rest:  	if (!new_prog) {  		/* Only calculating new length. */  		*new_len = new_insn - first_insn; +		if (*seen_ld_abs) +			*new_len += 4; /* Prologue bits. */  		return 0;  	} @@ -1024,6 +1208,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)  	struct sock_filter *old_prog;  	struct bpf_prog *old_fp;  	int err, new_len, old_len = fp->len; +	bool seen_ld_abs = false;  	/* We are free to overwrite insns et al right here as it  	 * won't be used at this point in time anymore internally @@ -1045,7 +1230,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)  	}  	/* 1st pass: calculate the new program length. */ -	err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); +	err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, +				 &seen_ld_abs);  	if (err)  		goto out_err_free; @@ -1064,7 +1250,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)  	fp->len = new_len;  	/* 2nd pass: remap sock_filter insns into bpf_insn insns. */ -	err = bpf_convert_filter(old_prog, old_len, fp, &new_len); +	err = bpf_convert_filter(old_prog, old_len, fp, &new_len, +				 &seen_ld_abs);  	if (err)  		/* 2nd bpf_convert_filter() can fail only if it fails  		 * to allocate memory, remapping must succeed. Note, @@ -1512,6 +1699,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {  	.arg4_type	= ARG_CONST_SIZE,  }; +BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, +	   u32, offset, void *, to, u32, len, u32, start_header) +{ +	u8 *ptr; + +	if (unlikely(offset > 0xffff || len > skb_headlen(skb))) +		goto err_clear; + +	switch (start_header) { +	case BPF_HDR_START_MAC: +		ptr = skb_mac_header(skb) + offset; +		break; +	case BPF_HDR_START_NET: +		ptr = skb_network_header(skb) + offset; +		break; +	default: +		goto err_clear; +	} + +	if (likely(ptr >= skb_mac_header(skb) && +		   ptr + len <= skb_tail_pointer(skb))) { +		memcpy(to, ptr, len); +		return 0; +	} + +err_clear: +	memset(to, 0, len); +	return -EFAULT; +} + +static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { +	.func		= bpf_skb_load_bytes_relative, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_ANYTHING, +	.arg3_type	= ARG_PTR_TO_UNINIT_MEM, +	.arg4_type	= ARG_CONST_SIZE, +	.arg5_type	= ARG_ANYTHING, +}; +  BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)  {  	/* Idea is the following: should the needed direct read/write @@ -1857,6 +2085,33 @@ static const struct bpf_func_proto bpf_redirect_proto = {  	.arg2_type      = ARG_ANYTHING,  }; +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, +	   struct bpf_map *, map, void *, key, u64, flags) +{ +	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + +	/* If user passes invalid input drop the packet. */ +	if (unlikely(flags & ~(BPF_F_INGRESS))) +		return SK_DROP; + +	tcb->bpf.flags = flags; +	tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); +	if (!tcb->bpf.sk_redir) +		return SK_DROP; + +	return SK_PASS; +} + +static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { +	.func           = bpf_sk_redirect_hash, +	.gpl_only       = false, +	.ret_type       = RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type      = ARG_CONST_MAP_PTR, +	.arg3_type      = ARG_PTR_TO_MAP_KEY, +	.arg4_type      = ARG_ANYTHING, +}; +  BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,  	   struct bpf_map *, map, u32, key, u64, flags)  { @@ -1866,9 +2121,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,  	if (unlikely(flags & ~(BPF_F_INGRESS)))  		return SK_DROP; -	tcb->bpf.key = key;  	tcb->bpf.flags = flags; -	tcb->bpf.map = map; +	tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); +	if (!tcb->bpf.sk_redir) +		return SK_DROP;  	return SK_PASS;  } @@ -1876,16 +2132,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,  struct sock *do_sk_redirect_map(struct sk_buff *skb)  {  	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); -	struct sock *sk = NULL; - -	if (tcb->bpf.map) { -		sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key); -		tcb->bpf.key = 0; -		tcb->bpf.map = NULL; -	} - -	return sk; +	return tcb->bpf.sk_redir;  }  static const struct bpf_func_proto bpf_sk_redirect_map_proto = { @@ -1898,32 +2146,49 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {  	.arg4_type      = ARG_ANYTHING,  }; -BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, -	   struct bpf_map *, map, u32, key, u64, flags) +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, +	   struct bpf_map *, map, void *, key, u64, flags)  {  	/* If user passes invalid input drop the packet. */  	if (unlikely(flags & ~(BPF_F_INGRESS)))  		return SK_DROP; -	msg->key = key;  	msg->flags = flags; -	msg->map = map; +	msg->sk_redir = __sock_hash_lookup_elem(map, key); +	if (!msg->sk_redir) +		return SK_DROP;  	return SK_PASS;  } -struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { +	.func           = bpf_msg_redirect_hash, +	.gpl_only       = false, +	.ret_type       = RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type      = ARG_CONST_MAP_PTR, +	.arg3_type      = ARG_PTR_TO_MAP_KEY, +	.arg4_type      = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, +	   struct bpf_map *, map, u32, key, u64, flags)  { -	struct sock *sk = NULL; +	/* If user passes invalid input drop the packet. */ +	if (unlikely(flags & ~(BPF_F_INGRESS))) +		return SK_DROP; -	if (msg->map) { -		sk = __sock_map_lookup_elem(msg->map, msg->key); +	msg->flags = flags; +	msg->sk_redir = __sock_map_lookup_elem(map, key); +	if (!msg->sk_redir) +		return SK_DROP; -		msg->key = 0; -		msg->map = NULL; -	} +	return SK_PASS; +} -	return sk; +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ +	return msg->sk_redir;  }  static const struct bpf_func_proto bpf_msg_redirect_map_proto = { @@ -2186,7 +2451,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,  	return ret;  } -const struct bpf_func_proto bpf_skb_vlan_push_proto = { +static const struct bpf_func_proto bpf_skb_vlan_push_proto = {  	.func           = bpf_skb_vlan_push,  	.gpl_only       = false,  	.ret_type       = RET_INTEGER, @@ -2194,7 +2459,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {  	.arg2_type      = ARG_ANYTHING,  	.arg3_type      = ARG_ANYTHING,  }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);  BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)  { @@ -2208,13 +2472,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)  	return ret;  } -const struct bpf_func_proto bpf_skb_vlan_pop_proto = { +static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {  	.func           = bpf_skb_vlan_pop,  	.gpl_only       = false,  	.ret_type       = RET_INTEGER,  	.arg1_type      = ARG_PTR_TO_CTX,  }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);  static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)  { @@ -2699,8 +2962,9 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)  BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)  { +	void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);  	unsigned long metalen = xdp_get_metalen(xdp); -	void *data_start = xdp->data_hard_start + metalen; +	void *data_start = xdp_frame_end + metalen;  	void *data = xdp->data + offset;  	if (unlikely(data < data_start || @@ -2724,14 +2988,39 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {  	.arg2_type	= ARG_ANYTHING,  }; +BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) +{ +	void *data_end = xdp->data_end + offset; + +	/* only shrinking is allowed for now. */ +	if (unlikely(offset >= 0)) +		return -EINVAL; + +	if (unlikely(data_end < xdp->data + ETH_HLEN)) +		return -EINVAL; + +	xdp->data_end = data_end; + +	return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { +	.func		= bpf_xdp_adjust_tail, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_ANYTHING, +}; +  BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)  { +	void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);  	void *meta = xdp->data_meta + offset;  	unsigned long metalen = xdp->data - meta;  	if (xdp_data_meta_unsupported(xdp))  		return -ENOTSUPP; -	if (unlikely(meta < xdp->data_hard_start || +	if (unlikely(meta < xdp_frame_end ||  		     meta > xdp->data))  		return -EINVAL;  	if (unlikely((metalen & (sizeof(__u32) - 1)) || @@ -2756,16 +3045,20 @@ static int __bpf_tx_xdp(struct net_device *dev,  			struct xdp_buff *xdp,  			u32 index)  { -	int err; +	struct xdp_frame *xdpf; +	int sent;  	if (!dev->netdev_ops->ndo_xdp_xmit) {  		return -EOPNOTSUPP;  	} -	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); -	if (err) -		return err; -	dev->netdev_ops->ndo_xdp_flush(dev); +	xdpf = convert_to_xdp_frame(xdp); +	if (unlikely(!xdpf)) +		return -EOVERFLOW; + +	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH); +	if (sent <= 0) +		return sent;  	return 0;  } @@ -2776,24 +3069,33 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,  {  	int err; -	if (map->map_type == BPF_MAP_TYPE_DEVMAP) { -		struct net_device *dev = fwd; - -		if (!dev->netdev_ops->ndo_xdp_xmit) -			return -EOPNOTSUPP; +	switch (map->map_type) { +	case BPF_MAP_TYPE_DEVMAP: { +		struct bpf_dtab_netdev *dst = fwd; -		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); +		err = dev_map_enqueue(dst, xdp, dev_rx);  		if (err)  			return err;  		__dev_map_insert_ctx(map, index); - -	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { +		break; +	} +	case BPF_MAP_TYPE_CPUMAP: {  		struct bpf_cpu_map_entry *rcpu = fwd;  		err = cpu_map_enqueue(rcpu, xdp, dev_rx);  		if (err)  			return err;  		__cpu_map_insert_ctx(map, index); +		break; +	} +	case BPF_MAP_TYPE_XSKMAP: { +		struct xdp_sock *xs = fwd; + +		err = __xsk_map_redirect(map, xdp, xs); +		return err; +	} +	default: +		break;  	}  	return 0;  } @@ -2812,6 +3114,9 @@ void xdp_do_flush_map(void)  		case BPF_MAP_TYPE_CPUMAP:  			__cpu_map_flush(map);  			break; +		case BPF_MAP_TYPE_XSKMAP: +			__xsk_map_flush(map); +			break;  		default:  			break;  		} @@ -2826,6 +3131,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)  		return __dev_map_lookup_elem(map, index);  	case BPF_MAP_TYPE_CPUMAP:  		return __cpu_map_lookup_elem(map, index); +	case BPF_MAP_TYPE_XSKMAP: +		return __xsk_map_lookup_elem(map, index);  	default:  		return NULL;  	} @@ -2923,13 +3230,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)  static int xdp_do_generic_redirect_map(struct net_device *dev,  				       struct sk_buff *skb, +				       struct xdp_buff *xdp,  				       struct bpf_prog *xdp_prog)  {  	struct redirect_info *ri = this_cpu_ptr(&redirect_info);  	unsigned long map_owner = ri->map_owner;  	struct bpf_map *map = ri->map; -	struct net_device *fwd = NULL;  	u32 index = ri->ifindex; +	void *fwd = NULL;  	int err = 0;  	ri->ifindex = 0; @@ -2951,6 +3259,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,  		if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))  			goto err;  		skb->dev = fwd; +		generic_xdp_tx(skb, xdp_prog); +	} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { +		struct xdp_sock *xs = fwd; + +		err = xsk_generic_rcv(xs, xdp); +		if (err) +			goto err; +		consume_skb(skb);  	} else {  		/* TODO: Handle BPF_MAP_TYPE_CPUMAP */  		err = -EBADRQC; @@ -2965,7 +3281,7 @@ err:  }  int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, -			    struct bpf_prog *xdp_prog) +			    struct xdp_buff *xdp, struct bpf_prog *xdp_prog)  {  	struct redirect_info *ri = this_cpu_ptr(&redirect_info);  	u32 index = ri->ifindex; @@ -2973,7 +3289,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,  	int err = 0;  	if (ri->map) -		return xdp_do_generic_redirect_map(dev, skb, xdp_prog); +		return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog);  	ri->ifindex = 0;  	fwd = dev_get_by_index_rcu(dev_net(dev), index); @@ -2987,6 +3303,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,  	skb->dev = fwd;  	_trace_xdp_redirect(dev, xdp_prog, index); +	generic_xdp_tx(skb, xdp_prog);  	return 0;  err:  	_trace_xdp_redirect_err(dev, xdp_prog, index, err); @@ -3045,27 +3362,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {  	.arg3_type      = ARG_ANYTHING,  }; -bool bpf_helper_changes_pkt_data(void *func) -{ -	if (func == bpf_skb_vlan_push || -	    func == bpf_skb_vlan_pop || -	    func == bpf_skb_store_bytes || -	    func == bpf_skb_change_proto || -	    func == bpf_skb_change_head || -	    func == bpf_skb_change_tail || -	    func == bpf_skb_adjust_room || -	    func == bpf_skb_pull_data || -	    func == bpf_clone_redirect || -	    func == bpf_l3_csum_replace || -	    func == bpf_l4_csum_replace || -	    func == bpf_xdp_adjust_head || -	    func == bpf_xdp_adjust_meta || -	    func == bpf_msg_pull_data) -		return true; - -	return false; -} -  static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,  				  unsigned long off, unsigned long len)  { @@ -3148,6 +3444,7 @@ set_compat:  	to->tunnel_id = be64_to_cpu(info->key.tun_id);  	to->tunnel_tos = info->key.tos;  	to->tunnel_ttl = info->key.ttl; +	to->tunnel_ext = 0;  	if (flags & BPF_F_TUNINFO_IPV6) {  		memcpy(to->remote_ipv6, &info->key.u.ipv6.src, @@ -3155,6 +3452,8 @@ set_compat:  		to->tunnel_label = be32_to_cpu(info->key.label);  	} else {  		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); +		memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); +		to->tunnel_label = 0;  	}  	if (unlikely(size != sizeof(struct bpf_tunnel_key))) @@ -3364,6 +3663,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {  	.arg3_type	= ARG_ANYTHING,  }; +#ifdef CONFIG_SOCK_CGROUP_DATA +BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) +{ +	struct sock *sk = skb_to_full_sk(skb); +	struct cgroup *cgrp; + +	if (!sk || !sk_fullsock(sk)) +		return 0; + +	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); +	return cgrp->kn->id.id; +} + +static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { +	.func           = bpf_skb_cgroup_id, +	.gpl_only       = false, +	.ret_type       = RET_INTEGER, +	.arg1_type      = ARG_PTR_TO_CTX, +}; +#endif +  static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,  				  unsigned long off, unsigned long len)  { @@ -3711,6 +4031,603 @@ static const struct bpf_func_proto bpf_bind_proto = {  	.arg3_type	= ARG_CONST_SIZE,  }; +#ifdef CONFIG_XFRM +BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, +	   struct bpf_xfrm_state *, to, u32, size, u64, flags) +{ +	const struct sec_path *sp = skb_sec_path(skb); +	const struct xfrm_state *x; + +	if (!sp || unlikely(index >= sp->len || flags)) +		goto err_clear; + +	x = sp->xvec[index]; + +	if (unlikely(size != sizeof(struct bpf_xfrm_state))) +		goto err_clear; + +	to->reqid = x->props.reqid; +	to->spi = x->id.spi; +	to->family = x->props.family; +	to->ext = 0; + +	if (to->family == AF_INET6) { +		memcpy(to->remote_ipv6, x->props.saddr.a6, +		       sizeof(to->remote_ipv6)); +	} else { +		to->remote_ipv4 = x->props.saddr.a4; +		memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); +	} + +	return 0; +err_clear: +	memset(to, 0, size); +	return -EINVAL; +} + +static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { +	.func		= bpf_skb_get_xfrm_state, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_ANYTHING, +	.arg3_type	= ARG_PTR_TO_UNINIT_MEM, +	.arg4_type	= ARG_CONST_SIZE, +	.arg5_type	= ARG_ANYTHING, +}; +#endif + +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, +				  const struct neighbour *neigh, +				  const struct net_device *dev) +{ +	memcpy(params->dmac, neigh->ha, ETH_ALEN); +	memcpy(params->smac, dev->dev_addr, ETH_ALEN); +	params->h_vlan_TCI = 0; +	params->h_vlan_proto = 0; + +	return dev->ifindex; +} +#endif + +#if IS_ENABLED(CONFIG_INET) +static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, +			       u32 flags, bool check_mtu) +{ +	struct in_device *in_dev; +	struct neighbour *neigh; +	struct net_device *dev; +	struct fib_result res; +	struct fib_nh *nh; +	struct flowi4 fl4; +	int err; +	u32 mtu; + +	dev = dev_get_by_index_rcu(net, params->ifindex); +	if (unlikely(!dev)) +		return -ENODEV; + +	/* verify forwarding is enabled on this interface */ +	in_dev = __in_dev_get_rcu(dev); +	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) +		return 0; + +	if (flags & BPF_FIB_LOOKUP_OUTPUT) { +		fl4.flowi4_iif = 1; +		fl4.flowi4_oif = params->ifindex; +	} else { +		fl4.flowi4_iif = params->ifindex; +		fl4.flowi4_oif = 0; +	} +	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; +	fl4.flowi4_scope = RT_SCOPE_UNIVERSE; +	fl4.flowi4_flags = 0; + +	fl4.flowi4_proto = params->l4_protocol; +	fl4.daddr = params->ipv4_dst; +	fl4.saddr = params->ipv4_src; +	fl4.fl4_sport = params->sport; +	fl4.fl4_dport = params->dport; + +	if (flags & BPF_FIB_LOOKUP_DIRECT) { +		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; +		struct fib_table *tb; + +		tb = fib_get_table(net, tbid); +		if (unlikely(!tb)) +			return 0; + +		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); +	} else { +		fl4.flowi4_mark = 0; +		fl4.flowi4_secid = 0; +		fl4.flowi4_tun_key.tun_id = 0; +		fl4.flowi4_uid = sock_net_uid(net, NULL); + +		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); +	} + +	if (err || res.type != RTN_UNICAST) +		return 0; + +	if (res.fi->fib_nhs > 1) +		fib_select_path(net, &res, &fl4, NULL); + +	if (check_mtu) { +		mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); +		if (params->tot_len > mtu) +			return 0; +	} + +	nh = &res.fi->fib_nh[res.nh_sel]; + +	/* do not handle lwt encaps right now */ +	if (nh->nh_lwtstate) +		return 0; + +	dev = nh->nh_dev; +	if (unlikely(!dev)) +		return 0; + +	if (nh->nh_gw) +		params->ipv4_dst = nh->nh_gw; + +	params->rt_metric = res.fi->fib_priority; + +	/* xdp and cls_bpf programs are run in RCU-bh so +	 * rcu_read_lock_bh is not needed here +	 */ +	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); +	if (neigh) +		return bpf_fib_set_fwd_params(params, neigh, dev); + +	return 0; +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, +			       u32 flags, bool check_mtu) +{ +	struct in6_addr *src = (struct in6_addr *) params->ipv6_src; +	struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; +	struct neighbour *neigh; +	struct net_device *dev; +	struct inet6_dev *idev; +	struct fib6_info *f6i; +	struct flowi6 fl6; +	int strict = 0; +	int oif; +	u32 mtu; + +	/* link local addresses are never forwarded */ +	if (rt6_need_strict(dst) || rt6_need_strict(src)) +		return 0; + +	dev = dev_get_by_index_rcu(net, params->ifindex); +	if (unlikely(!dev)) +		return -ENODEV; + +	idev = __in6_dev_get_safely(dev); +	if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) +		return 0; + +	if (flags & BPF_FIB_LOOKUP_OUTPUT) { +		fl6.flowi6_iif = 1; +		oif = fl6.flowi6_oif = params->ifindex; +	} else { +		oif = fl6.flowi6_iif = params->ifindex; +		fl6.flowi6_oif = 0; +		strict = RT6_LOOKUP_F_HAS_SADDR; +	} +	fl6.flowlabel = params->flowinfo; +	fl6.flowi6_scope = 0; +	fl6.flowi6_flags = 0; +	fl6.mp_hash = 0; + +	fl6.flowi6_proto = params->l4_protocol; +	fl6.daddr = *dst; +	fl6.saddr = *src; +	fl6.fl6_sport = params->sport; +	fl6.fl6_dport = params->dport; + +	if (flags & BPF_FIB_LOOKUP_DIRECT) { +		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; +		struct fib6_table *tb; + +		tb = ipv6_stub->fib6_get_table(net, tbid); +		if (unlikely(!tb)) +			return 0; + +		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); +	} else { +		fl6.flowi6_mark = 0; +		fl6.flowi6_secid = 0; +		fl6.flowi6_tun_key.tun_id = 0; +		fl6.flowi6_uid = sock_net_uid(net, NULL); + +		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); +	} + +	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) +		return 0; + +	if (unlikely(f6i->fib6_flags & RTF_REJECT || +	    f6i->fib6_type != RTN_UNICAST)) +		return 0; + +	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) +		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, +						       fl6.flowi6_oif, NULL, +						       strict); + +	if (check_mtu) { +		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); +		if (params->tot_len > mtu) +			return 0; +	} + +	if (f6i->fib6_nh.nh_lwtstate) +		return 0; + +	if (f6i->fib6_flags & RTF_GATEWAY) +		*dst = f6i->fib6_nh.nh_gw; + +	dev = f6i->fib6_nh.nh_dev; +	params->rt_metric = f6i->fib6_metric; + +	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is +	 * not needed here. Can not use __ipv6_neigh_lookup_noref here +	 * because we need to get nd_tbl via the stub +	 */ +	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, +				      ndisc_hashfn, dst, dev); +	if (neigh) +		return bpf_fib_set_fwd_params(params, neigh, dev); + +	return 0; +} +#endif + +BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, +	   struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ +	if (plen < sizeof(*params)) +		return -EINVAL; + +	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) +		return -EINVAL; + +	switch (params->family) { +#if IS_ENABLED(CONFIG_INET) +	case AF_INET: +		return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, +					   flags, true); +#endif +#if IS_ENABLED(CONFIG_IPV6) +	case AF_INET6: +		return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, +					   flags, true); +#endif +	} +	return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { +	.func		= bpf_xdp_fib_lookup, +	.gpl_only	= true, +	.ret_type	= RET_INTEGER, +	.arg1_type      = ARG_PTR_TO_CTX, +	.arg2_type      = ARG_PTR_TO_MEM, +	.arg3_type      = ARG_CONST_SIZE, +	.arg4_type	= ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, +	   struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ +	struct net *net = dev_net(skb->dev); +	int index = -EAFNOSUPPORT; + +	if (plen < sizeof(*params)) +		return -EINVAL; + +	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) +		return -EINVAL; + +	switch (params->family) { +#if IS_ENABLED(CONFIG_INET) +	case AF_INET: +		index = bpf_ipv4_fib_lookup(net, params, flags, false); +		break; +#endif +#if IS_ENABLED(CONFIG_IPV6) +	case AF_INET6: +		index = bpf_ipv6_fib_lookup(net, params, flags, false); +		break; +#endif +	} + +	if (index > 0) { +		struct net_device *dev; + +		dev = dev_get_by_index_rcu(net, index); +		if (!is_skb_forwardable(dev, skb)) +			index = 0; +	} + +	return index; +} + +static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { +	.func		= bpf_skb_fib_lookup, +	.gpl_only	= true, +	.ret_type	= RET_INTEGER, +	.arg1_type      = ARG_PTR_TO_CTX, +	.arg2_type      = ARG_PTR_TO_MEM, +	.arg3_type      = ARG_CONST_SIZE, +	.arg4_type	= ARG_ANYTHING, +}; + +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) +{ +	int err; +	struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; + +	if (!seg6_validate_srh(srh, len)) +		return -EINVAL; + +	switch (type) { +	case BPF_LWT_ENCAP_SEG6_INLINE: +		if (skb->protocol != htons(ETH_P_IPV6)) +			return -EBADMSG; + +		err = seg6_do_srh_inline(skb, srh); +		break; +	case BPF_LWT_ENCAP_SEG6: +		skb_reset_inner_headers(skb); +		skb->encapsulation = 1; +		err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); +		break; +	default: +		return -EINVAL; +	} + +	bpf_compute_data_pointers(skb); +	if (err) +		return err; + +	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); +	skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + +	return seg6_lookup_nexthop(skb, NULL, 0); +} +#endif /* CONFIG_IPV6_SEG6_BPF */ + +BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, +	   u32, len) +{ +	switch (type) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +	case BPF_LWT_ENCAP_SEG6: +	case BPF_LWT_ENCAP_SEG6_INLINE: +		return bpf_push_seg6_encap(skb, type, hdr, len); +#endif +	default: +		return -EINVAL; +	} +} + +static const struct bpf_func_proto bpf_lwt_push_encap_proto = { +	.func		= bpf_lwt_push_encap, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_ANYTHING, +	.arg3_type	= ARG_PTR_TO_MEM, +	.arg4_type	= ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, +	   const void *, from, u32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +	struct seg6_bpf_srh_state *srh_state = +		this_cpu_ptr(&seg6_bpf_srh_states); +	void *srh_tlvs, *srh_end, *ptr; +	struct ipv6_sr_hdr *srh; +	int srhoff = 0; + +	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) +		return -EINVAL; + +	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); +	srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); +	srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); + +	ptr = skb->data + offset; +	if (ptr >= srh_tlvs && ptr + len <= srh_end) +		srh_state->valid = 0; +	else if (ptr < (void *)&srh->flags || +		 ptr + len > (void *)&srh->segments) +		return -EFAULT; + +	if (unlikely(bpf_try_make_writable(skb, offset + len))) +		return -EFAULT; + +	memcpy(skb->data + offset, from, len); +	return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ +	return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { +	.func		= bpf_lwt_seg6_store_bytes, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_ANYTHING, +	.arg3_type	= ARG_PTR_TO_MEM, +	.arg4_type	= ARG_CONST_SIZE +}; + +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, +	   u32, action, void *, param, u32, param_len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +	struct seg6_bpf_srh_state *srh_state = +		this_cpu_ptr(&seg6_bpf_srh_states); +	struct ipv6_sr_hdr *srh; +	int srhoff = 0; +	int err; + +	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) +		return -EINVAL; +	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + +	if (!srh_state->valid) { +		if (unlikely((srh_state->hdrlen & 7) != 0)) +			return -EBADMSG; + +		srh->hdrlen = (u8)(srh_state->hdrlen >> 3); +		if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) +			return -EBADMSG; + +		srh_state->valid = 1; +	} + +	switch (action) { +	case SEG6_LOCAL_ACTION_END_X: +		if (param_len != sizeof(struct in6_addr)) +			return -EINVAL; +		return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); +	case SEG6_LOCAL_ACTION_END_T: +		if (param_len != sizeof(int)) +			return -EINVAL; +		return seg6_lookup_nexthop(skb, NULL, *(int *)param); +	case SEG6_LOCAL_ACTION_END_B6: +		err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, +					  param, param_len); +		if (!err) +			srh_state->hdrlen = +				((struct ipv6_sr_hdr *)param)->hdrlen << 3; +		return err; +	case SEG6_LOCAL_ACTION_END_B6_ENCAP: +		err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, +					  param, param_len); +		if (!err) +			srh_state->hdrlen = +				((struct ipv6_sr_hdr *)param)->hdrlen << 3; +		return err; +	default: +		return -EINVAL; +	} +#else /* CONFIG_IPV6_SEG6_BPF */ +	return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { +	.func		= bpf_lwt_seg6_action, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_ANYTHING, +	.arg3_type	= ARG_PTR_TO_MEM, +	.arg4_type	= ARG_CONST_SIZE +}; + +BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, +	   s32, len) +{ +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) +	struct seg6_bpf_srh_state *srh_state = +		this_cpu_ptr(&seg6_bpf_srh_states); +	void *srh_end, *srh_tlvs, *ptr; +	struct ipv6_sr_hdr *srh; +	struct ipv6hdr *hdr; +	int srhoff = 0; +	int ret; + +	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) +		return -EINVAL; +	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + +	srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + +			((srh->first_segment + 1) << 4)); +	srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + +			srh_state->hdrlen); +	ptr = skb->data + offset; + +	if (unlikely(ptr < srh_tlvs || ptr > srh_end)) +		return -EFAULT; +	if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) +		return -EFAULT; + +	if (len > 0) { +		ret = skb_cow_head(skb, len); +		if (unlikely(ret < 0)) +			return ret; + +		ret = bpf_skb_net_hdr_push(skb, offset, len); +	} else { +		ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); +	} + +	bpf_compute_data_pointers(skb); +	if (unlikely(ret < 0)) +		return ret; + +	hdr = (struct ipv6hdr *)skb->data; +	hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + +	srh_state->hdrlen += len; +	srh_state->valid = 0; +	return 0; +#else /* CONFIG_IPV6_SEG6_BPF */ +	return -EOPNOTSUPP; +#endif +} + +static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { +	.func		= bpf_lwt_seg6_adjust_srh, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_ANYTHING, +	.arg3_type	= ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) +{ +	if (func == bpf_skb_vlan_push || +	    func == bpf_skb_vlan_pop || +	    func == bpf_skb_store_bytes || +	    func == bpf_skb_change_proto || +	    func == bpf_skb_change_head || +	    func == bpf_skb_change_tail || +	    func == bpf_skb_adjust_room || +	    func == bpf_skb_pull_data || +	    func == bpf_clone_redirect || +	    func == bpf_l3_csum_replace || +	    func == bpf_l4_csum_replace || +	    func == bpf_xdp_adjust_head || +	    func == bpf_xdp_adjust_meta || +	    func == bpf_msg_pull_data || +	    func == bpf_xdp_adjust_tail || +	    func == bpf_lwt_push_encap || +	    func == bpf_lwt_seg6_store_bytes || +	    func == bpf_lwt_seg6_adjust_srh || +	    func == bpf_lwt_seg6_action +	    ) +		return true; + +	return false; +} +  static const struct bpf_func_proto *  bpf_base_func_proto(enum bpf_func_id func_id)  { @@ -3781,6 +4698,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  	switch (func_id) {  	case BPF_FUNC_skb_load_bytes:  		return &bpf_skb_load_bytes_proto; +	case BPF_FUNC_skb_load_bytes_relative: +		return &bpf_skb_load_bytes_relative_proto;  	case BPF_FUNC_get_socket_cookie:  		return &bpf_get_socket_cookie_proto;  	case BPF_FUNC_get_socket_uid: @@ -3798,6 +4717,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_skb_store_bytes_proto;  	case BPF_FUNC_skb_load_bytes:  		return &bpf_skb_load_bytes_proto; +	case BPF_FUNC_skb_load_bytes_relative: +		return &bpf_skb_load_bytes_relative_proto;  	case BPF_FUNC_skb_pull_data:  		return &bpf_skb_pull_data_proto;  	case BPF_FUNC_csum_diff: @@ -3852,6 +4773,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_get_socket_cookie_proto;  	case BPF_FUNC_get_socket_uid:  		return &bpf_get_socket_uid_proto; +	case BPF_FUNC_fib_lookup: +		return &bpf_skb_fib_lookup_proto; +#ifdef CONFIG_XFRM +	case BPF_FUNC_skb_get_xfrm_state: +		return &bpf_skb_get_xfrm_state_proto; +#endif +#ifdef CONFIG_SOCK_CGROUP_DATA +	case BPF_FUNC_skb_cgroup_id: +		return &bpf_skb_cgroup_id_proto; +#endif  	default:  		return bpf_base_func_proto(func_id);  	} @@ -3875,33 +4806,10 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_xdp_redirect_proto;  	case BPF_FUNC_redirect_map:  		return &bpf_xdp_redirect_map_proto; -	default: -		return bpf_base_func_proto(func_id); -	} -} - -static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ -	switch (func_id) { -	case BPF_FUNC_skb_load_bytes: -		return &bpf_skb_load_bytes_proto; -	case BPF_FUNC_skb_pull_data: -		return &bpf_skb_pull_data_proto; -	case BPF_FUNC_csum_diff: -		return &bpf_csum_diff_proto; -	case BPF_FUNC_get_cgroup_classid: -		return &bpf_get_cgroup_classid_proto; -	case BPF_FUNC_get_route_realm: -		return &bpf_get_route_realm_proto; -	case BPF_FUNC_get_hash_recalc: -		return &bpf_get_hash_recalc_proto; -	case BPF_FUNC_perf_event_output: -		return &bpf_skb_event_output_proto; -	case BPF_FUNC_get_smp_processor_id: -		return &bpf_get_smp_processor_id_proto; -	case BPF_FUNC_skb_under_cgroup: -		return &bpf_skb_under_cgroup_proto; +	case BPF_FUNC_xdp_adjust_tail: +		return &bpf_xdp_adjust_tail_proto; +	case BPF_FUNC_fib_lookup: +		return &bpf_xdp_fib_lookup_proto;  	default:  		return bpf_base_func_proto(func_id);  	} @@ -3919,6 +4827,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_sock_ops_cb_flags_set_proto;  	case BPF_FUNC_sock_map_update:  		return &bpf_sock_map_update_proto; +	case BPF_FUNC_sock_hash_update: +		return &bpf_sock_hash_update_proto;  	default:  		return bpf_base_func_proto(func_id);  	} @@ -3930,6 +4840,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  	switch (func_id) {  	case BPF_FUNC_msg_redirect_map:  		return &bpf_msg_redirect_map_proto; +	case BPF_FUNC_msg_redirect_hash: +		return &bpf_msg_redirect_hash_proto;  	case BPF_FUNC_msg_apply_bytes:  		return &bpf_msg_apply_bytes_proto;  	case BPF_FUNC_msg_cork_bytes: @@ -3961,12 +4873,52 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_get_socket_uid_proto;  	case BPF_FUNC_sk_redirect_map:  		return &bpf_sk_redirect_map_proto; +	case BPF_FUNC_sk_redirect_hash: +		return &bpf_sk_redirect_hash_proto;  	default:  		return bpf_base_func_proto(func_id);  	}  }  static const struct bpf_func_proto * +lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ +	switch (func_id) { +	case BPF_FUNC_skb_load_bytes: +		return &bpf_skb_load_bytes_proto; +	case BPF_FUNC_skb_pull_data: +		return &bpf_skb_pull_data_proto; +	case BPF_FUNC_csum_diff: +		return &bpf_csum_diff_proto; +	case BPF_FUNC_get_cgroup_classid: +		return &bpf_get_cgroup_classid_proto; +	case BPF_FUNC_get_route_realm: +		return &bpf_get_route_realm_proto; +	case BPF_FUNC_get_hash_recalc: +		return &bpf_get_hash_recalc_proto; +	case BPF_FUNC_perf_event_output: +		return &bpf_skb_event_output_proto; +	case BPF_FUNC_get_smp_processor_id: +		return &bpf_get_smp_processor_id_proto; +	case BPF_FUNC_skb_under_cgroup: +		return &bpf_skb_under_cgroup_proto; +	default: +		return bpf_base_func_proto(func_id); +	} +} + +static const struct bpf_func_proto * +lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ +	switch (func_id) { +	case BPF_FUNC_lwt_push_encap: +		return &bpf_lwt_push_encap_proto; +	default: +		return lwt_out_func_proto(func_id, prog); +	} +} + +static const struct bpf_func_proto *  lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  {  	switch (func_id) { @@ -3997,7 +4949,22 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  	case BPF_FUNC_set_hash_invalid:  		return &bpf_set_hash_invalid_proto;  	default: -		return lwt_inout_func_proto(func_id, prog); +		return lwt_out_func_proto(func_id, prog); +	} +} + +static const struct bpf_func_proto * +lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ +	switch (func_id) { +	case BPF_FUNC_lwt_seg6_store_bytes: +		return &bpf_lwt_seg6_store_bytes_proto; +	case BPF_FUNC_lwt_seg6_action: +		return &bpf_lwt_seg6_action_proto; +	case BPF_FUNC_lwt_seg6_adjust_srh: +		return &bpf_lwt_seg6_adjust_srh_proto; +	default: +		return lwt_out_func_proto(func_id, prog);  	}  } @@ -4105,7 +5072,6 @@ static bool lwt_is_valid_access(int off, int size,  	return bpf_skb_is_valid_access(off, size, type, prog, info);  } -  /* Attach type specific accesses */  static bool __sock_filter_check_attach_type(int off,  					    enum bpf_access_type access_type, @@ -4221,6 +5187,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,  	return insn - insn_buf;  } +static int bpf_gen_ld_abs(const struct bpf_insn *orig, +			  struct bpf_insn *insn_buf) +{ +	bool indirect = BPF_MODE(orig->code) == BPF_IND; +	struct bpf_insn *insn = insn_buf; + +	/* We're guaranteed here that CTX is in R6. */ +	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); +	if (!indirect) { +		*insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); +	} else { +		*insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); +		if (orig->imm) +			*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); +	} + +	switch (BPF_SIZE(orig->code)) { +	case BPF_B: +		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); +		break; +	case BPF_H: +		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); +		break; +	case BPF_W: +		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); +		break; +	} + +	*insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); +	*insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); +	*insn++ = BPF_EXIT_INSN(); + +	return insn - insn_buf; +} +  static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,  			       const struct bpf_prog *prog)  { @@ -4279,8 +5280,15 @@ static bool xdp_is_valid_access(int off, int size,  				const struct bpf_prog *prog,  				struct bpf_insn_access_aux *info)  { -	if (type == BPF_WRITE) +	if (type == BPF_WRITE) { +		if (bpf_prog_is_dev_bound(prog->aux)) { +			switch (off) { +			case offsetof(struct xdp_md, rx_queue_index): +				return __is_valid_xdp_access(off, size); +			} +		}  		return false; +	}  	switch (off) {  	case offsetof(struct xdp_md, data): @@ -4327,6 +5335,7 @@ static bool sock_addr_is_valid_access(int off, int size,  		switch (prog->expected_attach_type) {  		case BPF_CGROUP_INET4_BIND:  		case BPF_CGROUP_INET4_CONNECT: +		case BPF_CGROUP_UDP4_SENDMSG:  			break;  		default:  			return false; @@ -4336,6 +5345,24 @@ static bool sock_addr_is_valid_access(int off, int size,  		switch (prog->expected_attach_type) {  		case BPF_CGROUP_INET6_BIND:  		case BPF_CGROUP_INET6_CONNECT: +		case BPF_CGROUP_UDP6_SENDMSG: +			break; +		default: +			return false; +		} +		break; +	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): +		switch (prog->expected_attach_type) { +		case BPF_CGROUP_UDP4_SENDMSG: +			break; +		default: +			return false; +		} +		break; +	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], +				msg_src_ip6[3]): +		switch (prog->expected_attach_type) { +		case BPF_CGROUP_UDP6_SENDMSG:  			break;  		default:  			return false; @@ -4346,6 +5373,9 @@ static bool sock_addr_is_valid_access(int off, int size,  	switch (off) {  	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):  	case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): +	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): +	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], +				msg_src_ip6[3]):  		/* Only narrow read access allowed for now. */  		if (type == BPF_READ) {  			bpf_ctx_record_field_size(info, size_default); @@ -4465,18 +5495,23 @@ static bool sk_msg_is_valid_access(int off, int size,  	switch (off) {  	case offsetof(struct sk_msg_md, data):  		info->reg_type = PTR_TO_PACKET; +		if (size != sizeof(__u64)) +			return false;  		break;  	case offsetof(struct sk_msg_md, data_end):  		info->reg_type = PTR_TO_PACKET_END; +		if (size != sizeof(__u64)) +			return false;  		break; +	default: +		if (size != sizeof(__u32)) +			return false;  	}  	if (off < 0 || off >= sizeof(struct sk_msg_md))  		return false;  	if (off % size != 0)  		return false; -	if (size != sizeof(__u64)) -		return false;  	return true;  } @@ -5095,6 +6130,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,  		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,  					SK_FL_PROTO_SHIFT);  		break; + +	case offsetof(struct bpf_sock_addr, msg_src_ip4): +		/* Treat t_ctx as struct in_addr for msg_src_ip4. */ +		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( +			struct bpf_sock_addr_kern, struct in_addr, t_ctx, +			s_addr, BPF_SIZE(si->code), 0, tmp_reg); +		break; + +	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], +				msg_src_ip6[3]): +		off = si->off; +		off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); +		/* Treat t_ctx as struct in6_addr for msg_src_ip6. */ +		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( +			struct bpf_sock_addr_kern, struct in6_addr, t_ctx, +			s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); +		break;  	}  	return insn - insn_buf; @@ -5152,7 +6204,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,  		break;  	case offsetof(struct bpf_sock_ops, local_ip4): -		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, +					  skc_rcv_saddr) != 4);  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(  					      struct bpf_sock_ops_kern, sk), @@ -5469,6 +6522,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,  				     struct bpf_prog *prog, u32 *target_size)  {  	struct bpf_insn *insn = insn_buf; +#if IS_ENABLED(CONFIG_IPV6) +	int off; +#endif  	switch (si->off) {  	case offsetof(struct sk_msg_md, data): @@ -5481,6 +6537,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,  				      si->dst_reg, si->src_reg,  				      offsetof(struct sk_msg_buff, data_end));  		break; +	case offsetof(struct sk_msg_md, family): +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +					      struct sk_msg_buff, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct sk_msg_buff, sk)); +		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, skc_family)); +		break; + +	case offsetof(struct sk_msg_md, remote_ip4): +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +						struct sk_msg_buff, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct sk_msg_buff, sk)); +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, skc_daddr)); +		break; + +	case offsetof(struct sk_msg_md, local_ip4): +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, +					  skc_rcv_saddr) != 4); + +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +					      struct sk_msg_buff, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct sk_msg_buff, sk)); +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, +					       skc_rcv_saddr)); +		break; + +	case offsetof(struct sk_msg_md, remote_ip6[0]) ... +	     offsetof(struct sk_msg_md, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, +					  skc_v6_daddr.s6_addr32[0]) != 4); + +		off = si->off; +		off -= offsetof(struct sk_msg_md, remote_ip6[0]); +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +						struct sk_msg_buff, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct sk_msg_buff, sk)); +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, +					       skc_v6_daddr.s6_addr32[0]) + +				      off); +#else +		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif +		break; + +	case offsetof(struct sk_msg_md, local_ip6[0]) ... +	     offsetof(struct sk_msg_md, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, +					  skc_v6_rcv_saddr.s6_addr32[0]) != 4); + +		off = si->off; +		off -= offsetof(struct sk_msg_md, local_ip6[0]); +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +						struct sk_msg_buff, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct sk_msg_buff, sk)); +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, +					       skc_v6_rcv_saddr.s6_addr32[0]) + +				      off); +#else +		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif +		break; + +	case offsetof(struct sk_msg_md, remote_port): +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +						struct sk_msg_buff, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct sk_msg_buff, sk)); +		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD +		*insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif +		break; + +	case offsetof(struct sk_msg_md, local_port): +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( +						struct sk_msg_buff, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct sk_msg_buff, sk)); +		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, +				      offsetof(struct sock_common, skc_num)); +		break;  	}  	return insn - insn_buf; @@ -5490,6 +6647,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = {  	.get_func_proto		= sk_filter_func_proto,  	.is_valid_access	= sk_filter_is_valid_access,  	.convert_ctx_access	= bpf_convert_ctx_access, +	.gen_ld_abs		= bpf_gen_ld_abs,  };  const struct bpf_prog_ops sk_filter_prog_ops = { @@ -5501,6 +6659,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {  	.is_valid_access	= tc_cls_act_is_valid_access,  	.convert_ctx_access	= tc_cls_act_convert_ctx_access,  	.gen_prologue		= tc_cls_act_prologue, +	.gen_ld_abs		= bpf_gen_ld_abs,  };  const struct bpf_prog_ops tc_cls_act_prog_ops = { @@ -5527,13 +6686,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = {  	.test_run		= bpf_prog_test_run_skb,  }; -const struct bpf_verifier_ops lwt_inout_verifier_ops = { -	.get_func_proto		= lwt_inout_func_proto, +const struct bpf_verifier_ops lwt_in_verifier_ops = { +	.get_func_proto		= lwt_in_func_proto,  	.is_valid_access	= lwt_is_valid_access,  	.convert_ctx_access	= bpf_convert_ctx_access,  }; -const struct bpf_prog_ops lwt_inout_prog_ops = { +const struct bpf_prog_ops lwt_in_prog_ops = { +	.test_run		= bpf_prog_test_run_skb, +}; + +const struct bpf_verifier_ops lwt_out_verifier_ops = { +	.get_func_proto		= lwt_out_func_proto, +	.is_valid_access	= lwt_is_valid_access, +	.convert_ctx_access	= bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_out_prog_ops = {  	.test_run		= bpf_prog_test_run_skb,  }; @@ -5548,6 +6717,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = {  	.test_run		= bpf_prog_test_run_skb,  }; +const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { +	.get_func_proto		= lwt_seg6local_func_proto, +	.is_valid_access	= lwt_is_valid_access, +	.convert_ctx_access	= bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_seg6local_prog_ops = { +	.test_run		= bpf_prog_test_run_skb, +}; +  const struct bpf_verifier_ops cg_sock_verifier_ops = {  	.get_func_proto		= sock_filter_func_proto,  	.is_valid_access	= sock_filter_is_valid_access, diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d29f09bc5ff9..53f96e4f7bf5 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1253,7 +1253,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)  EXPORT_SYMBOL(skb_get_hash_perturb);  u32 __skb_get_poff(const struct sk_buff *skb, void *data, -		   const struct flow_keys *keys, int hlen) +		   const struct flow_keys_basic *keys, int hlen)  {  	u32 poff = keys->control.thoff; @@ -1314,9 +1314,9 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,   */  u32 skb_get_poff(const struct sk_buff *skb)  { -	struct flow_keys keys; +	struct flow_keys_basic keys; -	if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) +	if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))  		return 0;  	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); @@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)  	keys->ports.src = fl6->fl6_sport;  	keys->ports.dst = fl6->fl6_dport;  	keys->keyid.keyid = fl6->fl6_gre_key; -	keys->tags.flow_label = (__force u32)fl6->flowlabel; +	keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);  	keys->basic.ip_proto = fl6->flowi6_proto;  	return flow_hash_from_keys(keys); @@ -1403,7 +1403,7 @@ static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {  	},  }; -static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { +static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {  	{  		.key_id = FLOW_DISSECTOR_KEY_CONTROL,  		.offset = offsetof(struct flow_keys, control), @@ -1417,7 +1417,8 @@ static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {  struct flow_dissector flow_keys_dissector __read_mostly;  EXPORT_SYMBOL(flow_keys_dissector); -struct flow_dissector flow_keys_buf_dissector __read_mostly; +struct flow_dissector flow_keys_basic_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_basic_dissector);  static int __init init_default_flow_dissectors(void)  { @@ -1427,9 +1428,9 @@ static int __init init_default_flow_dissectors(void)  	skb_flow_dissector_init(&flow_keys_dissector_symmetric,  				flow_keys_dissector_symmetric_keys,  				ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); -	skb_flow_dissector_init(&flow_keys_buf_dissector, -				flow_keys_buf_dissector_keys, -				ARRAY_SIZE(flow_keys_buf_dissector_keys)); +	skb_flow_dissector_init(&flow_keys_basic_dissector, +				flow_keys_basic_dissector_keys, +				ARRAY_SIZE(flow_keys_basic_dissector_keys));  	return 0;  } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ce519861be59..8e3fda9e725c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -59,7 +59,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,  				    struct net_device *dev);  #ifdef CONFIG_PROC_FS -static const struct file_operations neigh_stat_seq_fops; +static const struct seq_operations neigh_stat_seq_ops;  #endif  /* @@ -119,13 +119,14 @@ unsigned long neigh_rand_reach_time(unsigned long base)  EXPORT_SYMBOL(neigh_rand_reach_time); -static bool neigh_del(struct neighbour *n, __u8 state, +static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags,  		      struct neighbour __rcu **np, struct neigh_table *tbl)  {  	bool retval = false;  	write_lock(&n->lock); -	if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state)) { +	if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) && +	    !(n->flags & flags)) {  		struct neighbour *neigh;  		neigh = rcu_dereference_protected(n->next, @@ -157,7 +158,7 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)  	while ((n = rcu_dereference_protected(*np,  					      lockdep_is_held(&tbl->lock)))) {  		if (n == ndel) -			return neigh_del(n, 0, np, tbl); +			return neigh_del(n, 0, 0, np, tbl);  		np = &n->next;  	}  	return false; @@ -185,7 +186,8 @@ static int neigh_forced_gc(struct neigh_table *tbl)  			 * - nobody refers to it.  			 * - it is not permanent  			 */ -			if (neigh_del(n, NUD_PERMANENT, np, tbl)) { +			if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, +				      tbl)) {  				shrunk = 1;  				continue;  			} @@ -820,7 +822,8 @@ static void neigh_periodic_work(struct work_struct *work)  			write_lock(&n->lock);  			state = n->nud_state; -			if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { +			if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || +			    (n->flags & NTF_EXT_LEARNED)) {  				write_unlock(&n->lock);  				goto next_elt;  			} @@ -1136,6 +1139,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,  	if (neigh->dead)  		goto out; +	neigh_update_ext_learned(neigh, flags, ¬ify); +  	if (!(new & NUD_VALID)) {  		neigh_del_timer(neigh);  		if (old & NUD_CONNECTED) @@ -1558,8 +1563,8 @@ void neigh_table_init(int index, struct neigh_table *tbl)  		panic("cannot create neighbour cache statistics");  #ifdef CONFIG_PROC_FS -	if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat, -			      &neigh_stat_seq_fops, tbl)) +	if (!proc_create_seq_data(tbl->id, 0, init_net.proc_net_stat, +			      &neigh_stat_seq_ops, tbl))  		panic("cannot create neighbour proc dir entry");  #endif @@ -1781,6 +1786,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,  			flags &= ~NEIGH_UPDATE_F_OVERRIDE;  	} +	if (ndm->ndm_flags & NTF_EXT_LEARNED) +		flags |= NEIGH_UPDATE_F_EXT_LEARNED; +  	if (ndm->ndm_flags & NTF_USE) {  		neigh_event_send(neigh, NULL);  		err = 0; @@ -2786,7 +2794,7 @@ EXPORT_SYMBOL(neigh_seq_stop);  static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)  { -	struct neigh_table *tbl = seq->private; +	struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));  	int cpu;  	if (*pos == 0) @@ -2803,7 +2811,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)  static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)  { -	struct neigh_table *tbl = seq->private; +	struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));  	int cpu;  	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { @@ -2822,7 +2830,7 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v)  static int neigh_stat_seq_show(struct seq_file *seq, void *v)  { -	struct neigh_table *tbl = seq->private; +	struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));  	struct neigh_statistics *st = v;  	if (v == SEQ_START_TOKEN) { @@ -2861,25 +2869,6 @@ static const struct seq_operations neigh_stat_seq_ops = {  	.stop	= neigh_stat_seq_stop,  	.show	= neigh_stat_seq_show,  }; - -static int neigh_stat_seq_open(struct inode *inode, struct file *file) -{ -	int ret = seq_open(file, &neigh_stat_seq_ops); - -	if (!ret) { -		struct seq_file *sf = file->private_data; -		sf->private = PDE_DATA(inode); -	} -	return ret; -}; - -static const struct file_operations neigh_stat_seq_fops = { -	.open 	 = neigh_stat_seq_open, -	.read	 = seq_read, -	.llseek	 = seq_lseek, -	.release = seq_release, -}; -  #endif /* CONFIG_PROC_FS */  static inline size_t neigh_nlmsg_size(void) diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 9737302907b1..63881f72ef71 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -175,19 +175,6 @@ static const struct seq_operations dev_seq_ops = {  	.show  = dev_seq_show,  }; -static int dev_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open_net(inode, file, &dev_seq_ops, -			    sizeof(struct seq_net_private)); -} - -static const struct file_operations dev_seq_fops = { -	.open    = dev_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release_net, -}; -  static const struct seq_operations softnet_seq_ops = {  	.start = softnet_seq_start,  	.next  = softnet_seq_next, @@ -195,18 +182,6 @@ static const struct seq_operations softnet_seq_ops = {  	.show  = softnet_seq_show,  }; -static int softnet_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open(file, &softnet_seq_ops); -} - -static const struct file_operations softnet_seq_fops = { -	.open    = softnet_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release, -}; -  static void *ptype_get_idx(loff_t pos)  {  	struct packet_type *pt = NULL; @@ -297,30 +272,18 @@ static const struct seq_operations ptype_seq_ops = {  	.show  = ptype_seq_show,  }; -static int ptype_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open_net(inode, file, &ptype_seq_ops, -			sizeof(struct seq_net_private)); -} - -static const struct file_operations ptype_seq_fops = { -	.open    = ptype_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release_net, -}; - -  static int __net_init dev_proc_net_init(struct net *net)  {  	int rc = -ENOMEM; -	if (!proc_create("dev", 0444, net->proc_net, &dev_seq_fops)) +	if (!proc_create_net("dev", 0444, net->proc_net, &dev_seq_ops, +			sizeof(struct seq_net_private)))  		goto out; -	if (!proc_create("softnet_stat", 0444, net->proc_net, -			 &softnet_seq_fops)) +	if (!proc_create_seq("softnet_stat", 0444, net->proc_net, +			 &softnet_seq_ops))  		goto out_dev; -	if (!proc_create("ptype", 0444, net->proc_net, &ptype_seq_fops)) +	if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops, +			sizeof(struct seq_net_private)))  		goto out_softnet;  	if (wext_proc_init(net)) @@ -377,22 +340,10 @@ static const struct seq_operations dev_mc_seq_ops = {  	.show  = dev_mc_seq_show,  }; -static int dev_mc_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open_net(inode, file, &dev_mc_seq_ops, -			    sizeof(struct seq_net_private)); -} - -static const struct file_operations dev_mc_seq_fops = { -	.open    = dev_mc_seq_open, -	.read    = seq_read, -	.llseek  = seq_lseek, -	.release = seq_release_net, -}; -  static int __net_init dev_mc_net_init(struct net *net)  { -	if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops)) +	if (!proc_create_net("dev_mcast", 0, net->proc_net, &dev_mc_seq_ops, +			sizeof(struct seq_net_private)))  		return -ENOMEM;  	return 0;  } diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 380934580fa1..419af6dfe29f 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -35,10 +35,6 @@  #include <trace/events/tcp.h>  #include <trace/events/fib.h>  #include <trace/events/qdisc.h> -#if IS_ENABLED(CONFIG_IPV6) -#include <trace/events/fib6.h> -EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); -#endif  #if IS_ENABLED(CONFIG_BRIDGE)  #include <trace/events/bridge.h>  EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); diff --git a/net/core/page_pool.c b/net/core/page_pool.c new file mode 100644 index 000000000000..68bf07206744 --- /dev/null +++ b/net/core/page_pool.c @@ -0,0 +1,317 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.c + *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com> + *	Copyright (C) 2016 Red Hat, Inc. + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include <net/page_pool.h> +#include <linux/dma-direction.h> +#include <linux/dma-mapping.h> +#include <linux/page-flags.h> +#include <linux/mm.h> /* for __put_page() */ + +static int page_pool_init(struct page_pool *pool, +			  const struct page_pool_params *params) +{ +	unsigned int ring_qsize = 1024; /* Default */ + +	memcpy(&pool->p, params, sizeof(pool->p)); + +	/* Validate only known flags were used */ +	if (pool->p.flags & ~(PP_FLAG_ALL)) +		return -EINVAL; + +	if (pool->p.pool_size) +		ring_qsize = pool->p.pool_size; + +	/* Sanity limit mem that can be pinned down */ +	if (ring_qsize > 32768) +		return -E2BIG; + +	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. +	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, +	 * which is the XDP_TX use-case. +	 */ +	if ((pool->p.dma_dir != DMA_FROM_DEVICE) && +	    (pool->p.dma_dir != DMA_BIDIRECTIONAL)) +		return -EINVAL; + +	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) +		return -ENOMEM; + +	return 0; +} + +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ +	struct page_pool *pool; +	int err = 0; + +	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); +	if (!pool) +		return ERR_PTR(-ENOMEM); + +	err = page_pool_init(pool, params); +	if (err < 0) { +		pr_warn("%s() gave up with errno %d\n", __func__, err); +		kfree(pool); +		return ERR_PTR(err); +	} +	return pool; +} +EXPORT_SYMBOL(page_pool_create); + +/* fast path */ +static struct page *__page_pool_get_cached(struct page_pool *pool) +{ +	struct ptr_ring *r = &pool->ring; +	struct page *page; + +	/* Quicker fallback, avoid locks when ring is empty */ +	if (__ptr_ring_empty(r)) +		return NULL; + +	/* Test for safe-context, caller should provide this guarantee */ +	if (likely(in_serving_softirq())) { +		if (likely(pool->alloc.count)) { +			/* Fast-path */ +			page = pool->alloc.cache[--pool->alloc.count]; +			return page; +		} +		/* Slower-path: Alloc array empty, time to refill +		 * +		 * Open-coded bulk ptr_ring consumer. +		 * +		 * Discussion: the ring consumer lock is not really +		 * needed due to the softirq/NAPI protection, but +		 * later need the ability to reclaim pages on the +		 * ring. Thus, keeping the locks. +		 */ +		spin_lock(&r->consumer_lock); +		while ((page = __ptr_ring_consume(r))) { +			if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) +				break; +			pool->alloc.cache[pool->alloc.count++] = page; +		} +		spin_unlock(&r->consumer_lock); +		return page; +	} + +	/* Slow-path: Get page from locked ring queue */ +	page = ptr_ring_consume(&pool->ring); +	return page; +} + +/* slow path */ +noinline +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, +						 gfp_t _gfp) +{ +	struct page *page; +	gfp_t gfp = _gfp; +	dma_addr_t dma; + +	/* We could always set __GFP_COMP, and avoid this branch, as +	 * prep_new_page() can handle order-0 with __GFP_COMP. +	 */ +	if (pool->p.order) +		gfp |= __GFP_COMP; + +	/* FUTURE development: +	 * +	 * Current slow-path essentially falls back to single page +	 * allocations, which doesn't improve performance.  This code +	 * need bulk allocation support from the page allocator code. +	 */ + +	/* Cache was empty, do real allocation */ +	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); +	if (!page) +		return NULL; + +	if (!(pool->p.flags & PP_FLAG_DMA_MAP)) +		goto skip_dma_map; + +	/* Setup DMA mapping: use page->private for DMA-addr +	 * This mapping is kept for lifetime of page, until leaving pool. +	 */ +	dma = dma_map_page(pool->p.dev, page, 0, +			   (PAGE_SIZE << pool->p.order), +			   pool->p.dma_dir); +	if (dma_mapping_error(pool->p.dev, dma)) { +		put_page(page); +		return NULL; +	} +	set_page_private(page, dma); /* page->private = dma; */ + +skip_dma_map: +	/* When page just alloc'ed is should/must have refcnt 1. */ +	return page; +} + +/* For using page_pool replace: alloc_pages() API calls, but provide + * synchronization guarantee for allocation side. + */ +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ +	struct page *page; + +	/* Fast-path: Get a page from cache */ +	page = __page_pool_get_cached(pool); +	if (page) +		return page; + +	/* Slow-path: cache empty, do real allocation */ +	page = __page_pool_alloc_pages_slow(pool, gfp); +	return page; +} +EXPORT_SYMBOL(page_pool_alloc_pages); + +/* Cleanup page_pool state from page */ +static void __page_pool_clean_page(struct page_pool *pool, +				   struct page *page) +{ +	if (!(pool->p.flags & PP_FLAG_DMA_MAP)) +		return; + +	/* DMA unmap */ +	dma_unmap_page(pool->p.dev, page_private(page), +		       PAGE_SIZE << pool->p.order, pool->p.dma_dir); +	set_page_private(page, 0); +} + +/* Return a page to the page allocator, cleaning up our state */ +static void __page_pool_return_page(struct page_pool *pool, struct page *page) +{ +	__page_pool_clean_page(pool, page); +	put_page(page); +	/* An optimization would be to call __free_pages(page, pool->p.order) +	 * knowing page is not part of page-cache (thus avoiding a +	 * __page_cache_release() call). +	 */ +} + +static bool __page_pool_recycle_into_ring(struct page_pool *pool, +				   struct page *page) +{ +	int ret; +	/* BH protection not needed if current is serving softirq */ +	if (in_serving_softirq()) +		ret = ptr_ring_produce(&pool->ring, page); +	else +		ret = ptr_ring_produce_bh(&pool->ring, page); + +	return (ret == 0) ? true : false; +} + +/* Only allow direct recycling in special circumstances, into the + * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case. + * + * Caller must provide appropriate safe context. + */ +static bool __page_pool_recycle_direct(struct page *page, +				       struct page_pool *pool) +{ +	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) +		return false; + +	/* Caller MUST have verified/know (page_ref_count(page) == 1) */ +	pool->alloc.cache[pool->alloc.count++] = page; +	return true; +} + +void __page_pool_put_page(struct page_pool *pool, +			  struct page *page, bool allow_direct) +{ +	/* This allocator is optimized for the XDP mode that uses +	 * one-frame-per-page, but have fallbacks that act like the +	 * regular page allocator APIs. +	 * +	 * refcnt == 1 means page_pool owns page, and can recycle it. +	 */ +	if (likely(page_ref_count(page) == 1)) { +		/* Read barrier done in page_ref_count / READ_ONCE */ + +		if (allow_direct && in_serving_softirq()) +			if (__page_pool_recycle_direct(page, pool)) +				return; + +		if (!__page_pool_recycle_into_ring(pool, page)) { +			/* Cache full, fallback to free pages */ +			__page_pool_return_page(pool, page); +		} +		return; +	} +	/* Fallback/non-XDP mode: API user have elevated refcnt. +	 * +	 * Many drivers split up the page into fragments, and some +	 * want to keep doing this to save memory and do refcnt based +	 * recycling. Support this use case too, to ease drivers +	 * switching between XDP/non-XDP. +	 * +	 * In-case page_pool maintains the DMA mapping, API user must +	 * call page_pool_put_page once.  In this elevated refcnt +	 * case, the DMA is unmapped/released, as driver is likely +	 * doing refcnt based recycle tricks, meaning another process +	 * will be invoking put_page. +	 */ +	__page_pool_clean_page(pool, page); +	put_page(page); +} +EXPORT_SYMBOL(__page_pool_put_page); + +static void __page_pool_empty_ring(struct page_pool *pool) +{ +	struct page *page; + +	/* Empty recycle ring */ +	while ((page = ptr_ring_consume(&pool->ring))) { +		/* Verify the refcnt invariant of cached pages */ +		if (!(page_ref_count(page) == 1)) +			pr_crit("%s() page_pool refcnt %d violation\n", +				__func__, page_ref_count(page)); + +		__page_pool_return_page(pool, page); +	} +} + +static void __page_pool_destroy_rcu(struct rcu_head *rcu) +{ +	struct page_pool *pool; + +	pool = container_of(rcu, struct page_pool, rcu); + +	WARN(pool->alloc.count, "API usage violation"); + +	__page_pool_empty_ring(pool); +	ptr_ring_cleanup(&pool->ring, NULL); +	kfree(pool); +} + +/* Cleanup and release resources */ +void page_pool_destroy(struct page_pool *pool) +{ +	struct page *page; + +	/* Empty alloc cache, assume caller made sure this is +	 * no-longer in use, and page_pool_alloc_pages() cannot be +	 * call concurrently. +	 */ +	while (pool->alloc.count) { +		page = pool->alloc.cache[--pool->alloc.count]; +		__page_pool_return_page(pool, page); +	} + +	/* No more consumers should exist, but producers could still +	 * be in-flight. +	 */ +	__page_pool_empty_ring(pool); + +	/* An xdp_mem_allocator can still ref page_pool pointer */ +	call_rcu(&pool->rcu, __page_pool_destroy_rcu); +} +EXPORT_SYMBOL(page_pool_destroy); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7e4ede34cc52..49368e21d228 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3603,7 +3603,8 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)  		return -ENOMEM;  	strcpy(pkt_dev->odevname, ifname); -	pkt_dev->flows = vzalloc_node(MAX_CFLOWS * sizeof(struct flow_state), +	pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS, +						 sizeof(struct flow_state)),  				      node);  	if (pkt_dev->flows == NULL) {  		kfree(pkt_dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 45936922d7e2..5ef61222fdef 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,6 +59,9 @@  #include <net/rtnetlink.h>  #include <net/net_namespace.h> +#define RTNL_MAX_TYPE		48 +#define RTNL_SLAVE_MAX_TYPE	36 +  struct rtnl_link {  	rtnl_doit_func		doit;  	rtnl_dumpit_func	dumpit; @@ -389,6 +392,11 @@ int rtnl_link_register(struct rtnl_link_ops *ops)  {  	int err; +	/* Sanity-check max sizes to avoid stack buffer overflow. */ +	if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || +		    ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) +		return -EINVAL; +  	rtnl_lock();  	err = __rtnl_link_register(ops);  	rtnl_unlock(); @@ -785,13 +793,15 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,  		       long expires, u32 error)  {  	struct rta_cacheinfo ci = { -		.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse), -		.rta_used = dst->__use, -		.rta_clntref = atomic_read(&(dst->__refcnt)),  		.rta_error = error,  		.rta_id =  id,  	}; +	if (dst) { +		ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); +		ci.rta_used = dst->__use; +		ci.rta_clntref = atomic_read(&dst->__refcnt); +	}  	if (expires) {  		unsigned long clock; @@ -2256,6 +2266,10 @@ static int do_setlink(const struct sk_buff *skb,  	const struct net_device_ops *ops = dev->netdev_ops;  	int err; +	err = validate_linkmsg(dev, tb); +	if (err < 0) +		return err; +  	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {  		struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),  							    tb, CAP_NET_ADMIN); @@ -2619,10 +2633,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,  		goto errout;  	} -	err = validate_linkmsg(dev, tb); -	if (err < 0) -		goto errout; -  	err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);  errout:  	return err; @@ -2900,13 +2910,16 @@ replay:  	}  	if (1) { -		struct nlattr *attr[ops ? ops->maxtype + 1 : 1]; -		struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1]; +		struct nlattr *attr[RTNL_MAX_TYPE + 1]; +		struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];  		struct nlattr **data = NULL;  		struct nlattr **slave_data = NULL;  		struct net *dest_net, *link_net = NULL;  		if (ops) { +			if (ops->maxtype > RTNL_MAX_TYPE) +				return -EINVAL; +  			if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {  				err = nla_parse_nested(attr, ops->maxtype,  						       linkinfo[IFLA_INFO_DATA], @@ -2923,6 +2936,9 @@ replay:  		}  		if (m_ops) { +			if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) +				return -EINVAL; +  			if (m_ops->slave_maxtype &&  			    linkinfo[IFLA_INFO_SLAVE_DATA]) {  				err = nla_parse_nested(slave_attr, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 345b51837ca8..c642304f178c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1305,7 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)  	skb->inner_mac_header += off;  } -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)  {  	__copy_skb_header(new, old); @@ -1313,6 +1313,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)  	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;  	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;  } +EXPORT_SYMBOL(skb_copy_header);  static inline int skb_alloc_rx_flag(const struct sk_buff *skb)  { @@ -1355,7 +1356,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)  	BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); -	copy_skb_header(n, skb); +	skb_copy_header(n, skb);  	return n;  }  EXPORT_SYMBOL(skb_copy); @@ -1419,7 +1420,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,  		skb_clone_fraglist(n);  	} -	copy_skb_header(n, skb); +	skb_copy_header(n, skb);  out:  	return n;  } @@ -1599,7 +1600,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,  	BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,  			     skb->len + head_copy_len)); -	copy_skb_header(n, skb); +	skb_copy_header(n, skb);  	skb_headers_offset_update(n, newheadroom - oldheadroom); @@ -1839,6 +1840,20 @@ done:  }  EXPORT_SYMBOL(___pskb_trim); +/* Note : use pskb_trim_rcsum() instead of calling this directly + */ +int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) +{ +	if (skb->ip_summed == CHECKSUM_COMPLETE) { +		int delta = skb->len - len; + +		skb->csum = csum_sub(skb->csum, +				     skb_checksum(skb, len, delta, 0)); +	} +	return __pskb_trim(skb, len); +} +EXPORT_SYMBOL(pskb_trim_rcsum_slow); +  /**   *	__pskb_pull_tail - advance tail of skb header   *	@skb: buffer to reallocate @@ -4926,6 +4941,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)  		thlen = tcp_hdrlen(skb);  	} else if (unlikely(skb_is_gso_sctp(skb))) {  		thlen = sizeof(struct sctphdr); +	} else if (shinfo->gso_type & SKB_GSO_UDP_L4) { +		thlen = sizeof(struct udphdr);  	}  	/* UFO sets gso_size to the size of the fragmentation  	 * payload, i.e. the size of the L4 (UDP) header is already diff --git a/net/core/sock.c b/net/core/sock.c index 3b6d02854e57..bcc41829a16d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];    x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \    x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \    x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \ -  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX" +  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \ +  x "AF_MAX"  static const char *const af_family_key_strings[AF_MAX+1] = {  	_sock_locks("sk_lock-") @@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = {    "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,    "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,    "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      , -  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX" +  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_XDP"      , +  "rlock-AF_MAX"  };  static const char *const af_family_wlock_key_strings[AF_MAX+1] = {    "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     , @@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = {    "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,    "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,    "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      , -  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX" +  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_XDP"      , +  "wlock-AF_MAX"  };  static const char *const af_family_elock_key_strings[AF_MAX+1] = {    "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     , @@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = {    "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,    "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,    "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      , -  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX" +  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_XDP"      , +  "elock-AF_MAX"  };  /* @@ -323,8 +327,8 @@ EXPORT_SYMBOL(sysctl_optmem_max);  int sysctl_tstamp_allow_data __read_mostly = 1; -struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; -EXPORT_SYMBOL_GPL(memalloc_socks); +DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); +EXPORT_SYMBOL_GPL(memalloc_socks_key);  /**   * sk_set_memalloc - sets %SOCK_MEMALLOC @@ -338,7 +342,7 @@ void sk_set_memalloc(struct sock *sk)  {  	sock_set_flag(sk, SOCK_MEMALLOC);  	sk->sk_allocation |= __GFP_MEMALLOC; -	static_key_slow_inc(&memalloc_socks); +	static_branch_inc(&memalloc_socks_key);  }  EXPORT_SYMBOL_GPL(sk_set_memalloc); @@ -346,7 +350,7 @@ void sk_clear_memalloc(struct sock *sk)  {  	sock_reset_flag(sk, SOCK_MEMALLOC);  	sk->sk_allocation &= ~__GFP_MEMALLOC; -	static_key_slow_dec(&memalloc_socks); +	static_branch_dec(&memalloc_socks_key);  	/*  	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward @@ -905,7 +909,10 @@ set_rcvbuf:  	case SO_RCVLOWAT:  		if (val < 0)  			val = INT_MAX; -		sk->sk_rcvlowat = val ? : 1; +		if (sock->ops->set_rcvlowat) +			ret = sock->ops->set_rcvlowat(sk, val); +		else +			sk->sk_rcvlowat = val ? : 1;  		break;  	case SO_RCVTIMEO: @@ -2567,12 +2574,6 @@ int sock_no_getname(struct socket *sock, struct sockaddr *saddr,  }  EXPORT_SYMBOL(sock_no_getname); -__poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) -{ -	return 0; -} -EXPORT_SYMBOL(sock_no_poll); -  int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)  {  	return -EOPNOTSUPP; @@ -3439,22 +3440,10 @@ static const struct seq_operations proto_seq_ops = {  	.show   = proto_seq_show,  }; -static int proto_seq_open(struct inode *inode, struct file *file) -{ -	return seq_open_net(inode, file, &proto_seq_ops, -			    sizeof(struct seq_net_private)); -} - -static const struct file_operations proto_seq_fops = { -	.open		= proto_seq_open, -	.read		= seq_read, -	.llseek		= seq_lseek, -	.release	= seq_release_net, -}; -  static __net_init int proto_init_net(struct net *net)  { -	if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops)) +	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, +			sizeof(struct seq_net_private)))  		return -ENOMEM;  	return 0; diff --git a/net/core/xdp.c b/net/core/xdp.c index 097a0f74e004..9d1f22072d5d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -5,6 +5,10 @@   */  #include <linux/types.h>  #include <linux/mm.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <linux/rhashtable.h> +#include <net/page_pool.h>  #include <net/xdp.h> @@ -13,6 +17,105 @@  #define REG_STATE_UNREGISTERED	0x2  #define REG_STATE_UNUSED	0x3 +static DEFINE_IDA(mem_id_pool); +static DEFINE_MUTEX(mem_id_lock); +#define MEM_ID_MAX 0xFFFE +#define MEM_ID_MIN 1 +static int mem_id_next = MEM_ID_MIN; + +static bool mem_id_init; /* false */ +static struct rhashtable *mem_id_ht; + +struct xdp_mem_allocator { +	struct xdp_mem_info mem; +	union { +		void *allocator; +		struct page_pool *page_pool; +		struct zero_copy_allocator *zc_alloc; +	}; +	struct rhash_head node; +	struct rcu_head rcu; +}; + +static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) +{ +	const u32 *k = data; +	const u32 key = *k; + +	BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id) +		     != sizeof(u32)); + +	/* Use cyclic increasing ID as direct hash key, see rht_bucket_index */ +	return key << RHT_HASH_RESERVED_SPACE; +} + +static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, +			  const void *ptr) +{ +	const struct xdp_mem_allocator *xa = ptr; +	u32 mem_id = *(u32 *)arg->key; + +	return xa->mem.id != mem_id; +} + +static const struct rhashtable_params mem_id_rht_params = { +	.nelem_hint = 64, +	.head_offset = offsetof(struct xdp_mem_allocator, node), +	.key_offset  = offsetof(struct xdp_mem_allocator, mem.id), +	.key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id), +	.max_size = MEM_ID_MAX, +	.min_size = 8, +	.automatic_shrinking = true, +	.hashfn    = xdp_mem_id_hashfn, +	.obj_cmpfn = xdp_mem_id_cmp, +}; + +static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) +{ +	struct xdp_mem_allocator *xa; + +	xa = container_of(rcu, struct xdp_mem_allocator, rcu); + +	/* Allow this ID to be reused */ +	ida_simple_remove(&mem_id_pool, xa->mem.id); + +	/* Notice, driver is expected to free the *allocator, +	 * e.g. page_pool, and MUST also use RCU free. +	 */ + +	/* Poison memory */ +	xa->mem.id = 0xFFFF; +	xa->mem.type = 0xF0F0; +	xa->allocator = (void *)0xDEAD9001; + +	kfree(xa); +} + +static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +{ +	struct xdp_mem_allocator *xa; +	int id = xdp_rxq->mem.id; +	int err; + +	if (id == 0) +		return; + +	mutex_lock(&mem_id_lock); + +	xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); +	if (!xa) { +		mutex_unlock(&mem_id_lock); +		return; +	} + +	err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params); +	WARN_ON(err); + +	call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + +	mutex_unlock(&mem_id_lock); +} +  void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)  {  	/* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -21,8 +124,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)  	WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); +	__xdp_rxq_info_unreg_mem_model(xdp_rxq); +  	xdp_rxq->reg_state = REG_STATE_UNREGISTERED;  	xdp_rxq->dev = NULL; + +	/* Reset mem info to defaults */ +	xdp_rxq->mem.id = 0; +	xdp_rxq->mem.type = 0;  }  EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); @@ -71,3 +180,193 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)  	return (xdp_rxq->reg_state == REG_STATE_REGISTERED);  }  EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); + +static int __mem_id_init_hash_table(void) +{ +	struct rhashtable *rht; +	int ret; + +	if (unlikely(mem_id_init)) +		return 0; + +	rht = kzalloc(sizeof(*rht), GFP_KERNEL); +	if (!rht) +		return -ENOMEM; + +	ret = rhashtable_init(rht, &mem_id_rht_params); +	if (ret < 0) { +		kfree(rht); +		return ret; +	} +	mem_id_ht = rht; +	smp_mb(); /* mutex lock should provide enough pairing */ +	mem_id_init = true; + +	return 0; +} + +/* Allocate a cyclic ID that maps to allocator pointer. + * See: https://www.kernel.org/doc/html/latest/core-api/idr.html + * + * Caller must lock mem_id_lock. + */ +static int __mem_id_cyclic_get(gfp_t gfp) +{ +	int retries = 1; +	int id; + +again: +	id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp); +	if (id < 0) { +		if (id == -ENOSPC) { +			/* Cyclic allocator, reset next id */ +			if (retries--) { +				mem_id_next = MEM_ID_MIN; +				goto again; +			} +		} +		return id; /* errno */ +	} +	mem_id_next = id + 1; + +	return id; +} + +static bool __is_supported_mem_type(enum xdp_mem_type type) +{ +	if (type == MEM_TYPE_PAGE_POOL) +		return is_page_pool_compiled_in(); + +	if (type >= MEM_TYPE_MAX) +		return false; + +	return true; +} + +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, +			       enum xdp_mem_type type, void *allocator) +{ +	struct xdp_mem_allocator *xdp_alloc; +	gfp_t gfp = GFP_KERNEL; +	int id, errno, ret; +	void *ptr; + +	if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { +		WARN(1, "Missing register, driver bug"); +		return -EFAULT; +	} + +	if (!__is_supported_mem_type(type)) +		return -EOPNOTSUPP; + +	xdp_rxq->mem.type = type; + +	if (!allocator) { +		if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY) +			return -EINVAL; /* Setup time check page_pool req */ +		return 0; +	} + +	/* Delay init of rhashtable to save memory if feature isn't used */ +	if (!mem_id_init) { +		mutex_lock(&mem_id_lock); +		ret = __mem_id_init_hash_table(); +		mutex_unlock(&mem_id_lock); +		if (ret < 0) { +			WARN_ON(1); +			return ret; +		} +	} + +	xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); +	if (!xdp_alloc) +		return -ENOMEM; + +	mutex_lock(&mem_id_lock); +	id = __mem_id_cyclic_get(gfp); +	if (id < 0) { +		errno = id; +		goto err; +	} +	xdp_rxq->mem.id = id; +	xdp_alloc->mem  = xdp_rxq->mem; +	xdp_alloc->allocator = allocator; + +	/* Insert allocator into ID lookup table */ +	ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); +	if (IS_ERR(ptr)) { +		errno = PTR_ERR(ptr); +		goto err; +	} + +	mutex_unlock(&mem_id_lock); + +	return 0; +err: +	mutex_unlock(&mem_id_lock); +	kfree(xdp_alloc); +	return errno; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); + +/* XDP RX runs under NAPI protection, and in different delivery error + * scenarios (e.g. queue full), it is possible to return the xdp_frame + * while still leveraging this protection.  The @napi_direct boolian + * is used for those calls sites.  Thus, allowing for faster recycling + * of xdp_frames/pages in those cases. + */ +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, +			 unsigned long handle) +{ +	struct xdp_mem_allocator *xa; +	struct page *page; + +	switch (mem->type) { +	case MEM_TYPE_PAGE_POOL: +		rcu_read_lock(); +		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ +		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); +		page = virt_to_head_page(data); +		if (xa) +			page_pool_put_page(xa->page_pool, page, napi_direct); +		else +			put_page(page); +		rcu_read_unlock(); +		break; +	case MEM_TYPE_PAGE_SHARED: +		page_frag_free(data); +		break; +	case MEM_TYPE_PAGE_ORDER0: +		page = virt_to_page(data); /* Assumes order0 page*/ +		put_page(page); +		break; +	case MEM_TYPE_ZERO_COPY: +		/* NB! Only valid from an xdp_buff! */ +		rcu_read_lock(); +		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ +		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); +		xa->zc_alloc->free(xa->zc_alloc, handle); +		rcu_read_unlock(); +	default: +		/* Not possible, checked in xdp_rxq_info_reg_mem_model() */ +		break; +	} +} + +void xdp_return_frame(struct xdp_frame *xdpf) +{ +	__xdp_return(xdpf->data, &xdpf->mem, false, 0); +} +EXPORT_SYMBOL_GPL(xdp_return_frame); + +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) +{ +	__xdp_return(xdpf->data, &xdpf->mem, true, 0); +} +EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); + +void xdp_return_buff(struct xdp_buff *xdp) +{ +	__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); +} +EXPORT_SYMBOL_GPL(xdp_return_buff);  | 
