diff options
Diffstat (limited to 'drivers/net/tun.c')
| -rw-r--r-- | drivers/net/tun.c | 304 | 
1 files changed, 223 insertions, 81 deletions
| diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 50e9cc19023a..060135ceaf0e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -113,7 +113,6 @@ do {								\  } while (0)  #endif -#define TUN_HEADROOM 256  #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)  /* TUN device flags */ @@ -563,12 +562,11 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)  		e->rps_rxhash = hash;  } -/* We try to identify a flow through its rxhash first. The reason that +/* We try to identify a flow through its rxhash. The reason that   * we do not check rxq no. is because some cards(e.g 82599), chooses   * the rxq based on the txq where the last packet of the flow comes. As   * the userspace application move between processors, we may get a - * different rxq no. here. If we could not get rxhash, then we would - * hope the rxq no. may help here. + * different rxq no. here.   */  static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)  { @@ -579,18 +577,13 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)  	numqueues = READ_ONCE(tun->numqueues);  	txq = __skb_get_hash_symmetric(skb); -	if (txq) { -		e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); -		if (e) { -			tun_flow_save_rps_rxhash(e, txq); -			txq = e->queue_index; -		} else -			/* use multiply and shift instead of expensive divide */ -			txq = ((u64)txq * numqueues) >> 32; -	} else if (likely(skb_rx_queue_recorded(skb))) { -		txq = skb_get_rx_queue(skb); -		while (unlikely(txq >= numqueues)) -			txq -= numqueues; +	e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); +	if (e) { +		tun_flow_save_rps_rxhash(e, txq); +		txq = e->queue_index; +	} else { +		/* use multiply and shift instead of expensive divide */ +		txq = ((u64)txq * numqueues) >> 32;  	}  	return txq; @@ -870,6 +863,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file,  		tun_napi_init(tun, tfile, napi, napi_frags);  	} +	if (rtnl_dereference(tun->xdp_prog)) +		sock_set_flag(&tfile->sk, SOCK_XDP); +  	tun_set_real_num_queues(tun);  	/* device is allowed to go away first, so no need to hold extra @@ -1045,16 +1041,13 @@ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)  		/* Select queue was not called for the skbuff, so we extract the  		 * RPS hash and save it into the flow_table here.  		 */ +		struct tun_flow_entry *e;  		__u32 rxhash;  		rxhash = __skb_get_hash_symmetric(skb); -		if (rxhash) { -			struct tun_flow_entry *e; -			e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], -					rxhash); -			if (e) -				tun_flow_save_rps_rxhash(e, rxhash); -		} +		e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash); +		if (e) +			tun_flow_save_rps_rxhash(e, rxhash);  	}  #endif  } @@ -1205,13 +1198,29 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,  		       struct netlink_ext_ack *extack)  {  	struct tun_struct *tun = netdev_priv(dev); +	struct tun_file *tfile;  	struct bpf_prog *old_prog; +	int i;  	old_prog = rtnl_dereference(tun->xdp_prog);  	rcu_assign_pointer(tun->xdp_prog, prog);  	if (old_prog)  		bpf_prog_put(old_prog); +	for (i = 0; i < tun->numqueues; i++) { +		tfile = rtnl_dereference(tun->tfiles[i]); +		if (prog) +			sock_set_flag(&tfile->sk, SOCK_XDP); +		else +			sock_reset_flag(&tfile->sk, SOCK_XDP); +	} +	list_for_each_entry(tfile, &tun->disabled, next) { +		if (prog) +			sock_set_flag(&tfile->sk, SOCK_XDP); +		else +			sock_reset_flag(&tfile->sk, SOCK_XDP); +	} +  	return 0;  } @@ -1575,6 +1584,55 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,  	return true;  } +static struct sk_buff *__tun_build_skb(struct page_frag *alloc_frag, char *buf, +				       int buflen, int len, int pad) +{ +	struct sk_buff *skb = build_skb(buf, buflen); + +	if (!skb) +		return ERR_PTR(-ENOMEM); + +	skb_reserve(skb, pad); +	skb_put(skb, len); + +	get_page(alloc_frag->page); +	alloc_frag->offset += buflen; + +	return skb; +} + +static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, +		       struct xdp_buff *xdp, u32 act) +{ +	int err; + +	switch (act) { +	case XDP_REDIRECT: +		err = xdp_do_redirect(tun->dev, xdp, xdp_prog); +		if (err) +			return err; +		break; +	case XDP_TX: +		err = tun_xdp_tx(tun->dev, xdp); +		if (err < 0) +			return err; +		break; +	case XDP_PASS: +		break; +	default: +		bpf_warn_invalid_xdp_action(act); +		/* fall through */ +	case XDP_ABORTED: +		trace_xdp_exception(tun->dev, xdp_prog, act); +		/* fall through */ +	case XDP_DROP: +		this_cpu_inc(tun->pcpu_stats->rx_dropped); +		break; +	} + +	return act; +} +  static struct sk_buff *tun_build_skb(struct tun_struct *tun,  				     struct tun_file *tfile,  				     struct iov_iter *from, @@ -1582,18 +1640,17 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,  				     int len, int *skb_xdp)  {  	struct page_frag *alloc_frag = ¤t->task_frag; -	struct sk_buff *skb;  	struct bpf_prog *xdp_prog;  	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); -	unsigned int delta = 0;  	char *buf;  	size_t copied; -	int err, pad = TUN_RX_PAD; +	int pad = TUN_RX_PAD; +	int err = 0;  	rcu_read_lock();  	xdp_prog = rcu_dereference(tun->xdp_prog);  	if (xdp_prog) -		pad += TUN_HEADROOM; +		pad += XDP_PACKET_HEADROOM;  	buflen += SKB_DATA_ALIGN(len + pad);  	rcu_read_unlock(); @@ -1612,17 +1669,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,  	 * of xdp_prog above, this should be rare and for simplicity  	 * we do XDP on skb in case the headroom is not enough.  	 */ -	if (hdr->gso_type || !xdp_prog) +	if (hdr->gso_type || !xdp_prog) {  		*skb_xdp = 1; -	else -		*skb_xdp = 0; +		return __tun_build_skb(alloc_frag, buf, buflen, len, pad); +	} + +	*skb_xdp = 0;  	local_bh_disable();  	rcu_read_lock();  	xdp_prog = rcu_dereference(tun->xdp_prog); -	if (xdp_prog && !*skb_xdp) { +	if (xdp_prog) {  		struct xdp_buff xdp; -		void *orig_data;  		u32 act;  		xdp.data_hard_start = buf; @@ -1630,66 +1688,33 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,  		xdp_set_data_meta_invalid(&xdp);  		xdp.data_end = xdp.data + len;  		xdp.rxq = &tfile->xdp_rxq; -		orig_data = xdp.data; -		act = bpf_prog_run_xdp(xdp_prog, &xdp); -		switch (act) { -		case XDP_REDIRECT: -			get_page(alloc_frag->page); -			alloc_frag->offset += buflen; -			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog); -			xdp_do_flush_map(); -			if (err) -				goto err_redirect; -			rcu_read_unlock(); -			local_bh_enable(); -			return NULL; -		case XDP_TX: +		act = bpf_prog_run_xdp(xdp_prog, &xdp); +		if (act == XDP_REDIRECT || act == XDP_TX) {  			get_page(alloc_frag->page);  			alloc_frag->offset += buflen; -			if (tun_xdp_tx(tun->dev, &xdp) < 0) -				goto err_redirect; -			rcu_read_unlock(); -			local_bh_enable(); -			return NULL; -		case XDP_PASS: -			delta = orig_data - xdp.data; -			len = xdp.data_end - xdp.data; -			break; -		default: -			bpf_warn_invalid_xdp_action(act); -			/* fall through */ -		case XDP_ABORTED: -			trace_xdp_exception(tun->dev, xdp_prog, act); -			/* fall through */ -		case XDP_DROP: -			goto err_xdp;  		} -	} +		err = tun_xdp_act(tun, xdp_prog, &xdp, act); +		if (err < 0) +			goto err_xdp; +		if (err == XDP_REDIRECT) +			xdp_do_flush_map(); +		if (err != XDP_PASS) +			goto out; -	skb = build_skb(buf, buflen); -	if (!skb) { -		rcu_read_unlock(); -		local_bh_enable(); -		return ERR_PTR(-ENOMEM); +		pad = xdp.data - xdp.data_hard_start; +		len = xdp.data_end - xdp.data;  	} - -	skb_reserve(skb, pad - delta); -	skb_put(skb, len); -	get_page(alloc_frag->page); -	alloc_frag->offset += buflen; -  	rcu_read_unlock();  	local_bh_enable(); -	return skb; +	return __tun_build_skb(alloc_frag, buf, buflen, len, pad); -err_redirect: -	put_page(alloc_frag->page);  err_xdp: +	put_page(alloc_frag->page); +out:  	rcu_read_unlock();  	local_bh_enable(); -	this_cpu_inc(tun->pcpu_stats->rx_dropped);  	return NULL;  } @@ -2264,6 +2289,8 @@ static void tun_setup(struct net_device *dev)  static int tun_validate(struct nlattr *tb[], struct nlattr *data[],  			struct netlink_ext_ack *extack)  { +	if (!data) +		return 0;  	return -EINVAL;  } @@ -2350,18 +2377,133 @@ static void tun_sock_write_space(struct sock *sk)  	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);  } +static int tun_xdp_one(struct tun_struct *tun, +		       struct tun_file *tfile, +		       struct xdp_buff *xdp, int *flush) +{ +	struct tun_xdp_hdr *hdr = xdp->data_hard_start; +	struct virtio_net_hdr *gso = &hdr->gso; +	struct tun_pcpu_stats *stats; +	struct bpf_prog *xdp_prog; +	struct sk_buff *skb = NULL; +	u32 rxhash = 0, act; +	int buflen = hdr->buflen; +	int err = 0; +	bool skb_xdp = false; + +	xdp_prog = rcu_dereference(tun->xdp_prog); +	if (xdp_prog) { +		if (gso->gso_type) { +			skb_xdp = true; +			goto build; +		} +		xdp_set_data_meta_invalid(xdp); +		xdp->rxq = &tfile->xdp_rxq; + +		act = bpf_prog_run_xdp(xdp_prog, xdp); +		err = tun_xdp_act(tun, xdp_prog, xdp, act); +		if (err < 0) { +			put_page(virt_to_head_page(xdp->data)); +			return err; +		} + +		switch (err) { +		case XDP_REDIRECT: +			*flush = true; +			/* fall through */ +		case XDP_TX: +			return 0; +		case XDP_PASS: +			break; +		default: +			put_page(virt_to_head_page(xdp->data)); +			return 0; +		} +	} + +build: +	skb = build_skb(xdp->data_hard_start, buflen); +	if (!skb) { +		err = -ENOMEM; +		goto out; +	} + +	skb_reserve(skb, xdp->data - xdp->data_hard_start); +	skb_put(skb, xdp->data_end - xdp->data); + +	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) { +		this_cpu_inc(tun->pcpu_stats->rx_frame_errors); +		kfree_skb(skb); +		err = -EINVAL; +		goto out; +	} + +	skb->protocol = eth_type_trans(skb, tun->dev); +	skb_reset_network_header(skb); +	skb_probe_transport_header(skb, 0); + +	if (skb_xdp) { +		err = do_xdp_generic(xdp_prog, skb); +		if (err != XDP_PASS) +			goto out; +	} + +	if (!rcu_dereference(tun->steering_prog)) +		rxhash = __skb_get_hash_symmetric(skb); + +	netif_receive_skb(skb); + +	stats = get_cpu_ptr(tun->pcpu_stats); +	u64_stats_update_begin(&stats->syncp); +	stats->rx_packets++; +	stats->rx_bytes += skb->len; +	u64_stats_update_end(&stats->syncp); +	put_cpu_ptr(stats); + +	if (rxhash) +		tun_flow_update(tun, rxhash, tfile); + +out: +	return err; +} +  static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)  { -	int ret; +	int ret, i;  	struct tun_file *tfile = container_of(sock, struct tun_file, socket);  	struct tun_struct *tun = tun_get(tfile); +	struct tun_msg_ctl *ctl = m->msg_control; +	struct xdp_buff *xdp;  	if (!tun)  		return -EBADFD; -	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter, +	if (ctl && (ctl->type == TUN_MSG_PTR)) { +		int n = ctl->num; +		int flush = 0; + +		local_bh_disable(); +		rcu_read_lock(); + +		for (i = 0; i < n; i++) { +			xdp = &((struct xdp_buff *)ctl->ptr)[i]; +			tun_xdp_one(tun, tfile, xdp, &flush); +		} + +		if (flush) +			xdp_do_flush_map(); + +		rcu_read_unlock(); +		local_bh_enable(); + +		ret = total_len; +		goto out; +	} + +	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,  			   m->msg_flags & MSG_DONTWAIT,  			   m->msg_flags & MSG_MORE); +out:  	tun_put(tun);  	return ret;  } | 
