From e6247027e5173c00efb2084d688d06ff835bc3b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Dec 2013 04:45:08 -0800 Subject: net: introduce dev_consume_skb_any() Some network drivers use dev_kfree_skb_any() and dev_kfree_skb_irq() helpers to free skbs, both for dropped packets and TX completed ones. We need to separate the two causes to get better diagnostics given by dropwatch or "perf record -e skb:kfree_skb" This patch provides two new helpers, dev_consume_skb_any() and dev_consume_skb_irq() to be used for consumed skbs. __dev_kfree_skb_irq() is slightly optimized to remove one atomic_dec_and_test() in fast path, and use this_cpu_{r|w} accessors. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 53 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 9 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7f0ed423a360..9d55e5188b96 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2368,17 +2368,52 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev, #define DEFAULT_MAX_NUM_RSS_QUEUES (8) int netif_get_num_default_rss_queues(void); -/* Use this variant when it is known for sure that it - * is executing from hardware interrupt context or with hardware interrupts - * disabled. - */ -void dev_kfree_skb_irq(struct sk_buff *skb); +enum skb_free_reason { + SKB_REASON_CONSUMED, + SKB_REASON_DROPPED, +}; + +void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason); +void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason); -/* Use this variant in places where it could be invoked - * from either hardware interrupt or other context, with hardware interrupts - * either disabled or enabled. +/* + * It is not allowed to call kfree_skb() or consume_skb() from hardware + * interrupt context or with hardware interrupts being disabled. + * (in_irq() || irqs_disabled()) + * + * We provide four helpers that can be used in following contexts : + * + * dev_kfree_skb_irq(skb) when caller drops a packet from irq context, + * replacing kfree_skb(skb) + * + * dev_consume_skb_irq(skb) when caller consumes a packet from irq context. + * Typically used in place of consume_skb(skb) in TX completion path + * + * dev_kfree_skb_any(skb) when caller doesn't know its current irq context, + * replacing kfree_skb(skb) + * + * dev_consume_skb_any(skb) when caller doesn't know its current irq context, + * and consumed a packet. Used in place of consume_skb(skb) */ -void dev_kfree_skb_any(struct sk_buff *skb); +static inline void dev_kfree_skb_irq(struct sk_buff *skb) +{ + __dev_kfree_skb_irq(skb, SKB_REASON_DROPPED); +} + +static inline void dev_consume_skb_irq(struct sk_buff *skb) +{ + __dev_kfree_skb_irq(skb, SKB_REASON_CONSUMED); +} + +static inline void dev_kfree_skb_any(struct sk_buff *skb) +{ + __dev_kfree_skb_any(skb, SKB_REASON_DROPPED); +} + +static inline void dev_consume_skb_any(struct sk_buff *skb) +{ + __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED); +} int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); -- cgit v1.2.3 From 37cb0620073cb64101d9307931c135c70b2e3f04 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Tue, 10 Dec 2013 20:45:41 -0800 Subject: tipc: remove TIPC usage of field af_packet_priv in struct net_device TIPC is currently using the field 'af_packet_priv' in struct net_device as a handle to find the bearer instance associated to the given network device. But, by doing so it is blocking other networking cleanups, such as the one discussed here: http://patchwork.ozlabs.org/patch/178044/ This commit removes this usage from TIPC. Instead, we introduce a new field, 'tipc_ptr', to the net_device structure, to serve this purpose. When TIPC bearer is enabled, the bearer object is associated to 'tipc_ptr'. When a TIPC packet arrives in the recv_msg() upcall from a networking device, the bearer object can now be obtained from 'tipc_ptr'. When a bearer is disabled, the bearer object is detached from its underlying network device by setting 'tipc_ptr' to NULL. Additionally, an RCU lock is used to protect the new pointer. Henceforth, the existing tipc_net_lock is used in write mode to serialize write accesses to this pointer, while the new RCU lock is applied on the read side to ensure that the pointer is 100% valid within its wrapped area for all readers. Signed-off-by: Ying Xue Cc: Patrick McHardy Reviewed-by: Paul Gortmaker Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/tipc/bearer.h | 2 ++ net/tipc/eth_media.c | 55 +++++++++++++++++++++++++++-------------------- net/tipc/ib_media.c | 54 ++++++++++++++++++++++++++-------------------- 4 files changed, 68 insertions(+), 46 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9d55e5188b96..0ca8100f9fbc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1282,6 +1282,9 @@ struct net_device { #endif #if IS_ENABLED(CONFIG_NET_DSA) struct dsa_switch_tree *dsa_ptr; /* dsa specific data */ +#endif +#if IS_ENABLED(CONFIG_TIPC) + struct tipc_bearer __rcu *tipc_ptr; /* TIPC specific data */ #endif void *atalk_ptr; /* AppleTalk link */ struct in_device __rcu *ip_ptr; /* IPv4 specific data */ diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index e50266aa4d10..91b8d8b92373 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -105,6 +105,7 @@ struct tipc_media { /** * struct tipc_bearer - Generic TIPC bearer structure + * @dev: ptr to associated network device * @usr_handle: pointer to additional media-specific information about bearer * @mtu: max packet size bearer can support * @lock: spinlock for controlling access to bearer @@ -127,6 +128,7 @@ struct tipc_media { * care of initializing all other fields. */ struct tipc_bearer { + struct net_device *dev; void *usr_handle; /* initalized by media */ u32 mtu; /* initalized by media */ struct tipc_media_addr addr; /* initalized by media */ diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 37fb145476ec..c5f685dee15b 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -63,6 +63,9 @@ static int eth_started; static int recv_notification(struct notifier_block *nb, unsigned long evt, void *dv); +static int recv_msg(struct sk_buff *buf, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); + /* * Network device notifier info */ @@ -71,6 +74,11 @@ static struct notifier_block notifier = { .priority = 0 }; +static struct packet_type tipc_packet_type __read_mostly = { + .type = __constant_htons(ETH_P_TIPC), + .func = recv_msg, +}; + /** * eth_media_addr_set - initialize Ethernet media address structure * @@ -128,20 +136,25 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, static int recv_msg(struct sk_buff *buf, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct eth_media *eb_ptr = (struct eth_media *)pt->af_packet_priv; + struct tipc_bearer *b_ptr; if (!net_eq(dev_net(dev), &init_net)) { kfree_skb(buf); return NET_RX_DROP; } - if (likely(eb_ptr->bearer)) { + rcu_read_lock(); + b_ptr = rcu_dereference(dev->tipc_ptr); + if (likely(b_ptr)) { if (likely(buf->pkt_type <= PACKET_BROADCAST)) { buf->next = NULL; - tipc_recv_msg(buf, eb_ptr->bearer); + tipc_recv_msg(buf, b_ptr); + rcu_read_unlock(); return NET_RX_SUCCESS; } } + rcu_read_unlock(); + kfree_skb(buf); return NET_RX_DROP; } @@ -151,10 +164,7 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev, */ static void setup_media(struct work_struct *work) { - struct eth_media *eb_ptr = - container_of(work, struct eth_media, setup); - - dev_add_pack(&eb_ptr->tipc_packet_type); + dev_add_pack(&tipc_packet_type); } /** @@ -183,15 +193,11 @@ static int enable_media(struct tipc_bearer *tb_ptr) /* Create Ethernet bearer for device */ eb_ptr->dev = dev; - eb_ptr->tipc_packet_type.type = htons(ETH_P_TIPC); - eb_ptr->tipc_packet_type.dev = dev; - eb_ptr->tipc_packet_type.func = recv_msg; - eb_ptr->tipc_packet_type.af_packet_priv = eb_ptr; - INIT_LIST_HEAD(&(eb_ptr->tipc_packet_type.list)); INIT_WORK(&eb_ptr->setup, setup_media); schedule_work(&eb_ptr->setup); /* Associate TIPC bearer with Ethernet bearer */ + tb_ptr->dev = dev; eb_ptr->bearer = tb_ptr; tb_ptr->usr_handle = (void *)eb_ptr; memset(tb_ptr->bcast_addr.value, 0, sizeof(tb_ptr->bcast_addr.value)); @@ -200,6 +206,7 @@ static int enable_media(struct tipc_bearer *tb_ptr) tb_ptr->bcast_addr.broadcast = 1; tb_ptr->mtu = dev->mtu; eth_media_addr_set(tb_ptr, &tb_ptr->addr, (char *)dev->dev_addr); + rcu_assign_pointer(dev->tipc_ptr, tb_ptr); return 0; } @@ -213,7 +220,7 @@ static void cleanup_media(struct work_struct *work) struct eth_media *eb_ptr = container_of(work, struct eth_media, cleanup); - dev_remove_pack(&eb_ptr->tipc_packet_type); + dev_remove_pack(&tipc_packet_type); dev_put(eb_ptr->dev); eb_ptr->dev = NULL; } @@ -232,6 +239,7 @@ static void disable_media(struct tipc_bearer *tb_ptr) eb_ptr->bearer = NULL; INIT_WORK(&eb_ptr->cleanup, cleanup_media); schedule_work(&eb_ptr->cleanup); + RCU_INIT_POINTER(tb_ptr->dev->tipc_ptr, NULL); } /** @@ -243,21 +251,20 @@ static void disable_media(struct tipc_bearer *tb_ptr) static int recv_notification(struct notifier_block *nb, unsigned long evt, void *ptr) { + struct tipc_bearer *b_ptr; struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct eth_media *eb_ptr = ð_media_array[0]; - struct eth_media *stop = ð_media_array[MAX_ETH_MEDIA]; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; - while ((eb_ptr->dev != dev)) { - if (++eb_ptr == stop) - return NOTIFY_DONE; /* couldn't find device */ - } - if (!eb_ptr->bearer) + rcu_read_lock(); + b_ptr = rcu_dereference(dev->tipc_ptr); + if (!b_ptr) { + rcu_read_unlock(); return NOTIFY_DONE; /* bearer had been disabled */ + } - eb_ptr->bearer->mtu = dev->mtu; + b_ptr->mtu = dev->mtu; switch (evt) { case NETDEV_CHANGE: @@ -266,13 +273,15 @@ static int recv_notification(struct notifier_block *nb, unsigned long evt, case NETDEV_DOWN: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: - tipc_reset_bearer(eb_ptr->bearer); + tipc_reset_bearer(b_ptr); break; case NETDEV_UNREGISTER: case NETDEV_CHANGENAME: - tipc_disable_bearer(eb_ptr->bearer->name); + tipc_disable_bearer(b_ptr->name); break; } + rcu_read_unlock(); + return NOTIFY_OK; } diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index 48e1c07842e6..9fdf03cd672b 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -62,6 +62,13 @@ struct ib_media { static struct ib_media ib_media_array[MAX_IB_MEDIA]; static int ib_started; +static int recv_msg(struct sk_buff *buf, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); + +static struct packet_type tipc_packet_type __read_mostly = { + .type = __constant_htons(ETH_P_TIPC), + .func = recv_msg, +}; /** * ib_media_addr_set - initialize Infiniband media address structure @@ -120,20 +127,25 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, static int recv_msg(struct sk_buff *buf, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct ib_media *ib_ptr = (struct ib_media *)pt->af_packet_priv; + struct tipc_bearer *b_ptr; if (!net_eq(dev_net(dev), &init_net)) { kfree_skb(buf); return NET_RX_DROP; } - if (likely(ib_ptr->bearer)) { + rcu_read_lock(); + b_ptr = rcu_dereference(dev->tipc_ptr); + if (likely(b_ptr)) { if (likely(buf->pkt_type <= PACKET_BROADCAST)) { buf->next = NULL; - tipc_recv_msg(buf, ib_ptr->bearer); + tipc_recv_msg(buf, b_ptr); + rcu_read_unlock(); return NET_RX_SUCCESS; } } + rcu_read_unlock(); + kfree_skb(buf); return NET_RX_DROP; } @@ -143,10 +155,7 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev, */ static void setup_media(struct work_struct *work) { - struct ib_media *ib_ptr = - container_of(work, struct ib_media, setup); - - dev_add_pack(&ib_ptr->tipc_packet_type); + dev_add_pack(&tipc_packet_type); } /** @@ -175,15 +184,11 @@ static int enable_media(struct tipc_bearer *tb_ptr) /* Create InfiniBand bearer for device */ ib_ptr->dev = dev; - ib_ptr->tipc_packet_type.type = htons(ETH_P_TIPC); - ib_ptr->tipc_packet_type.dev = dev; - ib_ptr->tipc_packet_type.func = recv_msg; - ib_ptr->tipc_packet_type.af_packet_priv = ib_ptr; - INIT_LIST_HEAD(&(ib_ptr->tipc_packet_type.list)); INIT_WORK(&ib_ptr->setup, setup_media); schedule_work(&ib_ptr->setup); /* Associate TIPC bearer with InfiniBand bearer */ + tb_ptr->dev = dev; ib_ptr->bearer = tb_ptr; tb_ptr->usr_handle = (void *)ib_ptr; memset(tb_ptr->bcast_addr.value, 0, sizeof(tb_ptr->bcast_addr.value)); @@ -192,6 +197,7 @@ static int enable_media(struct tipc_bearer *tb_ptr) tb_ptr->bcast_addr.broadcast = 1; tb_ptr->mtu = dev->mtu; ib_media_addr_set(tb_ptr, &tb_ptr->addr, (char *)dev->dev_addr); + rcu_assign_pointer(dev->tipc_ptr, tb_ptr); return 0; } @@ -205,7 +211,7 @@ static void cleanup_bearer(struct work_struct *work) struct ib_media *ib_ptr = container_of(work, struct ib_media, cleanup); - dev_remove_pack(&ib_ptr->tipc_packet_type); + dev_remove_pack(&tipc_packet_type); dev_put(ib_ptr->dev); ib_ptr->dev = NULL; } @@ -224,6 +230,7 @@ static void disable_media(struct tipc_bearer *tb_ptr) ib_ptr->bearer = NULL; INIT_WORK(&ib_ptr->cleanup, cleanup_bearer); schedule_work(&ib_ptr->cleanup); + RCU_INIT_POINTER(tb_ptr->dev->tipc_ptr, NULL); } /** @@ -235,21 +242,20 @@ static void disable_media(struct tipc_bearer *tb_ptr) static int recv_notification(struct notifier_block *nb, unsigned long evt, void *ptr) { + struct tipc_bearer *b_ptr; struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct ib_media *ib_ptr = &ib_media_array[0]; - struct ib_media *stop = &ib_media_array[MAX_IB_MEDIA]; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; - while ((ib_ptr->dev != dev)) { - if (++ib_ptr == stop) - return NOTIFY_DONE; /* couldn't find device */ - } - if (!ib_ptr->bearer) + rcu_read_lock(); + b_ptr = rcu_dereference(dev->tipc_ptr); + if (!b_ptr) { + rcu_read_unlock(); return NOTIFY_DONE; /* bearer had been disabled */ + } - ib_ptr->bearer->mtu = dev->mtu; + b_ptr->mtu = dev->mtu; switch (evt) { case NETDEV_CHANGE: @@ -258,13 +264,15 @@ static int recv_notification(struct notifier_block *nb, unsigned long evt, case NETDEV_DOWN: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: - tipc_reset_bearer(ib_ptr->bearer); + tipc_reset_bearer(b_ptr); break; case NETDEV_UNREGISTER: case NETDEV_CHANGENAME: - tipc_disable_bearer(ib_ptr->bearer->name); + tipc_disable_bearer(b_ptr->name); break; } + rcu_read_unlock(); + return NOTIFY_OK; } -- cgit v1.2.3 From 299603e8370a93dd5d8e8d800f0dff1ce2c53d36 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Wed, 11 Dec 2013 20:53:45 -0800 Subject: net-gro: Prepare GRO stack for the upcoming tunneling support This patch modifies the GRO stack to avoid the use of "network_header" and associated macros like ip_hdr() and ipv6_hdr() in order to allow an arbitary number of IP hdrs (v4 or v6) to be used in the encapsulation chain. This lays the foundation for various IP tunneling support (IP-in-IP, GRE, VXLAN, SIT,...) to be added later. With this patch, the GRO stack traversing now is mostly based on skb_gro_offset rather than special hdr offsets saved in skb (e.g., skb->network_header). As a result all but the top layer (i.e., the the transport layer) must have hdrs of the same length in order for a pkt to be considered for aggregation. Therefore when adding a new encap layer (e.g., for tunneling), one must check and skip flows (e.g., by setting NAPI_GRO_CB(p)->same_flow to 0) that have a different hdr length. Note that unlike the network header, the transport header can and will continue to be set by the GRO code since there will be at most one "transport layer" in the encap chain. Signed-off-by: H.K. Jerry Chu Suggested-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- net/core/dev.c | 76 ++++++++++++++++------------------------------- net/ipv4/af_inet.c | 25 ++++++++++++---- net/ipv4/tcp_offload.c | 9 +++--- net/ipv6/ip6_offload.c | 54 ++++++++++++++++++++++++++------- net/ipv6/tcpv6_offload.c | 6 ++-- 6 files changed, 97 insertions(+), 75 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0ca8100f9fbc..5260d2eae2e6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1676,7 +1676,7 @@ struct offload_callbacks { int (*gso_send_check)(struct sk_buff *skb); struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff *skb); - int (*gro_complete)(struct sk_buff *skb); + int (*gro_complete)(struct sk_buff *skb, int nhoff); }; struct packet_offload { diff --git a/net/core/dev.c b/net/core/dev.c index 355df36360b4..c95d664b2b42 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3752,7 +3752,7 @@ static int napi_gro_complete(struct sk_buff *skb) if (ptype->type != type || !ptype->callbacks.gro_complete) continue; - err = ptype->callbacks.gro_complete(skb); + err = ptype->callbacks.gro_complete(skb, 0); break; } rcu_read_unlock(); @@ -3818,6 +3818,23 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) } } +static void skb_gro_reset_offset(struct sk_buff *skb) +{ + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (skb_mac_header(skb) == skb_tail_pointer(skb) && + pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); + } +} + static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; @@ -3833,6 +3850,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; + skb_gro_reset_offset(skb); gro_list_prepare(napi, skb); rcu_read_lock(); @@ -3938,27 +3956,8 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) return ret; } -static void skb_gro_reset_offset(struct sk_buff *skb) -{ - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; - - NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; - - if (skb_mac_header(skb) == skb_tail_pointer(skb) && - pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0))) { - NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); - } -} - gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - skb_gro_reset_offset(skb); - return napi_skb_finish(dev_gro_receive(napi, skb), skb); } EXPORT_SYMBOL(napi_gro_receive); @@ -3992,12 +3991,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff * { switch (ret) { case GRO_NORMAL: - case GRO_HELD: - skb->protocol = eth_type_trans(skb, skb->dev); - - if (ret == GRO_HELD) - skb_gro_pull(skb, -ETH_HLEN); - else if (netif_receive_skb(skb)) + if (netif_receive_skb(skb)) ret = GRO_DROP; break; @@ -4006,6 +4000,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff * napi_reuse_skb(napi, skb); break; + case GRO_HELD: case GRO_MERGED: break; } @@ -4016,36 +4011,15 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff * static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; - struct ethhdr *eth; - unsigned int hlen; - unsigned int off; napi->skb = NULL; - skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); - - off = skb_gro_offset(skb); - hlen = off + sizeof(*eth); - eth = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) { - eth = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!eth)) { - napi_reuse_skb(napi, skb); - skb = NULL; - goto out; - } + if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) { + napi_reuse_skb(napi, skb); + return NULL; } + skb->protocol = eth_type_trans(skb, skb->dev); - skb_gro_pull(skb, sizeof(*eth)); - - /* - * This works because the only protocols we care about don't require - * special handling. We'll fix it up properly at the end. - */ - skb->protocol = eth->h_proto; - -out: return skb; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 70011e029ac1..ef4f9df6d698 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1377,8 +1377,12 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, if (!NAPI_GRO_CB(p)->same_flow) continue; - iph2 = ip_hdr(p); - + iph2 = (struct iphdr *)(p->data + off); + /* The above works because, with the exception of the top + * (inner most) layer, we only aggregate pkts with the same + * hdr length so all the hdrs we'll need to verify will start + * at the same offset. + */ if ((iph->protocol ^ iph2->protocol) | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { @@ -1397,6 +1401,11 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, } NAPI_GRO_CB(skb)->flush |= flush; + skb_set_network_header(skb, off); + /* The above will be needed by the transport layer if there is one + * immediately following this IP hdr. + */ + skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); @@ -1411,10 +1420,10 @@ out: return pp; } -static int inet_gro_complete(struct sk_buff *skb) +static int inet_gro_complete(struct sk_buff *skb, int nhoff) { - __be16 newlen = htons(skb->len - skb_network_offset(skb)); - struct iphdr *iph = ip_hdr(skb); + __be16 newlen = htons(skb->len - nhoff); + struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); const struct net_offload *ops; int proto = iph->protocol; int err = -ENOSYS; @@ -1427,7 +1436,11 @@ static int inet_gro_complete(struct sk_buff *skb) if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; - err = ops->callbacks.gro_complete(skb); + /* Only need to add sizeof(*iph) to get to the next hdr below + * because any hdr with option will have been flushed in + * inet_gro_receive(). + */ + err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph)); out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 05606353c7e7..2658a27f540d 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -240,7 +240,7 @@ int tcp_gro_complete(struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); - skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_start = (unsigned char *)th - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; @@ -272,6 +272,7 @@ static int tcp_v4_gso_send_check(struct sk_buff *skb) static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) { + /* Use the IP hdr immediately proceeding for this transport */ const struct iphdr *iph = skb_gro_network_header(skb); __wsum wsum; @@ -303,13 +304,13 @@ skip_csum: return tcp_gro_receive(head, skb); } -static int tcp4_gro_complete(struct sk_buff *skb) +static int tcp4_gro_complete(struct sk_buff *skb, int thoff) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); - th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb), - iph->saddr, iph->daddr, 0); + th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, + iph->daddr, 0); skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; return tcp_gro_complete(skb); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 4b851692b1f6..7540a0ed75ae 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -154,6 +154,35 @@ out: return segs; } +/* Return the total length of all the extension hdrs, following the same + * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs. + */ +static int ipv6_exthdrs_len(struct ipv6hdr *iph, + const struct net_offload **opps) +{ + struct ipv6_opt_hdr *opth = NULL; + int len = 0, proto, optlen; + + proto = iph->nexthdr; + for (;;) { + if (proto != NEXTHDR_HOP) { + *opps = rcu_dereference(inet6_offloads[proto]); + if (unlikely(!(*opps))) + break; + if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) + break; + } + if (opth == NULL) + opth = (void *)(iph+1); + else + opth = (void *)opth + optlen; + optlen = ipv6_optlen(opth); + len += optlen; + proto = opth->nexthdr; + } + return len; +} + static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -177,6 +206,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, goto out; } + skb_set_network_header(skb, off); skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); @@ -211,12 +241,16 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, if (!NAPI_GRO_CB(p)->same_flow) continue; - iph2 = ipv6_hdr(p); + iph2 = (struct ipv6hdr *)(p->data + off); first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ; - /* All fields must match except length and Traffic Class. */ - if (nlen != skb_network_header_len(p) || - (first_word & htonl(0xF00FFFFF)) || + /* All fields must match except length and Traffic Class. + * XXX skbs on the gro_list have all been parsed and pulled + * already so we don't need to compare nlen + * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops))) + * memcmp() alone below is suffcient, right? + */ + if ((first_word & htonl(0xF00FFFFF)) || memcmp(&iph->nexthdr, &iph2->nexthdr, nlen - offsetof(struct ipv6hdr, nexthdr))) { NAPI_GRO_CB(p)->same_flow = 0; @@ -245,21 +279,21 @@ out: return pp; } -static int ipv6_gro_complete(struct sk_buff *skb) +static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; - struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); int err = -ENOSYS; - iph->payload_len = htons(skb->len - skb_network_offset(skb) - - sizeof(*iph)); + iph->payload_len = htons(skb->len - nhoff - sizeof(*iph)); rcu_read_lock(); - ops = rcu_dereference(inet6_offloads[NAPI_GRO_CB(skb)->proto]); + + nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out_unlock; - err = ops->callbacks.gro_complete(skb); + err = ops->callbacks.gro_complete(skb, nhoff); out_unlock: rcu_read_unlock(); diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 6d18157dc32c..0d78132ff18a 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -66,13 +66,13 @@ skip_csum: return tcp_gro_receive(head, skb); } -static int tcp6_gro_complete(struct sk_buff *skb) +static int tcp6_gro_complete(struct sk_buff *skb, int thoff) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); - th->check = ~tcp_v6_check(skb->len - skb_transport_offset(skb), - &iph->saddr, &iph->daddr, 0); + th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr, + &iph->daddr, 0); skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; return tcp_gro_complete(skb); -- cgit v1.2.3 From e001bfad913bf119fb67c1e8dd2d4ec1f5d392fa Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Fri, 13 Dec 2013 10:19:55 +0800 Subject: bonding: create bond_first_slave_rcu() The bond_first_slave_rcu() will be used to instead of bond_first_slave() in rcu_read_lock(). According to the Jay Vosburgh's suggestion, the struct netdev_adjacent should hide from users who wanted to use it directly. so I package a new function to get the first slave of the bond. Suggested-by: Nikolay Aleksandrov Suggested-by: Jay Vosburgh Suggested-by: Veaceslav Falico Signed-off-by: Ding Tianhong Signed-off-by: David S. Miller --- drivers/net/bonding/bonding.h | 4 ++++ include/linux/netdevice.h | 1 + net/core/dev.c | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index 8283cbdec50a..8f0d6d0c383b 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -101,6 +101,10 @@ netdev_adjacent_get_private(bond_slave_list(bond)->prev) : \ NULL) +/* Caller must have rcu_read_lock */ +#define bond_first_slave_rcu(bond) \ + netdev_lower_get_first_private_rcu(bond->dev) + #define bond_is_first_slave(bond, pos) (pos == bond_first_slave(bond)) #define bond_is_last_slave(bond, pos) (pos == bond_last_slave(bond)) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5260d2eae2e6..2c74d20dad34 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2907,6 +2907,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, priv = netdev_lower_get_next_private_rcu(dev, &(iter))) void *netdev_adjacent_get_private(struct list_head *adj_list); +void *netdev_lower_get_first_private_rcu(struct net_device *dev); struct net_device *netdev_master_upper_dev_get(struct net_device *dev); struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev); diff --git a/net/core/dev.c b/net/core/dev.c index c95d664b2b42..9d4369ece679 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4543,6 +4543,27 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, } EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); +/** + * netdev_lower_get_first_private_rcu - Get the first ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * + * Gets the first netdev_adjacent->private from the dev's lower neighbour + * list. The caller must hold RCU read lock. + */ +void *netdev_lower_get_first_private_rcu(struct net_device *dev) +{ + struct netdev_adjacent *lower; + + lower = list_first_or_null_rcu(&dev->adj_list.lower, + struct netdev_adjacent, list); + if (lower) + return lower->private; + return NULL; +} +EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); + /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device -- cgit v1.2.3 From 477bb93320cec7ae74d5ccfad4f2bfa0b28fbe90 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 13 Dec 2013 12:35:56 -0800 Subject: net: remove dead code for add/del multiple These function to manipulate multiple addresses are not used anywhere in current net-next tree. Some out of tree code maybe using these but too bad; they should submit their code upstream.. Also, make __hw_addr_flush local since only used by dev_addr_lists.c Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 ------ net/core/dev_addr_lists.c | 97 +---------------------------------------------- 2 files changed, 1 insertion(+), 107 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2c74d20dad34..a0dfcc8c002b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2810,17 +2810,10 @@ int register_netdev(struct net_device *dev); void unregister_netdev(struct net_device *dev); /* General hardware address lists handling functions */ -int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type); -void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type); int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); -void __hw_addr_flush(struct netdev_hw_addr_list *list); void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ @@ -2828,10 +2821,6 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); -int dev_addr_add_multiple(struct net_device *to_dev, - struct net_device *from_dev, unsigned char addr_type); -int dev_addr_del_multiple(struct net_device *to_dev, - struct net_device *from_dev, unsigned char addr_type); void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index ec40a849fc42..bb504a919e33 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -186,47 +186,6 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, return err; } -int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type) -{ - int err; - struct netdev_hw_addr *ha, *ha2; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - err = __hw_addr_add(to_list, ha->addr, addr_len, type); - if (err) - goto unroll; - } - return 0; - -unroll: - list_for_each_entry(ha2, &from_list->list, list) { - if (ha2 == ha) - break; - type = addr_type ? addr_type : ha2->type; - __hw_addr_del(to_list, ha2->addr, addr_len, type); - } - return err; -} -EXPORT_SYMBOL(__hw_addr_add_multiple); - -void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len, unsigned char addr_type) -{ - struct netdev_hw_addr *ha; - unsigned char type; - - list_for_each_entry(ha, &from_list->list, list) { - type = addr_type ? addr_type : ha->type; - __hw_addr_del(to_list, ha->addr, addr_len, type); - } -} -EXPORT_SYMBOL(__hw_addr_del_multiple); - /* This function only works where there is a strict 1-1 relationship * between source and destionation of they synch. If you ever need to * sync addresses to more then 1 destination, you need to use @@ -264,7 +223,7 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, } EXPORT_SYMBOL(__hw_addr_unsync); -void __hw_addr_flush(struct netdev_hw_addr_list *list) +static void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; @@ -274,7 +233,6 @@ void __hw_addr_flush(struct netdev_hw_addr_list *list) } list->count = 0; } -EXPORT_SYMBOL(__hw_addr_flush); void __hw_addr_init(struct netdev_hw_addr_list *list) { @@ -400,59 +358,6 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr, } EXPORT_SYMBOL(dev_addr_del); -/** - * dev_addr_add_multiple - Add device addresses from another device - * @to_dev: device to which addresses will be added - * @from_dev: device from which addresses will be added - * @addr_type: address type - 0 means type will be used from from_dev - * - * Add device addresses of the one device to another. - ** - * The caller must hold the rtnl_mutex. - */ -int dev_addr_add_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - int err; - - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - if (!err) - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return err; -} -EXPORT_SYMBOL(dev_addr_add_multiple); - -/** - * dev_addr_del_multiple - Delete device addresses by another device - * @to_dev: device where the addresses will be deleted - * @from_dev: device supplying the addresses to be deleted - * @addr_type: address type - 0 means type will be used from from_dev - * - * Deletes addresses in to device by the list of addresses in from device. - * - * The caller must hold the rtnl_mutex. - */ -int dev_addr_del_multiple(struct net_device *to_dev, - struct net_device *from_dev, - unsigned char addr_type) -{ - ASSERT_RTNL(); - - if (from_dev->addr_len != to_dev->addr_len) - return -EINVAL; - __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, - to_dev->addr_len, addr_type); - call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); - return 0; -} -EXPORT_SYMBOL(dev_addr_del_multiple); - /* * Unicast list handling functions */ -- cgit v1.2.3 From 1d143d9f0c833fcf38cc737eb0a8698fa2dd144c Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sun, 29 Dec 2013 14:01:29 -0800 Subject: net: core functions cleanup The following functions are not used outside of net/core/dev.c and should be declared static. call_netdevice_notifiers_info __dev_remove_offload netdev_has_any_upper_dev __netdev_adjacent_dev_remove __netdev_adjacent_dev_link_lists __netdev_adjacent_dev_unlink_lists __netdev_adjacent_dev_unlink __netdev_adjacent_dev_link_neighbour __netdev_adjacent_dev_unlink_neighbour And the following are never used and should be deleted netdev_lower_dev_get_private_rcu __netdev_find_adj_rcu Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ---- net/core/dev.c | 82 +++++++++++++++-------------------------------- 2 files changed, 26 insertions(+), 62 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 88afa8048a7c..bec60c481966 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1741,8 +1741,6 @@ netdev_notifier_info_to_dev(const struct netdev_notifier_info *info) return info->dev; } -int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, - struct netdev_notifier_info *info); int call_netdevice_notifiers(unsigned long val, struct net_device *dev); @@ -1809,7 +1807,6 @@ void dev_remove_pack(struct packet_type *pt); void __dev_remove_pack(struct packet_type *pt); void dev_add_offload(struct packet_offload *po); void dev_remove_offload(struct packet_offload *po); -void __dev_remove_offload(struct packet_offload *po); struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short flags, unsigned short mask); @@ -2867,7 +2864,6 @@ extern int weight_p; extern int bpf_jit_enable; bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); -bool netdev_has_any_upper_dev(struct net_device *dev); struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); @@ -2907,8 +2903,6 @@ int netdev_master_upper_dev_link_private(struct net_device *dev, void *private); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); -void *netdev_lower_dev_get_private_rcu(struct net_device *dev, - struct net_device *lower_dev); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); int skb_checksum_help(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index cc9ab80581d7..77f43aa373fe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -480,7 +480,7 @@ EXPORT_SYMBOL(dev_add_offload); * and must not be freed until after all the CPU's have gone * through a quiescent state. */ -void __dev_remove_offload(struct packet_offload *po) +static void __dev_remove_offload(struct packet_offload *po) { struct list_head *head = &offload_base; struct packet_offload *po1; @@ -498,7 +498,6 @@ void __dev_remove_offload(struct packet_offload *po) out: spin_unlock(&offload_lock); } -EXPORT_SYMBOL(__dev_remove_offload); /** * dev_remove_offload - remove packet offload handler @@ -1566,14 +1565,14 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); * are as for raw_notifier_call_chain(). */ -int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, - struct netdev_notifier_info *info) +static int call_netdevice_notifiers_info(unsigned long val, + struct net_device *dev, + struct netdev_notifier_info *info) { ASSERT_RTNL(); netdev_notifier_info_init(info, dev); return raw_notifier_call_chain(&netdev_chain, val, info); } -EXPORT_SYMBOL(call_netdevice_notifiers_info); /** * call_netdevice_notifiers - call all network notifier blocks @@ -4355,19 +4354,6 @@ struct netdev_adjacent { struct rcu_head rcu; }; -static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev, - struct net_device *adj_dev, - struct list_head *adj_list) -{ - struct netdev_adjacent *adj; - - list_for_each_entry_rcu(adj, adj_list, list) { - if (adj->dev == adj_dev) - return adj; - } - return NULL; -} - static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, struct net_device *adj_dev, struct list_head *adj_list) @@ -4406,13 +4392,12 @@ EXPORT_SYMBOL(netdev_has_upper_dev); * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ -bool netdev_has_any_upper_dev(struct net_device *dev) +static bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->all_adj_list.upper); } -EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device @@ -4644,9 +4629,9 @@ free_adj: return ret; } -void __netdev_adjacent_dev_remove(struct net_device *dev, - struct net_device *adj_dev, - struct list_head *dev_list) +static void __netdev_adjacent_dev_remove(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *dev_list) { struct netdev_adjacent *adj; char linkname[IFNAMSIZ+7]; @@ -4684,11 +4669,11 @@ void __netdev_adjacent_dev_remove(struct net_device *dev, kfree_rcu(adj, rcu); } -int __netdev_adjacent_dev_link_lists(struct net_device *dev, - struct net_device *upper_dev, - struct list_head *up_list, - struct list_head *down_list, - void *private, bool master) +static int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list, + void *private, bool master) { int ret; @@ -4707,8 +4692,8 @@ int __netdev_adjacent_dev_link_lists(struct net_device *dev, return 0; } -int __netdev_adjacent_dev_link(struct net_device *dev, - struct net_device *upper_dev) +static int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *upper_dev) { return __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->all_adj_list.upper, @@ -4716,26 +4701,26 @@ int __netdev_adjacent_dev_link(struct net_device *dev, NULL, false); } -void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, - struct net_device *upper_dev, - struct list_head *up_list, - struct list_head *down_list) +static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list) { __netdev_adjacent_dev_remove(dev, upper_dev, up_list); __netdev_adjacent_dev_remove(upper_dev, dev, down_list); } -void __netdev_adjacent_dev_unlink(struct net_device *dev, - struct net_device *upper_dev) +static void __netdev_adjacent_dev_unlink(struct net_device *dev, + struct net_device *upper_dev) { __netdev_adjacent_dev_unlink_lists(dev, upper_dev, &dev->all_adj_list.upper, &upper_dev->all_adj_list.lower); } -int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, - struct net_device *upper_dev, - void *private, bool master) +static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *upper_dev, + void *private, bool master) { int ret = __netdev_adjacent_dev_link(dev, upper_dev); @@ -4754,8 +4739,8 @@ int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, return 0; } -void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, - struct net_device *upper_dev) +static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) { __netdev_adjacent_dev_unlink(dev, upper_dev); __netdev_adjacent_dev_unlink_lists(dev, upper_dev, @@ -4944,21 +4929,6 @@ void netdev_upper_dev_unlink(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_dev_unlink); -void *netdev_lower_dev_get_private_rcu(struct net_device *dev, - struct net_device *lower_dev) -{ - struct netdev_adjacent *lower; - - if (!lower_dev) - return NULL; - lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower); - if (!lower) - return NULL; - - return lower->private; -} -EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu); - void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev) { -- cgit v1.2.3 From 86f8515f9721fa171483f0fe0391968fbb949cc9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 29 Dec 2013 17:27:11 +0100 Subject: net: netprio: rename config to be more consistent with cgroup configs While we're at it and introduced CGROUP_NET_CLASSID, lets also make NETPRIO_CGROUP more consistent with the rest of cgroups and rename it into CONFIG_CGROUP_NET_PRIO so that for networking, we now have CONFIG_CGROUP_NET_{PRIO,CLASSID}. This not only makes the CONFIG option consistent among networking cgroups, but also among cgroups CONFIG conventions in general as the vast majority has a prefix of CONFIG_CGROUP_. Signed-off-by: Daniel Borkmann Cc: Zefan Li Cc: cgroups@vger.kernel.org Acked-by: Li Zefan Signed-off-by: Pablo Neira Ayuso --- include/linux/cgroup_subsys.h | 2 +- include/linux/netdevice.h | 2 +- include/net/netprio_cgroup.h | 18 ++++++------------ include/net/sock.h | 2 +- net/Kconfig | 4 ++-- net/core/Makefile | 2 +- net/core/dev.c | 2 +- net/core/sock.c | 2 +- 8 files changed, 14 insertions(+), 20 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 58bf94de4b8e..7b99d717411d 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -43,7 +43,7 @@ SUBSYS(blkio) SUBSYS(perf) #endif -#if IS_SUBSYS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_PRIO) SUBSYS(net_prio) #endif diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5260d2eae2e6..45cf68194aa8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1444,7 +1444,7 @@ struct net_device { /* max exchange id for FCoE LRO by ddp */ unsigned int fcoe_ddp_xid; #endif -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif /* phy device may attach itself for hardware timestamping */ diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index 099d02782e22..dafc09f0fdbc 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -13,12 +13,12 @@ #ifndef _NETPRIO_CGROUP_H #define _NETPRIO_CGROUP_H + #include #include #include - -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map { struct rcu_head rcu; u32 priomap_len; @@ -27,8 +27,7 @@ struct netprio_map { void sock_update_netprioidx(struct sock *sk); -#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) - +#if IS_BUILTIN(CONFIG_CGROUP_NET_PRIO) static inline u32 task_netprioidx(struct task_struct *p) { struct cgroup_subsys_state *css; @@ -40,9 +39,7 @@ static inline u32 task_netprioidx(struct task_struct *p) rcu_read_unlock(); return idx; } - -#elif IS_MODULE(CONFIG_NETPRIO_CGROUP) - +#elif IS_MODULE(CONFIG_CGROUP_NET_PRIO) static inline u32 task_netprioidx(struct task_struct *p) { struct cgroup_subsys_state *css; @@ -56,9 +53,7 @@ static inline u32 task_netprioidx(struct task_struct *p) return idx; } #endif - -#else /* !CONFIG_NETPRIO_CGROUP */ - +#else /* !CONFIG_CGROUP_NET_PRIO */ static inline u32 task_netprioidx(struct task_struct *p) { return 0; @@ -66,6 +61,5 @@ static inline u32 task_netprioidx(struct task_struct *p) #define sock_update_netprioidx(sk) -#endif /* CONFIG_NETPRIO_CGROUP */ - +#endif /* CONFIG_CGROUP_NET_PRIO */ #endif /* _NET_CLS_CGROUP_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 2ef3c3eca47a..ef5e2be6eaf3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -395,7 +395,7 @@ struct sock { unsigned short sk_ack_backlog; unsigned short sk_max_ack_backlog; __u32 sk_priority; -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) __u32 sk_cgrp_prioidx; #endif struct pid *sk_peer_pid; diff --git a/net/Kconfig b/net/Kconfig index 7da10b830d70..e411046a62e3 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -238,12 +238,12 @@ config XPS depends on SMP default y -config NETPRIO_CGROUP +config CGROUP_NET_PRIO tristate "Network priority cgroup" depends on CGROUPS ---help--- Cgroup subsystem for use in assigning processes to network priorities on - a per-interface basis + a per-interface basis. config CGROUP_NET_CLASSID boolean "Network classid cgroup" diff --git a/net/core/Makefile b/net/core/Makefile index 4839a2796964..9628c20acff6 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -21,5 +21,5 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o -obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o +obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o diff --git a/net/core/dev.c b/net/core/dev.c index c95d664b2b42..888a79b2b8b9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2747,7 +2747,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); diff --git a/net/core/sock.c b/net/core/sock.c index 3f150729fb15..a29735c9a05d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1308,7 +1308,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) module_put(owner); } -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) void sock_update_netprioidx(struct sock *sk) { if (in_interrupt()) -- cgit v1.2.3 From 8f84985fec10de64a6b4cdfea45f2b0ab8f07c78 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sat, 4 Jan 2014 13:57:59 +0800 Subject: net: unify the pcpu_tstats and br_cpu_netstats as one They are same, so unify them as one, pcpu_sw_netstats. Define pcpu_sw_netstat in netdevice.h, remove pcpu_tstats from if_tunnel and remove br_cpu_netstats from br_private.h Cc: Cong Wang Cc: Stephen Hemminger Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 11 ++++++----- include/linux/if_tunnel.h | 9 --------- include/linux/netdevice.h | 11 ++++++++++- include/net/ip6_tunnel.h | 2 +- include/net/ip_tunnels.h | 4 ++-- net/bridge/br_device.c | 10 +++++----- net/bridge/br_input.c | 2 +- net/bridge/br_private.h | 10 +--------- net/ipv4/ip_tunnel.c | 9 +++++---- net/ipv4/ip_vti.c | 2 +- net/ipv6/ip6_gre.c | 10 +++++----- net/ipv6/ip6_tunnel.c | 12 ++++++------ net/ipv6/ip6_vti.c | 10 +++++----- net/ipv6/sit.c | 10 +++++----- net/openvswitch/vport.c | 12 ++++++------ net/openvswitch/vport.h | 2 +- 16 files changed, 60 insertions(+), 66 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index aef44aa44fe3..474a99ed0222 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1081,7 +1081,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct iphdr *oip = NULL; struct ipv6hdr *oip6 = NULL; struct vxlan_dev *vxlan; - struct pcpu_tstats *stats; + struct pcpu_sw_netstats *stats; union vxlan_addr saddr; __u32 vni; int err = 0; @@ -1587,11 +1587,12 @@ EXPORT_SYMBOL_GPL(vxlan_xmit_skb); static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, struct vxlan_dev *dst_vxlan) { - struct pcpu_tstats *tx_stats = this_cpu_ptr(src_vxlan->dev->tstats); - struct pcpu_tstats *rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats); + struct pcpu_sw_netstats *tx_stats, *rx_stats; union vxlan_addr loopback; union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; + tx_stats = this_cpu_ptr(src_vxlan->dev->tstats); + rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats); skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb->dev = dst_vxlan->dev; @@ -1897,12 +1898,12 @@ static int vxlan_init(struct net_device *dev) struct vxlan_sock *vs; int i; - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; for_each_possible_cpu(i) { - struct pcpu_tstats *vxlan_stats; + struct pcpu_sw_netstats *vxlan_stats; vxlan_stats = per_cpu_ptr(dev->tstats, i); u64_stats_init(&vxlan_stats->syncp); } diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h index f4e56ecd0b1a..712710bc0580 100644 --- a/include/linux/if_tunnel.h +++ b/include/linux/if_tunnel.h @@ -13,13 +13,4 @@ #define for_each_ip_tunnel_rcu(pos, start) \ for (pos = rcu_dereference(start); pos; pos = rcu_dereference(pos->next)) -/* often modified stats are per cpu, other are shared (netdev->stats) */ -struct pcpu_tstats { - u64 rx_packets; - u64 rx_bytes; - u64 tx_packets; - u64 tx_bytes; - struct u64_stats_sync syncp; -}; - #endif /* _IF_TUNNEL_H_ */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bec60c481966..51c0fe258163 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1409,7 +1409,7 @@ struct net_device { union { void *ml_priv; struct pcpu_lstats __percpu *lstats; /* loopback stats */ - struct pcpu_tstats __percpu *tstats; /* tunnel stats */ + struct pcpu_sw_netstats __percpu *tstats; struct pcpu_dstats __percpu *dstats; /* dummy stats */ struct pcpu_vstats __percpu *vstats; /* veth stats */ }; @@ -1685,6 +1685,15 @@ struct packet_offload { struct list_head list; }; +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_sw_netstats { + u64 rx_packets; + u64 rx_bytes; + u64 tx_packets; + u64 tx_bytes; + struct u64_stats_sync syncp; +}; + #include /* netdevice notifier chain. Please remember to update the rtnetlink diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 6d1549c4893c..a5593dab6af7 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -79,7 +79,7 @@ static inline void ip6tunnel_xmit(struct sk_buff *skb, struct net_device *dev) err = ip6_local_out(skb); if (net_xmit_eval(err) == 0) { - struct pcpu_tstats *tstats = this_cpu_ptr(dev->tstats); + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); tstats->tx_bytes += pkt_len; tstats->tx_packets++; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 9e25b1bc31da..cd729becbb07 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -162,10 +162,10 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum, static inline void iptunnel_xmit_stats(int err, struct net_device_stats *err_stats, - struct pcpu_tstats __percpu *stats) + struct pcpu_sw_netstats __percpu *stats) { if (err > 0) { - struct pcpu_tstats *tstats = this_cpu_ptr(stats); + struct pcpu_sw_netstats *tstats = this_cpu_ptr(stats); u64_stats_update_begin(&tstats->syncp); tstats->tx_bytes += err; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f00cfd2a0143..e4401a531afb 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -32,7 +32,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) const unsigned char *dest = skb->data; struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; - struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats); + struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); u16 vid = 0; rcu_read_lock(); @@ -90,12 +90,12 @@ static int br_dev_init(struct net_device *dev) struct net_bridge *br = netdev_priv(dev); int i; - br->stats = alloc_percpu(struct br_cpu_netstats); + br->stats = alloc_percpu(struct pcpu_sw_netstats); if (!br->stats) return -ENOMEM; for_each_possible_cpu(i) { - struct br_cpu_netstats *br_dev_stats; + struct pcpu_sw_netstats *br_dev_stats; br_dev_stats = per_cpu_ptr(br->stats, i); u64_stats_init(&br_dev_stats->syncp); } @@ -135,12 +135,12 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct net_bridge *br = netdev_priv(dev); - struct br_cpu_netstats tmp, sum = { 0 }; + struct pcpu_sw_netstats tmp, sum = { 0 }; unsigned int cpu; for_each_possible_cpu(cpu) { unsigned int start; - const struct br_cpu_netstats *bstats + const struct pcpu_sw_netstats *bstats = per_cpu_ptr(br->stats, cpu); do { start = u64_stats_fetch_begin_bh(&bstats->syncp); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7e73c32e205d..bf8dc7d308d6 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -28,7 +28,7 @@ static int br_pass_frame_up(struct sk_buff *skb) { struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev; struct net_bridge *br = netdev_priv(brdev); - struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats); + struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); u64_stats_update_begin(&brstats->syncp); brstats->rx_packets++; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2e77d923c8ee..3733f152351c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -210,21 +210,13 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device * rtnl_dereference(dev->rx_handler_data) : NULL; } -struct br_cpu_netstats { - u64 rx_packets; - u64 rx_bytes; - u64 tx_packets; - u64 tx_bytes; - struct u64_stats_sync syncp; -}; - struct net_bridge { spinlock_t lock; struct list_head port_list; struct net_device *dev; - struct br_cpu_netstats __percpu *stats; + struct pcpu_sw_netstats __percpu *stats; spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; #ifdef CONFIG_BRIDGE_NETFILTER diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e2c9cff26eb5..07a5ed374262 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -132,7 +132,8 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, int i; for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + const struct pcpu_sw_netstats *tstats = + per_cpu_ptr(dev->tstats, i); u64 rx_packets, rx_bytes, tx_packets, tx_bytes; unsigned int start; @@ -460,7 +461,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, bool log_ecn_error) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; const struct iphdr *iph = ip_hdr(skb); int err; @@ -1049,12 +1050,12 @@ int ip_tunnel_init(struct net_device *dev) int i, err; dev->destructor = ip_tunnel_dev_free; - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; for_each_possible_cpu(i) { - struct pcpu_tstats *ipt_stats; + struct pcpu_sw_netstats *ipt_stats; ipt_stats = per_cpu_ptr(dev->tstats, i); u64_stats_init(&ipt_stats->syncp); } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 52b802a0cd8c..0783200ad8d2 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -60,7 +60,7 @@ static int vti_rcv(struct sk_buff *skb) tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel != NULL) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; u32 oldmark = skb->mark; int ret; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index e27fb78c61f2..e7a440dd5c0d 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -498,7 +498,7 @@ static int ip6gre_rcv(struct sk_buff *skb) &ipv6h->saddr, &ipv6h->daddr, key, gre_proto); if (tunnel) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; @@ -1265,12 +1265,12 @@ static int ip6gre_tunnel_init(struct net_device *dev) if (ipv6_addr_any(&tunnel->parms.raddr)) dev->header_ops = &ip6gre_header_ops; - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; for_each_possible_cpu(i) { - struct pcpu_tstats *ip6gre_tunnel_stats; + struct pcpu_sw_netstats *ip6gre_tunnel_stats; ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i); u64_stats_init(&ip6gre_tunnel_stats->syncp); } @@ -1466,12 +1466,12 @@ static int ip6gre_tap_init(struct net_device *dev) ip6gre_tnl_link_config(tunnel, 1); - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; for_each_possible_cpu(i) { - struct pcpu_tstats *ip6gre_tap_stats; + struct pcpu_sw_netstats *ip6gre_tap_stats; ip6gre_tap_stats = per_cpu_ptr(dev->tstats, i); u64_stats_init(&ip6gre_tap_stats->syncp); } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 8d7c9867a445..02894216a46d 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -102,11 +101,12 @@ struct ip6_tnl_net { static struct net_device_stats *ip6_get_stats(struct net_device *dev) { - struct pcpu_tstats sum = { 0 }; + struct pcpu_sw_netstats sum = { 0 }; int i; for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + const struct pcpu_sw_netstats *tstats = + per_cpu_ptr(dev->tstats, i); sum.rx_packets += tstats->rx_packets; sum.rx_bytes += tstats->rx_bytes; @@ -784,7 +784,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr)) != NULL) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; if (t->parms.proto != ipproto && t->parms.proto != 0) { rcu_read_unlock(); @@ -1497,12 +1497,12 @@ ip6_tnl_dev_init_gen(struct net_device *dev) t->dev = dev; t->net = dev_net(dev); - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; for_each_possible_cpu(i) { - struct pcpu_tstats *ip6_tnl_stats; + struct pcpu_sw_netstats *ip6_tnl_stats; ip6_tnl_stats = per_cpu_ptr(dev->tstats, i); u64_stats_init(&ip6_tnl_stats->syncp); } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index ed94ba61dda0..da1d9e4d62ca 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -77,11 +76,12 @@ struct vti6_net { static struct net_device_stats *vti6_get_stats(struct net_device *dev) { - struct pcpu_tstats sum = { 0 }; + struct pcpu_sw_netstats sum = { 0 }; int i; for_each_possible_cpu(i) { - const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + const struct pcpu_sw_netstats *tstats = + per_cpu_ptr(dev->tstats, i); sum.rx_packets += tstats->rx_packets; sum.rx_bytes += tstats->rx_bytes; @@ -312,7 +312,7 @@ static int vti6_rcv(struct sk_buff *skb) if ((t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr)) != NULL) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) { rcu_read_unlock(); @@ -753,7 +753,7 @@ static inline int vti6_dev_init_gen(struct net_device *dev) t->dev = dev; t->net = dev_net(dev); - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; return 0; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 366fbba3359a..9937b2616713 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -671,7 +671,7 @@ static int ipip6_rcv(struct sk_buff *skb) tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); if (tunnel != NULL) { - struct pcpu_tstats *tstats; + struct pcpu_sw_netstats *tstats; if (tunnel->parms.iph.protocol != IPPROTO_IPV6 && tunnel->parms.iph.protocol != 0) @@ -1361,12 +1361,12 @@ static int ipip6_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); ipip6_tunnel_bind_dev(dev); - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; for_each_possible_cpu(i) { - struct pcpu_tstats *ipip6_tunnel_stats; + struct pcpu_sw_netstats *ipip6_tunnel_stats; ipip6_tunnel_stats = per_cpu_ptr(dev->tstats, i); u64_stats_init(&ipip6_tunnel_stats->syncp); } @@ -1391,12 +1391,12 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; iph->ttl = 64; - dev->tstats = alloc_percpu(struct pcpu_tstats); + dev->tstats = alloc_percpu(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; for_each_possible_cpu(i) { - struct pcpu_tstats *ipip6_fb_stats; + struct pcpu_sw_netstats *ipip6_fb_stats; ipip6_fb_stats = per_cpu_ptr(dev->tstats, i); u64_stats_init(&ipip6_fb_stats->syncp); } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index d830a95f03a4..f5275dd29cd9 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -136,14 +136,14 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); - vport->percpu_stats = alloc_percpu(struct pcpu_tstats); + vport->percpu_stats = alloc_percpu(struct pcpu_sw_netstats); if (!vport->percpu_stats) { kfree(vport); return ERR_PTR(-ENOMEM); } for_each_possible_cpu(i) { - struct pcpu_tstats *vport_stats; + struct pcpu_sw_netstats *vport_stats; vport_stats = per_cpu_ptr(vport->percpu_stats, i); u64_stats_init(&vport_stats->syncp); } @@ -275,8 +275,8 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) spin_unlock_bh(&vport->stats_lock); for_each_possible_cpu(i) { - const struct pcpu_tstats *percpu_stats; - struct pcpu_tstats local_stats; + const struct pcpu_sw_netstats *percpu_stats; + struct pcpu_sw_netstats local_stats; unsigned int start; percpu_stats = per_cpu_ptr(vport->percpu_stats, i); @@ -344,7 +344,7 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, struct ovs_key_ipv4_tunnel *tun_key) { - struct pcpu_tstats *stats; + struct pcpu_sw_netstats *stats; stats = this_cpu_ptr(vport->percpu_stats); u64_stats_update_begin(&stats->syncp); @@ -370,7 +370,7 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb) int sent = vport->ops->send(vport, skb); if (likely(sent > 0)) { - struct pcpu_tstats *stats; + struct pcpu_sw_netstats *stats; stats = this_cpu_ptr(vport->percpu_stats); diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 1a9fbcec6e1b..bc97ef7fa2af 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -87,7 +87,7 @@ struct vport { struct hlist_node dp_hash_node; const struct vport_ops *ops; - struct pcpu_tstats __percpu *percpu_stats; + struct pcpu_sw_netstats __percpu *percpu_stats; spinlock_t stats_lock; struct vport_err_stats err_stats; -- cgit v1.2.3 From bf5a755f5e9186406bbf50f4087100af5bd68e40 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Tue, 7 Jan 2014 10:23:19 -0800 Subject: net-gre-gro: Add GRE support to the GRO stack This patch built on top of Commit 299603e8370a93dd5d8e8d800f0dff1ce2c53d36 ("net-gro: Prepare GRO stack for the upcoming tunneling support") to add the support of the standard GRE (RFC1701/RFC2784/RFC2890) to the GRO stack. It also serves as an example for supporting other encapsulation protocols in the GRO stack in the future. The patch supports version 0 and all the flags (key, csum, seq#) but will flush any pkt with the S (seq#) flag. This is because the S flag is not support by GSO, and a GRO pkt may end up in the forwarding path, thus requiring GSO support to break it up correctly. Currently the "packet_offload" structure only contains L3 (ETH_P_IP/ ETH_P_IPV6) GRO offload support so the encapped pkts are limited to IP pkts (i.e., w/o L2 hdr). But support for other protocol type can be easily added, so is the support for GRE variations like NVGRE. The patch also support csum offload. Specifically if the csum flag is on and the h/w is capable of checksumming the payload (CHECKSUM_COMPLETE), the code will take advantage of the csum computed by the h/w when validating the GRE csum. Note that commit 60769a5dcd8755715c7143b4571d5c44f01796f1 "ipv4: gre: add GRO capability" already introduces GRO capability to IPv4 GRE tunnels, using the gro_cells infrastructure. But GRO is done after GRE hdr has been removed (i.e., decapped). The following patch applies GRO when pkts first come in (before hitting the GRE tunnel code). There is some performance advantage for applying GRO as early as possible. Also this approach is transparent to other subsystem like Open vSwitch where GRE decap is handled outside of the IP stack hence making it harder for the gro_cells stuff to apply. On the other hand, some NICs are still not capable of hashing on the inner hdr of a GRE pkt (RSS). In that case the GRO processing of pkts from the same remote host will all happen on the same CPU and the performance may be suboptimal. I'm including some rough preliminary performance numbers below. Note that the performance will be highly dependent on traffic load, mix as usual. Moreover it also depends on NIC offload features hence the following is by no means a comprehesive study. Local testing and tuning will be needed to decide the best setting. All tests spawned 50 copies of netperf TCP_STREAM and ran for 30 secs. (super_netperf 50 -H 192.168.1.18 -l 30) An IP GRE tunnel with only the key flag on (e.g., ip tunnel add gre1 mode gre local 10.246.17.18 remote 10.246.17.17 ttl 255 key 123) is configured. The GRO support for pkts AFTER decap are controlled through the device feature of the GRE device (e.g., ethtool -K gre1 gro on/off). 1.1 ethtool -K gre1 gro off; ethtool -K eth0 gro off thruput: 9.16Gbps CPU utilization: 19% 1.2 ethtool -K gre1 gro on; ethtool -K eth0 gro off thruput: 5.9Gbps CPU utilization: 15% 1.3 ethtool -K gre1 gro off; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 12-13% 1.4 ethtool -K gre1 gro on; ethtool -K eth0 gro on thruput: 9.26Gbps CPU utilization: 10% The following tests were performed on a different NIC that is capable of csum offload. I.e., the h/w is capable of computing IP payload csum (CHECKSUM_COMPLETE). 2.1 ethtool -K gre1 gro on (hence will use gro_cells) 2.1.1 ethtool -K eth0 gro off; csum offload disabled thruput: 8.53Gbps CPU utilization: 9% 2.1.2 ethtool -K eth0 gro off; csum offload enabled thruput: 8.97Gbps CPU utilization: 7-8% 2.1.3 ethtool -K eth0 gro on; csum offload disabled thruput: 8.83Gbps CPU utilization: 5-6% 2.1.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.98Gbps CPU utilization: 5% 2.2 ethtool -K gre1 gro off 2.2.1 ethtool -K eth0 gro off; csum offload disabled thruput: 5.93Gbps CPU utilization: 9% 2.2.2 ethtool -K eth0 gro off; csum offload enabled thruput: 5.62Gbps CPU utilization: 8% 2.2.3 ethtool -K eth0 gro on; csum offload disabled thruput: 7.69Gbps CPU utilization: 8% 2.2.4 ethtool -K eth0 gro on; csum offload enabled thruput: 8.96Gbps CPU utilization: 5-6% Signed-off-by: H.K. Jerry Chu Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 18 +++++- net/core/dev.c | 26 ++++++++ net/ipv4/af_inet.c | 10 ++- net/ipv4/gre_offload.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_offload.c | 7 +- net/ipv6/ip6_offload.c | 2 +- 6 files changed, 216 insertions(+), 7 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d9c961aa6a7f..a2a70cc70e7b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1632,7 +1632,10 @@ struct napi_gro_cb { int data_offset; /* This is non-zero if the packet cannot be merged with the new skb. */ - int flush; + u16 flush; + + /* Save the IP ID here and check when we get to the transport layer */ + u16 flush_id; /* Number of segments aggregated. */ u16 count; @@ -1651,6 +1654,9 @@ struct napi_gro_cb { /* Used in ipv6_gro_receive() */ int proto; + /* used to support CHECKSUM_COMPLETE for tunneling protocols */ + __wsum csum; + /* used in skb_gro_receive() slow path */ struct sk_buff *last; }; @@ -1900,6 +1906,14 @@ static inline void *skb_gro_network_header(struct sk_buff *skb) skb_network_offset(skb); } +static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, + const void *start, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) + NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum, + csum_partial(start, len, 0)); +} + static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, @@ -2440,6 +2454,8 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); +struct packet_offload *gro_find_receive_by_type(__be16 type); +struct packet_offload *gro_find_complete_by_type(__be16 type); static inline void napi_free_frags(struct napi_struct *napi) { diff --git a/net/core/dev.c b/net/core/dev.c index b3c574a88026..ce01847793c0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3846,6 +3846,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff skb_gro_reset_offset(skb); gro_list_prepare(napi, skb); + NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */ rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -3922,6 +3923,31 @@ normal: goto pull; } +struct packet_offload *gro_find_receive_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + return ptype; + } + return NULL; +} + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + return ptype; + } + return NULL; +} static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b8bc1a3d5cf1..6268a4751e64 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1391,9 +1391,15 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, NAPI_GRO_CB(p)->flush |= (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | - (__force int)((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)) | - ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); + ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); + /* Save the IP ID check to be included later when we get to + * the transport layer so only the inner most IP ID is checked. + * This is because some GSO/TSO implementations do not + * correctly increment the IP ID for the outer hdrs. + */ + NAPI_GRO_CB(p)->flush_id = + ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); NAPI_GRO_CB(p)->flush |= flush; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 9138cfb10140..746a7b10d434 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -116,10 +116,170 @@ out: return segs; } +/* Compute the whole skb csum in s/w and store it, then verify GRO csum + * starting from gro_offset. + */ +static __sum16 gro_skb_checksum(struct sk_buff *skb) +{ + __sum16 sum; + + skb->csum = skb_checksum(skb, 0, skb->len, 0); + NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum, + csum_partial(skb->data, skb_gro_offset(skb), 0)); + sum = csum_fold(NAPI_GRO_CB(skb)->csum); + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) { + if (unlikely(!sum)) + netdev_rx_csum_fault(skb->dev); + } else + skb->ip_summed = CHECKSUM_COMPLETE; + + return sum; +} + +static struct sk_buff **gre_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct sk_buff *p; + const struct gre_base_hdr *greh; + unsigned int hlen, grehlen; + unsigned int off; + int flush = 1; + struct packet_offload *ptype; + __be16 type; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*greh); + greh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + greh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!greh)) + goto out; + } + + /* Only support version 0 and K (key), C (csum) flags. Note that + * although the support for the S (seq#) flag can be added easily + * for GRO, this is problematic for GSO hence can not be enabled + * here because a GRO pkt may end up in the forwarding path, thus + * requiring GSO support to break it up correctly. + */ + if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) + goto out; + + type = greh->protocol; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (ptype == NULL) + goto out_unlock; + + grehlen = GRE_HEADER_SECTION; + + if (greh->flags & GRE_KEY) + grehlen += GRE_HEADER_SECTION; + + if (greh->flags & GRE_CSUM) + grehlen += GRE_HEADER_SECTION; + + hlen = off + grehlen; + if (skb_gro_header_hard(skb, hlen)) { + greh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!greh)) + goto out_unlock; + } + if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */ + __sum16 csum = 0; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum = csum_fold(NAPI_GRO_CB(skb)->csum); + /* Don't trust csum error calculated/reported by h/w */ + if (skb->ip_summed == CHECKSUM_NONE || csum != 0) + csum = gro_skb_checksum(skb); + + /* GRE CSUM is the 1's complement of the 1's complement sum + * of the GRE hdr plus payload so it should add up to 0xffff + * (and 0 after csum_fold()) just like the IPv4 hdr csum. + */ + if (csum) + goto out_unlock; + } + flush = 0; + + for (p = *head; p; p = p->next) { + const struct gre_base_hdr *greh2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + /* The following checks are needed to ensure only pkts + * from the same tunnel are considered for aggregation. + * The criteria for "the same tunnel" includes: + * 1) same version (we only support version 0 here) + * 2) same protocol (we only support ETH_P_IP for now) + * 3) same set of flags + * 4) same key if the key field is present. + */ + greh2 = (struct gre_base_hdr *)(p->data + off); + + if (greh2->flags != greh->flags || + greh2->protocol != greh->protocol) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + if (greh->flags & GRE_KEY) { + /* compare keys */ + if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + } + + skb_gro_pull(skb, grehlen); + + /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + skb_gro_postpull_rcsum(skb, greh, grehlen); + + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +int gre_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff); + struct packet_offload *ptype; + unsigned int grehlen = sizeof(*greh); + int err = -ENOENT; + __be16 type; + + type = greh->protocol; + if (greh->flags & GRE_KEY) + grehlen += GRE_HEADER_SECTION; + + if (greh->flags & GRE_CSUM) + grehlen += GRE_HEADER_SECTION; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype != NULL) + err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); + + rcu_read_unlock(); + return err; +} + static const struct net_offload gre_offload = { .callbacks = { .gso_send_check = gre_gso_send_check, .gso_segment = gre_gso_segment, + .gro_receive = gre_gro_receive, + .gro_complete = gre_gro_complete, }, }; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 2658a27f540d..771a3950d87a 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -197,7 +197,8 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto out_check_final; found: - flush = NAPI_GRO_CB(p)->flush; + /* Include the IP ID check below from the inner most IP hdr */ + flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id; flush |= (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); @@ -230,7 +231,7 @@ out_check_final: pp = head; out: - NAPI_GRO_CB(skb)->flush |= flush; + NAPI_GRO_CB(skb)->flush |= (flush != 0); return pp; } @@ -280,7 +281,7 @@ static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff * if (NAPI_GRO_CB(skb)->flush) goto skip_csum; - wsum = skb->csum; + wsum = NAPI_GRO_CB(skb)->csum; switch (skb->ip_summed) { case CHECKSUM_NONE: diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 6fb4162fa785..1e8683b135bb 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -190,7 +190,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, unsigned int nlen; unsigned int hlen; unsigned int off; - int flush = 1; + u16 flush = 1; int proto; __wsum csum; -- cgit v1.2.3 From 5bb025fae53889cc99a21058c5dd369bf8cce820 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Tue, 14 Jan 2014 21:58:51 +0100 Subject: net: rename sysfs symlinks on device name change Currently, we don't rename the upper/lower_ifc symlinks in /sys/class/net/*/ , which might result stale/duplicate links/names. Fix this by adding netdev_adjacent_rename_links(dev, oldname) which renames all the upper/lower interface's links to dev from the upper/lower_oldname to the new name. We don't need a rollback because only we control these symlinks and if we fail to rename them - sysfs will anyway complain. Reported-by: Ding Tianhong CC: Ding Tianhong CC: "David S. Miller" CC: Eric Dumazet CC: Nicolas Dichtel CC: Cong Wang Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5c88ab19b3eb..30f6513f3b4d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2941,6 +2941,7 @@ int netdev_master_upper_dev_link_private(struct net_device *dev, void *private); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); int skb_checksum_help(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 130d3bd0ce6f..995755733997 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1119,6 +1119,8 @@ rollback: write_seqcount_end(&devnet_rename_seq); + netdev_adjacent_rename_links(dev, oldname); + write_lock_bh(&dev_base_lock); hlist_del_rcu(&dev->name_hlist); write_unlock_bh(&dev_base_lock); @@ -1138,6 +1140,7 @@ rollback: err = ret; write_seqcount_begin(&devnet_rename_seq); memcpy(dev->name, oldname, IFNAMSIZ); + memcpy(oldname, newname, IFNAMSIZ); goto rollback; } else { pr_err("%s: name change rollback failed: %d\n", @@ -4999,6 +5002,25 @@ void netdev_upper_dev_unlink(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_dev_unlink); +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) +{ + struct netdev_adjacent *iter; + + list_for_each_entry(iter, &dev->adj_list.upper, list) { + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.lower); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.lower); + } + + list_for_each_entry(iter, &dev->adj_list.lower, list) { + netdev_adjacent_sysfs_del(iter->dev, oldname, + &iter->dev->adj_list.upper); + netdev_adjacent_sysfs_add(iter->dev, dev, + &iter->dev->adj_list.upper); + } +} + void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev) { -- cgit v1.2.3 From 1d486bfb66971ebacc2a46a23431ace9af70dc66 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Thu, 16 Jan 2014 00:02:18 +0100 Subject: net: add NETDEV_PRECHANGEMTU to notify before mtu change happens Currently, if a device changes its mtu, first the change happens (invloving all the side effects), and after that the NETDEV_CHANGEMTU is sent so that other devices can catch up with the new mtu. However, if they return NOTIFY_BAD, then the change is reverted and error returned. This is a really long and costy operation (sometimes). To fix this, add NETDEV_PRECHANGEMTU notification which is called prior to any change actually happening, and if any callee returns NOTIFY_BAD - the change is aborted. This way we're skipping all the playing with apply/revert the mtu. CC: "David S. Miller" CC: Jiri Pirko CC: Eric Dumazet CC: Nicolas Dichtel CC: Cong Wang Signed-off-by: Veaceslav Falico Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- net/core/dev.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 30f6513f3b4d..d7668b881d08 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1718,7 +1718,7 @@ struct pcpu_sw_netstats { #define NETDEV_CHANGE 0x0004 /* Notify device state change */ #define NETDEV_REGISTER 0x0005 #define NETDEV_UNREGISTER 0x0006 -#define NETDEV_CHANGEMTU 0x0007 +#define NETDEV_CHANGEMTU 0x0007 /* notify after mtu change happened */ #define NETDEV_CHANGEADDR 0x0008 #define NETDEV_GOING_DOWN 0x0009 #define NETDEV_CHANGENAME 0x000A @@ -1734,6 +1734,7 @@ struct pcpu_sw_netstats { #define NETDEV_JOIN 0x0014 #define NETDEV_CHANGEUPPER 0x0015 #define NETDEV_RESEND_IGMP 0x0016 +#define NETDEV_PRECHANGEMTU 0x0017 /* notify before mtu change happened */ int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); diff --git a/net/core/dev.c b/net/core/dev.c index b2c1869b04e3..f87bedd51eed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5392,6 +5392,11 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) if (!netif_device_present(dev)) return -ENODEV; + err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); + err = notifier_to_errno(err); + if (err) + return err; + orig_mtu = dev->mtu; err = __dev_set_mtu(dev, new_mtu); -- cgit v1.2.3 From a953be53ce40440acb4740edb48577b9468d4c3d Mon Sep 17 00:00:00 2001 From: Michael Dalton Date: Thu, 16 Jan 2014 22:23:28 -0800 Subject: net-sysfs: add support for device-specific rx queue sysfs attributes Extend existing support for netdevice receive queue sysfs attributes to permit a device-specific attribute group. Initial use case for this support will be to allow the virtio-net device to export per-receive queue mergeable receive buffer size. Signed-off-by: Michael Dalton Signed-off-by: David S. Miller --- include/linux/netdevice.h | 35 +++++++++++++++++++++++++++++---- net/core/dev.c | 12 ++++++------ net/core/net-sysfs.c | 50 +++++++++++++++++++++++++++-------------------- 3 files changed, 66 insertions(+), 31 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d7668b881d08..e985231fe04b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -668,15 +668,28 @@ extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); #endif +#endif /* CONFIG_RPS */ /* This structure contains an instance of an RX queue. */ struct netdev_rx_queue { +#ifdef CONFIG_RPS struct rps_map __rcu *rps_map; struct rps_dev_flow_table __rcu *rps_flow_table; +#endif struct kobject kobj; struct net_device *dev; } ____cacheline_aligned_in_smp; -#endif /* CONFIG_RPS */ + +/* + * RX queue sysfs structures and functions. + */ +struct rx_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, const char *buf, size_t len); +}; #ifdef CONFIG_XPS /* @@ -1313,7 +1326,7 @@ struct net_device { unicast) */ -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS struct netdev_rx_queue *_rx; /* Number of RX queues allocated at register_netdev() time */ @@ -1424,6 +1437,8 @@ struct net_device { struct device dev; /* space for optional device, statistics, and wireless sysfs groups */ const struct attribute_group *sysfs_groups[4]; + /* space for optional per-rx queue attributes */ + const struct attribute_group *sysfs_rx_queue_group; /* rtnetlink link ops */ const struct rtnl_link_ops *rtnl_link_ops; @@ -2375,7 +2390,7 @@ static inline bool netif_is_multiqueue(const struct net_device *dev) int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq); #else static inline int netif_set_real_num_rx_queues(struct net_device *dev, @@ -2394,7 +2409,7 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev, from_dev->real_num_tx_queues); if (err) return err; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS return netif_set_real_num_rx_queues(to_dev, from_dev->real_num_rx_queues); #else @@ -2402,6 +2417,18 @@ static inline int netif_copy_real_num_queues(struct net_device *to_dev, #endif } +#ifdef CONFIG_SYSFS +static inline unsigned int get_netdev_rx_queue_index( + struct netdev_rx_queue *queue) +{ + struct net_device *dev = queue->dev; + int index = queue - dev->_rx; + + BUG_ON(index >= dev->num_rx_queues); + return index; +} +#endif + #define DEFAULT_MAX_NUM_RSS_QUEUES (8) int netif_get_num_default_rss_queues(void); diff --git a/net/core/dev.c b/net/core/dev.c index f87bedd51eed..288df6232006 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2083,7 +2083,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) } EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device @@ -5764,7 +5764,7 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; @@ -6309,7 +6309,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; @@ -6365,7 +6365,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) @@ -6385,7 +6385,7 @@ free_all: free_pcpu: free_percpu(dev->pcpu_refcnt); netif_free_tx_queues(dev); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS kfree(dev->_rx); #endif @@ -6410,7 +6410,7 @@ void free_netdev(struct net_device *dev) release_net(dev_net(dev)); netif_free_tx_queues(dev); -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS kfree(dev->_rx); #endif diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 49843bf7e43e..7eeadeecc5a2 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -498,17 +498,7 @@ static struct attribute_group wireless_group = { #define net_class_groups NULL #endif /* CONFIG_SYSFS */ -#ifdef CONFIG_RPS -/* - * RX queue sysfs structures and functions. - */ -struct rx_queue_attribute { - struct attribute attr; - ssize_t (*show)(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, char *buf); - ssize_t (*store)(struct netdev_rx_queue *queue, - struct rx_queue_attribute *attr, const char *buf, size_t len); -}; +#ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) container_of(_attr, \ struct rx_queue_attribute, attr) @@ -543,6 +533,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = { .store = rx_queue_attr_store, }; +#ifdef CONFIG_RPS static ssize_t show_rps_map(struct netdev_rx_queue *queue, struct rx_queue_attribute *attribute, char *buf) { @@ -718,16 +709,20 @@ static struct rx_queue_attribute rps_cpus_attribute = static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); +#endif /* CONFIG_RPS */ static struct attribute *rx_queue_default_attrs[] = { +#ifdef CONFIG_RPS &rps_cpus_attribute.attr, &rps_dev_flow_table_cnt_attribute.attr, +#endif NULL }; static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); +#ifdef CONFIG_RPS struct rps_map *map; struct rps_dev_flow_table *flow_table; @@ -743,6 +738,7 @@ static void rx_queue_release(struct kobject *kobj) RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); } +#endif memset(kobj, 0, sizeof(*kobj)); dev_put(queue->dev); @@ -763,25 +759,36 @@ static int rx_queue_add_kobject(struct net_device *net, int index) kobj->kset = net->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); - if (error) { - kobject_put(kobj); - return error; + if (error) + goto exit; + + if (net->sysfs_rx_queue_group) { + error = sysfs_create_group(kobj, net->sysfs_rx_queue_group); + if (error) + goto exit; } kobject_uevent(kobj, KOBJ_ADD); dev_hold(queue->dev); + return error; +exit: + kobject_put(kobj); return error; } -#endif /* CONFIG_RPS */ +#endif /* CONFIG_SYFS */ int net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) { -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS int i; int error = 0; +#ifndef CONFIG_RPS + if (!net->sysfs_rx_queue_group) + return 0; +#endif for (i = old_num; i < new_num; i++) { error = rx_queue_add_kobject(net, i); if (error) { @@ -790,8 +797,12 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } } - while (--i >= new_num) + while (--i >= new_num) { + if (net->sysfs_rx_queue_group) + sysfs_remove_group(&net->_rx[i].kobj, + net->sysfs_rx_queue_group); kobject_put(&net->_rx[i].kobj); + } return error; #else @@ -1155,9 +1166,6 @@ static int register_queue_kobjects(struct net_device *net) NULL, &net->dev.kobj); if (!net->queues_kset) return -ENOMEM; -#endif - -#ifdef CONFIG_RPS real_rx = net->real_num_rx_queues; #endif real_tx = net->real_num_tx_queues; @@ -1184,7 +1192,7 @@ static void remove_queue_kobjects(struct net_device *net) { int real_rx = 0, real_tx = 0; -#ifdef CONFIG_RPS +#ifdef CONFIG_SYSFS real_rx = net->real_num_rx_queues; #endif real_tx = net->real_num_tx_queues; -- cgit v1.2.3 From 1d3ee88ae0d605629bf369ab0b868dae8ca62a48 Mon Sep 17 00:00:00 2001 From: "sfeldma@cumulusnetworks.com" Date: Thu, 16 Jan 2014 22:57:56 -0800 Subject: bonding: add netlink attributes to slave link dev If link is IFF_SLAVE, extend link dev netlink attributes to include slave attributes with new IFLA_SLAVE nest. Add netlink notification (RTM_NEWLINK) when slave status changes from backup to active, or visa-versa. Adds new ndo_get_slave op to net_device_ops to fill skb with IFLA_SLAVE attributes. Currently only used by bonding driver, but could be used by other aggregating devices with slaves. Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 1 + drivers/net/bonding/bond_netlink.c | 36 +++++++++++++++++++++++++ drivers/net/bonding/bonding.h | 11 ++++++-- include/linux/netdevice.h | 5 ++++ include/uapi/linux/if_link.h | 13 +++++++++ net/core/rtnetlink.c | 54 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 118 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index df85cec3e5d9..3220b488dd1e 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3883,6 +3883,7 @@ static const struct net_device_ops bond_netdev_ops = { #endif .ndo_add_slave = bond_enslave, .ndo_del_slave = bond_release, + .ndo_get_slave = bond_get_slave, .ndo_fix_features = bond_fix_features, }; diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 555c7837d8e6..21c648854a8c 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -22,6 +22,42 @@ #include #include "bonding.h" +int bond_get_slave(struct net_device *slave_dev, struct sk_buff *skb) +{ + struct slave *slave = bond_slave_get_rtnl(slave_dev); + const struct aggregator *agg; + + if (nla_put_u8(skb, IFLA_SLAVE_STATE, bond_slave_state(slave))) + goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_SLAVE_MII_STATUS, slave->link)) + goto nla_put_failure; + + if (nla_put_u32(skb, IFLA_SLAVE_LINK_FAILURE_COUNT, + slave->link_failure_count)) + goto nla_put_failure; + + if (nla_put(skb, IFLA_SLAVE_PERM_HWADDR, + slave_dev->addr_len, slave->perm_hwaddr)) + goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_SLAVE_QUEUE_ID, slave->queue_id)) + goto nla_put_failure; + + if (slave->bond->params.mode == BOND_MODE_8023AD) { + agg = SLAVE_AD_INFO(slave).port.aggregator; + if (agg) + if (nla_put_u16(skb, IFLA_SLAVE_AD_AGGREGATOR_ID, + agg->aggregator_identifier)) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = { [IFLA_BOND_MODE] = { .type = NLA_U8 }, [IFLA_BOND_ACTIVE_SLAVE] = { .type = NLA_U32 }, diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index 309757d8482b..8a935f8f2b3c 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -285,12 +285,18 @@ static inline bool bond_is_lb(const struct bonding *bond) static inline void bond_set_active_slave(struct slave *slave) { - slave->backup = 0; + if (slave->backup) { + slave->backup = 0; + rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_KERNEL); + } } static inline void bond_set_backup_slave(struct slave *slave) { - slave->backup = 1; + if (!slave->backup) { + slave->backup = 1; + rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_KERNEL); + } } static inline int bond_slave_state(struct slave *slave) @@ -426,6 +432,7 @@ int bond_sysfs_slave_add(struct slave *slave); void bond_sysfs_slave_del(struct slave *slave); int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev); int bond_release(struct net_device *bond_dev, struct net_device *slave_dev); +int bond_get_slave(struct net_device *slave_dev, struct sk_buff *skb); int bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, int count); int bond_parse_parm(const char *mode_arg, const struct bond_parm_tbl *tbl); int bond_parm_tbl_lookup(int mode, const struct bond_parm_tbl *tbl); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e985231fe04b..83ce2aee65e6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -921,6 +921,9 @@ struct netdev_phys_port_id { * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * + * int (*ndo_get_slave)(struct net_device *slave_dev, struct sk_buff *skb); + * Called to fill netlink skb with slave info. + * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); @@ -1093,6 +1096,8 @@ struct net_device_ops { struct net_device *slave_dev); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); + int (*ndo_get_slave)(struct net_device *slave_dev, + struct sk_buff *skb); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3e6bd3c7445d..ba2f3bf5fdf5 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -144,6 +144,7 @@ enum { IFLA_NUM_RX_QUEUES, IFLA_CARRIER, IFLA_PHYS_PORT_ID, + IFLA_SLAVE, __IFLA_MAX }; @@ -368,6 +369,18 @@ enum { #define IFLA_BOND_AD_INFO_MAX (__IFLA_BOND_AD_INFO_MAX - 1) +enum { + IFLA_SLAVE_STATE, + IFLA_SLAVE_MII_STATUS, + IFLA_SLAVE_LINK_FAILURE_COUNT, + IFLA_SLAVE_PERM_HWADDR, + IFLA_SLAVE_QUEUE_ID, + IFLA_SLAVE_AD_AGGREGATOR_ID, + __IFLA_SLAVE_MAX, +}; + +#define IFLA_SLAVE_MAX (__IFLA_SLAVE_MAX - 1) + /* SR-IOV virtual function management section */ enum { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e6e7d582f901..4f85de7aca33 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -721,6 +721,28 @@ static size_t rtnl_port_size(const struct net_device *dev) return port_self_size; } +static size_t rtnl_bond_slave_size(const struct net_device *dev) +{ + struct net_device *bond; + size_t slave_size = + nla_total_size(sizeof(struct nlattr)) + /* IFLA_SLAVE */ + nla_total_size(1) + /* IFLA_SLAVE_STATE */ + nla_total_size(1) + /* IFLA_SLAVE_MII_STATUS */ + nla_total_size(4) + /* IFLA_SLAVE_LINK_FAILURE_COUNT */ + nla_total_size(MAX_ADDR_LEN) + /* IFLA_SLAVE_PERM_HWADDR */ + nla_total_size(2) + /* IFLA_SLAVE_QUEUE_ID */ + nla_total_size(2) + /* IFLA_SLAVE_AD_AGGREGATOR_ID */ + 0; + + if (netif_is_bond_slave((struct net_device *)dev)) { + bond = netdev_master_upper_dev_get((struct net_device *)dev); + if (bond && bond->netdev_ops->ndo_get_slave) + return slave_size; + } + + return 0; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -750,6 +772,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + + rtnl_bond_slave_size(dev) /* IFLA_SLAVE */ + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */ } @@ -847,6 +870,34 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static size_t rtnl_bond_slave_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device *bond; + struct nlattr *nest; + int err; + + if (!netif_is_bond_slave(dev)) + return 0; + + bond = netdev_master_upper_dev_get(dev); + if (!bond || !bond->netdev_ops->ndo_get_slave) + return 0; + + nest = nla_nest_start(skb, IFLA_SLAVE); + if (!nest) + return -EMSGSIZE; + + err = bond->netdev_ops->ndo_get_slave(dev, skb); + if (err) { + nla_nest_cancel(skb, nest); + return (err == -EMSGSIZE) ? err : 0; + } + + nla_nest_end(skb, nest); + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -1001,6 +1052,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_port_fill(skb, dev)) goto nla_put_failure; + if (rtnl_bond_slave_fill(skb, dev)) + goto nla_put_failure; + if (dev->rtnl_link_ops) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; -- cgit v1.2.3 From b582ef0990d457f7ce8ccf827af51a575ca0b4a6 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Mon, 20 Jan 2014 13:59:19 +0200 Subject: net: Add GRO support for UDP encapsulating protocols Add GRO handlers for protocols that do UDP encapsulation, with the intent of being able to coalesce packets which encapsulate packets belonging to the same TCP session. For GRO purposes, the destination UDP port takes the role of the ether type field in the ethernet header or the next protocol in the IP header. The UDP GRO handler will only attempt to coalesce packets whose destination port is registered to have gro handler. Use a mark on the skb GRO CB data to disallow (flush) running the udp gro receive code twice on a packet. This solves the problem of udp encapsulated packets whose inner VM packet is udp and happen to carry a port which has registered offloads. Signed-off-by: Shlomo Pongratz Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 +++- include/net/protocol.h | 3 + net/core/dev.c | 1 + net/ipv4/udp_offload.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 156 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 83ce2aee65e6..c31022980e18 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1675,7 +1675,10 @@ struct napi_gro_cb { unsigned long age; /* Used in ipv6_gro_receive() */ - int proto; + u16 proto; + + /* Used in udp_gro_receive */ + u16 udp_mark; /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; @@ -1714,6 +1717,11 @@ struct packet_offload { struct list_head list; }; +struct udp_offload { + __be16 port; + struct offload_callbacks callbacks; +}; + /* often modified stats are per cpu, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64 rx_packets; diff --git a/include/net/protocol.h b/include/net/protocol.h index 0e5f8665d7fb..a7e986b08147 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -108,6 +108,9 @@ int inet_del_offload(const struct net_offload *prot, unsigned char num); void inet_register_protosw(struct inet_protosw *p); void inet_unregister_protosw(struct inet_protosw *p); +int udp_add_offload(struct udp_offload *prot); +void udp_del_offload(struct udp_offload *prot); + #if IS_ENABLED(CONFIG_IPV6) int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char num); int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char num); diff --git a/net/core/dev.c b/net/core/dev.c index a578af589198..da92305c344f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3893,6 +3893,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; + NAPI_GRO_CB(skb)->udp_mark = 0; pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 79c62bdcd3c5..ee853c55deea 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -14,6 +14,15 @@ #include #include +static DEFINE_SPINLOCK(udp_offload_lock); +static struct udp_offload_priv *udp_offload_base __read_mostly; + +struct udp_offload_priv { + struct udp_offload *offload; + struct rcu_head rcu; + struct udp_offload_priv __rcu *next; +}; + static int udp4_ufo_send_check(struct sk_buff *skb) { if (!pskb_may_pull(skb, sizeof(struct udphdr))) @@ -89,10 +98,144 @@ out: return segs; } +int udp_add_offload(struct udp_offload *uo) +{ + struct udp_offload_priv **head = &udp_offload_base; + struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_KERNEL); + + if (!new_offload) + return -ENOMEM; + + new_offload->offload = uo; + + spin_lock(&udp_offload_lock); + rcu_assign_pointer(new_offload->next, rcu_dereference(*head)); + rcu_assign_pointer(*head, rcu_dereference(new_offload)); + spin_unlock(&udp_offload_lock); + + return 0; +} +EXPORT_SYMBOL(udp_add_offload); + +static void udp_offload_free_routine(struct rcu_head *head) +{ + struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu); + kfree(ou_priv); +} + +void udp_del_offload(struct udp_offload *uo) +{ + struct udp_offload_priv __rcu **head = &udp_offload_base; + struct udp_offload_priv *uo_priv; + + spin_lock(&udp_offload_lock); + + uo_priv = rcu_dereference(*head); + for (; uo_priv != NULL; + uo_priv = rcu_dereference(*head)) { + + if (uo_priv->offload == uo) { + rcu_assign_pointer(*head, rcu_dereference(uo_priv->next)); + goto unlock; + } + head = &uo_priv->next; + } + pr_warn("udp_del_offload: didn't find offload for port %d\n", htons(uo->port)); +unlock: + spin_unlock(&udp_offload_lock); + if (uo_priv != NULL) + call_rcu(&uo_priv->rcu, udp_offload_free_routine); +} +EXPORT_SYMBOL(udp_del_offload); + +static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct udp_offload_priv *uo_priv; + struct sk_buff *p, **pp = NULL; + struct udphdr *uh, *uh2; + unsigned int hlen, off; + int flush = 1; + + if (NAPI_GRO_CB(skb)->udp_mark || + (!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE)) + goto out; + + /* mark that this skb passed once through the udp gro layer */ + NAPI_GRO_CB(skb)->udp_mark = 1; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*uh); + uh = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + uh = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!uh)) + goto out; + } + + rcu_read_lock(); + uo_priv = rcu_dereference(udp_offload_base); + for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { + if (uo_priv->offload->port == uh->dest && + uo_priv->offload->callbacks.gro_receive) + goto unflush; + } + goto out_unlock; + +unflush: + flush = 0; + + for (p = *head; p; p = p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + uh2 = (struct udphdr *)(p->data + off); + if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ + pp = uo_priv->offload->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + return pp; +} + +static int udp_gro_complete(struct sk_buff *skb, int nhoff) +{ + struct udp_offload_priv *uo_priv; + __be16 newlen = htons(skb->len - nhoff); + struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + int err = -ENOSYS; + + uh->len = newlen; + + rcu_read_lock(); + + uo_priv = rcu_dereference(udp_offload_base); + for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { + if (uo_priv->offload->port == uh->dest && + uo_priv->offload->callbacks.gro_complete) + break; + } + + if (uo_priv != NULL) + err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr)); + + rcu_read_unlock(); + return err; +} + static const struct net_offload udpv4_offload = { .callbacks = { .gso_send_check = udp4_ufo_send_check, .gso_segment = udp4_ufo_fragment, + .gro_receive = udp_gro_receive, + .gro_complete = udp_gro_complete, }, }; -- cgit v1.2.3 From a9517d0f43832d787a4a0348163d659641bfd83c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 22 Jan 2014 09:05:57 +0100 Subject: rtnetlink: remove ndo_get_slave No longer used API bond-specific can be removed now. This is now handled in a generic way in rtnl_link_ops. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c31022980e18..440a02ee6f92 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -921,9 +921,6 @@ struct netdev_phys_port_id { * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * - * int (*ndo_get_slave)(struct net_device *slave_dev, struct sk_buff *skb); - * Called to fill netlink skb with slave info. - * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); @@ -1096,8 +1093,6 @@ struct net_device_ops { struct net_device *slave_dev); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); - int (*ndo_get_slave)(struct net_device *slave_dev, - struct sk_buff *skb); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, -- cgit v1.2.3